This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
do_gv_dump, do_gvgv_dump and do_hv_dump call all actually take a NULL
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 154#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 155#define RExC_emit (pRExC_state->emit)
fac92740 156#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
157#define RExC_naughty (pRExC_state->naughty)
158#define RExC_sawback (pRExC_state->sawback)
159#define RExC_seen (pRExC_state->seen)
160#define RExC_size (pRExC_state->size)
161#define RExC_npar (pRExC_state->npar)
e2e6a0f1 162#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
163#define RExC_extralen (pRExC_state->extralen)
164#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 166#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 167#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
168#define RExC_open_parens (pRExC_state->open_parens)
169#define RExC_close_parens (pRExC_state->close_parens)
170#define RExC_opend (pRExC_state->opend)
81714fb9 171#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
172#define RExC_recurse (pRExC_state->recurse)
173#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 174
a687059c
LW
175#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
176#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
177 ((*s) == '{' && regcurly(s)))
a687059c 178
35c8bce7
LW
179#ifdef SPSTART
180#undef SPSTART /* dratted cpp namespace... */
181#endif
a687059c
LW
182/*
183 * Flags to be passed up and down.
184 */
a687059c 185#define WORST 0 /* Worst case. */
821b33a5 186#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
187#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
188#define SPSTART 0x4 /* Starts with * or +. */
189#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 190
3dab1dad
YO
191#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
192
07be1b83
YO
193/* whether trie related optimizations are enabled */
194#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
195#define TRIE_STUDY_OPT
786e8c11 196#define FULL_TRIE_STUDY
07be1b83
YO
197#define TRIE_STCLASS
198#endif
1de06328
YO
199
200
40d049e4
YO
201
202#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
203#define PBITVAL(paren) (1 << ((paren) & 7))
204#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
205#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
206#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
207
208
1de06328
YO
209/* About scan_data_t.
210
211 During optimisation we recurse through the regexp program performing
212 various inplace (keyhole style) optimisations. In addition study_chunk
213 and scan_commit populate this data structure with information about
214 what strings MUST appear in the pattern. We look for the longest
215 string that must appear for at a fixed location, and we look for the
216 longest string that may appear at a floating location. So for instance
217 in the pattern:
218
219 /FOO[xX]A.*B[xX]BAR/
220
221 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
222 strings (because they follow a .* construct). study_chunk will identify
223 both FOO and BAR as being the longest fixed and floating strings respectively.
224
225 The strings can be composites, for instance
226
227 /(f)(o)(o)/
228
229 will result in a composite fixed substring 'foo'.
230
231 For each string some basic information is maintained:
232
233 - offset or min_offset
234 This is the position the string must appear at, or not before.
235 It also implicitly (when combined with minlenp) tells us how many
236 character must match before the string we are searching.
237 Likewise when combined with minlenp and the length of the string
238 tells us how many characters must appear after the string we have
239 found.
240
241 - max_offset
242 Only used for floating strings. This is the rightmost point that
243 the string can appear at. Ifset to I32 max it indicates that the
244 string can occur infinitely far to the right.
245
246 - minlenp
247 A pointer to the minimum length of the pattern that the string
248 was found inside. This is important as in the case of positive
249 lookahead or positive lookbehind we can have multiple patterns
250 involved. Consider
251
252 /(?=FOO).*F/
253
254 The minimum length of the pattern overall is 3, the minimum length
255 of the lookahead part is 3, but the minimum length of the part that
256 will actually match is 1. So 'FOO's minimum length is 3, but the
257 minimum length for the F is 1. This is important as the minimum length
258 is used to determine offsets in front of and behind the string being
259 looked for. Since strings can be composites this is the length of the
260 pattern at the time it was commited with a scan_commit. Note that
261 the length is calculated by study_chunk, so that the minimum lengths
262 are not known until the full pattern has been compiled, thus the
263 pointer to the value.
264
265 - lookbehind
266
267 In the case of lookbehind the string being searched for can be
268 offset past the start point of the final matching string.
269 If this value was just blithely removed from the min_offset it would
270 invalidate some of the calculations for how many chars must match
271 before or after (as they are derived from min_offset and minlen and
272 the length of the string being searched for).
273 When the final pattern is compiled and the data is moved from the
274 scan_data_t structure into the regexp structure the information
275 about lookbehind is factored in, with the information that would
276 have been lost precalculated in the end_shift field for the
277 associated string.
278
279 The fields pos_min and pos_delta are used to store the minimum offset
280 and the delta to the maximum offset at the current point in the pattern.
281
282*/
2c2d71f5
JH
283
284typedef struct scan_data_t {
1de06328
YO
285 /*I32 len_min; unused */
286 /*I32 len_delta; unused */
2c2d71f5
JH
287 I32 pos_min;
288 I32 pos_delta;
289 SV *last_found;
1de06328 290 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
291 I32 last_start_min;
292 I32 last_start_max;
1de06328
YO
293 SV **longest; /* Either &l_fixed, or &l_float. */
294 SV *longest_fixed; /* longest fixed string found in pattern */
295 I32 offset_fixed; /* offset where it starts */
296 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
297 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
298 SV *longest_float; /* longest floating string found in pattern */
299 I32 offset_float_min; /* earliest point in string it can appear */
300 I32 offset_float_max; /* latest point in string it can appear */
301 I32 *minlen_float; /* pointer to the minlen relevent to the string */
302 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
303 I32 flags;
304 I32 whilem_c;
cb434fcc 305 I32 *last_closep;
653099ff 306 struct regnode_charclass_class *start_class;
2c2d71f5
JH
307} scan_data_t;
308
a687059c 309/*
e50aee73 310 * Forward declarations for pregcomp()'s friends.
a687059c 311 */
a0d0e21e 312
27da23d5 313static const scan_data_t zero_scan_data =
1de06328 314 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
315
316#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
317#define SF_BEFORE_SEOL 0x0001
318#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
319#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
320#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
321
09b7f37c
CB
322#ifdef NO_UNARY_PLUS
323# define SF_FIX_SHIFT_EOL (0+2)
324# define SF_FL_SHIFT_EOL (0+4)
325#else
326# define SF_FIX_SHIFT_EOL (+2)
327# define SF_FL_SHIFT_EOL (+4)
328#endif
c277df42
IZ
329
330#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
331#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
332
333#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
334#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
335#define SF_IS_INF 0x0040
336#define SF_HAS_PAR 0x0080
337#define SF_IN_PAR 0x0100
338#define SF_HAS_EVAL 0x0200
339#define SCF_DO_SUBSTR 0x0400
653099ff
GS
340#define SCF_DO_STCLASS_AND 0x0800
341#define SCF_DO_STCLASS_OR 0x1000
342#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 343#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 344
786e8c11 345#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 346#define SCF_SEEN_ACCEPT 0x8000
07be1b83 347
eb160463 348#define UTF (RExC_utf8 != 0)
bbe252da
YO
349#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
350#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 351
ffc61ed2 352#define OOB_UNICODE 12345678
93733859 353#define OOB_NAMEDCLASS -1
b8c5462f 354
a0ed51b3
LW
355#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
356#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
357
8615cb43 358
b45f050a
JF
359/* length of regex to show in messages that don't mark a position within */
360#define RegexLengthToShowInErrorMessages 127
361
362/*
363 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
364 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
365 * op/pragma/warn/regcomp.
366 */
7253e4e3
RK
367#define MARKER1 "<-- HERE" /* marker as it appears in the description */
368#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 369
7253e4e3 370#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
371
372/*
373 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
374 * arg. Show regex, up to a maximum length. If it's too long, chop and add
375 * "...".
376 */
58e23c8d 377#define _FAIL(code) STMT_START { \
bfed75c6 378 const char *ellipses = ""; \
ccb2c380
MP
379 IV len = RExC_end - RExC_precomp; \
380 \
381 if (!SIZE_ONLY) \
382 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
383 if (len > RegexLengthToShowInErrorMessages) { \
384 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
385 len = RegexLengthToShowInErrorMessages - 10; \
386 ellipses = "..."; \
387 } \
58e23c8d 388 code; \
ccb2c380 389} STMT_END
8615cb43 390
58e23c8d
YO
391#define FAIL(msg) _FAIL( \
392 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
393 msg, (int)len, RExC_precomp, ellipses))
394
395#define FAIL2(msg,arg) _FAIL( \
396 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
397 arg, (int)len, RExC_precomp, ellipses))
398
b45f050a 399/*
b45f050a
JF
400 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
401 */
ccb2c380 402#define Simple_vFAIL(m) STMT_START { \
a28509cc 403 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
404 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
405 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
410 */
ccb2c380
MP
411#define vFAIL(m) STMT_START { \
412 if (!SIZE_ONLY) \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL(m); \
415} STMT_END
b45f050a
JF
416
417/*
418 * Like Simple_vFAIL(), but accepts two arguments.
419 */
ccb2c380 420#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 421 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
424} STMT_END
b45f050a
JF
425
426/*
427 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
428 */
ccb2c380
MP
429#define vFAIL2(m,a1) STMT_START { \
430 if (!SIZE_ONLY) \
431 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
432 Simple_vFAIL2(m, a1); \
433} STMT_END
b45f050a
JF
434
435
436/*
437 * Like Simple_vFAIL(), but accepts three arguments.
438 */
ccb2c380 439#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 440 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
441 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
442 (int)offset, RExC_precomp, RExC_precomp + offset); \
443} STMT_END
b45f050a
JF
444
445/*
446 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
447 */
ccb2c380
MP
448#define vFAIL3(m,a1,a2) STMT_START { \
449 if (!SIZE_ONLY) \
450 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
451 Simple_vFAIL3(m, a1, a2); \
452} STMT_END
b45f050a
JF
453
454/*
455 * Like Simple_vFAIL(), but accepts four arguments.
456 */
ccb2c380 457#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 458 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
459 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
460 (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
b45f050a 462
ccb2c380 463#define vWARN(loc,m) STMT_START { \
a28509cc 464 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
465 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
466 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
468
469#define vWARNdep(loc,m) STMT_START { \
a28509cc 470 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
471 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
472 "%s" REPORT_LOCATION, \
473 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476
477#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 478 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
479 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
480 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
481} STMT_END
482
483#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 484 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
485 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
486 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
487} STMT_END
488
489#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
491 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
9d1d55b5 500
8615cb43 501
cd439c50 502/* Allow for side effects in s */
ccb2c380
MP
503#define REGC(c,s) STMT_START { \
504 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
505} STMT_END
cd439c50 506
fac92740
MJD
507/* Macros for recording node offsets. 20001227 mjd@plover.com
508 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
509 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
510 * Element 0 holds the number n.
07be1b83 511 * Position is 1 indexed.
fac92740
MJD
512 */
513
ccb2c380
MP
514#define Set_Node_Offset_To_R(node,byte) STMT_START { \
515 if (! SIZE_ONLY) { \
516 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 517 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 518 if((node) < 0) { \
551405c4 519 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
520 } else { \
521 RExC_offsets[2*(node)-1] = (byte); \
522 } \
523 } \
524} STMT_END
525
526#define Set_Node_Offset(node,byte) \
527 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
528#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
529
530#define Set_Node_Length_To_R(node,len) STMT_START { \
531 if (! SIZE_ONLY) { \
532 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 533 __LINE__, (int)(node), (int)(len))); \
ccb2c380 534 if((node) < 0) { \
551405c4 535 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
536 } else { \
537 RExC_offsets[2*(node)] = (len); \
538 } \
539 } \
540} STMT_END
541
542#define Set_Node_Length(node,len) \
543 Set_Node_Length_To_R((node)-RExC_emit_start, len)
544#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
545#define Set_Node_Cur_Length(node) \
546 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
547
548/* Get offsets and lengths */
549#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
550#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
551
07be1b83
YO
552#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
553 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
554 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
555} STMT_END
556
557
558#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
559#define EXPERIMENTAL_INPLACESCAN
560#endif
561
304ee84b
YO
562#define DEBUG_STUDYDATA(str,data,depth) \
563DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 564 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
565 "%*s" str "Pos:%"IVdf"/%"IVdf \
566 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
567 (int)(depth)*2, "", \
568 (IV)((data)->pos_min), \
569 (IV)((data)->pos_delta), \
304ee84b 570 (UV)((data)->flags), \
1de06328 571 (IV)((data)->whilem_c), \
304ee84b
YO
572 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
573 is_inf ? "INF " : "" \
1de06328
YO
574 ); \
575 if ((data)->last_found) \
576 PerlIO_printf(Perl_debug_log, \
577 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
578 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
579 SvPVX_const((data)->last_found), \
580 (IV)((data)->last_end), \
581 (IV)((data)->last_start_min), \
582 (IV)((data)->last_start_max), \
583 ((data)->longest && \
584 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
585 SvPVX_const((data)->longest_fixed), \
586 (IV)((data)->offset_fixed), \
587 ((data)->longest && \
588 (data)->longest==&((data)->longest_float)) ? "*" : "", \
589 SvPVX_const((data)->longest_float), \
590 (IV)((data)->offset_float_min), \
591 (IV)((data)->offset_float_max) \
592 ); \
593 PerlIO_printf(Perl_debug_log,"\n"); \
594});
595
acfe0abc 596static void clear_re(pTHX_ void *r);
4327152a 597
653099ff 598/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 599 Update the longest found anchored substring and the longest found
653099ff
GS
600 floating substrings if needed. */
601
4327152a 602STATIC void
304ee84b 603S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 604{
e1ec3a88
AL
605 const STRLEN l = CHR_SVLEN(data->last_found);
606 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 607 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 608
c277df42 609 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 610 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
611 if (*data->longest == data->longest_fixed) {
612 data->offset_fixed = l ? data->last_start_min : data->pos_min;
613 if (data->flags & SF_BEFORE_EOL)
b81d288d 614 data->flags
c277df42
IZ
615 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
616 else
617 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
618 data->minlen_fixed=minlenp;
619 data->lookbehind_fixed=0;
a0ed51b3 620 }
304ee84b 621 else { /* *data->longest == data->longest_float */
c277df42 622 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
623 data->offset_float_max = (l
624 ? data->last_start_max
c277df42 625 : data->pos_min + data->pos_delta);
304ee84b 626 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 627 data->offset_float_max = I32_MAX;
c277df42 628 if (data->flags & SF_BEFORE_EOL)
b81d288d 629 data->flags
c277df42
IZ
630 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
631 else
632 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
633 data->minlen_float=minlenp;
634 data->lookbehind_float=0;
c277df42
IZ
635 }
636 }
637 SvCUR_set(data->last_found, 0);
0eda9292 638 {
a28509cc 639 SV * const sv = data->last_found;
097eb12c
AL
640 if (SvUTF8(sv) && SvMAGICAL(sv)) {
641 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
642 if (mg)
643 mg->mg_len = 0;
644 }
0eda9292 645 }
c277df42
IZ
646 data->last_end = -1;
647 data->flags &= ~SF_BEFORE_EOL;
304ee84b 648 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
649}
650
653099ff
GS
651/* Can match anything (initialization) */
652STATIC void
097eb12c 653S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 654{
653099ff 655 ANYOF_CLASS_ZERO(cl);
f8bef550 656 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 657 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
658 if (LOC)
659 cl->flags |= ANYOF_LOCALE;
660}
661
662/* Can match anything (initialization) */
663STATIC int
5f66b61c 664S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
665{
666 int value;
667
aaa51d5e 668 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
669 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
670 return 1;
1aa99e6b
IH
671 if (!(cl->flags & ANYOF_UNICODE_ALL))
672 return 0;
10edeb5d 673 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 674 return 0;
653099ff
GS
675 return 1;
676}
677
678/* Can match anything (initialization) */
679STATIC void
097eb12c 680S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 681{
8ecf7187 682 Zero(cl, 1, struct regnode_charclass_class);
653099ff 683 cl->type = ANYOF;
830247a4 684 cl_anything(pRExC_state, cl);
653099ff
GS
685}
686
687STATIC void
097eb12c 688S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 689{
8ecf7187 690 Zero(cl, 1, struct regnode_charclass_class);
653099ff 691 cl->type = ANYOF;
830247a4 692 cl_anything(pRExC_state, cl);
653099ff
GS
693 if (LOC)
694 cl->flags |= ANYOF_LOCALE;
695}
696
697/* 'And' a given class with another one. Can create false positives */
698/* We assume that cl is not inverted */
699STATIC void
5f66b61c 700S_cl_and(struct regnode_charclass_class *cl,
a28509cc 701 const struct regnode_charclass_class *and_with)
653099ff 702{
40d049e4
YO
703
704 assert(and_with->type == ANYOF);
653099ff
GS
705 if (!(and_with->flags & ANYOF_CLASS)
706 && !(cl->flags & ANYOF_CLASS)
707 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
708 && !(and_with->flags & ANYOF_FOLD)
709 && !(cl->flags & ANYOF_FOLD)) {
710 int i;
711
712 if (and_with->flags & ANYOF_INVERT)
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= ~and_with->bitmap[i];
715 else
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] &= and_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
719 if (!(and_with->flags & ANYOF_EOS))
720 cl->flags &= ~ANYOF_EOS;
1aa99e6b 721
14ebb1a2
JH
722 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
723 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
724 cl->flags &= ~ANYOF_UNICODE_ALL;
725 cl->flags |= ANYOF_UNICODE;
726 ARG_SET(cl, ARG(and_with));
727 }
14ebb1a2
JH
728 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
731 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
732 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 733 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
734}
735
736/* 'OR' a given class with another one. Can create false positives */
737/* We assume that cl is not inverted */
738STATIC void
097eb12c 739S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 740{
653099ff
GS
741 if (or_with->flags & ANYOF_INVERT) {
742 /* We do not use
743 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
744 * <= (B1 | !B2) | (CL1 | !CL2)
745 * which is wasteful if CL2 is small, but we ignore CL2:
746 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
747 * XXXX Can we handle case-fold? Unclear:
748 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
749 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
750 */
751 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
752 && !(or_with->flags & ANYOF_FOLD)
753 && !(cl->flags & ANYOF_FOLD) ) {
754 int i;
755
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= ~or_with->bitmap[i];
758 } /* XXXX: logic is complicated otherwise */
759 else {
830247a4 760 cl_anything(pRExC_state, cl);
653099ff
GS
761 }
762 } else {
763 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
764 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 765 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
766 || (cl->flags & ANYOF_FOLD)) ) {
767 int i;
768
769 /* OR char bitmap and class bitmap separately */
770 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
771 cl->bitmap[i] |= or_with->bitmap[i];
772 if (or_with->flags & ANYOF_CLASS) {
773 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
774 cl->classflags[i] |= or_with->classflags[i];
775 cl->flags |= ANYOF_CLASS;
776 }
777 }
778 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 779 cl_anything(pRExC_state, cl);
653099ff
GS
780 }
781 }
782 if (or_with->flags & ANYOF_EOS)
783 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
784
785 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
786 ARG(cl) != ARG(or_with)) {
787 cl->flags |= ANYOF_UNICODE_ALL;
788 cl->flags &= ~ANYOF_UNICODE;
789 }
790 if (or_with->flags & ANYOF_UNICODE_ALL) {
791 cl->flags |= ANYOF_UNICODE_ALL;
792 cl->flags &= ~ANYOF_UNICODE;
793 }
653099ff
GS
794}
795
a3621e74
YO
796#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
797#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
798#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
799#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
800
3dab1dad
YO
801
802#ifdef DEBUGGING
07be1b83 803/*
2b8b4781
NC
804 dump_trie(trie,widecharmap,revcharmap)
805 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
806 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
807
808 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
809 The _interim_ variants are used for debugging the interim
810 tables that are used to generate the final compressed
811 representation which is what dump_trie expects.
812
3dab1dad
YO
813 Part of the reason for their existance is to provide a form
814 of documentation as to how the different representations function.
07be1b83
YO
815
816*/
3dab1dad
YO
817
818/*
3dab1dad
YO
819 Dumps the final compressed table form of the trie to Perl_debug_log.
820 Used for debugging make_trie().
821*/
822
823STATIC void
2b8b4781
NC
824S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
825 AV *revcharmap, U32 depth)
3dab1dad
YO
826{
827 U32 state;
ab3bbdeb 828 SV *sv=sv_newmortal();
55eed653 829 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
830 GET_RE_DEBUG_FLAGS_DECL;
831
ab3bbdeb 832
3dab1dad
YO
833 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
834 (int)depth * 2 + 2,"",
835 "Match","Base","Ofs" );
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 838 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 839 if ( tmp ) {
ab3bbdeb
YO
840 PerlIO_printf( Perl_debug_log, "%*s",
841 colwidth,
ddc5bc0f 842 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
843 PL_colors[0], PL_colors[1],
844 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
845 PERL_PV_ESCAPE_FIRSTCHAR
846 )
847 );
3dab1dad
YO
848 }
849 }
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
852
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 854 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
855 PerlIO_printf( Perl_debug_log, "\n");
856
1e2e3d02 857 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 858 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
859
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
861
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
864 } else {
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
866 }
867
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
869
870 if ( base ) {
871 U32 ofs = 0;
872
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
876 ofs++;
877
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
879
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
884 {
ab3bbdeb
YO
885 PerlIO_printf( Perl_debug_log, "%*"UVXf,
886 colwidth,
3dab1dad
YO
887 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
888 } else {
ab3bbdeb 889 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
890 }
891 }
892
893 PerlIO_printf( Perl_debug_log, "]");
894
895 }
896 PerlIO_printf( Perl_debug_log, "\n" );
897 }
898}
899/*
3dab1dad
YO
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
904*/
905STATIC void
55eed653 906S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
907 HV *widecharmap, AV *revcharmap, U32 next_alloc,
908 U32 depth)
3dab1dad
YO
909{
910 U32 state;
ab3bbdeb 911 SV *sv=sv_newmortal();
55eed653 912 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
913 GET_RE_DEBUG_FLAGS_DECL;
914 /* print out the table precompression. */
ab3bbdeb
YO
915 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
916 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
917 "------:-----+-----------------\n" );
3dab1dad
YO
918
919 for( state=1 ; state < next_alloc ; state ++ ) {
920 U16 charid;
921
ab3bbdeb 922 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
923 (int)depth * 2 + 2,"", (UV)state );
924 if ( ! trie->states[ state ].wordnum ) {
925 PerlIO_printf( Perl_debug_log, "%5s| ","");
926 } else {
927 PerlIO_printf( Perl_debug_log, "W%4x| ",
928 trie->states[ state ].wordnum
929 );
930 }
931 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 932 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
933 if ( tmp ) {
934 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
935 colwidth,
ddc5bc0f 936 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
937 PL_colors[0], PL_colors[1],
938 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
939 PERL_PV_ESCAPE_FIRSTCHAR
940 ) ,
1e2e3d02
YO
941 TRIE_LIST_ITEM(state,charid).forid,
942 (UV)TRIE_LIST_ITEM(state,charid).newstate
943 );
944 if (!(charid % 10))
664e119d
RGS
945 PerlIO_printf(Perl_debug_log, "\n%*s| ",
946 (int)((depth * 2) + 14), "");
1e2e3d02 947 }
ab3bbdeb
YO
948 }
949 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
950 }
951}
952
953/*
3dab1dad
YO
954 Dumps a fully constructed but uncompressed trie in table form.
955 This is the normal DFA style state transition table, with a few
956 twists to facilitate compression later.
957 Used for debugging make_trie().
958*/
959STATIC void
55eed653 960S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
962 U32 depth)
3dab1dad
YO
963{
964 U32 state;
965 U16 charid;
ab3bbdeb 966 SV *sv=sv_newmortal();
55eed653 967 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 978 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 979 if ( tmp ) {
ab3bbdeb
YO
980 PerlIO_printf( Perl_debug_log, "%*s",
981 colwidth,
ddc5bc0f 982 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
983 PL_colors[0], PL_colors[1],
984 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
985 PERL_PV_ESCAPE_FIRSTCHAR
986 )
987 );
3dab1dad
YO
988 }
989 }
990
991 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
992
993 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 994 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
995 }
996
997 PerlIO_printf( Perl_debug_log, "\n" );
998
999 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1000
1001 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1002 (int)depth * 2 + 2,"",
1003 (UV)TRIE_NODENUM( state ) );
1004
1005 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1006 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1007 if (v)
1008 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1009 else
1010 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1011 }
1012 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1013 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1014 } else {
1015 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1016 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1017 }
1018 }
07be1b83 1019}
3dab1dad
YO
1020
1021#endif
1022
786e8c11
YO
1023/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1024 startbranch: the first branch in the whole branch sequence
1025 first : start branch of sequence of branch-exact nodes.
1026 May be the same as startbranch
1027 last : Thing following the last branch.
1028 May be the same as tail.
1029 tail : item following the branch sequence
1030 count : words in the sequence
1031 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1032 depth : indent depth
3dab1dad 1033
786e8c11 1034Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1035
786e8c11
YO
1036A trie is an N'ary tree where the branches are determined by digital
1037decomposition of the key. IE, at the root node you look up the 1st character and
1038follow that branch repeat until you find the end of the branches. Nodes can be
1039marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1040
786e8c11 1041 /he|she|his|hers/
72f13be8 1042
786e8c11
YO
1043would convert into the following structure. Numbers represent states, letters
1044following numbers represent valid transitions on the letter from that state, if
1045the number is in square brackets it represents an accepting state, otherwise it
1046will be in parenthesis.
07be1b83 1047
786e8c11
YO
1048 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1049 | |
1050 | (2)
1051 | |
1052 (1) +-i->(6)-+-s->[7]
1053 |
1054 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1055
786e8c11
YO
1056 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1057
1058This shows that when matching against the string 'hers' we will begin at state 1
1059read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1060then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1061is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1062single traverse. We store a mapping from accepting to state to which word was
1063matched, and then when we have multiple possibilities we try to complete the
1064rest of the regex in the order in which they occured in the alternation.
1065
1066The only prior NFA like behaviour that would be changed by the TRIE support is
1067the silent ignoring of duplicate alternations which are of the form:
1068
1069 / (DUPE|DUPE) X? (?{ ... }) Y /x
1070
1071Thus EVAL blocks follwing a trie may be called a different number of times with
1072and without the optimisation. With the optimisations dupes will be silently
1073ignored. This inconsistant behaviour of EVAL type nodes is well established as
1074the following demonstrates:
1075
1076 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1077
1078which prints out 'word' three times, but
1079
1080 'words'=~/(word|word|word)(?{ print $1 })S/
1081
1082which doesnt print it out at all. This is due to other optimisations kicking in.
1083
1084Example of what happens on a structural level:
1085
1086The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: BRANCH(8)
1090 6: EXACT <ac>(16)
1091 8: BRANCH(11)
1092 9: EXACT <ad>(16)
1093 11: BRANCH(14)
1094 12: EXACT <ab>(16)
1095 16: SUCCEED(0)
1096 17: NOTHING(18)
1097 18: END(0)
1098
1099This would be optimizable with startbranch=5, first=5, last=16, tail=16
1100and should turn into:
1101
1102 1: CURLYM[1] {1,32767}(18)
1103 5: TRIE(16)
1104 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1105 <ac>
1106 <ad>
1107 <ab>
1108 16: SUCCEED(0)
1109 17: NOTHING(18)
1110 18: END(0)
1111
1112Cases where tail != last would be like /(?foo|bar)baz/:
1113
1114 1: BRANCH(4)
1115 2: EXACT <foo>(8)
1116 4: BRANCH(7)
1117 5: EXACT <bar>(8)
1118 7: TAIL(8)
1119 8: EXACT <baz>(10)
1120 10: END(0)
1121
1122which would be optimizable with startbranch=1, first=1, last=7, tail=8
1123and would end up looking like:
1124
1125 1: TRIE(8)
1126 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1127 <foo>
1128 <bar>
1129 7: TAIL(8)
1130 8: EXACT <baz>(10)
1131 10: END(0)
1132
1133 d = uvuni_to_utf8_flags(d, uv, 0);
1134
1135is the recommended Unicode-aware way of saying
1136
1137 *(d++) = uv;
1138*/
1139
1e2e3d02 1140#define TRIE_STORE_REVCHAR \
786e8c11 1141 STMT_START { \
1e2e3d02 1142 SV *tmp = newSVpvs(""); \
786e8c11 1143 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1144 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1145 av_push( revcharmap, tmp ); \
786e8c11
YO
1146 } STMT_END
1147
1148#define TRIE_READ_CHAR STMT_START { \
1149 wordlen++; \
1150 if ( UTF ) { \
1151 if ( folder ) { \
1152 if ( foldlen > 0 ) { \
1153 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1154 foldlen -= len; \
1155 scan += len; \
1156 len = 0; \
1157 } else { \
1158 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1159 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1160 foldlen -= UNISKIP( uvc ); \
1161 scan = foldbuf + UNISKIP( uvc ); \
1162 } \
1163 } else { \
1164 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1165 } \
1166 } else { \
1167 uvc = (U32)*uc; \
1168 len = 1; \
1169 } \
1170} STMT_END
1171
1172
1173
1174#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1175 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1176 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1177 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1178 } \
1179 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1181 TRIE_LIST_CUR( state )++; \
1182} STMT_END
07be1b83 1183
786e8c11
YO
1184#define TRIE_LIST_NEW(state) STMT_START { \
1185 Newxz( trie->states[ state ].trans.list, \
1186 4, reg_trie_trans_le ); \
1187 TRIE_LIST_CUR( state ) = 1; \
1188 TRIE_LIST_LEN( state ) = 4; \
1189} STMT_END
07be1b83 1190
786e8c11
YO
1191#define TRIE_HANDLE_WORD(state) STMT_START { \
1192 U16 dupe= trie->states[ state ].wordnum; \
1193 regnode * const noper_next = regnext( noper ); \
1194 \
1195 if (trie->wordlen) \
1196 trie->wordlen[ curword ] = wordlen; \
1197 DEBUG_r({ \
1198 /* store the word for dumping */ \
1199 SV* tmp; \
1200 if (OP(noper) != NOTHING) \
1201 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1202 else \
1203 tmp = newSVpvn( "", 0 ); \
1204 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1205 av_push( trie_words, tmp ); \
786e8c11
YO
1206 }); \
1207 \
1208 curword++; \
1209 \
1210 if ( noper_next < tail ) { \
1211 if (!trie->jump) \
c944940b 1212 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1213 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1214 if (!jumper) \
1215 jumper = noper_next; \
1216 if (!nextbranch) \
1217 nextbranch= regnext(cur); \
1218 } \
1219 \
1220 if ( dupe ) { \
1221 /* So it's a dupe. This means we need to maintain a */\
1222 /* linked-list from the first to the next. */\
1223 /* we only allocate the nextword buffer when there */\
1224 /* a dupe, so first time we have to do the allocation */\
1225 if (!trie->nextword) \
c944940b 1226 trie->nextword = (U16 *) \
446bd890 1227 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1228 while ( trie->nextword[dupe] ) \
1229 dupe= trie->nextword[dupe]; \
1230 trie->nextword[dupe]= curword; \
1231 } else { \
1232 /* we haven't inserted this word yet. */ \
1233 trie->states[ state ].wordnum = curword; \
1234 } \
1235} STMT_END
07be1b83 1236
3dab1dad 1237
786e8c11
YO
1238#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1239 ( ( base + charid >= ucharcount \
1240 && base + charid < ubound \
1241 && state == trie->trans[ base - ucharcount + charid ].check \
1242 && trie->trans[ base - ucharcount + charid ].next ) \
1243 ? trie->trans[ base - ucharcount + charid ].next \
1244 : ( state==1 ? special : 0 ) \
1245 )
3dab1dad 1246
786e8c11
YO
1247#define MADE_TRIE 1
1248#define MADE_JUMP_TRIE 2
1249#define MADE_EXACT_TRIE 4
3dab1dad 1250
a3621e74 1251STATIC I32
786e8c11 1252S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1253{
27da23d5 1254 dVAR;
a3621e74
YO
1255 /* first pass, loop through and scan words */
1256 reg_trie_data *trie;
55eed653 1257 HV *widecharmap = NULL;
2b8b4781 1258 AV *revcharmap = newAV();
a3621e74 1259 regnode *cur;
9f7f3913 1260 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1261 STRLEN len = 0;
1262 UV uvc = 0;
1263 U16 curword = 0;
1264 U32 next_alloc = 0;
786e8c11
YO
1265 regnode *jumper = NULL;
1266 regnode *nextbranch = NULL;
7f69552c 1267 regnode *convert = NULL;
a3621e74 1268 /* we just use folder as a flag in utf8 */
e1ec3a88 1269 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1270 ? PL_fold
1271 : ( flags == EXACTFL
1272 ? PL_fold_locale
1273 : NULL
1274 )
1275 );
1276
2b8b4781
NC
1277#ifdef DEBUGGING
1278 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1279 AV *trie_words = NULL;
1280 /* along with revcharmap, this only used during construction but both are
1281 * useful during debugging so we store them in the struct when debugging.
8e11feef 1282 */
2b8b4781
NC
1283#else
1284 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1285 STRLEN trie_charcount=0;
3dab1dad 1286#endif
2b8b4781 1287 SV *re_trie_maxbuff;
a3621e74 1288 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1289#ifndef DEBUGGING
1290 PERL_UNUSED_ARG(depth);
1291#endif
a3621e74 1292
c944940b 1293 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1294 trie->refcount = 1;
3dab1dad 1295 trie->startstate = 1;
786e8c11 1296 trie->wordcount = word_count;
f8fc2ecf 1297 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1298 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1299 if (!(UTF && folder))
c944940b 1300 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1301 DEBUG_r({
2b8b4781 1302 trie_words = newAV();
a3621e74 1303 });
a3621e74 1304
0111c4fd 1305 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1306 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1307 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1308 }
3dab1dad
YO
1309 DEBUG_OPTIMISE_r({
1310 PerlIO_printf( Perl_debug_log,
786e8c11 1311 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1312 (int)depth * 2 + 2, "",
1313 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1314 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1315 (int)depth);
3dab1dad 1316 });
7f69552c
YO
1317
1318 /* Find the node we are going to overwrite */
1319 if ( first == startbranch && OP( last ) != BRANCH ) {
1320 /* whole branch chain */
1321 convert = first;
1322 } else {
1323 /* branch sub-chain */
1324 convert = NEXTOPER( first );
1325 }
1326
a3621e74
YO
1327 /* -- First loop and Setup --
1328
1329 We first traverse the branches and scan each word to determine if it
1330 contains widechars, and how many unique chars there are, this is
1331 important as we have to build a table with at least as many columns as we
1332 have unique chars.
1333
1334 We use an array of integers to represent the character codes 0..255
1335 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1336 native representation of the character value as the key and IV's for the
1337 coded index.
1338
1339 *TODO* If we keep track of how many times each character is used we can
1340 remap the columns so that the table compression later on is more
1341 efficient in terms of memory by ensuring most common value is in the
1342 middle and the least common are on the outside. IMO this would be better
1343 than a most to least common mapping as theres a decent chance the most
1344 common letter will share a node with the least common, meaning the node
1345 will not be compressable. With a middle is most common approach the worst
1346 case is when we have the least common nodes twice.
1347
1348 */
1349
a3621e74 1350 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1351 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1352 const U8 *uc = (U8*)STRING( noper );
a28509cc 1353 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1354 STRLEN foldlen = 0;
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1356 const U8 *scan = (U8*)NULL;
07be1b83 1357 U32 wordlen = 0; /* required init */
3dab1dad 1358 STRLEN chars=0;
a3621e74 1359
3dab1dad
YO
1360 if (OP(noper) == NOTHING) {
1361 trie->minlen= 0;
1362 continue;
1363 }
1364 if (trie->bitmap) {
1365 TRIE_BITMAP_SET(trie,*uc);
1366 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1367 }
a3621e74 1368 for ( ; uc < e ; uc += len ) {
3dab1dad 1369 TRIE_CHARCOUNT(trie)++;
a3621e74 1370 TRIE_READ_CHAR;
3dab1dad 1371 chars++;
a3621e74
YO
1372 if ( uvc < 256 ) {
1373 if ( !trie->charmap[ uvc ] ) {
1374 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1375 if ( folder )
1376 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1377 TRIE_STORE_REVCHAR;
a3621e74
YO
1378 }
1379 } else {
1380 SV** svpp;
55eed653
NC
1381 if ( !widecharmap )
1382 widecharmap = newHV();
a3621e74 1383
55eed653 1384 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1385
1386 if ( !svpp )
e4584336 1387 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1388
1389 if ( !SvTRUE( *svpp ) ) {
1390 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1391 TRIE_STORE_REVCHAR;
a3621e74
YO
1392 }
1393 }
1394 }
3dab1dad
YO
1395 if( cur == first ) {
1396 trie->minlen=chars;
1397 trie->maxlen=chars;
1398 } else if (chars < trie->minlen) {
1399 trie->minlen=chars;
1400 } else if (chars > trie->maxlen) {
1401 trie->maxlen=chars;
1402 }
1403
a3621e74
YO
1404 } /* end first pass */
1405 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1406 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1407 (int)depth * 2 + 2,"",
55eed653 1408 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1409 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1410 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1411 );
c944940b 1412 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1413
1414 /*
1415 We now know what we are dealing with in terms of unique chars and
1416 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1417 representation using a flat table will take. If it's over a reasonable
1418 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1419 conservative but potentially much slower representation using an array
1420 of lists.
1421
1422 At the end we convert both representations into the same compressed
1423 form that will be used in regexec.c for matching with. The latter
1424 is a form that cannot be used to construct with but has memory
1425 properties similar to the list form and access properties similar
1426 to the table form making it both suitable for fast searches and
1427 small enough that its feasable to store for the duration of a program.
1428
1429 See the comment in the code where the compressed table is produced
1430 inplace from the flat tabe representation for an explanation of how
1431 the compression works.
1432
1433 */
1434
1435
3dab1dad 1436 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1437 /*
1438 Second Pass -- Array Of Lists Representation
1439
1440 Each state will be represented by a list of charid:state records
1441 (reg_trie_trans_le) the first such element holds the CUR and LEN
1442 points of the allocated array. (See defines above).
1443
1444 We build the initial structure using the lists, and then convert
1445 it into the compressed table form which allows faster lookups
1446 (but cant be modified once converted).
a3621e74
YO
1447 */
1448
a3621e74
YO
1449 STRLEN transcount = 1;
1450
1e2e3d02
YO
1451 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1452 "%*sCompiling trie using list compiler\n",
1453 (int)depth * 2 + 2, ""));
446bd890 1454
c944940b
JH
1455 trie->states = (reg_trie_state *)
1456 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1457 sizeof(reg_trie_state) );
a3621e74
YO
1458 TRIE_LIST_NEW(1);
1459 next_alloc = 2;
1460
1461 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1462
c445ea15
AL
1463 regnode * const noper = NEXTOPER( cur );
1464 U8 *uc = (U8*)STRING( noper );
1465 const U8 * const e = uc + STR_LEN( noper );
1466 U32 state = 1; /* required init */
1467 U16 charid = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1469 STRLEN foldlen = 0; /* required init */
07be1b83 1470 U32 wordlen = 0; /* required init */
c445ea15
AL
1471 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1472
3dab1dad 1473 if (OP(noper) != NOTHING) {
786e8c11 1474 for ( ; uc < e ; uc += len ) {
c445ea15 1475
786e8c11 1476 TRIE_READ_CHAR;
c445ea15 1477
786e8c11
YO
1478 if ( uvc < 256 ) {
1479 charid = trie->charmap[ uvc ];
c445ea15 1480 } else {
55eed653 1481 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1482 if ( !svpp ) {
1483 charid = 0;
1484 } else {
1485 charid=(U16)SvIV( *svpp );
1486 }
c445ea15 1487 }
786e8c11
YO
1488 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1489 if ( charid ) {
a3621e74 1490
786e8c11
YO
1491 U16 check;
1492 U32 newstate = 0;
a3621e74 1493
786e8c11
YO
1494 charid--;
1495 if ( !trie->states[ state ].trans.list ) {
1496 TRIE_LIST_NEW( state );
c445ea15 1497 }
786e8c11
YO
1498 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1499 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1500 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1501 break;
1502 }
1503 }
1504 if ( ! newstate ) {
1505 newstate = next_alloc++;
1506 TRIE_LIST_PUSH( state, charid, newstate );
1507 transcount++;
1508 }
1509 state = newstate;
1510 } else {
1511 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1512 }
a28509cc 1513 }
c445ea15 1514 }
3dab1dad 1515 TRIE_HANDLE_WORD(state);
a3621e74
YO
1516
1517 } /* end second pass */
1518
1e2e3d02
YO
1519 /* next alloc is the NEXT state to be allocated */
1520 trie->statecount = next_alloc;
c944940b
JH
1521 trie->states = (reg_trie_state *)
1522 PerlMemShared_realloc( trie->states,
1523 next_alloc
1524 * sizeof(reg_trie_state) );
a3621e74 1525
3dab1dad 1526 /* and now dump it out before we compress it */
2b8b4781
NC
1527 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1528 revcharmap, next_alloc,
1529 depth+1)
1e2e3d02 1530 );
a3621e74 1531
c944940b
JH
1532 trie->trans = (reg_trie_trans *)
1533 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1534 {
1535 U32 state;
a3621e74
YO
1536 U32 tp = 0;
1537 U32 zp = 0;
1538
1539
1540 for( state=1 ; state < next_alloc ; state ++ ) {
1541 U32 base=0;
1542
1543 /*
1544 DEBUG_TRIE_COMPILE_MORE_r(
1545 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1546 );
1547 */
1548
1549 if (trie->states[state].trans.list) {
1550 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1551 U16 maxid=minid;
a28509cc 1552 U16 idx;
a3621e74
YO
1553
1554 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1555 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1556 if ( forid < minid ) {
1557 minid=forid;
1558 } else if ( forid > maxid ) {
1559 maxid=forid;
1560 }
a3621e74
YO
1561 }
1562 if ( transcount < tp + maxid - minid + 1) {
1563 transcount *= 2;
c944940b
JH
1564 trie->trans = (reg_trie_trans *)
1565 PerlMemShared_realloc( trie->trans,
446bd890
NC
1566 transcount
1567 * sizeof(reg_trie_trans) );
a3621e74
YO
1568 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1569 }
1570 base = trie->uniquecharcount + tp - minid;
1571 if ( maxid == minid ) {
1572 U32 set = 0;
1573 for ( ; zp < tp ; zp++ ) {
1574 if ( ! trie->trans[ zp ].next ) {
1575 base = trie->uniquecharcount + zp - minid;
1576 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1577 trie->trans[ zp ].check = state;
1578 set = 1;
1579 break;
1580 }
1581 }
1582 if ( !set ) {
1583 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1584 trie->trans[ tp ].check = state;
1585 tp++;
1586 zp = tp;
1587 }
1588 } else {
1589 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1590 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1591 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1592 trie->trans[ tid ].check = state;
1593 }
1594 tp += ( maxid - minid + 1 );
1595 }
1596 Safefree(trie->states[ state ].trans.list);
1597 }
1598 /*
1599 DEBUG_TRIE_COMPILE_MORE_r(
1600 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1601 );
1602 */
1603 trie->states[ state ].trans.base=base;
1604 }
cc601c31 1605 trie->lasttrans = tp + 1;
a3621e74
YO
1606 }
1607 } else {
1608 /*
1609 Second Pass -- Flat Table Representation.
1610
1611 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1612 We know that we will need Charcount+1 trans at most to store the data
1613 (one row per char at worst case) So we preallocate both structures
1614 assuming worst case.
1615
1616 We then construct the trie using only the .next slots of the entry
1617 structs.
1618
1619 We use the .check field of the first entry of the node temporarily to
1620 make compression both faster and easier by keeping track of how many non
1621 zero fields are in the node.
1622
1623 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1624 transition.
1625
1626 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1627 number representing the first entry of the node, and state as a
1628 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1629 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1630 are 2 entrys per node. eg:
1631
1632 A B A B
1633 1. 2 4 1. 3 7
1634 2. 0 3 3. 0 5
1635 3. 0 0 5. 0 0
1636 4. 0 0 7. 0 0
1637
1638 The table is internally in the right hand, idx form. However as we also
1639 have to deal with the states array which is indexed by nodenum we have to
1640 use TRIE_NODENUM() to convert.
1641
1642 */
1e2e3d02
YO
1643 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1644 "%*sCompiling trie using table compiler\n",
1645 (int)depth * 2 + 2, ""));
3dab1dad 1646
c944940b
JH
1647 trie->trans = (reg_trie_trans *)
1648 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1649 * trie->uniquecharcount + 1,
1650 sizeof(reg_trie_trans) );
1651 trie->states = (reg_trie_state *)
1652 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1653 sizeof(reg_trie_state) );
a3621e74
YO
1654 next_alloc = trie->uniquecharcount + 1;
1655
3dab1dad 1656
a3621e74
YO
1657 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1658
c445ea15 1659 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1660 const U8 *uc = (U8*)STRING( noper );
1661 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1662
1663 U32 state = 1; /* required init */
1664
1665 U16 charid = 0; /* sanity init */
1666 U32 accept_state = 0; /* sanity init */
1667 U8 *scan = (U8*)NULL; /* sanity init */
1668
1669 STRLEN foldlen = 0; /* required init */
07be1b83 1670 U32 wordlen = 0; /* required init */
a3621e74
YO
1671 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1672
3dab1dad 1673 if ( OP(noper) != NOTHING ) {
786e8c11 1674 for ( ; uc < e ; uc += len ) {
a3621e74 1675
786e8c11 1676 TRIE_READ_CHAR;
a3621e74 1677
786e8c11
YO
1678 if ( uvc < 256 ) {
1679 charid = trie->charmap[ uvc ];
1680 } else {
55eed653 1681 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1682 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1683 }
786e8c11
YO
1684 if ( charid ) {
1685 charid--;
1686 if ( !trie->trans[ state + charid ].next ) {
1687 trie->trans[ state + charid ].next = next_alloc;
1688 trie->trans[ state ].check++;
1689 next_alloc += trie->uniquecharcount;
1690 }
1691 state = trie->trans[ state + charid ].next;
1692 } else {
1693 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1694 }
1695 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1696 }
a3621e74 1697 }
3dab1dad
YO
1698 accept_state = TRIE_NODENUM( state );
1699 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1700
1701 } /* end second pass */
1702
3dab1dad 1703 /* and now dump it out before we compress it */
2b8b4781
NC
1704 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1705 revcharmap,
1706 next_alloc, depth+1));
a3621e74 1707
a3621e74
YO
1708 {
1709 /*
1710 * Inplace compress the table.*
1711
1712 For sparse data sets the table constructed by the trie algorithm will
1713 be mostly 0/FAIL transitions or to put it another way mostly empty.
1714 (Note that leaf nodes will not contain any transitions.)
1715
1716 This algorithm compresses the tables by eliminating most such
1717 transitions, at the cost of a modest bit of extra work during lookup:
1718
1719 - Each states[] entry contains a .base field which indicates the
1720 index in the state[] array wheres its transition data is stored.
1721
1722 - If .base is 0 there are no valid transitions from that node.
1723
1724 - If .base is nonzero then charid is added to it to find an entry in
1725 the trans array.
1726
1727 -If trans[states[state].base+charid].check!=state then the
1728 transition is taken to be a 0/Fail transition. Thus if there are fail
1729 transitions at the front of the node then the .base offset will point
1730 somewhere inside the previous nodes data (or maybe even into a node
1731 even earlier), but the .check field determines if the transition is
1732 valid.
1733
786e8c11 1734 XXX - wrong maybe?
a3621e74
YO
1735 The following process inplace converts the table to the compressed
1736 table: We first do not compress the root node 1,and mark its all its
1737 .check pointers as 1 and set its .base pointer as 1 as well. This
1738 allows to do a DFA construction from the compressed table later, and
1739 ensures that any .base pointers we calculate later are greater than
1740 0.
1741
1742 - We set 'pos' to indicate the first entry of the second node.
1743
1744 - We then iterate over the columns of the node, finding the first and
1745 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1746 and set the .check pointers accordingly, and advance pos
1747 appropriately and repreat for the next node. Note that when we copy
1748 the next pointers we have to convert them from the original
1749 NODEIDX form to NODENUM form as the former is not valid post
1750 compression.
1751
1752 - If a node has no transitions used we mark its base as 0 and do not
1753 advance the pos pointer.
1754
1755 - If a node only has one transition we use a second pointer into the
1756 structure to fill in allocated fail transitions from other states.
1757 This pointer is independent of the main pointer and scans forward
1758 looking for null transitions that are allocated to a state. When it
1759 finds one it writes the single transition into the "hole". If the
786e8c11 1760 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1761
1762 - Once compressed we can Renew/realloc the structures to release the
1763 excess space.
1764
1765 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1766 specifically Fig 3.47 and the associated pseudocode.
1767
1768 demq
1769 */
a3b680e6 1770 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1771 U32 state, charid;
a3621e74 1772 U32 pos = 0, zp=0;
1e2e3d02 1773 trie->statecount = laststate;
a3621e74
YO
1774
1775 for ( state = 1 ; state < laststate ; state++ ) {
1776 U8 flag = 0;
a28509cc
AL
1777 const U32 stateidx = TRIE_NODEIDX( state );
1778 const U32 o_used = trie->trans[ stateidx ].check;
1779 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1780 trie->trans[ stateidx ].check = 0;
1781
1782 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1783 if ( flag || trie->trans[ stateidx + charid ].next ) {
1784 if ( trie->trans[ stateidx + charid ].next ) {
1785 if (o_used == 1) {
1786 for ( ; zp < pos ; zp++ ) {
1787 if ( ! trie->trans[ zp ].next ) {
1788 break;
1789 }
1790 }
1791 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1792 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1793 trie->trans[ zp ].check = state;
1794 if ( ++zp > pos ) pos = zp;
1795 break;
1796 }
1797 used--;
1798 }
1799 if ( !flag ) {
1800 flag = 1;
1801 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1802 }
1803 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1804 trie->trans[ pos ].check = state;
1805 pos++;
1806 }
1807 }
1808 }
cc601c31 1809 trie->lasttrans = pos + 1;
c944940b
JH
1810 trie->states = (reg_trie_state *)
1811 PerlMemShared_realloc( trie->states, laststate
1812 * sizeof(reg_trie_state) );
a3621e74 1813 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1814 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1815 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1816 (int)depth * 2 + 2,"",
1817 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1818 (IV)next_alloc,
1819 (IV)pos,
a3621e74
YO
1820 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1821 );
1822
1823 } /* end table compress */
1824 }
1e2e3d02
YO
1825 DEBUG_TRIE_COMPILE_MORE_r(
1826 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1827 (int)depth * 2 + 2, "",
1828 (UV)trie->statecount,
1829 (UV)trie->lasttrans)
1830 );
cc601c31 1831 /* resize the trans array to remove unused space */
c944940b
JH
1832 trie->trans = (reg_trie_trans *)
1833 PerlMemShared_realloc( trie->trans, trie->lasttrans
1834 * sizeof(reg_trie_trans) );
a3621e74 1835
3dab1dad 1836 /* and now dump out the compressed format */
2b8b4781 1837 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1838
3dab1dad 1839 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1840 U8 nodetype =(U8)(flags & 0xFF);
1841 char *str=NULL;
786e8c11 1842
07be1b83 1843#ifdef DEBUGGING
e62cc96a 1844 regnode *optimize = NULL;
b57a0404
JH
1845 U32 mjd_offset = 0;
1846 U32 mjd_nodelen = 0;
07be1b83 1847#endif
a3621e74 1848 /*
3dab1dad
YO
1849 This means we convert either the first branch or the first Exact,
1850 depending on whether the thing following (in 'last') is a branch
1851 or not and whther first is the startbranch (ie is it a sub part of
1852 the alternation or is it the whole thing.)
1853 Assuming its a sub part we conver the EXACT otherwise we convert
1854 the whole branch sequence, including the first.
a3621e74 1855 */
3dab1dad 1856 /* Find the node we are going to overwrite */
7f69552c 1857 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1858 /* branch sub-chain */
3dab1dad 1859 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1860 DEBUG_r({
1861 mjd_offset= Node_Offset((convert));
1862 mjd_nodelen= Node_Length((convert));
1863 });
7f69552c
YO
1864 /* whole branch chain */
1865 } else {
1866 DEBUG_r({
1867 const regnode *nop = NEXTOPER( convert );
1868 mjd_offset= Node_Offset((nop));
1869 mjd_nodelen= Node_Length((nop));
1870 });
07be1b83 1871 }
7f69552c 1872
07be1b83
YO
1873 DEBUG_OPTIMISE_r(
1874 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1875 (int)depth * 2 + 2, "",
786e8c11 1876 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1877 );
a3621e74 1878
3dab1dad
YO
1879 /* But first we check to see if there is a common prefix we can
1880 split out as an EXACT and put in front of the TRIE node. */
1881 trie->startstate= 1;
55eed653 1882 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1883 U32 state;
1e2e3d02 1884 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1885 U32 ofs = 0;
8e11feef
RGS
1886 I32 idx = -1;
1887 U32 count = 0;
1888 const U32 base = trie->states[ state ].trans.base;
a3621e74 1889
3dab1dad 1890 if ( trie->states[state].wordnum )
8e11feef 1891 count = 1;
a3621e74 1892
8e11feef 1893 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1894 if ( ( base + ofs >= trie->uniquecharcount ) &&
1895 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1896 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1897 {
3dab1dad 1898 if ( ++count > 1 ) {
2b8b4781 1899 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1900 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1901 if ( state == 1 ) break;
3dab1dad
YO
1902 if ( count == 2 ) {
1903 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1904 DEBUG_OPTIMISE_r(
8e11feef
RGS
1905 PerlIO_printf(Perl_debug_log,
1906 "%*sNew Start State=%"UVuf" Class: [",
1907 (int)depth * 2 + 2, "",
786e8c11 1908 (UV)state));
be8e71aa 1909 if (idx >= 0) {
2b8b4781 1910 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1911 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1912
3dab1dad 1913 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1914 if ( folder )
1915 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1916 DEBUG_OPTIMISE_r(
07be1b83 1917 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1918 );
8e11feef
RGS
1919 }
1920 }
1921 TRIE_BITMAP_SET(trie,*ch);
1922 if ( folder )
1923 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1924 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1925 }
1926 idx = ofs;
1927 }
3dab1dad
YO
1928 }
1929 if ( count == 1 ) {
2b8b4781 1930 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1931 char *ch = SvPV_nolen( *tmp );
1932 DEBUG_OPTIMISE_r({
1933 SV *sv=sv_newmortal();
8e11feef
RGS
1934 PerlIO_printf( Perl_debug_log,
1935 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1936 (int)depth * 2 + 2, "",
de734bd5
A
1937 (UV)state, (UV)idx,
1938 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1939 PL_colors[0], PL_colors[1],
1940 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1941 PERL_PV_ESCAPE_FIRSTCHAR
1942 )
1943 );
1944 });
3dab1dad
YO
1945 if ( state==1 ) {
1946 OP( convert ) = nodetype;
1947 str=STRING(convert);
1948 STR_LEN(convert)=0;
1949 }
de734bd5
A
1950 while (*ch) {
1951 *str++ = *ch++;
1952 STR_LEN(convert)++;
1953 }
1954
8e11feef 1955 } else {
f9049ba1 1956#ifdef DEBUGGING
8e11feef
RGS
1957 if (state>1)
1958 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1959#endif
8e11feef
RGS
1960 break;
1961 }
1962 }
3dab1dad 1963 if (str) {
8e11feef 1964 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1965 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1966 trie->startstate = state;
07be1b83
YO
1967 trie->minlen -= (state - 1);
1968 trie->maxlen -= (state - 1);
1969 DEBUG_r({
1970 regnode *fix = convert;
de734bd5 1971 U32 word = trie->wordcount;
07be1b83
YO
1972 mjd_nodelen++;
1973 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1974 while( ++fix < n ) {
1975 Set_Node_Offset_Length(fix, 0, 0);
1976 }
de734bd5 1977 while (word--) {
2b8b4781 1978 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1979 if (tmp) {
1980 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1981 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1982 else
1983 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1984 }
1985 }
07be1b83 1986 });
8e11feef
RGS
1987 if (trie->maxlen) {
1988 convert = n;
1989 } else {
3dab1dad 1990 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1991 DEBUG_r(optimize= n);
3dab1dad
YO
1992 }
1993 }
1994 }
a5ca303d
YO
1995 if (!jumper)
1996 jumper = last;
3dab1dad 1997 if ( trie->maxlen ) {
8e11feef
RGS
1998 NEXT_OFF( convert ) = (U16)(tail - convert);
1999 ARG_SET( convert, data_slot );
786e8c11
YO
2000 /* Store the offset to the first unabsorbed branch in
2001 jump[0], which is otherwise unused by the jump logic.
2002 We use this when dumping a trie and during optimisation. */
2003 if (trie->jump)
7f69552c 2004 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2005
786e8c11
YO
2006 /* XXXX */
2007 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2008 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2009 {
2010 OP( convert ) = TRIEC;
2011 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2012 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2013 trie->bitmap= NULL;
2014 } else
2015 OP( convert ) = TRIE;
a3621e74 2016
3dab1dad
YO
2017 /* store the type in the flags */
2018 convert->flags = nodetype;
a5ca303d
YO
2019 DEBUG_r({
2020 optimize = convert
2021 + NODE_STEP_REGNODE
2022 + regarglen[ OP( convert ) ];
2023 });
2024 /* XXX We really should free up the resource in trie now,
2025 as we won't use them - (which resources?) dmq */
3dab1dad 2026 }
a3621e74 2027 /* needed for dumping*/
e62cc96a 2028 DEBUG_r(if (optimize) {
07be1b83 2029 regnode *opt = convert;
e62cc96a 2030 while ( ++opt < optimize) {
07be1b83
YO
2031 Set_Node_Offset_Length(opt,0,0);
2032 }
786e8c11
YO
2033 /*
2034 Try to clean up some of the debris left after the
2035 optimisation.
a3621e74 2036 */
786e8c11 2037 while( optimize < jumper ) {
07be1b83 2038 mjd_nodelen += Node_Length((optimize));
a3621e74 2039 OP( optimize ) = OPTIMIZED;
07be1b83 2040 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2041 optimize++;
2042 }
07be1b83 2043 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2044 });
2045 } /* end node insert */
55eed653 2046 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2047#ifdef DEBUGGING
2048 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2049 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2050#else
2051 SvREFCNT_dec(revcharmap);
07be1b83 2052#endif
786e8c11
YO
2053 return trie->jump
2054 ? MADE_JUMP_TRIE
2055 : trie->startstate>1
2056 ? MADE_EXACT_TRIE
2057 : MADE_TRIE;
2058}
2059
2060STATIC void
2061S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2062{
2063/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2064
2065 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2066 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2067 ISBN 0-201-10088-6
2068
2069 We find the fail state for each state in the trie, this state is the longest proper
2070 suffix of the current states 'word' that is also a proper prefix of another word in our
2071 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2072 the DFA not to have to restart after its tried and failed a word at a given point, it
2073 simply continues as though it had been matching the other word in the first place.
2074 Consider
2075 'abcdgu'=~/abcdefg|cdgu/
2076 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2077 fail, which would bring use to the state representing 'd' in the second word where we would
2078 try 'g' and succeed, prodceding to match 'cdgu'.
2079 */
2080 /* add a fail transition */
3251b653
NC
2081 const U32 trie_offset = ARG(source);
2082 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2083 U32 *q;
2084 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2085 const U32 numstates = trie->statecount;
786e8c11
YO
2086 const U32 ubound = trie->lasttrans + ucharcount;
2087 U32 q_read = 0;
2088 U32 q_write = 0;
2089 U32 charid;
2090 U32 base = trie->states[ 1 ].trans.base;
2091 U32 *fail;
2092 reg_ac_data *aho;
2093 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2094 GET_RE_DEBUG_FLAGS_DECL;
2095#ifndef DEBUGGING
2096 PERL_UNUSED_ARG(depth);
2097#endif
2098
2099
2100 ARG_SET( stclass, data_slot );
c944940b 2101 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2102 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2103 aho->trie=trie_offset;
446bd890
NC
2104 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2105 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2106 Newxz( q, numstates, U32);
c944940b 2107 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2108 aho->refcount = 1;
2109 fail = aho->fail;
2110 /* initialize fail[0..1] to be 1 so that we always have
2111 a valid final fail state */
2112 fail[ 0 ] = fail[ 1 ] = 1;
2113
2114 for ( charid = 0; charid < ucharcount ; charid++ ) {
2115 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2116 if ( newstate ) {
2117 q[ q_write ] = newstate;
2118 /* set to point at the root */
2119 fail[ q[ q_write++ ] ]=1;
2120 }
2121 }
2122 while ( q_read < q_write) {
2123 const U32 cur = q[ q_read++ % numstates ];
2124 base = trie->states[ cur ].trans.base;
2125
2126 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2127 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2128 if (ch_state) {
2129 U32 fail_state = cur;
2130 U32 fail_base;
2131 do {
2132 fail_state = fail[ fail_state ];
2133 fail_base = aho->states[ fail_state ].trans.base;
2134 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2135
2136 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2137 fail[ ch_state ] = fail_state;
2138 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2139 {
2140 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2141 }
2142 q[ q_write++ % numstates] = ch_state;
2143 }
2144 }
2145 }
2146 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2147 when we fail in state 1, this allows us to use the
2148 charclass scan to find a valid start char. This is based on the principle
2149 that theres a good chance the string being searched contains lots of stuff
2150 that cant be a start char.
2151 */
2152 fail[ 0 ] = fail[ 1 ] = 0;
2153 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2154 PerlIO_printf(Perl_debug_log,
2155 "%*sStclass Failtable (%"UVuf" states): 0",
2156 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2157 );
786e8c11
YO
2158 for( q_read=1; q_read<numstates; q_read++ ) {
2159 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2160 }
2161 PerlIO_printf(Perl_debug_log, "\n");
2162 });
2163 Safefree(q);
2164 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2165}
2166
786e8c11 2167
a3621e74 2168/*
5d1c421c
JH
2169 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2170 * These need to be revisited when a newer toolchain becomes available.
2171 */
2172#if defined(__sparc64__) && defined(__GNUC__)
2173# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2174# undef SPARC64_GCC_WORKAROUND
2175# define SPARC64_GCC_WORKAROUND 1
2176# endif
2177#endif
2178
07be1b83 2179#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2180 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2181 SV * const mysv=sv_newmortal(); \
2182 regnode *Next = regnext(scan); \
2183 regprop(RExC_rx, mysv, scan); \
7f69552c 2184 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2185 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2186 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2187 }});
07be1b83 2188
1de06328
YO
2189
2190
2191
2192
07be1b83
YO
2193#define JOIN_EXACT(scan,min,flags) \
2194 if (PL_regkind[OP(scan)] == EXACT) \
2195 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2196
be8e71aa 2197STATIC U32
07be1b83
YO
2198S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2199 /* Merge several consecutive EXACTish nodes into one. */
2200 regnode *n = regnext(scan);
2201 U32 stringok = 1;
2202 regnode *next = scan + NODE_SZ_STR(scan);
2203 U32 merged = 0;
2204 U32 stopnow = 0;
2205#ifdef DEBUGGING
2206 regnode *stop = scan;
72f13be8 2207 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2208#else
d47053eb
RGS
2209 PERL_UNUSED_ARG(depth);
2210#endif
2211#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2212 PERL_UNUSED_ARG(flags);
2213 PERL_UNUSED_ARG(val);
07be1b83 2214#endif
07be1b83
YO
2215 DEBUG_PEEP("join",scan,depth);
2216
2217 /* Skip NOTHING, merge EXACT*. */
2218 while (n &&
2219 ( PL_regkind[OP(n)] == NOTHING ||
2220 (stringok && (OP(n) == OP(scan))))
2221 && NEXT_OFF(n)
2222 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2223
2224 if (OP(n) == TAIL || n > next)
2225 stringok = 0;
2226 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2227 DEBUG_PEEP("skip:",n,depth);
2228 NEXT_OFF(scan) += NEXT_OFF(n);
2229 next = n + NODE_STEP_REGNODE;
2230#ifdef DEBUGGING
2231 if (stringok)
2232 stop = n;
2233#endif
2234 n = regnext(n);
2235 }
2236 else if (stringok) {
786e8c11 2237 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2238 regnode * const nnext = regnext(n);
2239
2240 DEBUG_PEEP("merg",n,depth);
2241
2242 merged++;
2243 if (oldl + STR_LEN(n) > U8_MAX)
2244 break;
2245 NEXT_OFF(scan) += NEXT_OFF(n);
2246 STR_LEN(scan) += STR_LEN(n);
2247 next = n + NODE_SZ_STR(n);
2248 /* Now we can overwrite *n : */
2249 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2250#ifdef DEBUGGING
2251 stop = next - 1;
2252#endif
2253 n = nnext;
2254 if (stopnow) break;
2255 }
2256
d47053eb
RGS
2257#ifdef EXPERIMENTAL_INPLACESCAN
2258 if (flags && !NEXT_OFF(n)) {
2259 DEBUG_PEEP("atch", val, depth);
2260 if (reg_off_by_arg[OP(n)]) {
2261 ARG_SET(n, val - n);
2262 }
2263 else {
2264 NEXT_OFF(n) = val - n;
2265 }
2266 stopnow = 1;
2267 }
07be1b83
YO
2268#endif
2269 }
2270
2271 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2272 /*
2273 Two problematic code points in Unicode casefolding of EXACT nodes:
2274
2275 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2276 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2277
2278 which casefold to
2279
2280 Unicode UTF-8
2281
2282 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2283 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2284
2285 This means that in case-insensitive matching (or "loose matching",
2286 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2287 length of the above casefolded versions) can match a target string
2288 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2289 This would rather mess up the minimum length computation.
2290
2291 What we'll do is to look for the tail four bytes, and then peek
2292 at the preceding two bytes to see whether we need to decrease
2293 the minimum length by four (six minus two).
2294
2295 Thanks to the design of UTF-8, there cannot be false matches:
2296 A sequence of valid UTF-8 bytes cannot be a subsequence of
2297 another valid sequence of UTF-8 bytes.
2298
2299 */
2300 char * const s0 = STRING(scan), *s, *t;
2301 char * const s1 = s0 + STR_LEN(scan) - 1;
2302 char * const s2 = s1 - 4;
e294cc5d
JH
2303#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2304 const char t0[] = "\xaf\x49\xaf\x42";
2305#else
07be1b83 2306 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2307#endif
07be1b83
YO
2308 const char * const t1 = t0 + 3;
2309
2310 for (s = s0 + 2;
2311 s < s2 && (t = ninstr(s, s1, t0, t1));
2312 s = t + 4) {
e294cc5d
JH
2313#ifdef EBCDIC
2314 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2315 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2316#else
07be1b83
YO
2317 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2318 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2319#endif
07be1b83
YO
2320 *min -= 4;
2321 }
2322 }
2323
2324#ifdef DEBUGGING
2325 /* Allow dumping */
2326 n = scan + NODE_SZ_STR(scan);
2327 while (n <= stop) {
2328 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2329 OP(n) = OPTIMIZED;
2330 NEXT_OFF(n) = 0;
2331 }
2332 n++;
2333 }
2334#endif
2335 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2336 return stopnow;
2337}
2338
653099ff
GS
2339/* REx optimizer. Converts nodes into quickier variants "in place".
2340 Finds fixed substrings. */
2341
a0288114 2342/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2343 to the position after last scanned or to NULL. */
2344
40d049e4
YO
2345#define INIT_AND_WITHP \
2346 assert(!and_withp); \
2347 Newx(and_withp,1,struct regnode_charclass_class); \
2348 SAVEFREEPV(and_withp)
07be1b83 2349
b515a41d
YO
2350/* this is a chain of data about sub patterns we are processing that
2351 need to be handled seperately/specially in study_chunk. Its so
2352 we can simulate recursion without losing state. */
2353struct scan_frame;
2354typedef struct scan_frame {
2355 regnode *last; /* last node to process in this frame */
2356 regnode *next; /* next node to process when last is reached */
2357 struct scan_frame *prev; /*previous frame*/
2358 I32 stop; /* what stopparen do we use */
2359} scan_frame;
2360
304ee84b
YO
2361
2362#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2363
76e3520e 2364STATIC I32
40d049e4 2365S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2366 I32 *minlenp, I32 *deltap,
40d049e4
YO
2367 regnode *last,
2368 scan_data_t *data,
2369 I32 stopparen,
2370 U8* recursed,
2371 struct regnode_charclass_class *and_withp,
2372 U32 flags, U32 depth)
c277df42
IZ
2373 /* scanp: Start here (read-write). */
2374 /* deltap: Write maxlen-minlen here. */
2375 /* last: Stop before this one. */
40d049e4
YO
2376 /* data: string data about the pattern */
2377 /* stopparen: treat close N as END */
2378 /* recursed: which subroutines have we recursed into */
2379 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2380{
97aff369 2381 dVAR;
c277df42
IZ
2382 I32 min = 0, pars = 0, code;
2383 regnode *scan = *scanp, *next;
2384 I32 delta = 0;
2385 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2386 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2387 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2388 scan_data_t data_fake;
a3621e74 2389 SV *re_trie_maxbuff = NULL;
786e8c11 2390 regnode *first_non_open = scan;
e2e6a0f1 2391 I32 stopmin = I32_MAX;
8aa23a47
YO
2392 scan_frame *frame = NULL;
2393
a3621e74 2394 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2395
13a24bad 2396#ifdef DEBUGGING
40d049e4 2397 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2398#endif
40d049e4 2399
786e8c11 2400 if ( depth == 0 ) {
40d049e4 2401 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2402 first_non_open=regnext(first_non_open);
2403 }
2404
b81d288d 2405
8aa23a47
YO
2406 fake_study_recurse:
2407 while ( scan && OP(scan) != END && scan < last ){
2408 /* Peephole optimizer: */
304ee84b 2409 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2410 DEBUG_PEEP("Peep",scan,depth);
2411 JOIN_EXACT(scan,&min,0);
2412
2413 /* Follow the next-chain of the current node and optimize
2414 away all the NOTHINGs from it. */
2415 if (OP(scan) != CURLYX) {
2416 const int max = (reg_off_by_arg[OP(scan)]
2417 ? I32_MAX
2418 /* I32 may be smaller than U16 on CRAYs! */
2419 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2420 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2421 int noff;
2422 regnode *n = scan;
2423
2424 /* Skip NOTHING and LONGJMP. */
2425 while ((n = regnext(n))
2426 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2427 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2428 && off + noff < max)
2429 off += noff;
2430 if (reg_off_by_arg[OP(scan)])
2431 ARG(scan) = off;
2432 else
2433 NEXT_OFF(scan) = off;
2434 }
a3621e74 2435
c277df42 2436
8aa23a47
YO
2437
2438 /* The principal pseudo-switch. Cannot be a switch, since we
2439 look into several different things. */
2440 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2441 || OP(scan) == IFTHEN) {
2442 next = regnext(scan);
2443 code = OP(scan);
2444 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2445
2446 if (OP(next) == code || code == IFTHEN) {
2447 /* NOTE - There is similar code to this block below for handling
2448 TRIE nodes on a re-study. If you change stuff here check there
2449 too. */
2450 I32 max1 = 0, min1 = I32_MAX, num = 0;
2451 struct regnode_charclass_class accum;
2452 regnode * const startbranch=scan;
2453
2454 if (flags & SCF_DO_SUBSTR)
304ee84b 2455 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2456 if (flags & SCF_DO_STCLASS)
2457 cl_init_zero(pRExC_state, &accum);
2458
2459 while (OP(scan) == code) {
2460 I32 deltanext, minnext, f = 0, fake;
2461 struct regnode_charclass_class this_class;
2462
2463 num++;
2464 data_fake.flags = 0;
2465 if (data) {
2466 data_fake.whilem_c = data->whilem_c;
2467 data_fake.last_closep = data->last_closep;
2468 }
2469 else
2470 data_fake.last_closep = &fake;
58e23c8d
YO
2471
2472 data_fake.pos_delta = delta;
8aa23a47
YO
2473 next = regnext(scan);
2474 scan = NEXTOPER(scan);
2475 if (code != BRANCH)
c277df42 2476 scan = NEXTOPER(scan);
8aa23a47
YO
2477 if (flags & SCF_DO_STCLASS) {
2478 cl_init(pRExC_state, &this_class);
2479 data_fake.start_class = &this_class;
2480 f = SCF_DO_STCLASS_AND;
58e23c8d 2481 }
8aa23a47
YO
2482 if (flags & SCF_WHILEM_VISITED_POS)
2483 f |= SCF_WHILEM_VISITED_POS;
2484
2485 /* we suppose the run is continuous, last=next...*/
2486 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2487 next, &data_fake,
2488 stopparen, recursed, NULL, f,depth+1);
2489 if (min1 > minnext)
2490 min1 = minnext;
2491 if (max1 < minnext + deltanext)
2492 max1 = minnext + deltanext;
2493 if (deltanext == I32_MAX)
2494 is_inf = is_inf_internal = 1;
2495 scan = next;
2496 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2497 pars++;
2498 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2499 if ( stopmin > minnext)
2500 stopmin = min + min1;
2501 flags &= ~SCF_DO_SUBSTR;
2502 if (data)
2503 data->flags |= SCF_SEEN_ACCEPT;
2504 }
2505 if (data) {
2506 if (data_fake.flags & SF_HAS_EVAL)
2507 data->flags |= SF_HAS_EVAL;
2508 data->whilem_c = data_fake.whilem_c;
3dab1dad 2509 }
8aa23a47
YO
2510 if (flags & SCF_DO_STCLASS)
2511 cl_or(pRExC_state, &accum, &this_class);
2512 }
2513 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2514 min1 = 0;
2515 if (flags & SCF_DO_SUBSTR) {
2516 data->pos_min += min1;
2517 data->pos_delta += max1 - min1;
2518 if (max1 != min1 || is_inf)
2519 data->longest = &(data->longest_float);
2520 }
2521 min += min1;
2522 delta += max1 - min1;
2523 if (flags & SCF_DO_STCLASS_OR) {
2524 cl_or(pRExC_state, data->start_class, &accum);
2525 if (min1) {
2526 cl_and(data->start_class, and_withp);
2527 flags &= ~SCF_DO_STCLASS;
653099ff 2528 }
8aa23a47
YO
2529 }
2530 else if (flags & SCF_DO_STCLASS_AND) {
2531 if (min1) {
2532 cl_and(data->start_class, &accum);
2533 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2534 }
8aa23a47
YO
2535 else {
2536 /* Switch to OR mode: cache the old value of
2537 * data->start_class */
2538 INIT_AND_WITHP;
2539 StructCopy(data->start_class, and_withp,
2540 struct regnode_charclass_class);
2541 flags &= ~SCF_DO_STCLASS_AND;
2542 StructCopy(&accum, data->start_class,
2543 struct regnode_charclass_class);
2544 flags |= SCF_DO_STCLASS_OR;
2545 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2546 }
8aa23a47 2547 }
a3621e74 2548
8aa23a47
YO
2549 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2550 /* demq.
a3621e74 2551
8aa23a47
YO
2552 Assuming this was/is a branch we are dealing with: 'scan' now
2553 points at the item that follows the branch sequence, whatever
2554 it is. We now start at the beginning of the sequence and look
2555 for subsequences of
a3621e74 2556
8aa23a47
YO
2557 BRANCH->EXACT=>x1
2558 BRANCH->EXACT=>x2
2559 tail
a3621e74 2560
8aa23a47 2561 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2562
8aa23a47
YO
2563 If we can find such a subseqence we need to turn the first
2564 element into a trie and then add the subsequent branch exact
2565 strings to the trie.
a3621e74 2566
8aa23a47 2567 We have two cases
a3621e74 2568
8aa23a47 2569 1. patterns where the whole set of branch can be converted.
a3621e74 2570
8aa23a47 2571 2. patterns where only a subset can be converted.
a3621e74 2572
8aa23a47
YO
2573 In case 1 we can replace the whole set with a single regop
2574 for the trie. In case 2 we need to keep the start and end
2575 branchs so
a3621e74 2576
8aa23a47
YO
2577 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2578 becomes BRANCH TRIE; BRANCH X;
786e8c11 2579
8aa23a47
YO
2580 There is an additional case, that being where there is a
2581 common prefix, which gets split out into an EXACT like node
2582 preceding the TRIE node.
a3621e74 2583
8aa23a47
YO
2584 If x(1..n)==tail then we can do a simple trie, if not we make
2585 a "jump" trie, such that when we match the appropriate word
2586 we "jump" to the appopriate tail node. Essentailly we turn
2587 a nested if into a case structure of sorts.
b515a41d 2588
8aa23a47
YO
2589 */
2590
2591 int made=0;
2592 if (!re_trie_maxbuff) {
2593 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2594 if (!SvIOK(re_trie_maxbuff))
2595 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2596 }
2597 if ( SvIV(re_trie_maxbuff)>=0 ) {
2598 regnode *cur;
2599 regnode *first = (regnode *)NULL;
2600 regnode *last = (regnode *)NULL;
2601 regnode *tail = scan;
2602 U8 optype = 0;
2603 U32 count=0;
a3621e74
YO
2604
2605#ifdef DEBUGGING
8aa23a47 2606 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2607#endif
8aa23a47
YO
2608 /* var tail is used because there may be a TAIL
2609 regop in the way. Ie, the exacts will point to the
2610 thing following the TAIL, but the last branch will
2611 point at the TAIL. So we advance tail. If we
2612 have nested (?:) we may have to move through several
2613 tails.
2614 */
2615
2616 while ( OP( tail ) == TAIL ) {
2617 /* this is the TAIL generated by (?:) */
2618 tail = regnext( tail );
2619 }
a3621e74 2620
8aa23a47
YO
2621
2622 DEBUG_OPTIMISE_r({
2623 regprop(RExC_rx, mysv, tail );
2624 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2625 (int)depth * 2 + 2, "",
2626 "Looking for TRIE'able sequences. Tail node is: ",
2627 SvPV_nolen_const( mysv )
2628 );
2629 });
2630
2631 /*
2632
2633 step through the branches, cur represents each
2634 branch, noper is the first thing to be matched
2635 as part of that branch and noper_next is the
2636 regnext() of that node. if noper is an EXACT
2637 and noper_next is the same as scan (our current
2638 position in the regex) then the EXACT branch is
2639 a possible optimization target. Once we have
2640 two or more consequetive such branches we can
2641 create a trie of the EXACT's contents and stich
2642 it in place. If the sequence represents all of
2643 the branches we eliminate the whole thing and
2644 replace it with a single TRIE. If it is a
2645 subsequence then we need to stitch it in. This
2646 means the first branch has to remain, and needs
2647 to be repointed at the item on the branch chain
2648 following the last branch optimized. This could
2649 be either a BRANCH, in which case the
2650 subsequence is internal, or it could be the
2651 item following the branch sequence in which
2652 case the subsequence is at the end.
2653
2654 */
2655
2656 /* dont use tail as the end marker for this traverse */
2657 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2658 regnode * const noper = NEXTOPER( cur );
b515a41d 2659#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2660 regnode * const noper_next = regnext( noper );
b515a41d
YO
2661#endif
2662
8aa23a47
YO
2663 DEBUG_OPTIMISE_r({
2664 regprop(RExC_rx, mysv, cur);
2665 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2666 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2667
2668 regprop(RExC_rx, mysv, noper);
2669 PerlIO_printf( Perl_debug_log, " -> %s",
2670 SvPV_nolen_const(mysv));
2671
2672 if ( noper_next ) {
2673 regprop(RExC_rx, mysv, noper_next );
2674 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2675 SvPV_nolen_const(mysv));
2676 }
2677 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2678 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2679 });
2680 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2681 : PL_regkind[ OP( noper ) ] == EXACT )
2682 || OP(noper) == NOTHING )
786e8c11 2683#ifdef NOJUMPTRIE
8aa23a47 2684 && noper_next == tail
786e8c11 2685#endif
8aa23a47
YO
2686 && count < U16_MAX)
2687 {
2688 count++;
2689 if ( !first || optype == NOTHING ) {
2690 if (!first) first = cur;
2691 optype = OP( noper );
2692 } else {
2693 last = cur;
2694 }
2695 } else {
2696 if ( last ) {
2697 make_trie( pRExC_state,
2698 startbranch, first, cur, tail, count,
2699 optype, depth+1 );
2700 }
2701 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2702#ifdef NOJUMPTRIE
8aa23a47 2703 && noper_next == tail
786e8c11 2704#endif
8aa23a47
YO
2705 ){
2706 count = 1;
2707 first = cur;
2708 optype = OP( noper );
2709 } else {
2710 count = 0;
2711 first = NULL;
2712 optype = 0;
2713 }
2714 last = NULL;
2715 }
2716 }
2717 DEBUG_OPTIMISE_r({
2718 regprop(RExC_rx, mysv, cur);
2719 PerlIO_printf( Perl_debug_log,
2720 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2721 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2722
2723 });
2724 if ( last ) {
2725 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2726#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2727 if ( ((made == MADE_EXACT_TRIE &&
2728 startbranch == first)
2729 || ( first_non_open == first )) &&
2730 depth==0 ) {
2731 flags |= SCF_TRIE_RESTUDY;
2732 if ( startbranch == first
2733 && scan == tail )
2734 {
2735 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2736 }
2737 }
3dab1dad 2738#endif
8aa23a47
YO
2739 }
2740 }
2741
2742 } /* do trie */
2743
653099ff 2744 }
8aa23a47
YO
2745 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2746 scan = NEXTOPER(NEXTOPER(scan));
2747 } else /* single branch is optimized. */
2748 scan = NEXTOPER(scan);
2749 continue;
2750 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2751 scan_frame *newframe = NULL;
2752 I32 paren;
2753 regnode *start;
2754 regnode *end;
2755
2756 if (OP(scan) != SUSPEND) {
2757 /* set the pointer */
2758 if (OP(scan) == GOSUB) {
2759 paren = ARG(scan);
2760 RExC_recurse[ARG2L(scan)] = scan;
2761 start = RExC_open_parens[paren-1];
2762 end = RExC_close_parens[paren-1];
2763 } else {
2764 paren = 0;
f8fc2ecf 2765 start = RExC_rxi->program + 1;
8aa23a47
YO
2766 end = RExC_opend;
2767 }
2768 if (!recursed) {
2769 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2770 SAVEFREEPV(recursed);
2771 }
2772 if (!PAREN_TEST(recursed,paren+1)) {
2773 PAREN_SET(recursed,paren+1);
2774 Newx(newframe,1,scan_frame);
2775 } else {
2776 if (flags & SCF_DO_SUBSTR) {
304ee84b 2777 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2778 data->longest = &(data->longest_float);
2779 }
2780 is_inf = is_inf_internal = 1;
2781 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2782 cl_anything(pRExC_state, data->start_class);
2783 flags &= ~SCF_DO_STCLASS;
2784 }
2785 } else {
2786 Newx(newframe,1,scan_frame);
2787 paren = stopparen;
2788 start = scan+2;
2789 end = regnext(scan);
2790 }
2791 if (newframe) {
2792 assert(start);
2793 assert(end);
2794 SAVEFREEPV(newframe);
2795 newframe->next = regnext(scan);
2796 newframe->last = last;
2797 newframe->stop = stopparen;
2798 newframe->prev = frame;
2799
2800 frame = newframe;
2801 scan = start;
2802 stopparen = paren;
2803 last = end;
2804
2805 continue;
2806 }
2807 }
2808 else if (OP(scan) == EXACT) {
2809 I32 l = STR_LEN(scan);
2810 UV uc;
2811 if (UTF) {
2812 const U8 * const s = (U8*)STRING(scan);
2813 l = utf8_length(s, s + l);
2814 uc = utf8_to_uvchr(s, NULL);
2815 } else {
2816 uc = *((U8*)STRING(scan));
2817 }
2818 min += l;
2819 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2820 /* The code below prefers earlier match for fixed
2821 offset, later match for variable offset. */
2822 if (data->last_end == -1) { /* Update the start info. */
2823 data->last_start_min = data->pos_min;
2824 data->last_start_max = is_inf
2825 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2826 }
8aa23a47
YO
2827 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2828 if (UTF)
2829 SvUTF8_on(data->last_found);
2830 {
2831 SV * const sv = data->last_found;
2832 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2833 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2834 if (mg && mg->mg_len >= 0)
2835 mg->mg_len += utf8_length((U8*)STRING(scan),
2836 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2837 }
8aa23a47
YO
2838 data->last_end = data->pos_min + l;
2839 data->pos_min += l; /* As in the first entry. */
2840 data->flags &= ~SF_BEFORE_EOL;
2841 }
2842 if (flags & SCF_DO_STCLASS_AND) {
2843 /* Check whether it is compatible with what we know already! */
2844 int compat = 1;
2845
2846 if (uc >= 0x100 ||
2847 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2848 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2849 && (!(data->start_class->flags & ANYOF_FOLD)
2850 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2851 )
2852 compat = 0;
2853 ANYOF_CLASS_ZERO(data->start_class);
2854 ANYOF_BITMAP_ZERO(data->start_class);
2855 if (compat)
2856 ANYOF_BITMAP_SET(data->start_class, uc);
2857 data->start_class->flags &= ~ANYOF_EOS;
2858 if (uc < 0x100)
2859 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2860 }
2861 else if (flags & SCF_DO_STCLASS_OR) {
2862 /* false positive possible if the class is case-folded */
2863 if (uc < 0x100)
2864 ANYOF_BITMAP_SET(data->start_class, uc);
2865 else
2866 data->start_class->flags |= ANYOF_UNICODE_ALL;
2867 data->start_class->flags &= ~ANYOF_EOS;
2868 cl_and(data->start_class, and_withp);
2869 }
2870 flags &= ~SCF_DO_STCLASS;
2871 }
2872 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2873 I32 l = STR_LEN(scan);
2874 UV uc = *((U8*)STRING(scan));
2875
2876 /* Search for fixed substrings supports EXACT only. */
2877 if (flags & SCF_DO_SUBSTR) {
2878 assert(data);
304ee84b 2879 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2880 }
2881 if (UTF) {
2882 const U8 * const s = (U8 *)STRING(scan);
2883 l = utf8_length(s, s + l);
2884 uc = utf8_to_uvchr(s, NULL);
2885 }
2886 min += l;
2887 if (flags & SCF_DO_SUBSTR)
2888 data->pos_min += l;
2889 if (flags & SCF_DO_STCLASS_AND) {
2890 /* Check whether it is compatible with what we know already! */
2891 int compat = 1;
2892
2893 if (uc >= 0x100 ||
2894 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2895 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2896 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2897 compat = 0;
2898 ANYOF_CLASS_ZERO(data->start_class);
2899 ANYOF_BITMAP_ZERO(data->start_class);
2900 if (compat) {
2901 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2902 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2903 data->start_class->flags |= ANYOF_FOLD;
2904 if (OP(scan) == EXACTFL)
2905 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2906 }
8aa23a47
YO
2907 }
2908 else if (flags & SCF_DO_STCLASS_OR) {
2909 if (data->start_class->flags & ANYOF_FOLD) {
2910 /* false positive possible if the class is case-folded.
2911 Assume that the locale settings are the same... */
1aa99e6b
IH
2912 if (uc < 0x100)
2913 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2914 data->start_class->flags &= ~ANYOF_EOS;
2915 }
8aa23a47 2916 cl_and(data->start_class, and_withp);
653099ff 2917 }
8aa23a47
YO
2918 flags &= ~SCF_DO_STCLASS;
2919 }
2920 else if (strchr((const char*)PL_varies,OP(scan))) {
2921 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2922 I32 f = flags, pos_before = 0;
2923 regnode * const oscan = scan;
2924 struct regnode_charclass_class this_class;
2925 struct regnode_charclass_class *oclass = NULL;
2926 I32 next_is_eval = 0;
2927
2928 switch (PL_regkind[OP(scan)]) {
2929 case WHILEM: /* End of (?:...)* . */
2930 scan = NEXTOPER(scan);
2931 goto finish;
2932 case PLUS:
2933 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2934 next = NEXTOPER(scan);
2935 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2936 mincount = 1;
2937 maxcount = REG_INFTY;
2938 next = regnext(scan);
2939 scan = NEXTOPER(scan);
2940 goto do_curly;
2941 }
2942 }
2943 if (flags & SCF_DO_SUBSTR)
2944 data->pos_min++;
2945 min++;
2946 /* Fall through. */
2947 case STAR:
2948 if (flags & SCF_DO_STCLASS) {
2949 mincount = 0;
2950 maxcount = REG_INFTY;
2951 next = regnext(scan);
2952 scan = NEXTOPER(scan);
2953 goto do_curly;
2954 }
2955 is_inf = is_inf_internal = 1;
2956 scan = regnext(scan);
c277df42 2957 if (flags & SCF_DO_SUBSTR) {
304ee84b 2958 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2959 data->longest = &(data->longest_float);
c277df42 2960 }
8aa23a47
YO
2961 goto optimize_curly_tail;
2962 case CURLY:
2963 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2964 && (scan->flags == stopparen))
2965 {
2966 mincount = 1;
2967 maxcount = 1;
2968 } else {
2969 mincount = ARG1(scan);
2970 maxcount = ARG2(scan);
653099ff 2971 }
8aa23a47
YO
2972 next = regnext(scan);
2973 if (OP(scan) == CURLYX) {
2974 I32 lp = (data ? *(data->last_closep) : 0);
2975 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2976 }
8aa23a47
YO
2977 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2978 next_is_eval = (OP(scan) == EVAL);
2979 do_curly:
2980 if (flags & SCF_DO_SUBSTR) {
304ee84b 2981 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2982 pos_before = data->pos_min;
b45f050a 2983 }
8aa23a47
YO
2984 if (data) {
2985 fl = data->flags;
2986 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2987 if (is_inf)
2988 data->flags |= SF_IS_INF;
2989 }
2990 if (flags & SCF_DO_STCLASS) {
2991 cl_init(pRExC_state, &this_class);
2992 oclass = data->start_class;
2993 data->start_class = &this_class;
2994 f |= SCF_DO_STCLASS_AND;
2995 f &= ~SCF_DO_STCLASS_OR;
2996 }
2997 /* These are the cases when once a subexpression
2998 fails at a particular position, it cannot succeed
2999 even after backtracking at the enclosing scope.
3000
3001 XXXX what if minimal match and we are at the
3002 initial run of {n,m}? */
3003 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3004 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3005
8aa23a47
YO
3006 /* This will finish on WHILEM, setting scan, or on NULL: */
3007 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3008 last, data, stopparen, recursed, NULL,
3009 (mincount == 0
3010 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3011
8aa23a47
YO
3012 if (flags & SCF_DO_STCLASS)
3013 data->start_class = oclass;
3014 if (mincount == 0 || minnext == 0) {
3015 if (flags & SCF_DO_STCLASS_OR) {
3016 cl_or(pRExC_state, data->start_class, &this_class);
3017 }
3018 else if (flags & SCF_DO_STCLASS_AND) {
3019 /* Switch to OR mode: cache the old value of
3020 * data->start_class */
3021 INIT_AND_WITHP;
3022 StructCopy(data->start_class, and_withp,
3023 struct regnode_charclass_class);
3024 flags &= ~SCF_DO_STCLASS_AND;
3025 StructCopy(&this_class, data->start_class,
3026 struct regnode_charclass_class);
3027 flags |= SCF_DO_STCLASS_OR;
3028 data->start_class->flags |= ANYOF_EOS;
3029 }
3030 } else { /* Non-zero len */
3031 if (flags & SCF_DO_STCLASS_OR) {
3032 cl_or(pRExC_state, data->start_class, &this_class);
3033 cl_and(data->start_class, and_withp);
3034 }
3035 else if (flags & SCF_DO_STCLASS_AND)
3036 cl_and(data->start_class, &this_class);
3037 flags &= ~SCF_DO_STCLASS;
3038 }
3039 if (!scan) /* It was not CURLYX, but CURLY. */
3040 scan = next;
3041 if ( /* ? quantifier ok, except for (?{ ... }) */
3042 (next_is_eval || !(mincount == 0 && maxcount == 1))
3043 && (minnext == 0) && (deltanext == 0)
3044 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3045 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3046 && ckWARN(WARN_REGEXP))
3047 {
3048 vWARN(RExC_parse,
3049 "Quantifier unexpected on zero-length expression");
3050 }
3051
3052 min += minnext * mincount;
3053 is_inf_internal |= ((maxcount == REG_INFTY
3054 && (minnext + deltanext) > 0)
3055 || deltanext == I32_MAX);
3056 is_inf |= is_inf_internal;
3057 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3058
3059 /* Try powerful optimization CURLYX => CURLYN. */
3060 if ( OP(oscan) == CURLYX && data
3061 && data->flags & SF_IN_PAR
3062 && !(data->flags & SF_HAS_EVAL)
3063 && !deltanext && minnext == 1 ) {
3064 /* Try to optimize to CURLYN. */
3065 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3066 regnode * const nxt1 = nxt;
497b47a8 3067#ifdef DEBUGGING
8aa23a47 3068 regnode *nxt2;
497b47a8 3069#endif
c277df42 3070
8aa23a47
YO
3071 /* Skip open. */
3072 nxt = regnext(nxt);
3073 if (!strchr((const char*)PL_simple,OP(nxt))
3074 && !(PL_regkind[OP(nxt)] == EXACT
3075 && STR_LEN(nxt) == 1))
3076 goto nogo;
497b47a8 3077#ifdef DEBUGGING
8aa23a47 3078 nxt2 = nxt;
497b47a8 3079#endif
8aa23a47
YO
3080 nxt = regnext(nxt);
3081 if (OP(nxt) != CLOSE)
3082 goto nogo;
3083 if (RExC_open_parens) {
3084 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3085 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3086 }
3087 /* Now we know that nxt2 is the only contents: */
3088 oscan->flags = (U8)ARG(nxt);
3089 OP(oscan) = CURLYN;
3090 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3091
c277df42 3092#ifdef DEBUGGING
8aa23a47
YO
3093 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3094 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3095 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3096 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3097 OP(nxt + 1) = OPTIMIZED; /* was count. */
3098 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3099#endif
8aa23a47
YO
3100 }
3101 nogo:
3102
3103 /* Try optimization CURLYX => CURLYM. */
3104 if ( OP(oscan) == CURLYX && data
3105 && !(data->flags & SF_HAS_PAR)
3106 && !(data->flags & SF_HAS_EVAL)
3107 && !deltanext /* atom is fixed width */
3108 && minnext != 0 /* CURLYM can't handle zero width */
3109 ) {
3110 /* XXXX How to optimize if data == 0? */
3111 /* Optimize to a simpler form. */
3112 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3113 regnode *nxt2;
3114
3115 OP(oscan) = CURLYM;
3116 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3117 && (OP(nxt2) != WHILEM))
3118 nxt = nxt2;
3119 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3120 /* Need to optimize away parenths. */
3121 if (data->flags & SF_IN_PAR) {
3122 /* Set the parenth number. */
3123 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3124
3125 if (OP(nxt) != CLOSE)
3126 FAIL("Panic opt close");
3127 oscan->flags = (U8)ARG(nxt);
3128 if (RExC_open_parens) {
3129 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3130 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3131 }
8aa23a47
YO
3132 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3133 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3134
c277df42 3135#ifdef DEBUGGING
8aa23a47
YO
3136 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3137 OP(nxt + 1) = OPTIMIZED; /* was count. */
3138 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3139 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3140#endif
c277df42 3141#if 0
8aa23a47
YO
3142 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3143 regnode *nnxt = regnext(nxt1);
3144
3145 if (nnxt == nxt) {
3146 if (reg_off_by_arg[OP(nxt1)])
3147 ARG_SET(nxt1, nxt2 - nxt1);
3148 else if (nxt2 - nxt1 < U16_MAX)
3149 NEXT_OFF(nxt1) = nxt2 - nxt1;
3150 else
3151 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3152 }
8aa23a47 3153 nxt1 = nnxt;
c277df42 3154 }
5d1c421c 3155#endif
8aa23a47
YO
3156 /* Optimize again: */
3157 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3158 NULL, stopparen, recursed, NULL, 0,depth+1);
3159 }
3160 else
3161 oscan->flags = 0;
3162 }
3163 else if ((OP(oscan) == CURLYX)
3164 && (flags & SCF_WHILEM_VISITED_POS)
3165 /* See the comment on a similar expression above.
3166 However, this time it not a subexpression
3167 we care about, but the expression itself. */
3168 && (maxcount == REG_INFTY)
3169 && data && ++data->whilem_c < 16) {
3170 /* This stays as CURLYX, we can put the count/of pair. */
3171 /* Find WHILEM (as in regexec.c) */
3172 regnode *nxt = oscan + NEXT_OFF(oscan);
3173
3174 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3175 nxt += ARG(nxt);
3176 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3177 | (RExC_whilem_seen << 4)); /* On WHILEM */
3178 }
3179 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3180 pars++;
3181 if (flags & SCF_DO_SUBSTR) {
3182 SV *last_str = NULL;
3183 int counted = mincount != 0;
a0ed51b3 3184
8aa23a47
YO
3185 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3186#if defined(SPARC64_GCC_WORKAROUND)
3187 I32 b = 0;
3188 STRLEN l = 0;
3189 const char *s = NULL;
3190 I32 old = 0;
b515a41d 3191
8aa23a47
YO
3192 if (pos_before >= data->last_start_min)
3193 b = pos_before;
3194 else
3195 b = data->last_start_min;
b515a41d 3196
8aa23a47
YO
3197 l = 0;
3198 s = SvPV_const(data->last_found, l);
3199 old = b - data->last_start_min;
3200
3201#else
3202 I32 b = pos_before >= data->last_start_min
3203 ? pos_before : data->last_start_min;
3204 STRLEN l;
3205 const char * const s = SvPV_const(data->last_found, l);
3206 I32 old = b - data->last_start_min;
3207#endif
3208
3209 if (UTF)
3210 old = utf8_hop((U8*)s, old) - (U8*)s;
3211
3212 l -= old;
3213 /* Get the added string: */
3214 last_str = newSVpvn(s + old, l);
3215 if (UTF)
3216 SvUTF8_on(last_str);
3217 if (deltanext == 0 && pos_before == b) {
3218 /* What was added is a constant string */
3219 if (mincount > 1) {
3220 SvGROW(last_str, (mincount * l) + 1);
3221 repeatcpy(SvPVX(last_str) + l,
3222 SvPVX_const(last_str), l, mincount - 1);
3223 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3224 /* Add additional parts. */
3225 SvCUR_set(data->last_found,
3226 SvCUR(data->last_found) - l);
3227 sv_catsv(data->last_found, last_str);
3228 {
3229 SV * sv = data->last_found;
3230 MAGIC *mg =
3231 SvUTF8(sv) && SvMAGICAL(sv) ?
3232 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3233 if (mg && mg->mg_len >= 0)
3234 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3235 }
8aa23a47 3236 data->last_end += l * (mincount - 1);
b515a41d 3237 }
8aa23a47
YO
3238 } else {
3239 /* start offset must point into the last copy */
3240 data->last_start_min += minnext * (mincount - 1);
3241 data->last_start_max += is_inf ? I32_MAX
3242 : (maxcount - 1) * (minnext + data->pos_delta);
3243 }
c277df42 3244 }
8aa23a47
YO
3245 /* It is counted once already... */
3246 data->pos_min += minnext * (mincount - counted);
3247 data->pos_delta += - counted * deltanext +
3248 (minnext + deltanext) * maxcount - minnext * mincount;
3249 if (mincount != maxcount) {
3250 /* Cannot extend fixed substrings found inside
3251 the group. */
304ee84b 3252 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3253 if (mincount && last_str) {
3254 SV * const sv = data->last_found;
3255 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3256 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3257
3258 if (mg)
3259 mg->mg_len = -1;
3260 sv_setsv(sv, last_str);
3261 data->last_end = data->pos_min;
3262 data->last_start_min =
3263 data->pos_min - CHR_SVLEN(last_str);
3264 data->last_start_max = is_inf
3265 ? I32_MAX
3266 : data->pos_min + data->pos_delta
3267 - CHR_SVLEN(last_str);
3268 }
3269 data->longest = &(data->longest_float);
3270 }
3271 SvREFCNT_dec(last_str);
c277df42 3272 }
8aa23a47
YO
3273 if (data && (fl & SF_HAS_EVAL))
3274 data->flags |= SF_HAS_EVAL;
3275 optimize_curly_tail:
3276 if (OP(oscan) != CURLYX) {
3277 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3278 && NEXT_OFF(next))
3279 NEXT_OFF(oscan) += NEXT_OFF(next);
3280 }
3281 continue;
3282 default: /* REF and CLUMP only? */
3283 if (flags & SCF_DO_SUBSTR) {
304ee84b 3284 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3285 data->longest = &(data->longest_float);
3286 }
3287 is_inf = is_inf_internal = 1;
3288 if (flags & SCF_DO_STCLASS_OR)
3289 cl_anything(pRExC_state, data->start_class);
3290 flags &= ~SCF_DO_STCLASS;
3291 break;
c277df42 3292 }
8aa23a47
YO
3293 }
3294 else if (strchr((const char*)PL_simple,OP(scan))) {
3295 int value = 0;
653099ff 3296
8aa23a47 3297 if (flags & SCF_DO_SUBSTR) {
304ee84b 3298 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3299 data->pos_min++;
3300 }
3301 min++;
3302 if (flags & SCF_DO_STCLASS) {
3303 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3304
8aa23a47
YO
3305 /* Some of the logic below assumes that switching
3306 locale on will only add false positives. */
3307 switch (PL_regkind[OP(scan)]) {
3308 case SANY:
3309 default:
3310 do_default:
3311 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3312 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3313 cl_anything(pRExC_state, data->start_class);
3314 break;
3315 case REG_ANY:
3316 if (OP(scan) == SANY)
3317 goto do_default;
3318 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3319 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3320 || (data->start_class->flags & ANYOF_CLASS));
3321 cl_anything(pRExC_state, data->start_class);
653099ff 3322 }
8aa23a47
YO
3323 if (flags & SCF_DO_STCLASS_AND || !value)
3324 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3325 break;
3326 case ANYOF:
3327 if (flags & SCF_DO_STCLASS_AND)
3328 cl_and(data->start_class,
3329 (struct regnode_charclass_class*)scan);
653099ff 3330 else
8aa23a47
YO
3331 cl_or(pRExC_state, data->start_class,
3332 (struct regnode_charclass_class*)scan);
3333 break;
3334 case ALNUM:
3335 if (flags & SCF_DO_STCLASS_AND) {
3336 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3337 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3338 for (value = 0; value < 256; value++)
3339 if (!isALNUM(value))
3340 ANYOF_BITMAP_CLEAR(data->start_class, value);
3341 }
653099ff 3342 }
8aa23a47
YO
3343 else {
3344 if (data->start_class->flags & ANYOF_LOCALE)
3345 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3346 else {
3347 for (value = 0; value < 256; value++)
3348 if (isALNUM(value))
3349 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3350 }
8aa23a47
YO
3351 }
3352 break;
3353 case ALNUML:
3354 if (flags & SCF_DO_STCLASS_AND) {
3355 if (data->start_class->flags & ANYOF_LOCALE)
3356 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3357 }
3358 else {
3359 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3360 data->start_class->flags |= ANYOF_LOCALE;
3361 }
3362 break;
3363 case NALNUM:
3364 if (flags & SCF_DO_STCLASS_AND) {
3365 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3366 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3367 for (value = 0; value < 256; value++)
3368 if (isALNUM(value))
3369 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3370 }
3371 }
8aa23a47
YO
3372 else {
3373 if (data->start_class->flags & ANYOF_LOCALE)
3374 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3375 else {
3376 for (value = 0; value < 256; value++)
3377 if (!isALNUM(value))
3378 ANYOF_BITMAP_SET(data->start_class, value);
3379 }
653099ff 3380 }
8aa23a47
YO
3381 break;
3382 case NALNUML:
3383 if (flags & SCF_DO_STCLASS_AND) {
3384 if (data->start_class->flags & ANYOF_LOCALE)
3385 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3386 }
8aa23a47
YO
3387 else {
3388 data->start_class->flags |= ANYOF_LOCALE;
3389 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3390 }
3391 break;
3392 case SPACE:
3393 if (flags & SCF_DO_STCLASS_AND) {
3394 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3395 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3396 for (value = 0; value < 256; value++)
3397 if (!isSPACE(value))
3398 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3399 }
3400 }
8aa23a47
YO
3401 else {
3402 if (data->start_class->flags & ANYOF_LOCALE)
3403 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3404 else {
3405 for (value = 0; value < 256; value++)
3406 if (isSPACE(value))
3407 ANYOF_BITMAP_SET(data->start_class, value);
3408 }
653099ff 3409 }
8aa23a47
YO
3410 break;
3411 case SPACEL:
3412 if (flags & SCF_DO_STCLASS_AND) {
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3415 }
3416 else {
3417 data->start_class->flags |= ANYOF_LOCALE;
3418 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3419 }
3420 break;
3421 case NSPACE:
3422 if (flags & SCF_DO_STCLASS_AND) {
3423 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3425 for (value = 0; value < 256; value++)
3426 if (isSPACE(value))
3427 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3428 }
8aa23a47
YO
3429 }
3430 else {
3431 if (data->start_class->flags & ANYOF_LOCALE)
3432 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3433 else {
3434 for (value = 0; value < 256; value++)
3435 if (!isSPACE(value))
3436 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3437 }
3438 }
8aa23a47
YO
3439 break;
3440 case NSPACEL:
3441 if (flags & SCF_DO_STCLASS_AND) {
3442 if (data->start_class->flags & ANYOF_LOCALE) {
3443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3444 for (value = 0; value < 256; value++)
3445 if (!isSPACE(value))
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3447 }
653099ff 3448 }
8aa23a47
YO
3449 else {
3450 data->start_class->flags |= ANYOF_LOCALE;
3451 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3452 }
3453 break;
3454 case DIGIT:
3455 if (flags & SCF_DO_STCLASS_AND) {
3456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3457 for (value = 0; value < 256; value++)
3458 if (!isDIGIT(value))
3459 ANYOF_BITMAP_CLEAR(data->start_class, value);
3460 }
3461 else {
3462 if (data->start_class->flags & ANYOF_LOCALE)
3463 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3464 else {
3465 for (value = 0; value < 256; value++)
3466 if (isDIGIT(value))
3467 ANYOF_BITMAP_SET(data->start_class, value);
3468 }
3469 }
3470 break;
3471 case NDIGIT:
3472 if (flags & SCF_DO_STCLASS_AND) {
3473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3474 for (value = 0; value < 256; value++)
3475 if (isDIGIT(value))
3476 ANYOF_BITMAP_CLEAR(data->start_class, value);
3477 }
3478 else {
3479 if (data->start_class->flags & ANYOF_LOCALE)
3480 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3481 else {
3482 for (value = 0; value < 256; value++)
3483 if (!isDIGIT(value))
3484 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3485 }
3486 }
8aa23a47
YO
3487 break;
3488 }
3489 if (flags & SCF_DO_STCLASS_OR)
3490 cl_and(data->start_class, and_withp);
3491 flags &= ~SCF_DO_STCLASS;
3492 }
3493 }
3494 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3495 data->flags |= (OP(scan) == MEOL
3496 ? SF_BEFORE_MEOL
3497 : SF_BEFORE_SEOL);
3498 }
3499 else if ( PL_regkind[OP(scan)] == BRANCHJ
3500 /* Lookbehind, or need to calculate parens/evals/stclass: */
3501 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3502 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3503 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3504 || OP(scan) == UNLESSM )
3505 {
3506 /* Negative Lookahead/lookbehind
3507 In this case we can't do fixed string optimisation.
3508 */
1de06328 3509
8aa23a47
YO
3510 I32 deltanext, minnext, fake = 0;
3511 regnode *nscan;
3512 struct regnode_charclass_class intrnl;
3513 int f = 0;
1de06328 3514
8aa23a47
YO
3515 data_fake.flags = 0;
3516 if (data) {
3517 data_fake.whilem_c = data->whilem_c;
3518 data_fake.last_closep = data->last_closep;
c277df42 3519 }
8aa23a47
YO
3520 else
3521 data_fake.last_closep = &fake;
58e23c8d 3522 data_fake.pos_delta = delta;
8aa23a47
YO
3523 if ( flags & SCF_DO_STCLASS && !scan->flags
3524 && OP(scan) == IFMATCH ) { /* Lookahead */
3525 cl_init(pRExC_state, &intrnl);
3526 data_fake.start_class = &intrnl;
3527 f |= SCF_DO_STCLASS_AND;
3528 }
3529 if (flags & SCF_WHILEM_VISITED_POS)
3530 f |= SCF_WHILEM_VISITED_POS;
3531 next = regnext(scan);
3532 nscan = NEXTOPER(NEXTOPER(scan));
3533 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3534 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3535 if (scan->flags) {
3536 if (deltanext) {
58e23c8d 3537 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3538 }
3539 else if (minnext > (I32)U8_MAX) {
58e23c8d 3540 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3541 }
3542 scan->flags = (U8)minnext;
3543 }
3544 if (data) {
3545 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3546 pars++;
3547 if (data_fake.flags & SF_HAS_EVAL)
3548 data->flags |= SF_HAS_EVAL;
3549 data->whilem_c = data_fake.whilem_c;
3550 }
3551 if (f & SCF_DO_STCLASS_AND) {
3552 const int was = (data->start_class->flags & ANYOF_EOS);
3553
3554 cl_and(data->start_class, &intrnl);
3555 if (was)
3556 data->start_class->flags |= ANYOF_EOS;
3557 }
cb434fcc 3558 }
8aa23a47
YO
3559#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3560 else {
3561 /* Positive Lookahead/lookbehind
3562 In this case we can do fixed string optimisation,
3563 but we must be careful about it. Note in the case of
3564 lookbehind the positions will be offset by the minimum
3565 length of the pattern, something we won't know about
3566 until after the recurse.
3567 */
3568 I32 deltanext, fake = 0;
3569 regnode *nscan;
3570 struct regnode_charclass_class intrnl;
3571 int f = 0;
3572 /* We use SAVEFREEPV so that when the full compile
3573 is finished perl will clean up the allocated
3574 minlens when its all done. This was we don't
3575 have to worry about freeing them when we know
3576 they wont be used, which would be a pain.
3577 */
3578 I32 *minnextp;
3579 Newx( minnextp, 1, I32 );
3580 SAVEFREEPV(minnextp);
3581
3582 if (data) {
3583 StructCopy(data, &data_fake, scan_data_t);
3584 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3585 f |= SCF_DO_SUBSTR;
3586 if (scan->flags)
304ee84b 3587 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3588 data_fake.last_found=newSVsv(data->last_found);
3589 }
3590 }
3591 else
3592 data_fake.last_closep = &fake;
3593 data_fake.flags = 0;
58e23c8d 3594 data_fake.pos_delta = delta;
8aa23a47
YO
3595 if (is_inf)
3596 data_fake.flags |= SF_IS_INF;
3597 if ( flags & SCF_DO_STCLASS && !scan->flags
3598 && OP(scan) == IFMATCH ) { /* Lookahead */
3599 cl_init(pRExC_state, &intrnl);
3600 data_fake.start_class = &intrnl;
3601 f |= SCF_DO_STCLASS_AND;
3602 }
3603 if (flags & SCF_WHILEM_VISITED_POS)
3604 f |= SCF_WHILEM_VISITED_POS;
3605 next = regnext(scan);
3606 nscan = NEXTOPER(NEXTOPER(scan));
3607
3608 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3609 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3610 if (scan->flags) {
3611 if (deltanext) {
58e23c8d 3612 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3613 }
3614 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3615 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3616 }
3617 scan->flags = (U8)*minnextp;
3618 }
3619
3620 *minnextp += min;
3621
3622 if (f & SCF_DO_STCLASS_AND) {
3623 const int was = (data->start_class->flags & ANYOF_EOS);
3624
3625 cl_and(data->start_class, &intrnl);
3626 if (was)
3627 data->start_class->flags |= ANYOF_EOS;
3628 }
3629 if (data) {
3630 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3631 pars++;
3632 if (data_fake.flags & SF_HAS_EVAL)
3633 data->flags |= SF_HAS_EVAL;
3634 data->whilem_c = data_fake.whilem_c;
3635 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3636 if (RExC_rx->minlen<*minnextp)
3637 RExC_rx->minlen=*minnextp;
304ee84b 3638 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3639 SvREFCNT_dec(data_fake.last_found);
3640
3641 if ( data_fake.minlen_fixed != minlenp )
3642 {
3643 data->offset_fixed= data_fake.offset_fixed;
3644 data->minlen_fixed= data_fake.minlen_fixed;
3645 data->lookbehind_fixed+= scan->flags;
3646 }
3647 if ( data_fake.minlen_float != minlenp )
3648 {
3649 data->minlen_float= data_fake.minlen_float;
3650 data->offset_float_min=data_fake.offset_float_min;
3651 data->offset_float_max=data_fake.offset_float_max;
3652 data->lookbehind_float+= scan->flags;
3653 }
3654 }
3655 }
3656
3657
40d049e4 3658 }
8aa23a47
YO
3659#endif
3660 }
3661 else if (OP(scan) == OPEN) {
3662 if (stopparen != (I32)ARG(scan))
3663 pars++;
3664 }
3665 else if (OP(scan) == CLOSE) {
3666 if (stopparen == (I32)ARG(scan)) {
3667 break;
3668 }
3669 if ((I32)ARG(scan) == is_par) {
3670 next = regnext(scan);
b515a41d 3671
8aa23a47
YO
3672 if ( next && (OP(next) != WHILEM) && next < last)
3673 is_par = 0; /* Disable optimization */
40d049e4 3674 }
8aa23a47
YO
3675 if (data)
3676 *(data->last_closep) = ARG(scan);
3677 }
3678 else if (OP(scan) == EVAL) {
c277df42
IZ
3679 if (data)
3680 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3681 }
3682 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3683 if (flags & SCF_DO_SUBSTR) {
304ee84b 3684 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3685 flags &= ~SCF_DO_SUBSTR;
40d049e4 3686 }
8aa23a47
YO
3687 if (data && OP(scan)==ACCEPT) {
3688 data->flags |= SCF_SEEN_ACCEPT;
3689 if (stopmin > min)
3690 stopmin = min;
e2e6a0f1 3691 }
8aa23a47
YO
3692 }
3693 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3694 {
0f5d15d6 3695 if (flags & SCF_DO_SUBSTR) {
304ee84b 3696 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3697 data->longest = &(data->longest_float);
3698 }
3699 is_inf = is_inf_internal = 1;
653099ff 3700 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3701 cl_anything(pRExC_state, data->start_class);
96776eda 3702 flags &= ~SCF_DO_STCLASS;
8aa23a47 3703 }
58e23c8d 3704 else if (OP(scan) == GPOS) {
bbe252da 3705 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3706 !(delta || is_inf || (data && data->pos_delta)))
3707 {
bbe252da
YO
3708 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3709 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3710 if (RExC_rx->gofs < (U32)min)
3711 RExC_rx->gofs = min;
3712 } else {
bbe252da 3713 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3714 RExC_rx->gofs = 0;
3715 }
3716 }
786e8c11 3717#ifdef TRIE_STUDY_OPT
40d049e4 3718#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3719 else if (PL_regkind[OP(scan)] == TRIE) {
3720 /* NOTE - There is similar code to this block above for handling
3721 BRANCH nodes on the initial study. If you change stuff here
3722 check there too. */
3723 regnode *trie_node= scan;
3724 regnode *tail= regnext(scan);
f8fc2ecf 3725 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3726 I32 max1 = 0, min1 = I32_MAX;
3727 struct regnode_charclass_class accum;
3728
3729 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3730 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3731 if (flags & SCF_DO_STCLASS)
3732 cl_init_zero(pRExC_state, &accum);
3733
3734 if (!trie->jump) {
3735 min1= trie->minlen;
3736 max1= trie->maxlen;
3737 } else {
3738 const regnode *nextbranch= NULL;
3739 U32 word;
3740
3741 for ( word=1 ; word <= trie->wordcount ; word++)
3742 {
3743 I32 deltanext=0, minnext=0, f = 0, fake;
3744 struct regnode_charclass_class this_class;
3745
3746 data_fake.flags = 0;
3747 if (data) {
3748 data_fake.whilem_c = data->whilem_c;
3749 data_fake.last_closep = data->last_closep;
3750 }
3751 else
3752 data_fake.last_closep = &fake;
58e23c8d 3753 data_fake.pos_delta = delta;
8aa23a47
YO
3754 if (flags & SCF_DO_STCLASS) {
3755 cl_init(pRExC_state, &this_class);
3756 data_fake.start_class = &this_class;
3757 f = SCF_DO_STCLASS_AND;
3758 }
3759 if (flags & SCF_WHILEM_VISITED_POS)
3760 f |= SCF_WHILEM_VISITED_POS;
3761
3762 if (trie->jump[word]) {
3763 if (!nextbranch)
3764 nextbranch = trie_node + trie->jump[0];
3765 scan= trie_node + trie->jump[word];
3766 /* We go from the jump point to the branch that follows
3767 it. Note this means we need the vestigal unused branches
3768 even though they arent otherwise used.
3769 */
3770 minnext = study_chunk(pRExC_state, &scan, minlenp,
3771 &deltanext, (regnode *)nextbranch, &data_fake,
3772 stopparen, recursed, NULL, f,depth+1);
3773 }
3774 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3775 nextbranch= regnext((regnode*)nextbranch);
3776
3777 if (min1 > (I32)(minnext + trie->minlen))
3778 min1 = minnext + trie->minlen;
3779 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3780 max1 = minnext + deltanext + trie->maxlen;
3781 if (deltanext == I32_MAX)
3782 is_inf = is_inf_internal = 1;
3783
3784 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3785 pars++;
3786 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3787 if ( stopmin > min + min1)
3788 stopmin = min + min1;
3789 flags &= ~SCF_DO_SUBSTR;
3790 if (data)
3791 data->flags |= SCF_SEEN_ACCEPT;
3792 }
3793 if (data) {
3794 if (data_fake.flags & SF_HAS_EVAL)
3795 data->flags |= SF_HAS_EVAL;
3796 data->whilem_c = data_fake.whilem_c;
3797 }
3798 if (flags & SCF_DO_STCLASS)
3799 cl_or(pRExC_state, &accum, &this_class);
3800 }
3801 }
3802 if (flags & SCF_DO_SUBSTR) {
3803 data->pos_min += min1;
3804 data->pos_delta += max1 - min1;
3805 if (max1 != min1 || is_inf)
3806 data->longest = &(data->longest_float);
3807 }
3808 min += min1;
3809 delta += max1 - min1;
3810 if (flags & SCF_DO_STCLASS_OR) {
3811 cl_or(pRExC_state, data->start_class, &accum);
3812 if (min1) {
3813 cl_and(data->start_class, and_withp);
3814 flags &= ~SCF_DO_STCLASS;
3815 }
3816 }
3817 else if (flags & SCF_DO_STCLASS_AND) {
3818 if (min1) {
3819 cl_and(data->start_class, &accum);
3820 flags &= ~SCF_DO_STCLASS;
3821 }
3822 else {
3823 /* Switch to OR mode: cache the old value of
3824 * data->start_class */
3825 INIT_AND_WITHP;
3826 StructCopy(data->start_class, and_withp,
3827 struct regnode_charclass_class);
3828 flags &= ~SCF_DO_STCLASS_AND;
3829 StructCopy(&accum, data->start_class,
3830 struct regnode_charclass_class);
3831 flags |= SCF_DO_STCLASS_OR;
3832 data->start_class->flags |= ANYOF_EOS;
3833 }
3834 }
3835 scan= tail;
3836 continue;
3837 }
786e8c11 3838#else
8aa23a47 3839 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3840 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3841 U8*bang=NULL;
3842
3843 min += trie->minlen;
3844 delta += (trie->maxlen - trie->minlen);
3845 flags &= ~SCF_DO_STCLASS; /* xxx */
3846 if (flags & SCF_DO_SUBSTR) {
304ee84b 3847 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3848 data->pos_min += trie->minlen;
3849 data->pos_delta += (trie->maxlen - trie->minlen);
3850 if (trie->maxlen != trie->minlen)
3851 data->longest = &(data->longest_float);
3852 }
3853 if (trie->jump) /* no more substrings -- for now /grr*/
3854 flags &= ~SCF_DO_SUBSTR;
b515a41d 3855 }
8aa23a47
YO
3856#endif /* old or new */
3857#endif /* TRIE_STUDY_OPT */
3858 /* Else: zero-length, ignore. */
3859 scan = regnext(scan);
3860 }
3861 if (frame) {
3862 last = frame->last;
3863 scan = frame->next;
3864 stopparen = frame->stop;
3865 frame = frame->prev;
3866 goto fake_study_recurse;
c277df42
IZ
3867 }
3868
3869 finish:
8aa23a47 3870 assert(!frame);
304ee84b 3871 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3872
c277df42 3873 *scanp = scan;
aca2d497 3874 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3875 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3876 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3877 if (is_par > (I32)U8_MAX)
c277df42
IZ
3878 is_par = 0;
3879 if (is_par && pars==1 && data) {
3880 data->flags |= SF_IN_PAR;
3881 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3882 }
3883 else if (pars && data) {
c277df42
IZ
3884 data->flags |= SF_HAS_PAR;
3885 data->flags &= ~SF_IN_PAR;
3886 }
653099ff 3887 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3888 cl_and(data->start_class, and_withp);
786e8c11
YO
3889 if (flags & SCF_TRIE_RESTUDY)
3890 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3891
304ee84b 3892 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3893
e2e6a0f1 3894 return min < stopmin ? min : stopmin;
c277df42
IZ
3895}
3896
2eccd3b2
NC
3897STATIC U32
3898S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3899{
4a4e7719
NC
3900 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3901
3902 Renewc(RExC_rxi->data,
3903 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3904 char, struct reg_data);
3905 if(count)
f8fc2ecf 3906 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3907 else
f8fc2ecf 3908 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3909 RExC_rxi->data->count = count + n;
3910 Copy(s, RExC_rxi->data->what + count, n, U8);
3911 return count;
c277df42
IZ
3912}
3913
f8149455 3914/*XXX: todo make this not included in a non debugging perl */
76234dfb 3915#ifndef PERL_IN_XSUB_RE
d88dccdf 3916void
864dbfa3 3917Perl_reginitcolors(pTHX)
d88dccdf 3918{
97aff369 3919 dVAR;
1df70142 3920 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3921 if (s) {
1df70142
AL
3922 char *t = savepv(s);
3923 int i = 0;
3924 PL_colors[0] = t;
d88dccdf 3925 while (++i < 6) {
1df70142
AL
3926 t = strchr(t, '\t');
3927 if (t) {
3928 *t = '\0';
3929 PL_colors[i] = ++t;
d88dccdf
IZ
3930 }
3931 else
1df70142 3932 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3933 }
3934 } else {
1df70142 3935 int i = 0;
b81d288d 3936 while (i < 6)
06b5626a 3937 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3938 }
3939 PL_colorset = 1;
3940}
76234dfb 3941#endif
8615cb43 3942
07be1b83 3943
786e8c11
YO
3944#ifdef TRIE_STUDY_OPT
3945#define CHECK_RESTUDY_GOTO \
3946 if ( \
3947 (data.flags & SCF_TRIE_RESTUDY) \
3948 && ! restudied++ \
3949 ) goto reStudy
3950#else
3951#define CHECK_RESTUDY_GOTO
3952#endif
f9f4320a 3953
a687059c 3954/*
e50aee73 3955 - pregcomp - compile a regular expression into internal code
a687059c
LW
3956 *
3957 * We can't allocate space until we know how big the compiled form will be,
3958 * but we can't compile it (and thus know how big it is) until we've got a
3959 * place to put the code. So we cheat: we compile it twice, once with code
3960 * generation turned off and size counting turned on, and once "for real".
3961 * This also means that we don't allocate space until we are sure that the
3962 * thing really will compile successfully, and we never have to move the
3963 * code and thus invalidate pointers into it. (Note that it has to be in
3964 * one piece because free() must be able to free it all.) [NB: not true in perl]
3965 *
3966 * Beware that the optimization-preparation code in here knows about some
3967 * of the structure of the compiled regexp. [I'll say.]
3968 */
b9b4dddf
YO
3969
3970
3971
f9f4320a 3972#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3973#define RE_ENGINE_PTR &PL_core_reg_engine
3974#else
f9f4320a
YO
3975extern const struct regexp_engine my_reg_engine;
3976#define RE_ENGINE_PTR &my_reg_engine
3977#endif
6d5c990f
RGS
3978
3979#ifndef PERL_IN_XSUB_RE
a687059c 3980regexp *
864dbfa3 3981Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3982{
97aff369 3983 dVAR;
6d5c990f 3984 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
3985 /* Dispatch a request to compile a regexp to correct
3986 regexp engine. */
f9f4320a
YO
3987 if (table) {
3988 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 3989 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 3990 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3991 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3992 DEBUG_COMPILE_r({
8d8756e7 3993 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3994 SvIV(*ptr));
3995 });
f2f78491 3996 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3997 }
b9b4dddf 3998 }
2a5d9b1d
RGS
3999 return Perl_re_compile(aTHX_ exp, xend, pm);
4000}
6d5c990f 4001#endif
2a5d9b1d
RGS
4002
4003regexp *
4004Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4005{
4006 dVAR;
a0d0e21e 4007 register regexp *r;
f8fc2ecf 4008 register regexp_internal *ri;
c277df42 4009 regnode *scan;
c277df42 4010 regnode *first;
a0d0e21e 4011 I32 flags;
a0d0e21e
LW
4012 I32 minlen = 0;
4013 I32 sawplus = 0;
4014 I32 sawopen = 0;
2c2d71f5 4015 scan_data_t data;
830247a4 4016 RExC_state_t RExC_state;
be8e71aa 4017 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4018#ifdef TRIE_STUDY_OPT
4019 int restudied= 0;
4020 RExC_state_t copyRExC_state;
4021#endif
2a5d9b1d 4022 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4023 DEBUG_r(if (!PL_colorset) reginitcolors());
4024
a0d0e21e 4025 if (exp == NULL)
c277df42 4026 FAIL("NULL regexp argument");
a0d0e21e 4027
a5961de5 4028 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4029
5cfc7842 4030 RExC_precomp = exp;
a3621e74 4031 DEBUG_COMPILE_r({
ab3bbdeb
YO
4032 SV *dsv= sv_newmortal();
4033 RE_PV_QUOTED_DECL(s, RExC_utf8,
4034 dsv, RExC_precomp, (xend - exp), 60);
4035 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4036 PL_colors[4],PL_colors[5],s);
a5961de5 4037 });
e2509266 4038 RExC_flags = pm->op_pmflags;
830247a4 4039 RExC_sawback = 0;
bbce6d69 4040
830247a4
IZ
4041 RExC_seen = 0;
4042 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4043 RExC_seen_evals = 0;
4044 RExC_extralen = 0;
c277df42 4045
bbce6d69 4046 /* First pass: determine size, legality. */
830247a4 4047 RExC_parse = exp;
fac92740 4048 RExC_start = exp;
830247a4
IZ
4049 RExC_end = xend;
4050 RExC_naughty = 0;
4051 RExC_npar = 1;
e2e6a0f1 4052 RExC_nestroot = 0;
830247a4
IZ
4053 RExC_size = 0L;
4054 RExC_emit = &PL_regdummy;
4055 RExC_whilem_seen = 0;
fc8cd66c 4056 RExC_charnames = NULL;
40d049e4
YO
4057 RExC_open_parens = NULL;
4058 RExC_close_parens = NULL;
4059 RExC_opend = NULL;
81714fb9 4060 RExC_paren_names = NULL;
1f1031fe
YO
4061#ifdef DEBUGGING
4062 RExC_paren_name_list = NULL;
4063#endif
40d049e4
YO
4064 RExC_recurse = NULL;
4065 RExC_recurse_count = 0;
81714fb9 4066
85ddcde9
JH
4067#if 0 /* REGC() is (currently) a NOP at the first pass.
4068 * Clever compilers notice this and complain. --jhi */
830247a4 4069 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4070#endif
3dab1dad
YO
4071 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4072 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4073 RExC_precomp = NULL;
a0d0e21e
LW
4074 return(NULL);
4075 }
07be1b83 4076 DEBUG_PARSE_r({
81714fb9
YO
4077 PerlIO_printf(Perl_debug_log,
4078 "Required size %"IVdf" nodes\n"
4079 "Starting second pass (creation)\n",
4080 (IV)RExC_size);
07be1b83
YO
4081 RExC_lastnum=0;
4082 RExC_lastparse=NULL;
4083 });
c277df42
IZ
4084 /* Small enough for pointer-storage convention?
4085 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4086 if (RExC_size >= 0x10000L && RExC_extralen)
4087 RExC_size += RExC_extralen;
c277df42 4088 else
830247a4
IZ
4089 RExC_extralen = 0;
4090 if (RExC_whilem_seen > 15)
4091 RExC_whilem_seen = 15;
a0d0e21e 4092
e2e6a0f1
YO
4093#ifdef DEBUGGING
4094 /* Make room for a sentinel value at the end of the program */
4095 RExC_size++;
4096#endif
4097
f9f4320a
YO
4098 /* Allocate space and zero-initialize. Note, the two step process
4099 of zeroing when in debug mode, thus anything assigned has to
4100 happen after that */
f8fc2ecf
YO
4101 Newxz(r, 1, regexp);
4102 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4103 char, regexp_internal);
4104 if ( r == NULL || ri == NULL )
b45f050a 4105 FAIL("Regexp out of space");
0f79a09d
GS
4106#ifdef DEBUGGING
4107 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4108 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4109#else
f8fc2ecf
YO
4110 /* bulk initialize base fields with 0. */
4111 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4112#endif
58e23c8d
YO
4113
4114 /* non-zero initialization begins here */
f8fc2ecf 4115 RXi_SET( r, ri );
f9f4320a 4116 r->engine= RE_ENGINE_PTR;
c277df42 4117 r->refcnt = 1;
bbce6d69 4118 r->prelen = xend - exp;
5cfc7842 4119 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4120 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4121 r->intflags = 0;
830247a4 4122 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4123
6bda09f9 4124 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4125 Newxz(RExC_open_parens, RExC_npar,regnode *);
4126 SAVEFREEPV(RExC_open_parens);
4127 Newxz(RExC_close_parens,RExC_npar,regnode *);
4128 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4129 }
4130
4131 /* Useful during FAIL. */
f8fc2ecf
YO
4132 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4133 if (ri->offsets) {
4134 ri->offsets[0] = RExC_size;
fac92740 4135 }
a3621e74 4136 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4137 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4138 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4139 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4140
830247a4 4141 RExC_rx = r;
f8fc2ecf 4142 RExC_rxi = ri;
bbce6d69 4143
4144 /* Second pass: emit code. */
e2509266 4145 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4146 RExC_parse = exp;
4147 RExC_end = xend;
4148 RExC_naughty = 0;
4149 RExC_npar = 1;
f8fc2ecf
YO
4150 RExC_emit_start = ri->program;
4151 RExC_emit = ri->program;
e2e6a0f1
YO
4152#ifdef DEBUGGING
4153 /* put a sentinal on the end of the program so we can check for
4154 overwrites */
f8fc2ecf 4155 ri->program[RExC_size].type = 255;
e2e6a0f1 4156#endif
2cd61cdb 4157 /* Store the count of eval-groups for security checks: */
f8149455 4158 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4159 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4160 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4161 return(NULL);
6bda09f9 4162
07be1b83
YO
4163 /* XXXX To minimize changes to RE engine we always allocate
4164 3-units-long substrs field. */
4165 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4166 if (RExC_recurse_count) {
4167 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4168 SAVEFREEPV(RExC_recurse);
4169 }
a0d0e21e 4170
07be1b83 4171reStudy:
1de06328 4172 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4173 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4174
07be1b83
YO
4175#ifdef TRIE_STUDY_OPT
4176 if ( restudied ) {
5d458dd8 4177 U32 seen=RExC_seen;
07be1b83 4178 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4179
4180 RExC_state = copyRExC_state;
4181 if (seen & REG_TOP_LEVEL_BRANCHES)
4182 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4183 else
4184 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4185 if (data.last_found) {
07be1b83 4186 SvREFCNT_dec(data.longest_fixed);
07be1b83 4187 SvREFCNT_dec(data.longest_float);
07be1b83 4188 SvREFCNT_dec(data.last_found);
1de06328 4189 }
40d049e4 4190 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4191 } else {
40d049e4 4192 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4193 copyRExC_state = RExC_state;
07be1b83 4194 }
40d049e4
YO
4195#else
4196 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4197#endif
fc8cd66c 4198
a0d0e21e 4199 /* Dig out information for optimizations. */
bbe252da 4200 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4201 pm->op_pmflags = RExC_flags;
a0ed51b3 4202 if (UTF)
bbe252da 4203 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4204 ri->regstclass = NULL;
830247a4 4205 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4206 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4207 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4208
1de06328
YO
4209 /* testing for BRANCH here tells us whether there is "must appear"
4210 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4211 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4212 I32 fake;
c5254dd6 4213 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4214 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4215 int stclass_flag;
07be1b83 4216 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4217
4218 first = scan;
c277df42 4219 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4220 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4221 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4222 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4223 /* for now we can't handle lookbehind IFMATCH*/
4224 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4225 (OP(first) == PLUS) ||
4226 (OP(first) == MINMOD) ||
653099ff 4227 /* An {n,m} with n>0 */
07be1b83
YO
4228 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4229 {
786e8c11 4230
a0d0e21e
LW
4231 if (OP(first) == PLUS)
4232 sawplus = 1;
4233 else
3dab1dad 4234 first += regarglen[OP(first)];
07be1b83
YO
4235 if (OP(first) == IFMATCH) {
4236 first = NEXTOPER(first);
4237 first += EXTRA_STEP_2ARGS;
7c167cea 4238 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4239 first = NEXTOPER(first);
a687059c
LW
4240 }
4241
a0d0e21e
LW
4242 /* Starting-point info. */
4243 again:
786e8c11 4244 DEBUG_PEEP("first:",first,0);
07be1b83 4245 /* Ignore EXACT as we deal with it later. */
3dab1dad 4246 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4247 if (OP(first) == EXACT)
6f207bd3 4248 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4249 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4250 ri->regstclass = first;
b3c9acc1 4251 }
07be1b83 4252#ifdef TRIE_STCLASS
786e8c11 4253 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4254 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4255 {
786e8c11 4256 regnode *trie_op;
07be1b83 4257 /* this can happen only on restudy */
786e8c11 4258 if ( OP(first) == TRIE ) {
c944940b 4259 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4260 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4261 StructCopy(first,trieop,struct regnode_1);
4262 trie_op=(regnode *)trieop;
4263 } else {
c944940b 4264 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4265 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4266 StructCopy(first,trieop,struct regnode_charclass);
4267 trie_op=(regnode *)trieop;
4268 }
1de06328 4269 OP(trie_op)+=2;
786e8c11 4270 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4271 ri->regstclass = trie_op;
07be1b83
YO
4272 }
4273#endif
bfed75c6 4274 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4275 ri->regstclass = first;
3dab1dad
YO
4276 else if (PL_regkind[OP(first)] == BOUND ||
4277 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4278 ri->regstclass = first;
3dab1dad 4279 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4280 r->extflags |= (OP(first) == MBOL
4281 ? RXf_ANCH_MBOL
cad2e5aa 4282 : (OP(first) == SBOL
bbe252da
YO
4283 ? RXf_ANCH_SBOL
4284 : RXf_ANCH_BOL));
a0d0e21e 4285 first = NEXTOPER(first);
774d564b 4286 goto again;
4287 }
4288 else if (OP(first) == GPOS) {
bbe252da 4289 r->extflags |= RXf_ANCH_GPOS;
774d564b 4290 first = NEXTOPER(first);
4291 goto again;
a0d0e21e 4292 }
cf2a2b69
YO
4293 else if ((!sawopen || !RExC_sawback) &&
4294 (OP(first) == STAR &&
3dab1dad 4295 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4296 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4297 {
4298 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4299 const int type =
4300 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4301 ? RXf_ANCH_MBOL
4302 : RXf_ANCH_SBOL;
4303 r->extflags |= type;
4304 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4305 first = NEXTOPER(first);
774d564b 4306 goto again;
a0d0e21e 4307 }
b81d288d 4308 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4309 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4310 /* x+ must match at the 1st pos of run of x's */
bbe252da 4311 r->intflags |= PREGf_SKIP;
a0d0e21e 4312
c277df42 4313 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4314#ifdef TRIE_STUDY_OPT
81714fb9 4315 DEBUG_PARSE_r(
be8e71aa
YO
4316 if (!restudied)
4317 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4318 (IV)(first - scan + 1))
4319 );
4320#else
81714fb9 4321 DEBUG_PARSE_r(
be8e71aa
YO
4322 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4323 (IV)(first - scan + 1))
4324 );
4325#endif
4326
4327
a0d0e21e
LW
4328 /*
4329 * If there's something expensive in the r.e., find the
4330 * longest literal string that must appear and make it the
4331 * regmust. Resolve ties in favor of later strings, since
4332 * the regstart check works with the beginning of the r.e.
4333 * and avoiding duplication strengthens checking. Not a
4334 * strong reason, but sufficient in the absence of others.
4335 * [Now we resolve ties in favor of the earlier string if
c277df42 4336 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4337 * earlier string may buy us something the later one won't.]
4338 */
de8c5301 4339
396482e1
GA
4340 data.longest_fixed = newSVpvs("");
4341 data.longest_float = newSVpvs("");
4342 data.last_found = newSVpvs("");
c277df42
IZ
4343 data.longest = &(data.longest_fixed);
4344 first = scan;
f8fc2ecf 4345 if (!ri->regstclass) {
830247a4 4346 cl_init(pRExC_state, &ch_class);
653099ff
GS
4347 data.start_class = &ch_class;
4348 stclass_flag = SCF_DO_STCLASS_AND;
4349 } else /* XXXX Check for BOUND? */
4350 stclass_flag = 0;
cb434fcc 4351 data.last_closep = &last_close;
de8c5301 4352
1de06328 4353 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4354 &data, -1, NULL, NULL,
4355 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4356
07be1b83 4357
786e8c11
YO
4358 CHECK_RESTUDY_GOTO;
4359
4360
830247a4 4361 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4362 && data.last_start_min == 0 && data.last_end > 0
830247a4 4363 && !RExC_seen_zerolen
2bf803e2 4364 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4365 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4366 r->extflags |= RXf_CHECK_ALL;
304ee84b 4367 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4368 SvREFCNT_dec(data.last_found);
4369
1de06328
YO
4370 /* Note that code very similar to this but for anchored string
4371 follows immediately below, changes may need to be made to both.
4372 Be careful.
4373 */
a0ed51b3 4374 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4375 if (longest_float_length
c277df42
IZ
4376 || (data.flags & SF_FL_BEFORE_EOL
4377 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4378 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4379 {
1182767e 4380 I32 t,ml;
cf93c79d 4381
1de06328 4382 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4383 && data.offset_fixed == data.offset_float_min
4384 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4385 goto remove_float; /* As in (a)+. */
4386
1de06328
YO
4387 /* copy the information about the longest float from the reg_scan_data
4388 over to the program. */
33b8afdf
JH
4389 if (SvUTF8(data.longest_float)) {
4390 r->float_utf8 = data.longest_float;
c445ea15 4391 r->float_substr = NULL;
33b8afdf
JH
4392 } else {
4393 r->float_substr = data.longest_float;
c445ea15 4394 r->float_utf8 = NULL;
33b8afdf 4395 }
1de06328
YO
4396 /* float_end_shift is how many chars that must be matched that
4397 follow this item. We calculate it ahead of time as once the
4398 lookbehind offset is added in we lose the ability to correctly
4399 calculate it.*/
4400 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4401 : (I32)longest_float_length;
1de06328
YO
4402 r->float_end_shift = ml - data.offset_float_min
4403 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4404 + data.lookbehind_float;
4405 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4406 r->float_max_offset = data.offset_float_max;
1182767e 4407 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4408 r->float_max_offset -= data.lookbehind_float;
4409
cf93c79d
IZ
4410 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4411 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4412 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4413 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4414 }
4415 else {
aca2d497 4416 remove_float:
c445ea15 4417 r->float_substr = r->float_utf8 = NULL;
c277df42 4418 SvREFCNT_dec(data.longest_float);
c5254dd6 4419 longest_float_length = 0;
a0d0e21e 4420 }
c277df42 4421
1de06328
YO
4422 /* Note that code very similar to this but for floating string
4423 is immediately above, changes may need to be made to both.
4424 Be careful.
4425 */
a0ed51b3 4426 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4427 if (longest_fixed_length
c277df42
IZ
4428 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4429 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4430 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4431 {
1182767e 4432 I32 t,ml;
cf93c79d 4433
1de06328
YO
4434 /* copy the information about the longest fixed
4435 from the reg_scan_data over to the program. */
33b8afdf
JH
4436 if (SvUTF8(data.longest_fixed)) {
4437 r->anchored_utf8 = data.longest_fixed;
c445ea15 4438 r->anchored_substr = NULL;
33b8afdf
JH
4439 } else {
4440 r->anchored_substr = data.longest_fixed;
c445ea15 4441 r->anchored_utf8 = NULL;
33b8afdf 4442 }
1de06328
YO
4443 /* fixed_end_shift is how many chars that must be matched that
4444 follow this item. We calculate it ahead of time as once the
4445 lookbehind offset is added in we lose the ability to correctly
4446 calculate it.*/
4447 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4448 : (I32)longest_fixed_length;
1de06328
YO
4449 r->anchored_end_shift = ml - data.offset_fixed
4450 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4451 + data.lookbehind_fixed;
4452 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4453
cf93c79d
IZ
4454 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4455 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4456 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4457 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4458 }
4459 else {
c445ea15 4460 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4461 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4462 longest_fixed_length = 0;
a0d0e21e 4463 }
f8fc2ecf
YO
4464 if (ri->regstclass
4465 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4466 ri->regstclass = NULL;
33b8afdf
JH
4467 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4468 && stclass_flag
653099ff 4469 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4470 && !cl_is_anything(data.start_class))
4471 {
2eccd3b2 4472 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4473
f8fc2ecf 4474 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4475 struct regnode_charclass_class);
4476 StructCopy(data.start_class,
f8fc2ecf 4477 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4478 struct regnode_charclass_class);
f8fc2ecf 4479 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4480 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4481 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4482 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4483 PerlIO_printf(Perl_debug_log,
a0288114 4484 "synthetic stclass \"%s\".\n",
3f7c398e 4485 SvPVX_const(sv));});
653099ff 4486 }
c277df42
IZ
4487
4488 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4489 if (longest_fixed_length > longest_float_length) {
1de06328 4490 r->check_end_shift = r->anchored_end_shift;
c277df42 4491 r->check_substr = r->anchored_substr;
33b8afdf 4492 r->check_utf8 = r->anchored_utf8;
c277df42 4493 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4494 if (r->extflags & RXf_ANCH_SINGLE)
4495 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4496 }
4497 else {
1de06328 4498 r->check_end_shift = r->float_end_shift;
c277df42 4499 r->check_substr = r->float_substr;
33b8afdf 4500 r->check_utf8 = r->float_utf8;
1de06328
YO
4501 r->check_offset_min = r->float_min_offset;
4502 r->check_offset_max = r->float_max_offset;
a0d0e21e 4503 }
30382c73
IZ
4504 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4505 This should be changed ASAP! */
bbe252da
YO
4506 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4507 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4508 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4509 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4510 }
1de06328
YO
4511 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4512 if ( (STRLEN)minlen < longest_float_length )
4513 minlen= longest_float_length;
4514 if ( (STRLEN)minlen < longest_fixed_length )
4515 minlen= longest_fixed_length;
4516 */
a0ed51b3
LW
4517 }
4518 else {
c277df42
IZ
4519 /* Several toplevels. Best we can is to set minlen. */
4520 I32 fake;
653099ff 4521 struct regnode_charclass_class ch_class;
cb434fcc 4522 I32 last_close = 0;
c277df42 4523
5d458dd8 4524 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4525
f8fc2ecf 4526 scan = ri->program + 1;
830247a4 4527 cl_init(pRExC_state, &ch_class);
653099ff 4528 data.start_class = &ch_class;
cb434fcc 4529 data.last_closep = &last_close;
07be1b83 4530
de8c5301 4531
1de06328 4532 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4533 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4534
786e8c11 4535 CHECK_RESTUDY_GOTO;
07be1b83 4536
33b8afdf 4537 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4538 = r->float_substr = r->float_utf8 = NULL;
653099ff 4539 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4540 && !cl_is_anything(data.start_class))
4541 {
2eccd3b2 4542 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4543
f8fc2ecf 4544 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4545 struct regnode_charclass_class);
4546 StructCopy(data.start_class,
f8fc2ecf 4547 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4548 struct regnode_charclass_class);
f8fc2ecf 4549 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4550 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4551 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4552 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4553 PerlIO_printf(Perl_debug_log,
a0288114 4554 "synthetic stclass \"%s\".\n",
3f7c398e 4555 SvPVX_const(sv));});
653099ff 4556 }
a0d0e21e
LW
4557 }
4558
1de06328
YO
4559 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4560 the "real" pattern. */
cf9788e3
RGS
4561 DEBUG_OPTIMISE_r({
4562 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4563 (IV)minlen, (IV)r->minlen);
cf9788e3 4564 });
de8c5301 4565 r->minlenret = minlen;
1de06328
YO
4566 if (r->minlen < minlen)
4567 r->minlen = minlen;
4568
b81d288d 4569 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4570 r->extflags |= RXf_GPOS_SEEN;
830247a4 4571 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4572 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4573 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4574 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4575 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4576 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4577 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4578 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4579 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4580 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4581 if (RExC_paren_names)
4582 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4583 else
4584 r->paren_names = NULL;
e357fc67
YO
4585 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4586 r->extflags |= RXf_WHITE;
4587 else if (r->prelen == 1 && r->precomp[0] == '^')
4588 r->extflags |= RXf_START_ONLY;
4589
1f1031fe
YO
4590#ifdef DEBUGGING
4591 if (RExC_paren_names) {
4592 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4593 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4594 } else
4595 ri->name_list_idx = 0;
4596#endif
4597
40d049e4
YO
4598 if (RExC_recurse_count) {
4599 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4600 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4601 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4602 }
4603 }
a02a5408
JC
4604 Newxz(r->startp, RExC_npar, I32);
4605 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4606 /* assume we don't need to swap parens around before we match */
4607
be8e71aa
YO
4608 DEBUG_DUMP_r({
4609 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4610 regdump(r);
4611 });
f8fc2ecf
YO
4612 DEBUG_OFFSETS_r(if (ri->offsets) {
4613 const U32 len = ri->offsets[0];
8e9a8a48
YO
4614 U32 i;
4615 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4616 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4617 for (i = 1; i <= len; i++) {
f8fc2ecf 4618 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4619 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4620 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4621 }
4622 PerlIO_printf(Perl_debug_log, "\n");
4623 });
a0d0e21e 4624 return(r);
a687059c
LW
4625}
4626
f9f4320a 4627#undef CORE_ONLY_BLOCK
f9f4320a 4628#undef RE_ENGINE_PTR
3dab1dad 4629
9af228c6 4630#ifndef PERL_IN_XSUB_RE
81714fb9 4631SV*
44a2ac75 4632Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
81714fb9 4633{
44a2ac75
YO
4634 AV *retarray = NULL;
4635 SV *ret;
4636 if (flags & 1)
4637 retarray=newAV();
4638
4639 if (from_re || PL_curpm) {
4640 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
81714fb9
YO
4641 if (rx && rx->paren_names) {
4642 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4643 if (he_str) {
4644 IV i;
4645 SV* sv_dat=HeVAL(he_str);
4646 I32 *nums=(I32*)SvPVX(sv_dat);
4647 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4648 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4649 rx->endp[nums[i]] != -1)
4650 {
44a2ac75
YO
4651 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4652 if (!retarray)
4653 return ret;
4654 } else {
4655 ret = newSVsv(&PL_sv_undef);
4656 }
4657 if (retarray) {
4658 SvREFCNT_inc(ret);
4659 av_push(retarray, ret);
81714fb9
YO
4660 }
4661 }
44a2ac75
YO
4662 if (retarray)
4663 return (SV*)retarray;
81714fb9
YO
4664 }
4665 }
4666 }
44a2ac75
YO
4667 return NULL;
4668}
4669
4670SV*
4671Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4672{
4673 char *s = NULL;
a9d504c3 4674 I32 i = 0;
44a2ac75
YO
4675 I32 s1, t1;
4676 SV *sv = usesv ? usesv : newSVpvs("");
4677
4678 if (paren == -2 && (s = rx->subbeg) && rx->startp[0] != -1) {
4679 /* $` */
4680 i = rx->startp[0];
4681 }
4682 else
4683 if (paren == -1 && rx->subbeg && rx->endp[0] != -1) {
4684 /* $' */
4685 s = rx->subbeg + rx->endp[0];
4686 i = rx->sublen - rx->endp[0];
4687 }
4688 else
4689 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4690 (s1 = rx->startp[paren]) != -1 &&
4691 (t1 = rx->endp[paren]) != -1)
4692 {
4693 /* $& $1 ... */
4694 i = t1 - s1;
4695 s = rx->subbeg + s1;
4696 }
4697
4698 if (s) {
4699 assert(rx->subbeg);
4700 assert(rx->sublen >= (s - rx->subbeg) + i );
4701
4702 if (i >= 0) {
4703 const int oldtainted = PL_tainted;
4704 TAINT_NOT;
4705 sv_setpvn(sv, s, i);
4706 PL_tainted = oldtainted;
4707 if ( (rx->extflags & RXf_CANY_SEEN)
4708 ? (RX_MATCH_UTF8(rx)
4709 && (!i || is_utf8_string((U8*)s, i)))
4710 : (RX_MATCH_UTF8(rx)) )
4711 {
4712 SvUTF8_on(sv);
4713 }
4714 else
4715 SvUTF8_off(sv);
4716 if (PL_tainting) {
4717 if (RX_MATCH_TAINTED(rx)) {
4718 if (SvTYPE(sv) >= SVt_PVMG) {
4719 MAGIC* const mg = SvMAGIC(sv);
4720 MAGIC* mgt;
4721 PL_tainted = 1;
4722 SvMAGIC_set(sv, mg->mg_moremagic);
4723 SvTAINT(sv);
4724 if ((mgt = SvMAGIC(sv))) {
4725 mg->mg_moremagic = mgt;
4726 SvMAGIC_set(sv, mg);
4727 }
4728 } else {
4729 PL_tainted = 1;
4730 SvTAINT(sv);
4731 }
4732 } else
4733 SvTAINTED_off(sv);
4734 }
4735 } else {
4736 sv_setsv(sv,&PL_sv_undef);
4737 }
81714fb9 4738 } else {
44a2ac75 4739 sv_setsv(sv,&PL_sv_undef);
81714fb9 4740 }
44a2ac75 4741 return sv;
81714fb9 4742}
9af228c6 4743#endif
0a4db386 4744
894be9b7 4745/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4746 * If flags is REG_RSN_RETURN_NULL returns null.
4747 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4748 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4749 * to the parsed name as looked up in the RExC_paren_names hash.
4750 * If there is an error throws a vFAIL().. type exception.
894be9b7 4751 */
0a4db386
YO
4752
4753#define REG_RSN_RETURN_NULL 0
4754#define REG_RSN_RETURN_NAME 1
4755#define REG_RSN_RETURN_DATA 2
4756
894be9b7
YO
4757STATIC SV*
4758S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4759 char *name_start = RExC_parse;
1f1031fe
YO
4760
4761 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4762 /* skip IDFIRST by using do...while */
4763 if (UTF)
4764 do {
4765 RExC_parse += UTF8SKIP(RExC_parse);
4766 } while (isALNUM_utf8((U8*)RExC_parse));
4767 else
4768 do {
4769 RExC_parse++;
4770 } while (isALNUM(*RExC_parse));
894be9b7 4771 }
1f1031fe 4772
0a4db386
YO
4773 if ( flags ) {
4774 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4775 (int)(RExC_parse - name_start)));
894be9b7 4776 if (UTF)
0a4db386
YO
4777 SvUTF8_on(sv_name);
4778 if ( flags == REG_RSN_RETURN_NAME)
4779 return sv_name;
4780 else if (flags==REG_RSN_RETURN_DATA) {
4781 HE *he_str = NULL;
4782 SV *sv_dat = NULL;
4783 if ( ! sv_name ) /* should not happen*/
4784 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4785 if (RExC_paren_names)
4786 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4787 if ( he_str )
4788 sv_dat = HeVAL(he_str);
4789 if ( ! sv_dat )
4790 vFAIL("Reference to nonexistent named group");
4791 return sv_dat;
4792 }
4793 else {
4794 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4795 }
4796 /* NOT REACHED */
894be9b7 4797 }
0a4db386 4798 return NULL;
894be9b7
YO
4799}
4800
3dab1dad
YO
4801#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4802 int rem=(int)(RExC_end - RExC_parse); \
4803 int cut; \
4804 int num; \
4805 int iscut=0; \
4806 if (rem>10) { \
4807 rem=10; \
4808 iscut=1; \
4809 } \
4810 cut=10-rem; \
4811 if (RExC_lastparse!=RExC_parse) \
4812 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4813 rem, RExC_parse, \
4814 cut + 4, \
4815 iscut ? "..." : "<" \
4816 ); \
4817 else \
4818 PerlIO_printf(Perl_debug_log,"%16s",""); \
4819 \
4820 if (SIZE_ONLY) \
4821 num=RExC_size; \
4822 else \
4823 num=REG_NODE_NUM(RExC_emit); \
4824 if (RExC_lastnum!=num) \
0a4db386 4825 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4826 else \
0a4db386 4827 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4828 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4829 (int)((depth*2)), "", \
3dab1dad
YO
4830 (funcname) \
4831 ); \
4832 RExC_lastnum=num; \
4833 RExC_lastparse=RExC_parse; \
4834})
4835
07be1b83
YO
4836
4837
3dab1dad
YO
4838#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4839 DEBUG_PARSE_MSG((funcname)); \
4840 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4841})
6bda09f9
YO
4842#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4843 DEBUG_PARSE_MSG((funcname)); \
4844 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4845})
a687059c
LW
4846/*
4847 - reg - regular expression, i.e. main body or parenthesized thing
4848 *
4849 * Caller must absorb opening parenthesis.
4850 *
4851 * Combining parenthesis handling with the base level of regular expression
4852 * is a trifle forced, but the need to tie the tails of the branches to what
4853 * follows makes it hard to avoid.
4854 */
07be1b83
YO
4855#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4856#ifdef DEBUGGING
4857#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4858#else
4859#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4860#endif
3dab1dad 4861
e2e6a0f1
YO
4862/* this idea is borrowed from STR_WITH_LEN in handy.h */
4863#define CHECK_WORD(s,v,l) \
4864 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4865
76e3520e 4866STATIC regnode *
3dab1dad 4867S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4868 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4869{
27da23d5 4870 dVAR;
c277df42
IZ
4871 register regnode *ret; /* Will be the head of the group. */
4872 register regnode *br;
4873 register regnode *lastbr;
cbbf8932 4874 register regnode *ender = NULL;
a0d0e21e 4875 register I32 parno = 0;
cbbf8932
AL
4876 I32 flags;
4877 const I32 oregflags = RExC_flags;
6136c704
AL
4878 bool have_branch = 0;
4879 bool is_open = 0;
9d1d55b5
JP
4880
4881 /* for (?g), (?gc), and (?o) warnings; warning
4882 about (?c) will warn about (?g) -- japhy */
4883
6136c704
AL
4884#define WASTED_O 0x01
4885#define WASTED_G 0x02
4886#define WASTED_C 0x04
4887#define WASTED_GC (0x02|0x04)
cbbf8932 4888 I32 wastedflags = 0x00;
9d1d55b5 4889
fac92740 4890 char * parse_start = RExC_parse; /* MJD */
a28509cc 4891 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4892
3dab1dad
YO
4893 GET_RE_DEBUG_FLAGS_DECL;
4894 DEBUG_PARSE("reg ");
4895
4896
821b33a5 4897 *flagp = 0; /* Tentatively. */
a0d0e21e 4898
9d1d55b5 4899
a0d0e21e
LW
4900 /* Make an OPEN node, if parenthesized. */
4901 if (paren) {
e2e6a0f1
YO
4902 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4903 char *start_verb = RExC_parse;
4904 STRLEN verb_len = 0;
4905 char *start_arg = NULL;
4906 unsigned char op = 0;
4907 int argok = 1;
4908 int internal_argval = 0; /* internal_argval is only useful if !argok */
4909 while ( *RExC_parse && *RExC_parse != ')' ) {
4910 if ( *RExC_parse == ':' ) {
4911 start_arg = RExC_parse + 1;
4912 break;
4913 }
4914 RExC_parse++;
4915 }
4916 ++start_verb;
4917 verb_len = RExC_parse - start_verb;
4918 if ( start_arg ) {
4919 RExC_parse++;
4920 while ( *RExC_parse && *RExC_parse != ')' )
4921 RExC_parse++;
4922 if ( *RExC_parse != ')' )
4923 vFAIL("Unterminated verb pattern argument");
4924 if ( RExC_parse == start_arg )
4925 start_arg = NULL;
4926 } else {
4927 if ( *RExC_parse != ')' )
4928 vFAIL("Unterminated verb pattern");
4929 }
5d458dd8 4930
e2e6a0f1
YO
4931 switch ( *start_verb ) {
4932 case 'A': /* (*ACCEPT) */
4933 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4934 op = ACCEPT;
4935 internal_argval = RExC_nestroot;
4936 }
4937 break;
4938 case 'C': /* (*COMMIT) */
4939 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4940 op = COMMIT;
e2e6a0f1
YO
4941 break;
4942 case 'F': /* (*FAIL) */
4943 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4944 op = OPFAIL;
4945 argok = 0;
4946 }
4947 break;
5d458dd8
YO
4948 case ':': /* (*:NAME) */
4949 case 'M': /* (*MARK:NAME) */
4950 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4951 op = MARKPOINT;
5d458dd8
YO
4952 argok = -1;
4953 }
4954 break;
4955 case 'P': /* (*PRUNE) */
4956 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4957 op = PRUNE;
e2e6a0f1 4958 break;
5d458dd8
YO
4959 case 'S': /* (*SKIP) */
4960 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4961 op = SKIP;
4962 break;
4963 case 'T': /* (*THEN) */
4964 /* [19:06] <TimToady> :: is then */
4965 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4966 op = CUTGROUP;
4967 RExC_seen |= REG_SEEN_CUTGROUP;
4968 }
e2e6a0f1
YO
4969 break;
4970 }
4971 if ( ! op ) {
4972 RExC_parse++;
4973 vFAIL3("Unknown verb pattern '%.*s'",
4974 verb_len, start_verb);
4975 }
4976 if ( argok ) {
4977 if ( start_arg && internal_argval ) {
4978 vFAIL3("Verb pattern '%.*s' may not have an argument",
4979 verb_len, start_verb);
4980 } else if ( argok < 0 && !start_arg ) {
4981 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4982 verb_len, start_verb);
4983 } else {
4984 ret = reganode(pRExC_state, op, internal_argval);
4985 if ( ! internal_argval && ! SIZE_ONLY ) {
4986 if (start_arg) {
4987 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4988 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4989 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4990 ret->flags = 0;
4991 } else {
4992 ret->flags = 1;
4993 }
4994 }
4995 }
4996 if (!internal_argval)
4997 RExC_seen |= REG_SEEN_VERBARG;
4998 } else if ( start_arg ) {
4999 vFAIL3("Verb pattern '%.*s' may not have an argument",
5000 verb_len, start_verb);
5001 } else {
5002 ret = reg_node(pRExC_state, op);
5003 }
5004 nextchar(pRExC_state);
5005 return ret;
5006 } else
fac92740 5007 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
5008 U32 posflags = 0, negflags = 0;
5009 U32 *flagsp = &posflags;
6136c704 5010 bool is_logical = 0;
a28509cc 5011 const char * const seqstart = RExC_parse;
ca9dfc88 5012
830247a4
IZ
5013 RExC_parse++;
5014 paren = *RExC_parse++;
c277df42 5015 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5016 switch (paren) {
894be9b7 5017
1f1031fe
YO
5018 case 'P': /* (?P...) variants for those used to PCRE/Python */
5019 paren = *RExC_parse++;
5020 if ( paren == '<') /* (?P<...>) named capture */
5021 goto named_capture;
5022 else if (paren == '>') { /* (?P>name) named recursion */
5023 goto named_recursion;
5024 }
5025 else if (paren == '=') { /* (?P=...) named backref */
5026 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5027 you change this make sure you change that */
5028 char* name_start = RExC_parse;
5029 U32 num = 0;
5030 SV *sv_dat = reg_scan_name(pRExC_state,
5031 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5032 if (RExC_parse == name_start || *RExC_parse != ')')
5033 vFAIL2("Sequence %.3s... not terminated",parse_start);
5034
5035 if (!SIZE_ONLY) {
5036 num = add_data( pRExC_state, 1, "S" );
5037 RExC_rxi->data->data[num]=(void*)sv_dat;
5038 SvREFCNT_inc(sv_dat);
5039 }
5040 RExC_sawback = 1;
5041 ret = reganode(pRExC_state,
5042 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5043 num);
5044 *flagp |= HASWIDTH;
5045
5046 Set_Node_Offset(ret, parse_start+1);
5047 Set_Node_Cur_Length(ret); /* MJD */
5048
5049 nextchar(pRExC_state);
5050 return ret;
5051 }
5052 goto unknown;
fac92740 5053 case '<': /* (?<...) */
b81d288d 5054 if (*RExC_parse == '!')
c277df42 5055 paren = ',';
0a4db386 5056 else if (*RExC_parse != '=')
1f1031fe 5057 named_capture:
0a4db386 5058 { /* (?<...>) */
81714fb9 5059 char *name_start;
894be9b7 5060 SV *svname;
81714fb9
YO
5061 paren= '>';
5062 case '\'': /* (?'...') */
5063 name_start= RExC_parse;
0a4db386
YO
5064 svname = reg_scan_name(pRExC_state,
5065 SIZE_ONLY ? /* reverse test from the others */
5066 REG_RSN_RETURN_NAME :
5067 REG_RSN_RETURN_NULL);
81714fb9
YO
5068 if (RExC_parse == name_start)
5069 goto unknown;
5070 if (*RExC_parse != paren)
5071 vFAIL2("Sequence (?%c... not terminated",
5072 paren=='>' ? '<' : paren);
5073 if (SIZE_ONLY) {
e62cc96a
YO
5074 HE *he_str;
5075 SV *sv_dat = NULL;
894be9b7
YO
5076 if (!svname) /* shouldnt happen */
5077 Perl_croak(aTHX_
5078 "panic: reg_scan_name returned NULL");
81714fb9
YO
5079 if (!RExC_paren_names) {
5080 RExC_paren_names= newHV();
5081 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5082#ifdef DEBUGGING
5083 RExC_paren_name_list= newAV();
5084 sv_2mortal((SV*)RExC_paren_name_list);
5085#endif
81714fb9
YO
5086 }
5087 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5088 if ( he_str )
81714fb9 5089 sv_dat = HeVAL(he_str);
e62cc96a 5090 if ( ! sv_dat ) {
81714fb9 5091 /* croak baby croak */
e62cc96a
YO
5092 Perl_croak(aTHX_
5093 "panic: paren_name hash element allocation failed");
5094 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
5095 IV count=SvIV(sv_dat);
5096 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5097 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5098 pv[count]=RExC_npar;
5099 SvIVX(sv_dat)++;
5100 } else {
5101 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5102 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5103 SvIOK_on(sv_dat);
5104 SvIVX(sv_dat)= 1;
e62cc96a 5105 }
1f1031fe
YO
5106#ifdef DEBUGGING
5107 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5108 SvREFCNT_dec(svname);
5109#endif
e62cc96a 5110
81714fb9
YO
5111 /*sv_dump(sv_dat);*/
5112 }
5113 nextchar(pRExC_state);
5114 paren = 1;
5115 goto capturing_parens;
5116 }
5117 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5118 RExC_parse++;
fac92740
MJD
5119 case '=': /* (?=...) */
5120 case '!': /* (?!...) */
830247a4 5121 RExC_seen_zerolen++;
e2e6a0f1
YO
5122 if (*RExC_parse == ')') {
5123 ret=reg_node(pRExC_state, OPFAIL);
5124 nextchar(pRExC_state);
5125 return ret;
5126 }
fac92740
MJD
5127 case ':': /* (?:...) */
5128 case '>': /* (?>...) */
a0d0e21e 5129 break;
fac92740
MJD
5130 case '$': /* (?$...) */
5131 case '@': /* (?@...) */
8615cb43 5132 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5133 break;
fac92740 5134 case '#': /* (?#...) */
830247a4
IZ
5135 while (*RExC_parse && *RExC_parse != ')')
5136 RExC_parse++;
5137 if (*RExC_parse != ')')
c277df42 5138 FAIL("Sequence (?#... not terminated");
830247a4 5139 nextchar(pRExC_state);
a0d0e21e
LW
5140 *flagp = TRYAGAIN;
5141 return NULL;
894be9b7
YO
5142 case '0' : /* (?0) */
5143 case 'R' : /* (?R) */
5144 if (*RExC_parse != ')')
6bda09f9 5145 FAIL("Sequence (?R) not terminated");
1a147d38 5146 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5147 nextchar(pRExC_state);
5148 return ret;
5149 /*notreached*/
894be9b7
YO
5150 { /* named and numeric backreferences */
5151 I32 num;
894be9b7
YO
5152 case '&': /* (?&NAME) */
5153 parse_start = RExC_parse - 1;
1f1031fe 5154 named_recursion:
894be9b7 5155 {
0a4db386
YO
5156 SV *sv_dat = reg_scan_name(pRExC_state,
5157 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5159 }
5160 goto gen_recurse_regop;
5161 /* NOT REACHED */
542fa716
YO
5162 case '+':
5163 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5164 RExC_parse++;
5165 vFAIL("Illegal pattern");
5166 }
5167 goto parse_recursion;
5168 /* NOT REACHED*/
5169 case '-': /* (?-1) */
5170 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5171 RExC_parse--; /* rewind to let it be handled later */
5172 goto parse_flags;
5173 }
5174 /*FALLTHROUGH */
6bda09f9
YO
5175 case '1': case '2': case '3': case '4': /* (?1) */
5176 case '5': case '6': case '7': case '8': case '9':
5177 RExC_parse--;
542fa716 5178 parse_recursion:
894be9b7
YO
5179 num = atoi(RExC_parse);
5180 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5181 if (*RExC_parse == '-')
5182 RExC_parse++;
6bda09f9
YO
5183 while (isDIGIT(*RExC_parse))
5184 RExC_parse++;
5185 if (*RExC_parse!=')')
5186 vFAIL("Expecting close bracket");
894be9b7
YO
5187
5188 gen_recurse_regop:
542fa716
YO
5189 if ( paren == '-' ) {
5190 /*
5191 Diagram of capture buffer numbering.
5192 Top line is the normal capture buffer numbers
5193 Botton line is the negative indexing as from
5194 the X (the (?-2))
5195
5196 + 1 2 3 4 5 X 6 7
5197 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5198 - 5 4 3 2 1 X x x
5199
5200 */
5201 num = RExC_npar + num;
5202 if (num < 1) {
5203 RExC_parse++;
5204 vFAIL("Reference to nonexistent group");
5205 }
5206 } else if ( paren == '+' ) {
5207 num = RExC_npar + num - 1;
5208 }
5209
1a147d38 5210 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5211 if (!SIZE_ONLY) {
5212 if (num > (I32)RExC_rx->nparens) {
5213 RExC_parse++;
5214 vFAIL("Reference to nonexistent group");
5215 }
40d049e4 5216 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5217 RExC_emit++;
226de585 5218 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5219 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5220 } else {
6bda09f9 5221 RExC_size++;
6bda09f9 5222 }
0a4db386 5223 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5224 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5225 Set_Node_Offset(ret, parse_start); /* MJD */
5226
6bda09f9
YO
5227 nextchar(pRExC_state);
5228 return ret;
894be9b7
YO
5229 } /* named and numeric backreferences */
5230 /* NOT REACHED */
5231
fac92740 5232 case 'p': /* (?p...) */
9014280d 5233 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5234 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5235 /* FALL THROUGH*/
fac92740 5236 case '?': /* (??...) */
6136c704 5237 is_logical = 1;
438a3801
YST
5238 if (*RExC_parse != '{')
5239 goto unknown;
830247a4 5240 paren = *RExC_parse++;
0f5d15d6 5241 /* FALL THROUGH */
fac92740 5242 case '{': /* (?{...}) */
c277df42 5243 {
2eccd3b2
NC
5244 I32 count = 1;
5245 U32 n = 0;
c277df42 5246 char c;
830247a4 5247 char *s = RExC_parse;
c277df42 5248
830247a4
IZ
5249 RExC_seen_zerolen++;
5250 RExC_seen |= REG_SEEN_EVAL;
5251 while (count && (c = *RExC_parse)) {
6136c704
AL
5252 if (c == '\\') {
5253 if (RExC_parse[1])
5254 RExC_parse++;
5255 }
b81d288d 5256 else if (c == '{')
c277df42 5257 count++;
b81d288d 5258 else if (c == '}')
c277df42 5259 count--;
830247a4 5260 RExC_parse++;
c277df42 5261 }
6136c704 5262 if (*RExC_parse != ')') {
b81d288d 5263 RExC_parse = s;
b45f050a
JF
5264 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5265 }
c277df42 5266 if (!SIZE_ONLY) {
f3548bdc 5267 PAD *pad;
6136c704
AL
5268 OP_4tree *sop, *rop;
5269 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5270
569233ed
SB
5271 ENTER;
5272 Perl_save_re_context(aTHX);
f3548bdc 5273 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5274 sop->op_private |= OPpREFCOUNTED;
5275 /* re_dup will OpREFCNT_inc */
5276 OpREFCNT_set(sop, 1);
569233ed 5277 LEAVE;
c277df42 5278
830247a4 5279 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5280 RExC_rxi->data->data[n] = (void*)rop;
5281 RExC_rxi->data->data[n+1] = (void*)sop;
5282 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5283 SvREFCNT_dec(sv);
a0ed51b3 5284 }
e24b16f9 5285 else { /* First pass */
830247a4 5286 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5287 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5288 /* No compiled RE interpolated, has runtime
5289 components ===> unsafe. */
5290 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5291 if (PL_tainting && PL_tainted)
cc6b7395 5292 FAIL("Eval-group in insecure regular expression");
54df2634 5293#if PERL_VERSION > 8
923e4eb5 5294 if (IN_PERL_COMPILETIME)
b5c19bd7 5295 PL_cv_has_eval = 1;
54df2634 5296#endif
c277df42 5297 }
b5c19bd7 5298
830247a4 5299 nextchar(pRExC_state);
6136c704 5300 if (is_logical) {
830247a4 5301 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5302 if (!SIZE_ONLY)
5303 ret->flags = 2;
3dab1dad 5304 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5305 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5306 return ret;
5307 }
ccb2c380
MP
5308 ret = reganode(pRExC_state, EVAL, n);
5309 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5310 Set_Node_Offset(ret, parse_start);
5311 return ret;
c277df42 5312 }
fac92740 5313 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5314 {
0a4db386 5315 int is_define= 0;
fac92740 5316 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5317 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5318 || RExC_parse[1] == '<'
830247a4 5319 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5320 I32 flag;
5321
830247a4 5322 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5323 if (!SIZE_ONLY)
5324 ret->flags = 1;
3dab1dad 5325 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5326 goto insert_if;
b81d288d 5327 }
a0ed51b3 5328 }
0a4db386
YO
5329 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5330 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5331 {
5332 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5333 char *name_start= RExC_parse++;
2eccd3b2 5334 U32 num = 0;
0a4db386
YO
5335 SV *sv_dat=reg_scan_name(pRExC_state,
5336 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5337 if (RExC_parse == name_start || *RExC_parse != ch)
5338 vFAIL2("Sequence (?(%c... not terminated",
5339 (ch == '>' ? '<' : ch));
5340 RExC_parse++;
5341 if (!SIZE_ONLY) {
5342 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5343 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5344 SvREFCNT_inc(sv_dat);
5345 }
5346 ret = reganode(pRExC_state,NGROUPP,num);
5347 goto insert_if_check_paren;
5348 }
5349 else if (RExC_parse[0] == 'D' &&
5350 RExC_parse[1] == 'E' &&
5351 RExC_parse[2] == 'F' &&
5352 RExC_parse[3] == 'I' &&
5353 RExC_parse[4] == 'N' &&
5354 RExC_parse[5] == 'E')
5355 {
5356 ret = reganode(pRExC_state,DEFINEP,0);
5357 RExC_parse +=6 ;
5358 is_define = 1;
5359 goto insert_if_check_paren;
5360 }
5361 else if (RExC_parse[0] == 'R') {
5362 RExC_parse++;
5363 parno = 0;
5364 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5365 parno = atoi(RExC_parse++);
5366 while (isDIGIT(*RExC_parse))
5367 RExC_parse++;
5368 } else if (RExC_parse[0] == '&') {
5369 SV *sv_dat;
5370 RExC_parse++;
5371 sv_dat = reg_scan_name(pRExC_state,
5372 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5373 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5374 }
1a147d38 5375 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5376 goto insert_if_check_paren;
5377 }
830247a4 5378 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5379 /* (?(1)...) */
6136c704 5380 char c;
830247a4 5381 parno = atoi(RExC_parse++);
c277df42 5382
830247a4
IZ
5383 while (isDIGIT(*RExC_parse))
5384 RExC_parse++;
fac92740 5385 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5386
0a4db386 5387 insert_if_check_paren:
830247a4 5388 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5389 vFAIL("Switch condition not recognized");
c277df42 5390 insert_if:
3dab1dad
YO
5391 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5392 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5393 if (br == NULL)
830247a4 5394 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5395 else
3dab1dad 5396 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5397 c = *nextchar(pRExC_state);
d1b80229
IZ
5398 if (flags&HASWIDTH)
5399 *flagp |= HASWIDTH;
c277df42 5400 if (c == '|') {
0a4db386
YO
5401 if (is_define)
5402 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5403 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5404 regbranch(pRExC_state, &flags, 1,depth+1);
5405 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5406 if (flags&HASWIDTH)
5407 *flagp |= HASWIDTH;
830247a4 5408 c = *nextchar(pRExC_state);
a0ed51b3
LW
5409 }
5410 else
c277df42
IZ
5411 lastbr = NULL;
5412 if (c != ')')
8615cb43 5413 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5414 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5415 REGTAIL(pRExC_state, br, ender);
c277df42 5416 if (lastbr) {
3dab1dad
YO
5417 REGTAIL(pRExC_state, lastbr, ender);
5418 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5419 }
5420 else
3dab1dad 5421 REGTAIL(pRExC_state, ret, ender);
c277df42 5422 return ret;
a0ed51b3
LW
5423 }
5424 else {
830247a4 5425 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5426 }
5427 }
1b1626e4 5428 case 0:
830247a4 5429 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5430 vFAIL("Sequence (? incomplete");
1b1626e4 5431 break;
a0d0e21e 5432 default:
830247a4 5433 --RExC_parse;
fac92740 5434 parse_flags: /* (?i) */
830247a4 5435 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5436 /* (?g), (?gc) and (?o) are useless here
5437 and must be globally applied -- japhy */
5438
5439 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5440 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5441 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5442 if (! (wastedflags & wflagbit) ) {
5443 wastedflags |= wflagbit;
5444 vWARN5(
5445 RExC_parse + 1,
5446 "Useless (%s%c) - %suse /%c modifier",
5447 flagsp == &negflags ? "?-" : "?",
5448 *RExC_parse,
5449 flagsp == &negflags ? "don't " : "",
5450 *RExC_parse
5451 );
5452 }
5453 }
5454 }
5455 else if (*RExC_parse == 'c') {
5456 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5457 if (! (wastedflags & WASTED_C) ) {
5458 wastedflags |= WASTED_GC;
9d1d55b5
JP
5459 vWARN3(
5460 RExC_parse + 1,
5461 "Useless (%sc) - %suse /gc modifier",
5462 flagsp == &negflags ? "?-" : "?",
5463 flagsp == &negflags ? "don't " : ""
5464 );
5465 }
5466 }
5467 }
5468 else { pmflag(flagsp, *RExC_parse); }
5469
830247a4 5470 ++RExC_parse;
ca9dfc88 5471 }
830247a4 5472 if (*RExC_parse == '-') {
ca9dfc88 5473 flagsp = &negflags;
9d1d55b5 5474 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5475 ++RExC_parse;
ca9dfc88 5476 goto parse_flags;
48c036b1 5477 }
e2509266
JH
5478 RExC_flags |= posflags;
5479 RExC_flags &= ~negflags;
830247a4
IZ
5480 if (*RExC_parse == ':') {
5481 RExC_parse++;
ca9dfc88
IZ
5482 paren = ':';
5483 break;
5484 }
c277df42 5485 unknown:
830247a4
IZ
5486 if (*RExC_parse != ')') {
5487 RExC_parse++;
5488 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5489 }
830247a4 5490 nextchar(pRExC_state);
a0d0e21e
LW
5491 *flagp = TRYAGAIN;
5492 return NULL;
5493 }
5494 }
fac92740 5495 else { /* (...) */
81714fb9 5496 capturing_parens:
830247a4
IZ
5497 parno = RExC_npar;
5498 RExC_npar++;
e2e6a0f1 5499
830247a4 5500 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5501 if (!SIZE_ONLY ){
5502 if (!RExC_nestroot)
5503 RExC_nestroot = parno;
5504 if (RExC_seen & REG_SEEN_RECURSE) {
5505 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5506 "Setting open paren #%"IVdf" to %d\n",
5507 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5508 RExC_open_parens[parno-1]= ret;
5509 }
6bda09f9 5510 }
fac92740
MJD
5511 Set_Node_Length(ret, 1); /* MJD */
5512 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5513 is_open = 1;
a0d0e21e 5514 }
a0ed51b3 5515 }
fac92740 5516 else /* ! paren */
a0d0e21e
LW
5517 ret = NULL;
5518
5519 /* Pick up the branches, linking them together. */
fac92740 5520 parse_start = RExC_parse; /* MJD */
3dab1dad 5521 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5522 /* branch_len = (paren != 0); */
2af232bd 5523
a0d0e21e
LW
5524 if (br == NULL)
5525 return(NULL);
830247a4
IZ
5526 if (*RExC_parse == '|') {
5527 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5528 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5529 }
fac92740 5530 else { /* MJD */
6bda09f9 5531 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5532 Set_Node_Length(br, paren != 0);
5533 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5534 }
c277df42
IZ
5535 have_branch = 1;
5536 if (SIZE_ONLY)
830247a4 5537 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5538 }
5539 else if (paren == ':') {
c277df42
IZ
5540 *flagp |= flags&SIMPLE;
5541 }
6136c704 5542 if (is_open) { /* Starts with OPEN. */
3dab1dad 5543 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5544 }
5545 else if (paren != '?') /* Not Conditional */
a0d0e21e 5546 ret = br;
32a0ca98 5547 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5548 lastbr = br;
830247a4
IZ
5549 while (*RExC_parse == '|') {
5550 if (!SIZE_ONLY && RExC_extralen) {
5551 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5552 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5553 }
5554 if (SIZE_ONLY)
830247a4
IZ
5555 RExC_extralen += 2; /* Account for LONGJMP. */
5556 nextchar(pRExC_state);
3dab1dad 5557 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5558
a687059c 5559 if (br == NULL)
a0d0e21e 5560 return(NULL);
3dab1dad 5561 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5562 lastbr = br;
821b33a5
IZ
5563 if (flags&HASWIDTH)
5564 *flagp |= HASWIDTH;
a687059c 5565 *flagp |= flags&SPSTART;
a0d0e21e
LW
5566 }
5567
c277df42
IZ
5568 if (have_branch || paren != ':') {
5569 /* Make a closing node, and hook it on the end. */
5570 switch (paren) {
5571 case ':':
830247a4 5572 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5573 break;
5574 case 1:
830247a4 5575 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5576 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5577 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5578 "Setting close paren #%"IVdf" to %d\n",
5579 (IV)parno, REG_NODE_NUM(ender)));
5580 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5581 if (RExC_nestroot == parno)
5582 RExC_nestroot = 0;
40d049e4 5583 }
fac92740
MJD
5584 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5585 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5586 break;
5587 case '<':
c277df42
IZ
5588 case ',':
5589 case '=':
5590 case '!':
c277df42 5591 *flagp &= ~HASWIDTH;
821b33a5
IZ
5592 /* FALL THROUGH */
5593 case '>':
830247a4 5594 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5595 break;
5596 case 0:
830247a4 5597 ender = reg_node(pRExC_state, END);
40d049e4
YO
5598 if (!SIZE_ONLY) {
5599 assert(!RExC_opend); /* there can only be one! */
5600 RExC_opend = ender;
5601 }
c277df42
IZ
5602 break;
5603 }
eaf3ca90 5604 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5605
9674d46a 5606 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5607 if (depth==1)
5608 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5609
c277df42 5610 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5611 for (br = ret; br; br = regnext(br)) {
5612 const U8 op = PL_regkind[OP(br)];
5613 if (op == BRANCH) {
07be1b83 5614 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5615 }
5616 else if (op == BRANCHJ) {
07be1b83 5617 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5618 }
c277df42
IZ
5619 }
5620 }
a0d0e21e 5621 }
c277df42
IZ
5622
5623 {
e1ec3a88
AL
5624 const char *p;
5625 static const char parens[] = "=!<,>";
c277df42
IZ
5626
5627 if (paren && (p = strchr(parens, paren))) {
eb160463 5628 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5629 int flag = (p - parens) > 1;
5630
5631 if (paren == '>')
5632 node = SUSPEND, flag = 0;
6bda09f9 5633 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5634 Set_Node_Cur_Length(ret);
5635 Set_Node_Offset(ret, parse_start + 1);
c277df42 5636 ret->flags = flag;
07be1b83 5637 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5638 }
a0d0e21e
LW
5639 }
5640
5641 /* Check for proper termination. */
ce3e6498 5642 if (paren) {
e2509266 5643 RExC_flags = oregflags;
830247a4
IZ
5644 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5645 RExC_parse = oregcomp_parse;
380a0633 5646 vFAIL("Unmatched (");
ce3e6498 5647 }
a0ed51b3 5648 }
830247a4
IZ
5649 else if (!paren && RExC_parse < RExC_end) {
5650 if (*RExC_parse == ')') {
5651 RExC_parse++;
380a0633 5652 vFAIL("Unmatched )");
a0ed51b3
LW
5653 }
5654 else
b45f050a 5655 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5656 /* NOTREACHED */
5657 }
a687059c 5658
a0d0e21e 5659 return(ret);
a687059c
LW
5660}
5661
5662/*
5663 - regbranch - one alternative of an | operator
5664 *
5665 * Implements the concatenation operator.
5666 */
76e3520e 5667STATIC regnode *
3dab1dad 5668S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5669{
97aff369 5670 dVAR;
c277df42
IZ
5671 register regnode *ret;
5672 register regnode *chain = NULL;
5673 register regnode *latest;
5674 I32 flags = 0, c = 0;
3dab1dad
YO
5675 GET_RE_DEBUG_FLAGS_DECL;
5676 DEBUG_PARSE("brnc");
b81d288d 5677 if (first)
c277df42
IZ
5678 ret = NULL;
5679 else {
b81d288d 5680 if (!SIZE_ONLY && RExC_extralen)
830247a4 5681 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5682 else {
830247a4 5683 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5684 Set_Node_Length(ret, 1);
5685 }
c277df42
IZ
5686 }
5687
b81d288d 5688 if (!first && SIZE_ONLY)
830247a4 5689 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5690
c277df42 5691 *flagp = WORST; /* Tentatively. */
a0d0e21e 5692
830247a4
IZ
5693 RExC_parse--;
5694 nextchar(pRExC_state);
5695 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5696 flags &= ~TRYAGAIN;
3dab1dad 5697 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5698 if (latest == NULL) {
5699 if (flags & TRYAGAIN)
5700 continue;
5701 return(NULL);
a0ed51b3
LW
5702 }
5703 else if (ret == NULL)
c277df42 5704 ret = latest;
a0d0e21e 5705 *flagp |= flags&HASWIDTH;
c277df42 5706 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5707 *flagp |= flags&SPSTART;
5708 else {
830247a4 5709 RExC_naughty++;
3dab1dad 5710 REGTAIL(pRExC_state, chain, latest);
a687059c 5711 }
a0d0e21e 5712 chain = latest;
c277df42
IZ
5713 c++;
5714 }
5715 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5716 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5717 if (ret == NULL)
5718 ret = chain;
5719 }
5720 if (c == 1) {
5721 *flagp |= flags&SIMPLE;
a0d0e21e 5722 }
a687059c 5723
d4c19fe8 5724 return ret;
a687059c
LW
5725}
5726
5727/*
5728 - regpiece - something followed by possible [*+?]
5729 *
5730 * Note that the branching code sequences used for ? and the general cases
5731 * of * and + are somewhat optimized: they use the same NOTHING node as
5732 * both the endmarker for their branch list and the body of the last branch.
5733 * It might seem that this node could be dispensed with entirely, but the
5734 * endmarker role is not redundant.
5735 */
76e3520e 5736STATIC regnode *
3dab1dad 5737S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5738{
97aff369 5739 dVAR;
c277df42 5740 register regnode *ret;
a0d0e21e
LW
5741 register char op;
5742 register char *next;
5743 I32 flags;
1df70142 5744 const char * const origparse = RExC_parse;
a0d0e21e 5745 I32 min;
c277df42 5746 I32 max = REG_INFTY;
fac92740 5747 char *parse_start;
10edeb5d 5748 const char *maxpos = NULL;
3dab1dad
YO
5749 GET_RE_DEBUG_FLAGS_DECL;
5750 DEBUG_PARSE("piec");
a0d0e21e 5751
3dab1dad 5752 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5753 if (ret == NULL) {
5754 if (flags & TRYAGAIN)
5755 *flagp |= TRYAGAIN;
5756 return(NULL);
5757 }
5758
830247a4 5759 op = *RExC_parse;
a0d0e21e 5760
830247a4 5761 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5762 maxpos = NULL;
fac92740 5763 parse_start = RExC_parse; /* MJD */
830247a4 5764 next = RExC_parse + 1;
a0d0e21e
LW
5765 while (isDIGIT(*next) || *next == ',') {
5766 if (*next == ',') {
5767 if (maxpos)
5768 break;
5769 else
5770 maxpos = next;
a687059c 5771 }
a0d0e21e
LW
5772 next++;
5773 }
5774 if (*next == '}') { /* got one */
5775 if (!maxpos)
5776 maxpos = next;
830247a4
IZ
5777 RExC_parse++;
5778 min = atoi(RExC_parse);
a0d0e21e
LW
5779 if (*maxpos == ',')
5780 maxpos++;
5781 else
830247a4 5782 maxpos = RExC_parse;
a0d0e21e
LW
5783 max = atoi(maxpos);
5784 if (!max && *maxpos != '0')
c277df42
IZ
5785 max = REG_INFTY; /* meaning "infinity" */
5786 else if (max >= REG_INFTY)
8615cb43 5787 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5788 RExC_parse = next;
5789 nextchar(pRExC_state);
a0d0e21e
LW
5790
5791 do_curly:
5792 if ((flags&SIMPLE)) {
830247a4 5793 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5794 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5795 Set_Node_Offset(ret, parse_start+1); /* MJD */
5796 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5797 }
5798 else {
3dab1dad 5799 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5800
5801 w->flags = 0;
3dab1dad 5802 REGTAIL(pRExC_state, ret, w);
830247a4 5803 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5804 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5805 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5806 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5807 }
6bda09f9 5808 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5809 /* MJD hk */
5810 Set_Node_Offset(ret, parse_start+1);
2af232bd 5811 Set_Node_Length(ret,
fac92740 5812 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5813
830247a4 5814 if (!SIZE_ONLY && RExC_extralen)
c277df42 5815 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5816 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5817 if (SIZE_ONLY)
830247a4
IZ
5818 RExC_whilem_seen++, RExC_extralen += 3;
5819 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5820 }
c277df42 5821 ret->flags = 0;
a0d0e21e
LW
5822
5823 if (min > 0)
821b33a5
IZ
5824 *flagp = WORST;
5825 if (max > 0)
5826 *flagp |= HASWIDTH;
a0d0e21e 5827 if (max && max < min)
8615cb43 5828 vFAIL("Can't do {n,m} with n > m");
c277df42 5829 if (!SIZE_ONLY) {
eb160463
GS
5830 ARG1_SET(ret, (U16)min);
5831 ARG2_SET(ret, (U16)max);
a687059c 5832 }
a687059c 5833
a0d0e21e 5834 goto nest_check;
a687059c 5835 }
a0d0e21e 5836 }
a687059c 5837
a0d0e21e
LW
5838 if (!ISMULT1(op)) {
5839 *flagp = flags;
a687059c 5840 return(ret);
a0d0e21e 5841 }
bb20fd44 5842
c277df42 5843#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5844
5845 /* if this is reinstated, don't forget to put this back into perldiag:
5846
5847 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5848
5849 (F) The part of the regexp subject to either the * or + quantifier
5850 could match an empty string. The {#} shows in the regular
5851 expression about where the problem was discovered.
5852
5853 */
5854
bb20fd44 5855 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5856 vFAIL("Regexp *+ operand could be empty");
b81d288d 5857#endif
bb20fd44 5858
fac92740 5859 parse_start = RExC_parse;
830247a4 5860 nextchar(pRExC_state);
a0d0e21e 5861
821b33a5 5862 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5863
5864 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5865 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5866 ret->flags = 0;
830247a4 5867 RExC_naughty += 4;
a0d0e21e
LW
5868 }
5869 else if (op == '*') {
5870 min = 0;
5871 goto do_curly;
a0ed51b3
LW
5872 }
5873 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5874 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5875 ret->flags = 0;
830247a4 5876 RExC_naughty += 3;
a0d0e21e
LW
5877 }
5878 else if (op == '+') {
5879 min = 1;
5880 goto do_curly;
a0ed51b3
LW
5881 }
5882 else if (op == '?') {
a0d0e21e
LW
5883 min = 0; max = 1;
5884 goto do_curly;
5885 }
5886 nest_check:
041457d9 5887 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5888 vWARN3(RExC_parse,
b45f050a 5889 "%.*s matches null string many times",
afd78fd5 5890 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5891 origparse);
a0d0e21e
LW
5892 }
5893
b9b4dddf 5894 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5895 nextchar(pRExC_state);
6bda09f9 5896 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5897 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5898 }
b9b4dddf
YO
5899#ifndef REG_ALLOW_MINMOD_SUSPEND
5900 else
5901#endif
5902 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5903 regnode *ender;
5904 nextchar(pRExC_state);
5905 ender = reg_node(pRExC_state, SUCCEED);
5906 REGTAIL(pRExC_state, ret, ender);
5907 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5908 ret->flags = 0;
5909 ender = reg_node(pRExC_state, TAIL);
5910 REGTAIL(pRExC_state, ret, ender);
5911 /*ret= ender;*/
5912 }
5913
5914 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5915 RExC_parse++;
b45f050a
JF
5916 vFAIL("Nested quantifiers");
5917 }
a0d0e21e
LW
5918
5919 return(ret);
a687059c
LW
5920}
5921
fc8cd66c
YO
5922
5923/* reg_namedseq(pRExC_state,UVp)
5924
5925 This is expected to be called by a parser routine that has
5926 recognized'\N' and needs to handle the rest. RExC_parse is
5927 expected to point at the first char following the N at the time
5928 of the call.
5929
5930 If valuep is non-null then it is assumed that we are parsing inside
5931 of a charclass definition and the first codepoint in the resolved
5932 string is returned via *valuep and the routine will return NULL.
5933 In this mode if a multichar string is returned from the charnames
5934 handler a warning will be issued, and only the first char in the
5935 sequence will be examined. If the string returned is zero length
5936 then the value of *valuep is undefined and NON-NULL will
5937 be returned to indicate failure. (This will NOT be a valid pointer
5938 to a regnode.)
5939
5940 If value is null then it is assumed that we are parsing normal text
5941 and inserts a new EXACT node into the program containing the resolved
5942 string and returns a pointer to the new node. If the string is
5943 zerolength a NOTHING node is emitted.
5944
5945 On success RExC_parse is set to the char following the endbrace.
5946 Parsing failures will generate a fatal errorvia vFAIL(...)
5947
5948 NOTE: We cache all results from the charnames handler locally in
5949 the RExC_charnames hash (created on first use) to prevent a charnames
5950 handler from playing silly-buggers and returning a short string and
5951 then a long string for a given pattern. Since the regexp program
5952 size is calculated during an initial parse this would result
5953 in a buffer overrun so we cache to prevent the charname result from
5954 changing during the course of the parse.
5955
5956 */
5957STATIC regnode *
5958S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5959{
5960 char * name; /* start of the content of the name */
5961 char * endbrace; /* endbrace following the name */
5962 SV *sv_str = NULL;
5963 SV *sv_name = NULL;
5964 STRLEN len; /* this has various purposes throughout the code */
5965 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5966 regnode *ret = NULL;
5967
5968 if (*RExC_parse != '{') {
5969 vFAIL("Missing braces on \\N{}");
5970 }
5971 name = RExC_parse+1;
5972 endbrace = strchr(RExC_parse, '}');
5973 if ( ! endbrace ) {
5974 RExC_parse++;
5975 vFAIL("Missing right brace on \\N{}");
5976 }
5977 RExC_parse = endbrace + 1;
5978
5979
5980 /* RExC_parse points at the beginning brace,
5981 endbrace points at the last */
5982 if ( name[0]=='U' && name[1]=='+' ) {
5983 /* its a "unicode hex" notation {U+89AB} */
5984 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5985 | PERL_SCAN_DISALLOW_PREFIX
5986 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5987 UV cp;
196f1508 5988 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5989 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5990 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5991 cp = 0xFFFD;
5992 }
5993 if (cp > 0xff)
5994 RExC_utf8 = 1;
5995 if ( valuep ) {
5996 *valuep = cp;
5997 return NULL;
5998 }
5999 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6000 } else {
6001 /* fetch the charnames handler for this scope */
6002 HV * const table = GvHV(PL_hintgv);
6003 SV **cvp= table ?
6004 hv_fetchs(table, "charnames", FALSE) :
6005 NULL;
6006 SV *cv= cvp ? *cvp : NULL;
6007 HE *he_str;
6008 int count;
6009 /* create an SV with the name as argument */
6010 sv_name = newSVpvn(name, endbrace - name);
6011
6012 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6013 vFAIL2("Constant(\\N{%s}) unknown: "
6014 "(possibly a missing \"use charnames ...\")",
6015 SvPVX(sv_name));
6016 }
6017 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6018 vFAIL2("Constant(\\N{%s}): "
6019 "$^H{charnames} is not defined",SvPVX(sv_name));
6020 }
6021
6022
6023
6024 if (!RExC_charnames) {
6025 /* make sure our cache is allocated */
6026 RExC_charnames = newHV();
6bda09f9 6027 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6028 }
6029 /* see if we have looked this one up before */
6030 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6031 if ( he_str ) {
6032 sv_str = HeVAL(he_str);
6033 cached = 1;
6034 } else {
6035 dSP ;
6036
6037 ENTER ;
6038 SAVETMPS ;
6039 PUSHMARK(SP) ;
6040
6041 XPUSHs(sv_name);
6042
6043 PUTBACK ;
6044
6045 count= call_sv(cv, G_SCALAR);
6046
6047 if (count == 1) { /* XXXX is this right? dmq */
6048 sv_str = POPs;
6049 SvREFCNT_inc_simple_void(sv_str);
6050 }
6051
6052 SPAGAIN ;
6053 PUTBACK ;
6054 FREETMPS ;
6055 LEAVE ;
6056
6057 if ( !sv_str || !SvOK(sv_str) ) {
6058 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6059 "did not return a defined value",SvPVX(sv_name));
6060 }
6061 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6062 cached = 1;
6063 }
6064 }
6065 if (valuep) {
6066 char *p = SvPV(sv_str, len);
6067 if (len) {
6068 STRLEN numlen = 1;
6069 if ( SvUTF8(sv_str) ) {
196f1508 6070 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6071 if (*valuep > 0x7F)
6072 RExC_utf8 = 1;
6073 /* XXXX
6074 We have to turn on utf8 for high bit chars otherwise
6075 we get failures with
6076
6077 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6078 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6079
6080 This is different from what \x{} would do with the same
6081 codepoint, where the condition is > 0xFF.
6082 - dmq
6083 */
6084
6085
6086 } else {
6087 *valuep = (UV)*p;
6088 /* warn if we havent used the whole string? */
6089 }
6090 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6091 vWARN2(RExC_parse,
6092 "Ignoring excess chars from \\N{%s} in character class",
6093 SvPVX(sv_name)
6094 );
6095 }
6096 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6097 vWARN2(RExC_parse,
6098 "Ignoring zero length \\N{%s} in character class",
6099 SvPVX(sv_name)
6100 );
6101 }
6102 if (sv_name)
6103 SvREFCNT_dec(sv_name);
6104 if (!cached)
6105 SvREFCNT_dec(sv_str);
6106 return len ? NULL : (regnode *)&len;
6107 } else if(SvCUR(sv_str)) {
6108
6109 char *s;
6110 char *p, *pend;
6111 STRLEN charlen = 1;
6112 char * parse_start = name-3; /* needed for the offsets */
6113 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6114
6115 ret = reg_node(pRExC_state,
6116 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6117 s= STRING(ret);
6118
6119 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6120 sv_utf8_upgrade(sv_str);
6121 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6122 RExC_utf8= 1;
6123 }
6124
6125 p = SvPV(sv_str, len);
6126 pend = p + len;
6127 /* len is the length written, charlen is the size the char read */
6128 for ( len = 0; p < pend; p += charlen ) {
6129 if (UTF) {
196f1508 6130 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6131 if (FOLD) {
6132 STRLEN foldlen,numlen;
6133 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6134 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6135 /* Emit all the Unicode characters. */
6136
6137 for (foldbuf = tmpbuf;
6138 foldlen;
6139 foldlen -= numlen)
6140 {
6141 uvc = utf8_to_uvchr(foldbuf, &numlen);
6142 if (numlen > 0) {
6143 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6144 s += unilen;
6145 len += unilen;
6146 /* In EBCDIC the numlen
6147 * and unilen can differ. */
6148 foldbuf += numlen;
6149 if (numlen >= foldlen)
6150 break;
6151 }
6152 else
6153 break; /* "Can't happen." */
6154 }
6155 } else {
6156 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6157 if (unilen > 0) {
6158 s += unilen;
6159 len += unilen;
6160 }
6161 }
6162 } else {
6163 len++;
6164 REGC(*p, s++);
6165 }
6166 }
6167 if (SIZE_ONLY) {
6168 RExC_size += STR_SZ(len);
6169 } else {
6170 STR_LEN(ret) = len;
6171 RExC_emit += STR_SZ(len);
6172 }
6173 Set_Node_Cur_Length(ret); /* MJD */
6174 RExC_parse--;
6175 nextchar(pRExC_state);
6176 } else {
6177 ret = reg_node(pRExC_state,NOTHING);
6178 }
6179 if (!cached) {
6180 SvREFCNT_dec(sv_str);
6181 }
6182 if (sv_name) {
6183 SvREFCNT_dec(sv_name);
6184 }
6185 return ret;
6186
6187}
6188
6189
9e08bc66
TS
6190/*
6191 * reg_recode
6192 *
6193 * It returns the code point in utf8 for the value in *encp.
6194 * value: a code value in the source encoding
6195 * encp: a pointer to an Encode object
6196 *
6197 * If the result from Encode is not a single character,
6198 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6199 */
6200STATIC UV
6201S_reg_recode(pTHX_ const char value, SV **encp)
6202{
6203 STRLEN numlen = 1;
6204 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6205 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6206 : SvPVX(sv);
6207 const STRLEN newlen = SvCUR(sv);
6208 UV uv = UNICODE_REPLACEMENT;
6209
6210 if (newlen)
6211 uv = SvUTF8(sv)
6212 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6213 : *(U8*)s;
6214
6215 if (!newlen || numlen != newlen) {
6216 uv = UNICODE_REPLACEMENT;
6217 if (encp)
6218 *encp = NULL;
6219 }
6220 return uv;
6221}
6222
fc8cd66c 6223
a687059c
LW
6224/*
6225 - regatom - the lowest level
6226 *
6227 * Optimization: gobbles an entire sequence of ordinary characters so that
6228 * it can turn them into a single node, which is smaller to store and
6229 * faster to run. Backslashed characters are exceptions, each becoming a
6230 * separate node; the code is simpler that way and it's not worth fixing.
6231 *
7f6f358c
YO
6232 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6233 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6234 */
76e3520e 6235STATIC regnode *
3dab1dad 6236S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6237{
97aff369 6238 dVAR;
cbbf8932 6239 register regnode *ret = NULL;
a0d0e21e 6240 I32 flags;
45948336 6241 char *parse_start = RExC_parse;
3dab1dad
YO
6242 GET_RE_DEBUG_FLAGS_DECL;
6243 DEBUG_PARSE("atom");
a0d0e21e
LW
6244 *flagp = WORST; /* Tentatively. */
6245
6246tryagain:
830247a4 6247 switch (*RExC_parse) {
a0d0e21e 6248 case '^':
830247a4
IZ
6249 RExC_seen_zerolen++;
6250 nextchar(pRExC_state);
bbe252da 6251 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6252 ret = reg_node(pRExC_state, MBOL);
bbe252da 6253 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6254 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6255 else
830247a4 6256 ret = reg_node(pRExC_state, BOL);
fac92740 6257 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6258 break;
6259 case '$':
830247a4 6260 nextchar(pRExC_state);
b81d288d 6261 if (*RExC_parse)
830247a4 6262 RExC_seen_zerolen++;
bbe252da 6263 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6264 ret = reg_node(pRExC_state, MEOL);
bbe252da 6265 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6266 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6267 else
830247a4 6268 ret = reg_node(pRExC_state, EOL);
fac92740 6269 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6270 break;
6271 case '.':
830247a4 6272 nextchar(pRExC_state);
bbe252da 6273 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6274 ret = reg_node(pRExC_state, SANY);
6275 else
6276 ret = reg_node(pRExC_state, REG_ANY);
6277 *flagp |= HASWIDTH|SIMPLE;
830247a4 6278 RExC_naughty++;
fac92740 6279 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6280 break;
6281 case '[':
b45f050a 6282 {
3dab1dad
YO
6283 char * const oregcomp_parse = ++RExC_parse;
6284 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6285 if (*RExC_parse != ']') {
6286 RExC_parse = oregcomp_parse;
b45f050a
JF
6287 vFAIL("Unmatched [");
6288 }
830247a4 6289 nextchar(pRExC_state);
a0d0e21e 6290 *flagp |= HASWIDTH|SIMPLE;
fac92740 6291 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6292 break;
b45f050a 6293 }
a0d0e21e 6294 case '(':
830247a4 6295 nextchar(pRExC_state);
3dab1dad 6296 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6297 if (ret == NULL) {
bf93d4cc 6298 if (flags & TRYAGAIN) {
830247a4 6299 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6300 /* Make parent create an empty node if needed. */
6301 *flagp |= TRYAGAIN;
6302 return(NULL);
6303 }
a0d0e21e 6304 goto tryagain;
bf93d4cc 6305 }
a0d0e21e
LW
6306 return(NULL);
6307 }
c277df42 6308 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6309 break;
6310 case '|':
6311 case ')':
6312 if (flags & TRYAGAIN) {
6313 *flagp |= TRYAGAIN;
6314 return NULL;
6315 }
b45f050a 6316 vFAIL("Internal urp");
a0d0e21e
LW
6317 /* Supposed to be caught earlier. */
6318 break;
85afd4ae 6319 case '{':
830247a4
IZ
6320 if (!regcurly(RExC_parse)) {
6321 RExC_parse++;
85afd4ae
CS
6322 goto defchar;
6323 }
6324 /* FALL THROUGH */
a0d0e21e
LW
6325 case '?':
6326 case '+':
6327 case '*':
830247a4 6328 RExC_parse++;
b45f050a 6329 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6330 break;
6331 case '\\':
830247a4 6332 switch (*++RExC_parse) {
a0d0e21e 6333 case 'A':
830247a4
IZ
6334 RExC_seen_zerolen++;
6335 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6336 *flagp |= SIMPLE;
830247a4 6337 nextchar(pRExC_state);
fac92740 6338 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6339 break;
6340 case 'G':
830247a4
IZ
6341 ret = reg_node(pRExC_state, GPOS);
6342 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6343 *flagp |= SIMPLE;
830247a4 6344 nextchar(pRExC_state);
fac92740 6345 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6346 break;
6347 case 'Z':
830247a4 6348 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6349 *flagp |= SIMPLE;
a1917ab9 6350 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6351 nextchar(pRExC_state);
a0d0e21e 6352 break;
b85d18e9 6353 case 'z':
830247a4 6354 ret = reg_node(pRExC_state, EOS);
b85d18e9 6355 *flagp |= SIMPLE;
830247a4
IZ
6356 RExC_seen_zerolen++; /* Do not optimize RE away */
6357 nextchar(pRExC_state);
fac92740 6358 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6359 break;
4a2d328f 6360 case 'C':
f33976b4
DB
6361 ret = reg_node(pRExC_state, CANY);
6362 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6363 *flagp |= HASWIDTH|SIMPLE;
830247a4 6364 nextchar(pRExC_state);
fac92740 6365 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6366 break;
6367 case 'X':
830247a4 6368 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6369 *flagp |= HASWIDTH;
830247a4 6370 nextchar(pRExC_state);
fac92740 6371 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6372 break;
a0d0e21e 6373 case 'w':
eb160463 6374 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6375 *flagp |= HASWIDTH|SIMPLE;
830247a4 6376 nextchar(pRExC_state);
fac92740 6377 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6378 break;
6379 case 'W':
eb160463 6380 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6381 *flagp |= HASWIDTH|SIMPLE;
830247a4 6382 nextchar(pRExC_state);
fac92740 6383 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6384 break;
6385 case 'b':
830247a4
IZ
6386 RExC_seen_zerolen++;
6387 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6388 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6389 *flagp |= SIMPLE;
830247a4 6390 nextchar(pRExC_state);
fac92740 6391 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6392 break;
6393 case 'B':
830247a4
IZ
6394 RExC_seen_zerolen++;
6395 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6396 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6397 *flagp |= SIMPLE;
830247a4 6398 nextchar(pRExC_state);
fac92740 6399 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6400 break;
6401 case 's':
eb160463 6402 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6403 *flagp |= HASWIDTH|SIMPLE;
830247a4 6404 nextchar(pRExC_state);
fac92740 6405 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6406 break;
6407 case 'S':
eb160463 6408 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6409 *flagp |= HASWIDTH|SIMPLE;
830247a4 6410 nextchar(pRExC_state);
fac92740 6411 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6412 break;
6413 case 'd':
ffc61ed2 6414 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6415 *flagp |= HASWIDTH|SIMPLE;
830247a4 6416 nextchar(pRExC_state);
fac92740 6417 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6418 break;
6419 case 'D':
ffc61ed2 6420 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6421 *flagp |= HASWIDTH|SIMPLE;
830247a4 6422 nextchar(pRExC_state);
fac92740 6423 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6424 break;
a14b48bc
LW
6425 case 'p':
6426 case 'P':
3568d838 6427 {
3dab1dad 6428 char* const oldregxend = RExC_end;
ccb2c380 6429 char* parse_start = RExC_parse - 2;
a14b48bc 6430
830247a4 6431 if (RExC_parse[1] == '{') {
3568d838 6432 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6433 RExC_end = strchr(RExC_parse, '}');
6434 if (!RExC_end) {
3dab1dad 6435 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6436 RExC_parse += 2;
6437 RExC_end = oldregxend;
0da60cf5 6438 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6439 }
830247a4 6440 RExC_end++;
a14b48bc 6441 }
af6f566e 6442 else {
830247a4 6443 RExC_end = RExC_parse + 2;
af6f566e
HS
6444 if (RExC_end > oldregxend)
6445 RExC_end = oldregxend;
6446 }
830247a4 6447 RExC_parse--;
a14b48bc 6448
3dab1dad 6449 ret = regclass(pRExC_state,depth+1);
a14b48bc 6450
830247a4
IZ
6451 RExC_end = oldregxend;
6452 RExC_parse--;
ccb2c380
MP
6453
6454 Set_Node_Offset(ret, parse_start + 2);
6455 Set_Node_Cur_Length(ret);
830247a4 6456 nextchar(pRExC_state);
a14b48bc
LW
6457 *flagp |= HASWIDTH|SIMPLE;
6458 }
6459 break;
fc8cd66c
YO
6460 case 'N':
6461 /* Handle \N{NAME} here and not below because it can be
6462 multicharacter. join_exact() will join them up later on.
6463 Also this makes sure that things like /\N{BLAH}+/ and
6464 \N{BLAH} being multi char Just Happen. dmq*/
6465 ++RExC_parse;
6466 ret= reg_namedseq(pRExC_state, NULL);
6467 break;
0a4db386 6468 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6469 parse_named_seq:
81714fb9
YO
6470 {
6471 char ch= RExC_parse[1];
1f1031fe
YO
6472 if (ch != '<' && ch != '\'' && ch != '{') {
6473 RExC_parse++;
6474 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6475 } else {
1f1031fe
YO
6476 /* this pretty much dupes the code for (?P=...) in reg(), if
6477 you change this make sure you change that */
81714fb9 6478 char* name_start = (RExC_parse += 2);
2eccd3b2 6479 U32 num = 0;
0a4db386
YO
6480 SV *sv_dat = reg_scan_name(pRExC_state,
6481 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6482 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6483 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6484 vFAIL2("Sequence %.3s... not terminated",parse_start);
6485
6486 if (!SIZE_ONLY) {
6487 num = add_data( pRExC_state, 1, "S" );
6488 RExC_rxi->data->data[num]=(void*)sv_dat;
6489 SvREFCNT_inc(sv_dat);
6490 }
6491
81714fb9
YO
6492 RExC_sawback = 1;
6493 ret = reganode(pRExC_state,
6494 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6495 num);
6496 *flagp |= HASWIDTH;
1f1031fe 6497
81714fb9
YO
6498 /* override incorrect value set in reganode MJD */
6499 Set_Node_Offset(ret, parse_start+1);
6500 Set_Node_Cur_Length(ret); /* MJD */
6501 nextchar(pRExC_state);
1f1031fe 6502
81714fb9
YO
6503 }
6504 break;
1f1031fe 6505 }
a0d0e21e
LW
6506 case 'n':
6507 case 'r':
6508 case 't':
6509 case 'f':
6510 case 'e':
6511 case 'a':
6512 case 'x':
6513 case 'c':
6514 case '0':
6515 goto defchar;
2bf803e2 6516 case 'g':
a0d0e21e
LW
6517 case '1': case '2': case '3': case '4':
6518 case '5': case '6': case '7': case '8': case '9':
6519 {
c74340f9 6520 I32 num;
2bf803e2
YO
6521 bool isg = *RExC_parse == 'g';
6522 bool isrel = 0;
6523 bool hasbrace = 0;
6524 if (isg) {
c74340f9 6525 RExC_parse++;
2bf803e2
YO
6526 if (*RExC_parse == '{') {
6527 RExC_parse++;
6528 hasbrace = 1;
6529 }
6530 if (*RExC_parse == '-') {
6531 RExC_parse++;
6532 isrel = 1;
6533 }
1f1031fe
YO
6534 if (hasbrace && !isDIGIT(*RExC_parse)) {
6535 if (isrel) RExC_parse--;
6536 RExC_parse -= 2;
6537 goto parse_named_seq;
6538 } }
c74340f9
YO
6539 num = atoi(RExC_parse);
6540 if (isrel) {
5624f11d 6541 num = RExC_npar - num;
c74340f9
YO
6542 if (num < 1)
6543 vFAIL("Reference to nonexistent or unclosed group");
6544 }
2bf803e2 6545 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6546 goto defchar;
6547 else {
3dab1dad 6548 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6549 while (isDIGIT(*RExC_parse))
6550 RExC_parse++;
1f1031fe
YO
6551 if (parse_start == RExC_parse - 1)
6552 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6553 if (hasbrace) {
6554 if (*RExC_parse != '}')
6555 vFAIL("Unterminated \\g{...} pattern");
6556 RExC_parse++;
6557 }
c74340f9
YO
6558 if (!SIZE_ONLY) {
6559 if (num > (I32)RExC_rx->nparens)
6560 vFAIL("Reference to nonexistent group");
c74340f9 6561 }
830247a4 6562 RExC_sawback = 1;
eb160463
GS
6563 ret = reganode(pRExC_state,
6564 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6565 num);
a0d0e21e 6566 *flagp |= HASWIDTH;
2af232bd 6567
fac92740 6568 /* override incorrect value set in reganode MJD */
2af232bd 6569 Set_Node_Offset(ret, parse_start+1);
fac92740 6570 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6571 RExC_parse--;
6572 nextchar(pRExC_state);
a0d0e21e
LW
6573 }
6574 }
6575 break;
6576 case '\0':
830247a4 6577 if (RExC_parse >= RExC_end)
b45f050a 6578 FAIL("Trailing \\");
a0d0e21e
LW
6579 /* FALL THROUGH */
6580 default:
a0288114 6581 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6582 back into the quick-grab loop below */
45948336 6583 parse_start--;
a0d0e21e
LW
6584 goto defchar;
6585 }
6586 break;
4633a7c4
LW
6587
6588 case '#':
bbe252da 6589 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6590 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6591 RExC_parse++;
830247a4 6592 if (RExC_parse < RExC_end)
4633a7c4
LW
6593 goto tryagain;
6594 }
6595 /* FALL THROUGH */
6596
a0d0e21e 6597 default: {
ba210ebe 6598 register STRLEN len;
58ae7d3f 6599 register UV ender;
a0d0e21e 6600 register char *p;
3dab1dad 6601 char *s;
80aecb99 6602 STRLEN foldlen;
89ebb4a3 6603 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6604
6605 parse_start = RExC_parse - 1;
a0d0e21e 6606
830247a4 6607 RExC_parse++;
a0d0e21e
LW
6608
6609 defchar:
58ae7d3f 6610 ender = 0;
eb160463
GS
6611 ret = reg_node(pRExC_state,
6612 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6613 s = STRING(ret);
830247a4
IZ
6614 for (len = 0, p = RExC_parse - 1;
6615 len < 127 && p < RExC_end;
a0d0e21e
LW
6616 len++)
6617 {
3dab1dad 6618 char * const oldp = p;
5b5a24f7 6619
bbe252da 6620 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6621 p = regwhite(p, RExC_end);
a0d0e21e
LW
6622 switch (*p) {
6623 case '^':
6624 case '$':
6625 case '.':
6626 case '[':
6627 case '(':
6628 case ')':
6629 case '|':
6630 goto loopdone;
6631 case '\\':
6632 switch (*++p) {
6633 case 'A':
1ed8eac0
JF
6634 case 'C':
6635 case 'X':
a0d0e21e 6636 case 'G':
2bf803e2 6637 case 'g':
a0d0e21e 6638 case 'Z':
b85d18e9 6639 case 'z':
a0d0e21e
LW
6640 case 'w':
6641 case 'W':
6642 case 'b':
6643 case 'B':
6644 case 's':
6645 case 'S':
6646 case 'd':
6647 case 'D':
a14b48bc
LW
6648 case 'p':
6649 case 'P':
fc8cd66c 6650 case 'N':
c74340f9 6651 case 'R':
99ed40ff 6652 case 'k':
a0d0e21e
LW
6653 --p;
6654 goto loopdone;
6655 case 'n':
6656 ender = '\n';
6657 p++;
a687059c 6658 break;
a0d0e21e
LW
6659 case 'r':
6660 ender = '\r';
6661 p++;
a687059c 6662 break;
a0d0e21e
LW
6663 case 't':
6664 ender = '\t';
6665 p++;
a687059c 6666 break;
a0d0e21e
LW
6667 case 'f':
6668 ender = '\f';
6669 p++;
a687059c 6670 break;
a0d0e21e 6671 case 'e':
c7f1f016 6672 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6673 p++;
a687059c 6674 break;
a0d0e21e 6675 case 'a':
c7f1f016 6676 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6677 p++;
a687059c 6678 break;
a0d0e21e 6679 case 'x':
a0ed51b3 6680 if (*++p == '{') {
1df70142 6681 char* const e = strchr(p, '}');
b81d288d 6682
b45f050a 6683 if (!e) {
830247a4 6684 RExC_parse = p + 1;
b45f050a
JF
6685 vFAIL("Missing right brace on \\x{}");
6686 }
de5f0749 6687 else {
a4c04bdc
NC
6688 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6689 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6690 STRLEN numlen = e - p - 1;
53305cf1 6691 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6692 if (ender > 0xff)
6693 RExC_utf8 = 1;
a0ed51b3
LW
6694 p = e + 1;
6695 }
a0ed51b3
LW
6696 }
6697 else {
a4c04bdc 6698 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6699 STRLEN numlen = 2;
53305cf1 6700 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6701 p += numlen;
6702 }
9e08bc66
TS
6703 if (PL_encoding && ender < 0x100)
6704 goto recode_encoding;
a687059c 6705 break;
a0d0e21e
LW
6706 case 'c':
6707 p++;
bbce6d69 6708 ender = UCHARAT(p++);
6709 ender = toCTRL(ender);
a687059c 6710 break;
a0d0e21e
LW
6711 case '0': case '1': case '2': case '3':case '4':
6712 case '5': case '6': case '7': case '8':case '9':
6713 if (*p == '0' ||
830247a4 6714 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6715 I32 flags = 0;
1df70142 6716 STRLEN numlen = 3;
53305cf1 6717 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6718 p += numlen;
6719 }
6720 else {
6721 --p;
6722 goto loopdone;
a687059c 6723 }
9e08bc66
TS
6724 if (PL_encoding && ender < 0x100)
6725 goto recode_encoding;
6726 break;
6727 recode_encoding:
6728 {
6729 SV* enc = PL_encoding;
6730 ender = reg_recode((const char)(U8)ender, &enc);
6731 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6732 vWARN(p, "Invalid escape in the specified encoding");
6733 RExC_utf8 = 1;
6734 }
a687059c 6735 break;
a0d0e21e 6736 case '\0':
830247a4 6737 if (p >= RExC_end)
b45f050a 6738 FAIL("Trailing \\");
a687059c 6739 /* FALL THROUGH */
a0d0e21e 6740 default:
041457d9 6741 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6742 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6743 goto normal_default;
a0d0e21e
LW
6744 }
6745 break;
a687059c 6746 default:
a0ed51b3 6747 normal_default:
fd400ab9 6748 if (UTF8_IS_START(*p) && UTF) {
1df70142 6749 STRLEN numlen;
5e12f4fb 6750 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6751 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6752 p += numlen;
6753 }
6754 else
6755 ender = *p++;
a0d0e21e 6756 break;
a687059c 6757 }
bbe252da 6758 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6759 p = regwhite(p, RExC_end);
60a8b682
JH
6760 if (UTF && FOLD) {
6761 /* Prime the casefolded buffer. */
ac7e0132 6762 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6763 }
a0d0e21e
LW
6764 if (ISMULT2(p)) { /* Back off on ?+*. */
6765 if (len)
6766 p = oldp;
16ea2a2e 6767 else if (UTF) {
80aecb99 6768 if (FOLD) {
60a8b682 6769 /* Emit all the Unicode characters. */
1df70142 6770 STRLEN numlen;
80aecb99
JH
6771 for (foldbuf = tmpbuf;
6772 foldlen;
6773 foldlen -= numlen) {
6774 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6775 if (numlen > 0) {
71207a34 6776 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6777 s += unilen;
6778 len += unilen;
6779 /* In EBCDIC the numlen
6780 * and unilen can differ. */
9dc45d57 6781 foldbuf += numlen;
47654450
JH
6782 if (numlen >= foldlen)
6783 break;
9dc45d57
JH
6784 }
6785 else
6786 break; /* "Can't happen." */
80aecb99
JH
6787 }
6788 }
6789 else {
71207a34 6790 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6791 if (unilen > 0) {
0ebc6274
JH
6792 s += unilen;
6793 len += unilen;
9dc45d57 6794 }
80aecb99 6795 }
a0ed51b3 6796 }
a0d0e21e
LW
6797 else {
6798 len++;
eb160463 6799 REGC((char)ender, s++);
a0d0e21e
LW
6800 }
6801 break;
a687059c 6802 }
16ea2a2e 6803 if (UTF) {
80aecb99 6804 if (FOLD) {
60a8b682 6805 /* Emit all the Unicode characters. */
1df70142 6806 STRLEN numlen;
80aecb99
JH
6807 for (foldbuf = tmpbuf;
6808 foldlen;
6809 foldlen -= numlen) {
6810 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6811 if (numlen > 0) {
71207a34 6812 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6813 len += unilen;
6814 s += unilen;
6815 /* In EBCDIC the numlen
6816 * and unilen can differ. */
9dc45d57 6817 foldbuf += numlen;
47654450
JH
6818 if (numlen >= foldlen)
6819 break;
9dc45d57
JH
6820 }
6821 else
6822 break;
80aecb99
JH
6823 }
6824 }
6825 else {
71207a34 6826 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6827 if (unilen > 0) {
0ebc6274
JH
6828 s += unilen;
6829 len += unilen;
9dc45d57 6830 }
80aecb99
JH
6831 }
6832 len--;
a0ed51b3
LW
6833 }
6834 else
eb160463 6835 REGC((char)ender, s++);
a0d0e21e
LW
6836 }
6837 loopdone:
830247a4 6838 RExC_parse = p - 1;
fac92740 6839 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6840 nextchar(pRExC_state);
793db0cb
JH
6841 {
6842 /* len is STRLEN which is unsigned, need to copy to signed */
6843 IV iv = len;
6844 if (iv < 0)
6845 vFAIL("Internal disaster");
6846 }
a0d0e21e
LW
6847 if (len > 0)
6848 *flagp |= HASWIDTH;
090f7165 6849 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6850 *flagp |= SIMPLE;
3dab1dad 6851
cd439c50 6852 if (SIZE_ONLY)
830247a4 6853 RExC_size += STR_SZ(len);
3dab1dad
YO
6854 else {
6855 STR_LEN(ret) = len;
830247a4 6856 RExC_emit += STR_SZ(len);
07be1b83 6857 }
3dab1dad 6858 }
a0d0e21e
LW
6859 break;
6860 }
a687059c 6861
a0d0e21e 6862 return(ret);
a687059c
LW
6863}
6864
873ef191 6865STATIC char *
5f66b61c 6866S_regwhite(char *p, const char *e)
5b5a24f7
CS
6867{
6868 while (p < e) {
6869 if (isSPACE(*p))
6870 ++p;
6871 else if (*p == '#') {
6872 do {
6873 p++;
6874 } while (p < e && *p != '\n');
6875 }
6876 else
6877 break;
6878 }
6879 return p;
6880}
6881
b8c5462f
JH
6882/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6883 Character classes ([:foo:]) can also be negated ([:^foo:]).
6884 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6885 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6886 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6887
6888#define POSIXCC_DONE(c) ((c) == ':')
6889#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6890#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6891
b8c5462f 6892STATIC I32
830247a4 6893S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6894{
97aff369 6895 dVAR;
936ed897 6896 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6897
830247a4 6898 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6899 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6900 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6901 const char c = UCHARAT(RExC_parse);
097eb12c 6902 char* const s = RExC_parse++;
b81d288d 6903
9a86a77b 6904 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6905 RExC_parse++;
6906 if (RExC_parse == RExC_end)
620e46c5 6907 /* Grandfather lone [:, [=, [. */
830247a4 6908 RExC_parse = s;
620e46c5 6909 else {
3dab1dad 6910 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6911 assert(*t == c);
6912
9a86a77b 6913 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6914 const char *posixcc = s + 1;
830247a4 6915 RExC_parse++; /* skip over the ending ] */
3dab1dad 6916
b8c5462f 6917 if (*s == ':') {
1df70142
AL
6918 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6919 const I32 skip = t - posixcc;
80916619
NC
6920
6921 /* Initially switch on the length of the name. */
6922 switch (skip) {
6923 case 4:
3dab1dad
YO
6924 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6925 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6926 break;
80916619
NC
6927 case 5:
6928 /* Names all of length 5. */
6929 /* alnum alpha ascii blank cntrl digit graph lower
6930 print punct space upper */
6931 /* Offset 4 gives the best switch position. */
6932 switch (posixcc[4]) {
6933 case 'a':
3dab1dad
YO
6934 if (memEQ(posixcc, "alph", 4)) /* alpha */
6935 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6936 break;
6937 case 'e':
3dab1dad
YO
6938 if (memEQ(posixcc, "spac", 4)) /* space */
6939 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6940 break;
6941 case 'h':
3dab1dad
YO
6942 if (memEQ(posixcc, "grap", 4)) /* graph */
6943 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6944 break;
6945 case 'i':
3dab1dad
YO
6946 if (memEQ(posixcc, "asci", 4)) /* ascii */
6947 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6948 break;
6949 case 'k':
3dab1dad
YO
6950 if (memEQ(posixcc, "blan", 4)) /* blank */
6951 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6952 break;
6953 case 'l':
3dab1dad
YO
6954 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6955 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6956 break;
6957 case 'm':
3dab1dad
YO
6958 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6959 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6960 break;
6961 case 'r':
3dab1dad
YO
6962 if (memEQ(posixcc, "lowe", 4)) /* lower */
6963 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6964 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6965 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6966 break;
6967 case 't':
3dab1dad
YO
6968 if (memEQ(posixcc, "digi", 4)) /* digit */
6969 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6970 else if (memEQ(posixcc, "prin", 4)) /* print */
6971 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6972 else if (memEQ(posixcc, "punc", 4)) /* punct */
6973 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6974 break;
b8c5462f
JH
6975 }
6976 break;
80916619 6977 case 6:
3dab1dad
YO
6978 if (memEQ(posixcc, "xdigit", 6))
6979 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6980 break;
6981 }
80916619
NC
6982
6983 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6984 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6985 t - s - 1, s + 1);
80916619
NC
6986 assert (posixcc[skip] == ':');
6987 assert (posixcc[skip+1] == ']');
b45f050a 6988 } else if (!SIZE_ONLY) {
b8c5462f 6989 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6990
830247a4 6991 /* adjust RExC_parse so the warning shows after
b45f050a 6992 the class closes */
9a86a77b 6993 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6994 RExC_parse++;
b45f050a
JF
6995 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6996 }
b8c5462f
JH
6997 } else {
6998 /* Maternal grandfather:
6999 * "[:" ending in ":" but not in ":]" */
830247a4 7000 RExC_parse = s;
767d463e 7001 }
620e46c5
JH
7002 }
7003 }
7004
b8c5462f
JH
7005 return namedclass;
7006}
7007
7008STATIC void
830247a4 7009S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7010{
97aff369 7011 dVAR;
3dab1dad 7012 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7013 const char *s = RExC_parse;
7014 const char c = *s++;
b8c5462f 7015
3dab1dad 7016 while (isALNUM(*s))
b8c5462f
JH
7017 s++;
7018 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7019 if (ckWARN(WARN_REGEXP))
7020 vWARN3(s+2,
7021 "POSIX syntax [%c %c] belongs inside character classes",
7022 c, c);
b45f050a
JF
7023
7024 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7025 if (POSIXCC_NOTYET(c)) {
830247a4 7026 /* adjust RExC_parse so the error shows after
b45f050a 7027 the class closes */
9a86a77b 7028 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7029 NOOP;
b45f050a
JF
7030 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7031 }
b8c5462f
JH
7032 }
7033 }
620e46c5
JH
7034}
7035
7f6f358c 7036
89836f1f
YO
7037#define _C_C_T_(NAME,TEST,WORD) \
7038ANYOF_##NAME: \
7039 if (LOC) \
7040 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7041 else { \
7042 for (value = 0; value < 256; value++) \
7043 if (TEST) \
7044 ANYOF_BITMAP_SET(ret, value); \
7045 } \
7046 yesno = '+'; \
7047 what = WORD; \
7048 break; \
7049case ANYOF_N##NAME: \
7050 if (LOC) \
7051 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7052 else { \
7053 for (value = 0; value < 256; value++) \
7054 if (!TEST) \
7055 ANYOF_BITMAP_SET(ret, value); \
7056 } \
7057 yesno = '!'; \
7058 what = WORD; \
7059 break
7060
7061
7f6f358c
YO
7062/*
7063 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7064 matches the pattern or if the pattern matches a single char only and
7065 that char is < 256 and we are case insensitive then we produce an
7066 EXACT node instead.
7f6f358c 7067*/
89836f1f 7068
76e3520e 7069STATIC regnode *
3dab1dad 7070S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7071{
97aff369 7072 dVAR;
9ef43ace 7073 register UV value = 0;
9a86a77b 7074 register UV nextvalue;
3568d838 7075 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7076 register IV range = 0;
c277df42 7077 register regnode *ret;
ba210ebe 7078 STRLEN numlen;
ffc61ed2 7079 IV namedclass;
cbbf8932 7080 char *rangebegin = NULL;
936ed897 7081 bool need_class = 0;
c445ea15 7082 SV *listsv = NULL;
ffc61ed2 7083 UV n;
9e55ce06 7084 bool optimize_invert = TRUE;
cbbf8932 7085 AV* unicode_alternate = NULL;
1b2d223b
JH
7086#ifdef EBCDIC
7087 UV literal_endpoint = 0;
7088#endif
7f6f358c 7089 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7090
3dab1dad 7091 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7092 case we need to change the emitted regop to an EXACT. */
07be1b83 7093 const char * orig_parse = RExC_parse;
72f13be8 7094 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7095#ifndef DEBUGGING
7096 PERL_UNUSED_ARG(depth);
7097#endif
72f13be8 7098
3dab1dad 7099 DEBUG_PARSE("clas");
7f6f358c
YO
7100
7101 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7102 ret = reganode(pRExC_state, ANYOF, 0);
7103
7104 if (!SIZE_ONLY)
7105 ANYOF_FLAGS(ret) = 0;
7106
9a86a77b 7107 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7108 RExC_naughty++;
7109 RExC_parse++;
7110 if (!SIZE_ONLY)
7111 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7112 }
a0d0e21e 7113
73060fc4 7114 if (SIZE_ONLY) {
830247a4 7115 RExC_size += ANYOF_SKIP;
73060fc4
JH
7116 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7117 }
936ed897 7118 else {
830247a4 7119 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7120 if (FOLD)
7121 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7122 if (LOC)
7123 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7124 ANYOF_BITMAP_ZERO(ret);
396482e1 7125 listsv = newSVpvs("# comment\n");
a0d0e21e 7126 }
b8c5462f 7127
9a86a77b
JH
7128 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7129
b938889d 7130 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7131 checkposixcc(pRExC_state);
b8c5462f 7132
f064b6ad
HS
7133 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7134 if (UCHARAT(RExC_parse) == ']')
7135 goto charclassloop;
ffc61ed2 7136
fc8cd66c 7137parseit:
9a86a77b 7138 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7139
7140 charclassloop:
7141
7142 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7143
73b437c8 7144 if (!range)
830247a4 7145 rangebegin = RExC_parse;
ffc61ed2 7146 if (UTF) {
5e12f4fb 7147 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7148 RExC_end - RExC_parse,
9f7f3913 7149 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7150 RExC_parse += numlen;
7151 }
7152 else
7153 value = UCHARAT(RExC_parse++);
7f6f358c 7154
9a86a77b
JH
7155 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7156 if (value == '[' && POSIXCC(nextvalue))
830247a4 7157 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7158 else if (value == '\\') {
ffc61ed2 7159 if (UTF) {
5e12f4fb 7160 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7161 RExC_end - RExC_parse,
9f7f3913 7162 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7163 RExC_parse += numlen;
7164 }
7165 else
7166 value = UCHARAT(RExC_parse++);
470c3474 7167 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7168 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7169 * be a problem later if we want switch on Unicode.
7170 * A similar issue a little bit later when switching on
7171 * namedclass. --jhi */
ffc61ed2 7172 switch ((I32)value) {
b8c5462f
JH
7173 case 'w': namedclass = ANYOF_ALNUM; break;
7174 case 'W': namedclass = ANYOF_NALNUM; break;
7175 case 's': namedclass = ANYOF_SPACE; break;
7176 case 'S': namedclass = ANYOF_NSPACE; break;
7177 case 'd': namedclass = ANYOF_DIGIT; break;
7178 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7179 case 'N': /* Handle \N{NAME} in class */
7180 {
7181 /* We only pay attention to the first char of
7182 multichar strings being returned. I kinda wonder
7183 if this makes sense as it does change the behaviour
7184 from earlier versions, OTOH that behaviour was broken
7185 as well. */
7186 UV v; /* value is register so we cant & it /grrr */
7187 if (reg_namedseq(pRExC_state, &v)) {
7188 goto parseit;
7189 }
7190 value= v;
7191 }
7192 break;
ffc61ed2
JH
7193 case 'p':
7194 case 'P':
3dab1dad
YO
7195 {
7196 char *e;
af6f566e 7197 if (RExC_parse >= RExC_end)
2a4859cd 7198 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7199 if (*RExC_parse == '{') {
1df70142 7200 const U8 c = (U8)value;
ffc61ed2
JH
7201 e = strchr(RExC_parse++, '}');
7202 if (!e)
0da60cf5 7203 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7204 while (isSPACE(UCHARAT(RExC_parse)))
7205 RExC_parse++;
7206 if (e == RExC_parse)
0da60cf5 7207 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7208 n = e - RExC_parse;
ab13f0c7
JH
7209 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7210 n--;
ffc61ed2
JH
7211 }
7212 else {
7213 e = RExC_parse;
7214 n = 1;
7215 }
7216 if (!SIZE_ONLY) {
ab13f0c7
JH
7217 if (UCHARAT(RExC_parse) == '^') {
7218 RExC_parse++;
7219 n--;
7220 value = value == 'p' ? 'P' : 'p'; /* toggle */
7221 while (isSPACE(UCHARAT(RExC_parse))) {
7222 RExC_parse++;
7223 n--;
7224 }
7225 }
097eb12c
AL
7226 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7227 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7228 }
7229 RExC_parse = e + 1;
7230 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7231 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7232 }
f81125e2 7233 break;
b8c5462f
JH
7234 case 'n': value = '\n'; break;
7235 case 'r': value = '\r'; break;
7236 case 't': value = '\t'; break;
7237 case 'f': value = '\f'; break;
7238 case 'b': value = '\b'; break;
c7f1f016
NIS
7239 case 'e': value = ASCII_TO_NATIVE('\033');break;
7240 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7241 case 'x':
ffc61ed2 7242 if (*RExC_parse == '{') {
a4c04bdc
NC
7243 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7244 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7245 char * const e = strchr(RExC_parse++, '}');
b81d288d 7246 if (!e)
ffc61ed2 7247 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7248
7249 numlen = e - RExC_parse;
7250 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7251 RExC_parse = e + 1;
7252 }
7253 else {
a4c04bdc 7254 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7255 numlen = 2;
7256 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7257 RExC_parse += numlen;
7258 }
9e08bc66
TS
7259 if (PL_encoding && value < 0x100)
7260 goto recode_encoding;
b8c5462f
JH
7261 break;
7262 case 'c':
830247a4 7263 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7264 value = toCTRL(value);
7265 break;
7266 case '0': case '1': case '2': case '3': case '4':
7267 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7268 {
7269 I32 flags = 0;
7270 numlen = 3;
7271 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7272 RExC_parse += numlen;
7273 if (PL_encoding && value < 0x100)
7274 goto recode_encoding;
7275 break;
7276 }
7277 recode_encoding:
7278 {
7279 SV* enc = PL_encoding;
7280 value = reg_recode((const char)(U8)value, &enc);
7281 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7282 vWARN(RExC_parse,
7283 "Invalid escape in the specified encoding");
7284 break;
7285 }
1028017a 7286 default:
041457d9 7287 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7288 vWARN2(RExC_parse,
7289 "Unrecognized escape \\%c in character class passed through",
7290 (int)value);
1028017a 7291 break;
b8c5462f 7292 }
ffc61ed2 7293 } /* end of \blah */
1b2d223b
JH
7294#ifdef EBCDIC
7295 else
7296 literal_endpoint++;
7297#endif
ffc61ed2
JH
7298
7299 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7300
7301 if (!SIZE_ONLY && !need_class)
936ed897 7302 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7303
936ed897 7304 need_class = 1;
ffc61ed2
JH
7305
7306 /* a bad range like a-\d, a-[:digit:] ? */
7307 if (range) {
73b437c8 7308 if (!SIZE_ONLY) {
afd78fd5 7309 if (ckWARN(WARN_REGEXP)) {
097eb12c 7310 const int w =
afd78fd5
JH
7311 RExC_parse >= rangebegin ?
7312 RExC_parse - rangebegin : 0;
830247a4 7313 vWARN4(RExC_parse,
b45f050a 7314 "False [] range \"%*.*s\"",
097eb12c 7315 w, w, rangebegin);
afd78fd5 7316 }
3568d838
JH
7317 if (prevvalue < 256) {
7318 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7319 ANYOF_BITMAP_SET(ret, '-');
7320 }
7321 else {
7322 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7323 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7324 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7325 }
b8c5462f 7326 }
ffc61ed2
JH
7327
7328 range = 0; /* this was not a true range */
73b437c8 7329 }
ffc61ed2 7330
89836f1f
YO
7331
7332
73b437c8 7333 if (!SIZE_ONLY) {
c49a72a9
NC
7334 const char *what = NULL;
7335 char yesno = 0;
7336
3568d838
JH
7337 if (namedclass > OOB_NAMEDCLASS)
7338 optimize_invert = FALSE;
e2962f66
JH
7339 /* Possible truncation here but in some 64-bit environments
7340 * the compiler gets heartburn about switch on 64-bit values.
7341 * A similar issue a little earlier when switching on value.
98f323fa 7342 * --jhi */
e2962f66 7343 switch ((I32)namedclass) {
89836f1f
YO
7344 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7345 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7346 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7347 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7348 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7349 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7350 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7351 case _C_C_T_(PRINT, isPRINT(value), "Print");
7352 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7353 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7354 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7355 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7356 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7357 case ANYOF_ASCII:
7358 if (LOC)
936ed897 7359 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7360 else {
c7f1f016 7361#ifndef EBCDIC
1ba5c669
JH
7362 for (value = 0; value < 128; value++)
7363 ANYOF_BITMAP_SET(ret, value);
7364#else /* EBCDIC */
ffbc6a93 7365 for (value = 0; value < 256; value++) {
3a3c4447
JH
7366 if (isASCII(value))
7367 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7368 }
1ba5c669 7369#endif /* EBCDIC */
73b437c8 7370 }
c49a72a9
NC
7371 yesno = '+';
7372 what = "ASCII";
73b437c8
JH
7373 break;
7374 case ANYOF_NASCII:
7375 if (LOC)
936ed897 7376 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7377 else {
c7f1f016 7378#ifndef EBCDIC
1ba5c669
JH
7379 for (value = 128; value < 256; value++)
7380 ANYOF_BITMAP_SET(ret, value);
7381#else /* EBCDIC */
ffbc6a93 7382 for (value = 0; value < 256; value++) {
3a3c4447
JH
7383 if (!isASCII(value))
7384 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7385 }
1ba5c669 7386#endif /* EBCDIC */
73b437c8 7387 }
c49a72a9
NC
7388 yesno = '!';
7389 what = "ASCII";
89836f1f 7390 break;
ffc61ed2
JH
7391 case ANYOF_DIGIT:
7392 if (LOC)
7393 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7394 else {
7395 /* consecutive digits assumed */
7396 for (value = '0'; value <= '9'; value++)
7397 ANYOF_BITMAP_SET(ret, value);
7398 }
c49a72a9
NC
7399 yesno = '+';
7400 what = "Digit";
ffc61ed2
JH
7401 break;
7402 case ANYOF_NDIGIT:
7403 if (LOC)
7404 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7405 else {
7406 /* consecutive digits assumed */
7407 for (value = 0; value < '0'; value++)
7408 ANYOF_BITMAP_SET(ret, value);
7409 for (value = '9' + 1; value < 256; value++)
7410 ANYOF_BITMAP_SET(ret, value);
7411 }
c49a72a9
NC
7412 yesno = '!';
7413 what = "Digit";
89836f1f 7414 break;
f81125e2
JP
7415 case ANYOF_MAX:
7416 /* this is to handle \p and \P */
7417 break;
73b437c8 7418 default:
b45f050a 7419 vFAIL("Invalid [::] class");
73b437c8 7420 break;
b8c5462f 7421 }
c49a72a9
NC
7422 if (what) {
7423 /* Strings such as "+utf8::isWord\n" */
7424 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7425 }
b8c5462f 7426 if (LOC)
936ed897 7427 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7428 continue;
a0d0e21e 7429 }
ffc61ed2
JH
7430 } /* end of namedclass \blah */
7431
a0d0e21e 7432 if (range) {
eb160463 7433 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7434 const int w = RExC_parse - rangebegin;
7435 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7436 range = 0; /* not a valid range */
73b437c8 7437 }
a0d0e21e
LW
7438 }
7439 else {
3568d838 7440 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7441 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7442 RExC_parse[1] != ']') {
7443 RExC_parse++;
ffc61ed2
JH
7444
7445 /* a bad range like \w-, [:word:]- ? */
7446 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7447 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7448 const int w =
afd78fd5
JH
7449 RExC_parse >= rangebegin ?
7450 RExC_parse - rangebegin : 0;
830247a4 7451 vWARN4(RExC_parse,
b45f050a 7452 "False [] range \"%*.*s\"",
097eb12c 7453 w, w, rangebegin);
afd78fd5 7454 }
73b437c8 7455 if (!SIZE_ONLY)
936ed897 7456 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7457 } else
ffc61ed2
JH
7458 range = 1; /* yeah, it's a range! */
7459 continue; /* but do it the next time */
a0d0e21e 7460 }
a687059c 7461 }
ffc61ed2 7462
93733859 7463 /* now is the next time */
07be1b83 7464 /*stored += (value - prevvalue + 1);*/
ae5c130c 7465 if (!SIZE_ONLY) {
3568d838 7466 if (prevvalue < 256) {
1df70142 7467 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7468 IV i;
3568d838 7469#ifdef EBCDIC
1b2d223b
JH
7470 /* In EBCDIC [\x89-\x91] should include
7471 * the \x8e but [i-j] should not. */
7472 if (literal_endpoint == 2 &&
7473 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7474 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7475 {
3568d838
JH
7476 if (isLOWER(prevvalue)) {
7477 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7478 if (isLOWER(i))
7479 ANYOF_BITMAP_SET(ret, i);
7480 } else {
3568d838 7481 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7482 if (isUPPER(i))
7483 ANYOF_BITMAP_SET(ret, i);
7484 }
8ada0baa 7485 }
ffc61ed2 7486 else
8ada0baa 7487#endif
07be1b83
YO
7488 for (i = prevvalue; i <= ceilvalue; i++) {
7489 if (!ANYOF_BITMAP_TEST(ret,i)) {
7490 stored++;
7491 ANYOF_BITMAP_SET(ret, i);
7492 }
7493 }
3568d838 7494 }
a5961de5 7495 if (value > 255 || UTF) {
1df70142
AL
7496 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7497 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7498 stored+=2; /* can't optimize this class */
ffc61ed2 7499 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7500 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7501 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7502 prevnatvalue, natvalue);
7503 }
7504 else if (prevnatvalue == natvalue) {
7505 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7506 if (FOLD) {
89ebb4a3 7507 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7508 STRLEN foldlen;
1df70142 7509 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7510
e294cc5d
JH
7511#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7512 if (RExC_precomp[0] == ':' &&
7513 RExC_precomp[1] == '[' &&
7514 (f == 0xDF || f == 0x92)) {
7515 f = NATIVE_TO_UNI(f);
7516 }
7517#endif
c840d2a2
JH
7518 /* If folding and foldable and a single
7519 * character, insert also the folded version
7520 * to the charclass. */
9e55ce06 7521 if (f != value) {
e294cc5d
JH
7522#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7523 if ((RExC_precomp[0] == ':' &&
7524 RExC_precomp[1] == '[' &&
7525 (f == 0xA2 &&
7526 (value == 0xFB05 || value == 0xFB06))) ?
7527 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7528 foldlen == (STRLEN)UNISKIP(f) )
7529#else
eb160463 7530 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7531#endif
9e55ce06
JH
7532 Perl_sv_catpvf(aTHX_ listsv,
7533 "%04"UVxf"\n", f);
7534 else {
7535 /* Any multicharacter foldings
7536 * require the following transform:
7537 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7538 * where E folds into "pq" and F folds
7539 * into "rst", all other characters
7540 * fold to single characters. We save
7541 * away these multicharacter foldings,
7542 * to be later saved as part of the
7543 * additional "s" data. */
7544 SV *sv;
7545
7546 if (!unicode_alternate)
7547 unicode_alternate = newAV();
7548 sv = newSVpvn((char*)foldbuf, foldlen);
7549 SvUTF8_on(sv);
7550 av_push(unicode_alternate, sv);
7551 }
7552 }
254ba52a 7553
60a8b682
JH
7554 /* If folding and the value is one of the Greek
7555 * sigmas insert a few more sigmas to make the
7556 * folding rules of the sigmas to work right.
7557 * Note that not all the possible combinations
7558 * are handled here: some of them are handled
9e55ce06
JH
7559 * by the standard folding rules, and some of
7560 * them (literal or EXACTF cases) are handled
7561 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7562 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7563 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7564 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7565 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7566 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7567 }
7568 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7569 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7570 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7571 }
7572 }
ffc61ed2 7573 }
1b2d223b
JH
7574#ifdef EBCDIC
7575 literal_endpoint = 0;
7576#endif
8ada0baa 7577 }
ffc61ed2
JH
7578
7579 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7580 }
ffc61ed2 7581
936ed897 7582 if (need_class) {
4f66b38d 7583 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7584 if (SIZE_ONLY)
830247a4 7585 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7586 else
830247a4 7587 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7588 }
ffc61ed2 7589
7f6f358c
YO
7590
7591 if (SIZE_ONLY)
7592 return ret;
7593 /****** !SIZE_ONLY AFTER HERE *********/
7594
7595 if( stored == 1 && value < 256
7596 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7597 ) {
7598 /* optimize single char class to an EXACT node
7599 but *only* when its not a UTF/high char */
07be1b83
YO
7600 const char * cur_parse= RExC_parse;
7601 RExC_emit = (regnode *)orig_emit;
7602 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7603 ret = reg_node(pRExC_state,
7604 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7605 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7606 *STRING(ret)= (char)value;
7607 STR_LEN(ret)= 1;
7608 RExC_emit += STR_SZ(1);
7609 return ret;
7610 }
ae5c130c 7611 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7612 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7613 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7614 ) {
a0ed51b3 7615 for (value = 0; value < 256; ++value) {
936ed897 7616 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7617 UV fold = PL_fold[value];
ffc61ed2
JH
7618
7619 if (fold != value)
7620 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7621 }
7622 }
936ed897 7623 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7624 }
ffc61ed2 7625
ae5c130c 7626 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7627 if (optimize_invert &&
ffc61ed2
JH
7628 /* If the only flag is inversion. */
7629 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7630 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7631 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7632 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7633 }
7f6f358c 7634 {
097eb12c 7635 AV * const av = newAV();
ffc61ed2 7636 SV *rv;
9e55ce06 7637 /* The 0th element stores the character class description
6a0407ee 7638 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7639 * to initialize the appropriate swash (which gets stored in
7640 * the 1st element), and also useful for dumping the regnode.
7641 * The 2nd element stores the multicharacter foldings,
6a0407ee 7642 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7643 av_store(av, 0, listsv);
7644 av_store(av, 1, NULL);
9e55ce06 7645 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7646 rv = newRV_noinc((SV*)av);
19860706 7647 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7648 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7649 ARG_SET(ret, n);
a0ed51b3 7650 }
a0ed51b3
LW
7651 return ret;
7652}
89836f1f
YO
7653#undef _C_C_T_
7654
a0ed51b3 7655
76e3520e 7656STATIC char*
830247a4 7657S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7658{
097eb12c 7659 char* const retval = RExC_parse++;
a0d0e21e 7660
4633a7c4 7661 for (;;) {
830247a4
IZ
7662 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7663 RExC_parse[2] == '#') {
e994fd66
AE
7664 while (*RExC_parse != ')') {
7665 if (RExC_parse == RExC_end)
7666 FAIL("Sequence (?#... not terminated");
830247a4 7667 RExC_parse++;
e994fd66 7668 }
830247a4 7669 RExC_parse++;
4633a7c4
LW
7670 continue;
7671 }
bbe252da 7672 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7673 if (isSPACE(*RExC_parse)) {
7674 RExC_parse++;
748a9306
LW
7675 continue;
7676 }
830247a4 7677 else if (*RExC_parse == '#') {
e994fd66
AE
7678 while (RExC_parse < RExC_end)
7679 if (*RExC_parse++ == '\n') break;
748a9306
LW
7680 continue;
7681 }
748a9306 7682 }
4633a7c4 7683 return retval;
a0d0e21e 7684 }
a687059c
LW
7685}
7686
7687/*
c277df42 7688- reg_node - emit a node
a0d0e21e 7689*/
76e3520e 7690STATIC regnode * /* Location. */
830247a4 7691S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7692{
97aff369 7693 dVAR;
c277df42 7694 register regnode *ptr;
504618e9 7695 regnode * const ret = RExC_emit;
07be1b83 7696 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7697
c277df42 7698 if (SIZE_ONLY) {
830247a4
IZ
7699 SIZE_ALIGN(RExC_size);
7700 RExC_size += 1;
a0d0e21e
LW
7701 return(ret);
7702 }
e2e6a0f1
YO
7703#ifdef DEBUGGING
7704 if (OP(RExC_emit) == 255)
7705 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7706 reg_name[op], OP(RExC_emit));
7707#endif
c277df42 7708 NODE_ALIGN_FILL(ret);
a0d0e21e 7709 ptr = ret;
c277df42 7710 FILL_ADVANCE_NODE(ptr, op);
fac92740 7711 if (RExC_offsets) { /* MJD */
07be1b83 7712 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7713 "reg_node", __LINE__,
7714 reg_name[op],
07be1b83
YO
7715 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7716 ? "Overwriting end of array!\n" : "OK",
7717 (UV)(RExC_emit - RExC_emit_start),
7718 (UV)(RExC_parse - RExC_start),
7719 (UV)RExC_offsets[0]));
ccb2c380 7720 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7721 }
07be1b83 7722
830247a4 7723 RExC_emit = ptr;
a0d0e21e 7724 return(ret);
a687059c
LW
7725}
7726
7727/*
a0d0e21e
LW
7728- reganode - emit a node with an argument
7729*/
76e3520e 7730STATIC regnode * /* Location. */
830247a4 7731S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7732{
97aff369 7733 dVAR;
c277df42 7734 register regnode *ptr;
504618e9 7735 regnode * const ret = RExC_emit;
07be1b83 7736 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7737
c277df42 7738 if (SIZE_ONLY) {
830247a4
IZ
7739 SIZE_ALIGN(RExC_size);
7740 RExC_size += 2;
6bda09f9
YO
7741 /*
7742 We can't do this:
7743
7744 assert(2==regarglen[op]+1);
7745
7746 Anything larger than this has to allocate the extra amount.
7747 If we changed this to be:
7748
7749 RExC_size += (1 + regarglen[op]);
7750
7751 then it wouldn't matter. Its not clear what side effect
7752 might come from that so its not done so far.
7753 -- dmq
7754 */
a0d0e21e
LW
7755 return(ret);
7756 }
e2e6a0f1
YO
7757#ifdef DEBUGGING
7758 if (OP(RExC_emit) == 255)
7759 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7760#endif
c277df42 7761 NODE_ALIGN_FILL(ret);
a0d0e21e 7762 ptr = ret;
c277df42 7763 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7764 if (RExC_offsets) { /* MJD */
07be1b83 7765 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7766 "reganode",
ccb2c380
MP
7767 __LINE__,
7768 reg_name[op],
07be1b83 7769 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7770 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7771 (UV)(RExC_emit - RExC_emit_start),
7772 (UV)(RExC_parse - RExC_start),
7773 (UV)RExC_offsets[0]));
ccb2c380 7774 Set_Cur_Node_Offset;
fac92740
MJD
7775 }
7776
830247a4 7777 RExC_emit = ptr;
a0d0e21e 7778 return(ret);
fe14fcc3
LW
7779}
7780
7781/*
cd439c50 7782- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7783*/
71207a34
AL
7784STATIC STRLEN
7785S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7786{
97aff369 7787 dVAR;
71207a34 7788 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7789}
7790
7791/*
a0d0e21e
LW
7792- reginsert - insert an operator in front of already-emitted operand
7793*
7794* Means relocating the operand.
7795*/
76e3520e 7796STATIC void
6bda09f9 7797S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7798{
97aff369 7799 dVAR;
c277df42
IZ
7800 register regnode *src;
7801 register regnode *dst;
7802 register regnode *place;
504618e9 7803 const int offset = regarglen[(U8)op];
6bda09f9 7804 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7805 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7806/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7807 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7808 if (SIZE_ONLY) {
6bda09f9 7809 RExC_size += size;
a0d0e21e
LW
7810 return;
7811 }
a687059c 7812
830247a4 7813 src = RExC_emit;
6bda09f9 7814 RExC_emit += size;
830247a4 7815 dst = RExC_emit;
40d049e4 7816 if (RExC_open_parens) {
6bda09f9 7817 int paren;
6d99fb9b 7818 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7819 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7820 if ( RExC_open_parens[paren] >= opnd ) {
7821 DEBUG_PARSE_FMT("open"," - %d",size);
7822 RExC_open_parens[paren] += size;
7823 } else {
7824 DEBUG_PARSE_FMT("open"," - %s","ok");
7825 }
7826 if ( RExC_close_parens[paren] >= opnd ) {
7827 DEBUG_PARSE_FMT("close"," - %d",size);
7828 RExC_close_parens[paren] += size;
7829 } else {
7830 DEBUG_PARSE_FMT("close"," - %s","ok");
7831 }
7832 }
6bda09f9 7833 }
40d049e4 7834
fac92740 7835 while (src > opnd) {
c277df42 7836 StructCopy(--src, --dst, regnode);
fac92740 7837 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7838 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7839 "reg_insert",
ccb2c380
MP
7840 __LINE__,
7841 reg_name[op],
07be1b83
YO
7842 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7843 ? "Overwriting end of array!\n" : "OK",
7844 (UV)(src - RExC_emit_start),
7845 (UV)(dst - RExC_emit_start),
7846 (UV)RExC_offsets[0]));
ccb2c380
MP
7847 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7848 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7849 }
7850 }
7851
a0d0e21e
LW
7852
7853 place = opnd; /* Op node, where operand used to be. */
fac92740 7854 if (RExC_offsets) { /* MJD */
07be1b83 7855 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7856 "reginsert",
ccb2c380
MP
7857 __LINE__,
7858 reg_name[op],
07be1b83 7859 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7860 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7861 (UV)(place - RExC_emit_start),
7862 (UV)(RExC_parse - RExC_start),
786e8c11 7863 (UV)RExC_offsets[0]));
ccb2c380 7864 Set_Node_Offset(place, RExC_parse);
45948336 7865 Set_Node_Length(place, 1);
fac92740 7866 }
c277df42
IZ
7867 src = NEXTOPER(place);
7868 FILL_ADVANCE_NODE(place, op);
7869 Zero(src, offset, regnode);
a687059c
LW
7870}
7871
7872/*
c277df42 7873- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7874- SEE ALSO: regtail_study
a0d0e21e 7875*/
097eb12c 7876/* TODO: All three parms should be const */
76e3520e 7877STATIC void
3dab1dad 7878S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7879{
97aff369 7880 dVAR;
c277df42 7881 register regnode *scan;
72f13be8 7882 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7883#ifndef DEBUGGING
7884 PERL_UNUSED_ARG(depth);
7885#endif
a0d0e21e 7886
c277df42 7887 if (SIZE_ONLY)
a0d0e21e
LW
7888 return;
7889
7890 /* Find last node. */
7891 scan = p;
7892 for (;;) {
504618e9 7893 regnode * const temp = regnext(scan);
3dab1dad
YO
7894 DEBUG_PARSE_r({
7895 SV * const mysv=sv_newmortal();
7896 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7897 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7898 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7899 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7900 (temp == NULL ? "->" : ""),
7901 (temp == NULL ? reg_name[OP(val)] : "")
7902 );
3dab1dad
YO
7903 });
7904 if (temp == NULL)
7905 break;
7906 scan = temp;
7907 }
7908
7909 if (reg_off_by_arg[OP(scan)]) {
7910 ARG_SET(scan, val - scan);
7911 }
7912 else {
7913 NEXT_OFF(scan) = val - scan;
7914 }
7915}
7916
07be1b83 7917#ifdef DEBUGGING
3dab1dad
YO
7918/*
7919- regtail_study - set the next-pointer at the end of a node chain of p to val.
7920- Look for optimizable sequences at the same time.
7921- currently only looks for EXACT chains.
07be1b83
YO
7922
7923This is expermental code. The idea is to use this routine to perform
7924in place optimizations on branches and groups as they are constructed,
7925with the long term intention of removing optimization from study_chunk so
7926that it is purely analytical.
7927
7928Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7929to control which is which.
7930
3dab1dad
YO
7931*/
7932/* TODO: All four parms should be const */
07be1b83 7933
3dab1dad
YO
7934STATIC U8
7935S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7936{
7937 dVAR;
7938 register regnode *scan;
07be1b83
YO
7939 U8 exact = PSEUDO;
7940#ifdef EXPERIMENTAL_INPLACESCAN
7941 I32 min = 0;
7942#endif
7943
3dab1dad
YO
7944 GET_RE_DEBUG_FLAGS_DECL;
7945
07be1b83 7946
3dab1dad
YO
7947 if (SIZE_ONLY)
7948 return exact;
7949
7950 /* Find last node. */
7951
7952 scan = p;
7953 for (;;) {
7954 regnode * const temp = regnext(scan);
07be1b83
YO
7955#ifdef EXPERIMENTAL_INPLACESCAN
7956 if (PL_regkind[OP(scan)] == EXACT)
7957 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7958 return EXACT;
7959#endif
3dab1dad
YO
7960 if ( exact ) {
7961 switch (OP(scan)) {
7962 case EXACT:
7963 case EXACTF:
7964 case EXACTFL:
7965 if( exact == PSEUDO )
7966 exact= OP(scan);
07be1b83
YO
7967 else if ( exact != OP(scan) )
7968 exact= 0;
3dab1dad
YO
7969 case NOTHING:
7970 break;
7971 default:
7972 exact= 0;
7973 }
7974 }
7975 DEBUG_PARSE_r({
7976 SV * const mysv=sv_newmortal();
7977 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7978 regprop(RExC_rx, mysv, scan);
eaf3ca90 7979 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7980 SvPV_nolen_const(mysv),
eaf3ca90
YO
7981 REG_NODE_NUM(scan),
7982 reg_name[exact]);
3dab1dad 7983 });
a0d0e21e
LW
7984 if (temp == NULL)
7985 break;
7986 scan = temp;
7987 }
07be1b83
YO
7988 DEBUG_PARSE_r({
7989 SV * const mysv_val=sv_newmortal();
7990 DEBUG_PARSE_MSG("");
7991 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
7992 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
7993 SvPV_nolen_const(mysv_val),
7994 (IV)REG_NODE_NUM(val),
7995 (IV)(val - scan)
07be1b83
YO
7996 );
7997 });
c277df42
IZ
7998 if (reg_off_by_arg[OP(scan)]) {
7999 ARG_SET(scan, val - scan);
a0ed51b3
LW
8000 }
8001 else {
c277df42
IZ
8002 NEXT_OFF(scan) = val - scan;
8003 }
3dab1dad
YO
8004
8005 return exact;
a687059c 8006}
07be1b83 8007#endif
a687059c
LW
8008
8009/*
a687059c
LW
8010 - regcurly - a little FSA that accepts {\d+,?\d*}
8011 */
79072805 8012STATIC I32
5f66b61c 8013S_regcurly(register const char *s)
a687059c
LW
8014{
8015 if (*s++ != '{')
8016 return FALSE;
f0fcb552 8017 if (!isDIGIT(*s))
a687059c 8018 return FALSE;
f0fcb552 8019 while (isDIGIT(*s))
a687059c
LW
8020 s++;
8021 if (*s == ',')
8022 s++;
f0fcb552 8023 while (isDIGIT(*s))
a687059c
LW
8024 s++;
8025 if (*s != '}')
8026 return FALSE;
8027 return TRUE;
8028}
8029
a687059c
LW
8030
8031/*
fd181c75 8032 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8033 */
8034void
097eb12c 8035Perl_regdump(pTHX_ const regexp *r)
a687059c 8036{
35ff7856 8037#ifdef DEBUGGING
97aff369 8038 dVAR;
c445ea15 8039 SV * const sv = sv_newmortal();
ab3bbdeb 8040 SV *dsv= sv_newmortal();
f8fc2ecf 8041 RXi_GET_DECL(r,ri);
a687059c 8042
f8fc2ecf 8043 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8044
8045 /* Header fields of interest. */
ab3bbdeb
YO
8046 if (r->anchored_substr) {
8047 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8048 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8049 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8050 "anchored %s%s at %"IVdf" ",
8051 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8052 (IV)r->anchored_offset);
ab3bbdeb
YO
8053 } else if (r->anchored_utf8) {
8054 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8055 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8056 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8057 "anchored utf8 %s%s at %"IVdf" ",
8058 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8059 (IV)r->anchored_offset);
ab3bbdeb
YO
8060 }
8061 if (r->float_substr) {
8062 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8063 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8064 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8065 "floating %s%s at %"IVdf"..%"UVuf" ",
8066 s, RE_SV_TAIL(r->float_substr),
7b0972df 8067 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8068 } else if (r->float_utf8) {
8069 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8070 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8071 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8072 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8073 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8074 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8075 }
33b8afdf 8076 if (r->check_substr || r->check_utf8)
b81d288d 8077 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8078 (const char *)
8079 (r->check_substr == r->float_substr
8080 && r->check_utf8 == r->float_utf8
8081 ? "(checking floating" : "(checking anchored"));
bbe252da 8082 if (r->extflags & RXf_NOSCAN)
c277df42 8083 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8084 if (r->extflags & RXf_CHECK_ALL)
c277df42 8085 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8086 if (r->check_substr || r->check_utf8)
c277df42
IZ
8087 PerlIO_printf(Perl_debug_log, ") ");
8088
f8fc2ecf
YO
8089 if (ri->regstclass) {
8090 regprop(r, sv, ri->regstclass);
1de06328 8091 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8092 }
bbe252da 8093 if (r->extflags & RXf_ANCH) {
774d564b 8094 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8095 if (r->extflags & RXf_ANCH_BOL)
774d564b 8096 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8097 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8098 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8099 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8100 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8101 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8102 PerlIO_printf(Perl_debug_log, "(GPOS)");
8103 PerlIO_putc(Perl_debug_log, ' ');
8104 }
bbe252da 8105 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8106 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8107 if (r->intflags & PREGf_SKIP)
760ac839 8108 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8109 if (r->intflags & PREGf_IMPLICIT)
760ac839 8110 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8111 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8112 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8113 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8114 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8115#else
96a5add6 8116 PERL_UNUSED_CONTEXT;
65e66c80 8117 PERL_UNUSED_ARG(r);
17c3b450 8118#endif /* DEBUGGING */
a687059c
LW
8119}
8120
8121/*
a0d0e21e
LW
8122- regprop - printable representation of opcode
8123*/
46fc3d4c 8124void
32fc9b6a 8125Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8126{
35ff7856 8127#ifdef DEBUGGING
97aff369 8128 dVAR;
9b155405 8129 register int k;
f8fc2ecf 8130 RXi_GET_DECL(prog,progi);
1de06328 8131 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8132
a0d0e21e 8133
54dc92de 8134 sv_setpvn(sv, "", 0);
8aa23a47 8135
03363afd 8136 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8137 /* It would be nice to FAIL() here, but this may be called from
8138 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8139 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8140 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8141
3dab1dad 8142 k = PL_regkind[OP(o)];
9b155405 8143
2a782b5b 8144 if (k == EXACT) {
396482e1 8145 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8146 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8147 * is a crude hack but it may be the best for now since
8148 * we have no flag "this EXACTish node was UTF-8"
8149 * --jhi */
8150 const char * const s =
ddc5bc0f 8151 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8152 PL_colors[0], PL_colors[1],
8153 PERL_PV_ESCAPE_UNI_DETECT |
8154 PERL_PV_PRETTY_ELIPSES |
8155 PERL_PV_PRETTY_LTGT
8156 );
8157 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8158 } else if (k == TRIE) {
3dab1dad 8159 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8160 * progi->data isn't available here */
1de06328 8161 const char op = OP(o);
647f639f 8162 const U32 n = ARG(o);
1de06328 8163 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8164 (reg_ac_data *)progi->data->data[n] :
1de06328 8165 NULL;
3251b653
NC
8166 const reg_trie_data * const trie
8167 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8168
8169 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8170 DEBUG_TRIE_COMPILE_r(
8171 Perl_sv_catpvf(aTHX_ sv,
8172 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8173 (UV)trie->startstate,
1e2e3d02 8174 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8175 (UV)trie->wordcount,
8176 (UV)trie->minlen,
8177 (UV)trie->maxlen,
8178 (UV)TRIE_CHARCOUNT(trie),
8179 (UV)trie->uniquecharcount
8180 )
8181 );
8182 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8183 int i;
8184 int rangestart = -1;
f46cb337 8185 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8186 Perl_sv_catpvf(aTHX_ sv, "[");
8187 for (i = 0; i <= 256; i++) {
8188 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8189 if (rangestart == -1)
8190 rangestart = i;
8191 } else if (rangestart != -1) {
8192 if (i <= rangestart + 3)
8193 for (; rangestart < i; rangestart++)
8194 put_byte(sv, rangestart);
8195 else {
8196 put_byte(sv, rangestart);
8197 sv_catpvs(sv, "-");
8198 put_byte(sv, i - 1);
8199 }
8200 rangestart = -1;
8201 }
8202 }
8203 Perl_sv_catpvf(aTHX_ sv, "]");
8204 }
8205
a3621e74 8206 } else if (k == CURLY) {
cb434fcc 8207 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8208 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8209 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8210 }
2c2d71f5
JH
8211 else if (k == WHILEM && o->flags) /* Ordinal/of */
8212 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8213 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8214 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe
YO
8215 if ( prog->paren_names ) {
8216 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8217 SV **name= av_fetch(list, ARG(o), 0 );
8218 if (name)
be2597df 8219 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe
YO
8220 }
8221 } else if (k == NREF) {
8222 if ( prog->paren_names ) {
8223 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8224 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8225 I32 *nums=(I32*)SvPVX(sv_dat);
8226 SV **name= av_fetch(list, nums[0], 0 );
8227 I32 n;
8228 if (name) {
8229 for ( n=0; n<SvIVX(sv_dat); n++ ) {
714c8e96
JH
8230 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8231 (n ? "," : ""), (IV)nums[n]);
1f1031fe 8232 }
be2597df 8233 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe
YO
8234 }
8235 }
8236 } else if (k == GOSUB)
6bda09f9 8237 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8238 else if (k == VERB) {
8239 if (!o->flags)
8240 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8241 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8242 } else if (k == LOGICAL)
04ebc1ab 8243 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8244 else if (k == ANYOF) {
8245 int i, rangestart = -1;
2d03de9c 8246 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8247
8248 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8249 static const char * const anyofs[] = {
653099ff
GS
8250 "\\w",
8251 "\\W",
8252 "\\s",
8253 "\\S",
8254 "\\d",
8255 "\\D",
8256 "[:alnum:]",
8257 "[:^alnum:]",
8258 "[:alpha:]",
8259 "[:^alpha:]",
8260 "[:ascii:]",
8261 "[:^ascii:]",
8262 "[:ctrl:]",
8263 "[:^ctrl:]",
8264 "[:graph:]",
8265 "[:^graph:]",
8266 "[:lower:]",
8267 "[:^lower:]",
8268 "[:print:]",
8269 "[:^print:]",
8270 "[:punct:]",
8271 "[:^punct:]",
8272 "[:upper:]",
aaa51d5e 8273 "[:^upper:]",
653099ff 8274 "[:xdigit:]",
aaa51d5e
JF
8275 "[:^xdigit:]",
8276 "[:space:]",
8277 "[:^space:]",
8278 "[:blank:]",
8279 "[:^blank:]"
653099ff
GS
8280 };
8281
19860706 8282 if (flags & ANYOF_LOCALE)
396482e1 8283 sv_catpvs(sv, "{loc}");
19860706 8284 if (flags & ANYOF_FOLD)
396482e1 8285 sv_catpvs(sv, "{i}");
653099ff 8286 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8287 if (flags & ANYOF_INVERT)
396482e1 8288 sv_catpvs(sv, "^");
ffc61ed2
JH
8289 for (i = 0; i <= 256; i++) {
8290 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8291 if (rangestart == -1)
8292 rangestart = i;
8293 } else if (rangestart != -1) {
8294 if (i <= rangestart + 3)
8295 for (; rangestart < i; rangestart++)
653099ff 8296 put_byte(sv, rangestart);
ffc61ed2
JH
8297 else {
8298 put_byte(sv, rangestart);
396482e1 8299 sv_catpvs(sv, "-");
ffc61ed2 8300 put_byte(sv, i - 1);
653099ff 8301 }
ffc61ed2 8302 rangestart = -1;
653099ff 8303 }
847a199f 8304 }
ffc61ed2
JH
8305
8306 if (o->flags & ANYOF_CLASS)
bb7a0f54 8307 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8308 if (ANYOF_CLASS_TEST(o,i))
8309 sv_catpv(sv, anyofs[i]);
8310
8311 if (flags & ANYOF_UNICODE)
396482e1 8312 sv_catpvs(sv, "{unicode}");
1aa99e6b 8313 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8314 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8315
8316 {
8317 SV *lv;
32fc9b6a 8318 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8319
ffc61ed2
JH
8320 if (lv) {
8321 if (sw) {
89ebb4a3 8322 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8323
ffc61ed2 8324 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8325 uvchr_to_utf8(s, i);
ffc61ed2 8326
3568d838 8327 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8328 if (rangestart == -1)
8329 rangestart = i;
8330 } else if (rangestart != -1) {
ffc61ed2
JH
8331 if (i <= rangestart + 3)
8332 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8333 const U8 * const e = uvchr_to_utf8(s,rangestart);
8334 U8 *p;
8335 for(p = s; p < e; p++)
ffc61ed2
JH
8336 put_byte(sv, *p);
8337 }
8338 else {
2d03de9c
AL
8339 const U8 *e = uvchr_to_utf8(s,rangestart);
8340 U8 *p;
8341 for (p = s; p < e; p++)
ffc61ed2 8342 put_byte(sv, *p);
396482e1 8343 sv_catpvs(sv, "-");
2d03de9c
AL
8344 e = uvchr_to_utf8(s, i-1);
8345 for (p = s; p < e; p++)
1df70142 8346 put_byte(sv, *p);
ffc61ed2
JH
8347 }
8348 rangestart = -1;
8349 }
19860706 8350 }
ffc61ed2 8351
396482e1 8352 sv_catpvs(sv, "..."); /* et cetera */
19860706 8353 }
fde631ed 8354
ffc61ed2 8355 {
2e0de35c 8356 char *s = savesvpv(lv);
c445ea15 8357 char * const origs = s;
b81d288d 8358
3dab1dad
YO
8359 while (*s && *s != '\n')
8360 s++;
b81d288d 8361
ffc61ed2 8362 if (*s == '\n') {
2d03de9c 8363 const char * const t = ++s;
ffc61ed2
JH
8364
8365 while (*s) {
8366 if (*s == '\n')
8367 *s = ' ';
8368 s++;
8369 }
8370 if (s[-1] == ' ')
8371 s[-1] = 0;
8372
8373 sv_catpv(sv, t);
fde631ed 8374 }
b81d288d 8375
ffc61ed2 8376 Safefree(origs);
fde631ed
JH
8377 }
8378 }
653099ff 8379 }
ffc61ed2 8380
653099ff
GS
8381 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8382 }
9b155405 8383 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8384 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8385#else
96a5add6 8386 PERL_UNUSED_CONTEXT;
65e66c80
SP
8387 PERL_UNUSED_ARG(sv);
8388 PERL_UNUSED_ARG(o);
f9049ba1 8389 PERL_UNUSED_ARG(prog);
17c3b450 8390#endif /* DEBUGGING */
35ff7856 8391}
a687059c 8392
cad2e5aa
JH
8393SV *
8394Perl_re_intuit_string(pTHX_ regexp *prog)
8395{ /* Assume that RE_INTUIT is set */
97aff369 8396 dVAR;
a3621e74 8397 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8398 PERL_UNUSED_CONTEXT;
8399
a3621e74 8400 DEBUG_COMPILE_r(
cfd0369c 8401 {
2d03de9c 8402 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8403 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8404
8405 if (!PL_colorset) reginitcolors();
8406 PerlIO_printf(Perl_debug_log,
a0288114 8407 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8408 PL_colors[4],
8409 prog->check_substr ? "" : "utf8 ",
8410 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8411 s,
8412 PL_colors[1],
8413 (strlen(s) > 60 ? "..." : ""));
8414 } );
8415
33b8afdf 8416 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8417}
8418
84da74a7 8419/*
f8149455 8420 pregfree()
84da74a7 8421
f8149455
YO
8422 handles refcounting and freeing the perl core regexp structure. When
8423 it is necessary to actually free the structure the first thing it
8424 does is call the 'free' method of the regexp_engine associated to to
8425 the regexp, allowing the handling of the void *pprivate; member
8426 first. (This routine is not overridable by extensions, which is why
8427 the extensions free is called first.)
8428
8429 See regdupe and regdupe_internal if you change anything here.
84da74a7 8430*/
f8149455 8431#ifndef PERL_IN_XSUB_RE
2b69d0c2 8432void
864dbfa3 8433Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8434{
27da23d5 8435 dVAR;
fc32ee4a 8436 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8437
7821416a
IZ
8438 if (!r || (--r->refcnt > 0))
8439 return;
f8149455
YO
8440
8441 CALLREGFREE_PVT(r); /* free the private data */
8442
43c5f42d
NC
8443 /* gcov results gave these as non-null 100% of the time, so there's no
8444 optimisation in checking them before calling Safefree */
8445 Safefree(r->precomp);
ed252734 8446 RX_MATCH_COPY_FREE(r);
f8c7b90f 8447#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8448 if (r->saved_copy)
8449 SvREFCNT_dec(r->saved_copy);
8450#endif
a193d654
GS
8451 if (r->substrs) {
8452 if (r->anchored_substr)
8453 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8454 if (r->anchored_utf8)
8455 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8456 if (r->float_substr)
8457 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8458 if (r->float_utf8)
8459 SvREFCNT_dec(r->float_utf8);
2779dcf1 8460 Safefree(r->substrs);
a193d654 8461 }
81714fb9
YO
8462 if (r->paren_names)
8463 SvREFCNT_dec(r->paren_names);
f8149455
YO
8464
8465 Safefree(r->startp);
8466 Safefree(r->endp);
8467 Safefree(r);
8468}
8469#endif
8470
8471/* regfree_internal()
8472
8473 Free the private data in a regexp. This is overloadable by
8474 extensions. Perl takes care of the regexp structure in pregfree(),
8475 this covers the *pprivate pointer which technically perldoesnt
8476 know about, however of course we have to handle the
8477 regexp_internal structure when no extension is in use.
8478
8479 Note this is called before freeing anything in the regexp
8480 structure.
8481 */
8482
8483void
8484Perl_regfree_internal(pTHX_ struct regexp *r)
8485{
8486 dVAR;
8487 RXi_GET_DECL(r,ri);
8488 GET_RE_DEBUG_FLAGS_DECL;
8489
8490 DEBUG_COMPILE_r({
8491 if (!PL_colorset)
8492 reginitcolors();
8493 {
8494 SV *dsv= sv_newmortal();
8495 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8496 dsv, r->precomp, r->prelen, 60);
8497 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8498 PL_colors[4],PL_colors[5],s);
8499 }
8500 });
8501
8502 Safefree(ri->offsets); /* 20010421 MJD */
f8fc2ecf
YO
8503 if (ri->data) {
8504 int n = ri->data->count;
f3548bdc
DM
8505 PAD* new_comppad = NULL;
8506 PAD* old_comppad;
4026c95a 8507 PADOFFSET refcnt;
dfad63ad 8508
c277df42 8509 while (--n >= 0) {
261faec3 8510 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8511 switch (ri->data->what[n]) {
c277df42 8512 case 's':
81714fb9 8513 case 'S':
55eed653 8514 case 'u':
f8fc2ecf 8515 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8516 break;
653099ff 8517 case 'f':
f8fc2ecf 8518 Safefree(ri->data->data[n]);
653099ff 8519 break;
dfad63ad 8520 case 'p':
f8fc2ecf 8521 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8522 break;
c277df42 8523 case 'o':
dfad63ad 8524 if (new_comppad == NULL)
cea2e8a9 8525 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8526 PAD_SAVE_LOCAL(old_comppad,
8527 /* Watch out for global destruction's random ordering. */
c445ea15 8528 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8529 );
b34c0dd4 8530 OP_REFCNT_LOCK;
f8fc2ecf 8531 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8532 OP_REFCNT_UNLOCK;
8533 if (!refcnt)
f8fc2ecf 8534 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8535
f3548bdc 8536 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8537 SvREFCNT_dec((SV*)new_comppad);
8538 new_comppad = NULL;
c277df42
IZ
8539 break;
8540 case 'n':
9e55ce06 8541 break;
07be1b83 8542 case 'T':
be8e71aa
YO
8543 { /* Aho Corasick add-on structure for a trie node.
8544 Used in stclass optimization only */
07be1b83 8545 U32 refcount;
f8fc2ecf 8546 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8547 OP_REFCNT_LOCK;
8548 refcount = --aho->refcount;
8549 OP_REFCNT_UNLOCK;
8550 if ( !refcount ) {
446bd890
NC
8551 PerlMemShared_free(aho->states);
8552 PerlMemShared_free(aho->fail);
446bd890
NC
8553 /* do this last!!!! */
8554 PerlMemShared_free(ri->data->data[n]);
8555 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8556 }
8557 }
8558 break;
a3621e74 8559 case 't':
07be1b83 8560 {
be8e71aa 8561 /* trie structure. */
07be1b83 8562 U32 refcount;
f8fc2ecf 8563 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8564 OP_REFCNT_LOCK;
8565 refcount = --trie->refcount;
8566 OP_REFCNT_UNLOCK;
8567 if ( !refcount ) {
446bd890 8568 PerlMemShared_free(trie->charmap);
446bd890
NC
8569 PerlMemShared_free(trie->states);
8570 PerlMemShared_free(trie->trans);
07be1b83 8571 if (trie->bitmap)
446bd890 8572 PerlMemShared_free(trie->bitmap);
07be1b83 8573 if (trie->wordlen)
446bd890 8574 PerlMemShared_free(trie->wordlen);
786e8c11 8575 if (trie->jump)
446bd890 8576 PerlMemShared_free(trie->jump);
786e8c11 8577 if (trie->nextword)
446bd890 8578 PerlMemShared_free(trie->nextword);
446bd890
NC
8579 /* do this last!!!! */
8580 PerlMemShared_free(ri->data->data[n]);
a3621e74 8581 }
07be1b83
YO
8582 }
8583 break;
c277df42 8584 default:
f8fc2ecf 8585 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8586 }
8587 }
f8fc2ecf
YO
8588 Safefree(ri->data->what);
8589 Safefree(ri->data);
a0d0e21e 8590 }
f8fc2ecf
YO
8591 if (ri->swap) {
8592 Safefree(ri->swap->startp);
8593 Safefree(ri->swap->endp);
8594 Safefree(ri->swap);
c74340f9 8595 }
f8fc2ecf 8596 Safefree(ri);
a687059c 8597}
c277df42 8598
84da74a7
YO
8599#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8600#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8601#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8602#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8603
8604/*
8605 regdupe - duplicate a regexp.
8606
8607 This routine is called by sv.c's re_dup and is expected to clone a
8608 given regexp structure. It is a no-op when not under USE_ITHREADS.
8609 (Originally this *was* re_dup() for change history see sv.c)
8610
f8149455
YO
8611 After all of the core data stored in struct regexp is duplicated
8612 the regexp_engine.dupe method is used to copy any private data
8613 stored in the *pprivate pointer. This allows extensions to handle
8614 any duplication it needs to do.
8615
8616 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8617*/
a3c0e9ca 8618#if defined(USE_ITHREADS)
f8149455 8619#ifndef PERL_IN_XSUB_RE
84da74a7 8620regexp *
f8149455 8621Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8622{
84da74a7 8623 dVAR;
f8fc2ecf 8624 regexp *ret;
f8149455 8625 int i, npar;
84da74a7 8626 struct reg_substr_datum *s;
644c02aa 8627
84da74a7
YO
8628 if (!r)
8629 return (REGEXP *)NULL;
8630
8631 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8632 return ret;
8633
f8149455 8634
84da74a7 8635 npar = r->nparens+1;
f8fc2ecf 8636 Newxz(ret, 1, regexp);
84da74a7
YO
8637 Newx(ret->startp, npar, I32);
8638 Copy(r->startp, ret->startp, npar, I32);
8639 Newx(ret->endp, npar, I32);
f8149455 8640 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8641
c945c181 8642 if (r->substrs) {
785a26d5
YO
8643 Newx(ret->substrs, 1, struct reg_substr_data);
8644 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8645 s->min_offset = r->substrs->data[i].min_offset;
8646 s->max_offset = r->substrs->data[i].max_offset;
8647 s->end_shift = r->substrs->data[i].end_shift;
8648 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8649 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8650 }
8651 } else
8652 ret->substrs = NULL;
f8149455
YO
8653
8654 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8655 ret->refcnt = r->refcnt;
8656 ret->minlen = r->minlen;
8657 ret->minlenret = r->minlenret;
8658 ret->prelen = r->prelen;
8659 ret->nparens = r->nparens;
8660 ret->lastparen = r->lastparen;
8661 ret->lastcloseparen = r->lastcloseparen;
8662 ret->intflags = r->intflags;
8663 ret->extflags = r->extflags;
8664
8665 ret->sublen = r->sublen;
8666
8667 ret->engine = r->engine;
8668
8669 ret->paren_names = hv_dup_inc(r->paren_names, param);
8670
8671 if (RX_MATCH_COPIED(ret))
8672 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8673 else
8674 ret->subbeg = NULL;
8675#ifdef PERL_OLD_COPY_ON_WRITE
8676 ret->saved_copy = NULL;
8677#endif
8678
8679 ret->pprivate = r->pprivate;
785a26d5
YO
8680 if (ret->pprivate)
8681 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8682
8683 ptr_table_store(PL_ptr_table, r, ret);
8684 return ret;
8685}
8686#endif /* PERL_IN_XSUB_RE */
8687
8688/*
8689 regdupe_internal()
8690
8691 This is the internal complement to regdupe() which is used to copy
8692 the structure pointed to by the *pprivate pointer in the regexp.
8693 This is the core version of the extension overridable cloning hook.
8694 The regexp structure being duplicated will be copied by perl prior
8695 to this and will be provided as the regexp *r argument, however
8696 with the /old/ structures pprivate pointer value. Thus this routine
8697 may override any copying normally done by perl.
8698
8699 It returns a pointer to the new regexp_internal structure.
8700*/
8701
8702void *
8703Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8704{
8705 dVAR;
8706 regexp_internal *reti;
8707 int len, npar;
8708 RXi_GET_DECL(r,ri);
8709
8710 npar = r->nparens+1;
8711 len = ri->offsets[0];
8712
8713 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8714 Copy(ri->program, reti->program, len+1, regnode);
8715
8716 if(ri->swap) {
8717 Newx(reti->swap, 1, regexp_paren_ofs);
8718 /* no need to copy these */
8719 Newx(reti->swap->startp, npar, I32);
8720 Newx(reti->swap->endp, npar, I32);
8721 } else {
8722 reti->swap = NULL;
8723 }
8724
84da74a7 8725
f8fc2ecf
YO
8726 reti->regstclass = NULL;
8727 if (ri->data) {
84da74a7 8728 struct reg_data *d;
f8fc2ecf 8729 const int count = ri->data->count;
84da74a7
YO
8730 int i;
8731
8732 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8733 char, struct reg_data);
8734 Newx(d->what, count, U8);
8735
8736 d->count = count;
8737 for (i = 0; i < count; i++) {
f8fc2ecf 8738 d->what[i] = ri->data->what[i];
84da74a7 8739 switch (d->what[i]) {
55eed653 8740 /* legal options are one of: sSfpontTu
84da74a7
YO
8741 see also regcomp.h and pregfree() */
8742 case 's':
81714fb9 8743 case 'S':
0536c0a7 8744 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8745 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8746 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8747 break;
84da74a7
YO
8748 case 'f':
8749 /* This is cheating. */
8750 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8751 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8752 struct regnode_charclass_class);
f8fc2ecf 8753 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8754 break;
8755 case 'o':
bbe252da
YO
8756 /* Compiled op trees are readonly and in shared memory,
8757 and can thus be shared without duplication. */
84da74a7 8758 OP_REFCNT_LOCK;
f8fc2ecf 8759 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8760 OP_REFCNT_UNLOCK;
8761 break;
23eab42c
NC
8762 case 'T':
8763 /* Trie stclasses are readonly and can thus be shared
8764 * without duplication. We free the stclass in pregfree
8765 * when the corresponding reg_ac_data struct is freed.
8766 */
8767 reti->regstclass= ri->regstclass;
8768 /* Fall through */
84da74a7 8769 case 't':
84da74a7 8770 OP_REFCNT_LOCK;
0536c0a7 8771 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8772 OP_REFCNT_UNLOCK;
0536c0a7
NC
8773 /* Fall through */
8774 case 'n':
8775 d->data[i] = ri->data->data[i];
84da74a7 8776 break;
84da74a7 8777 default:
f8fc2ecf 8778 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8779 }
8780 }
8781
f8fc2ecf 8782 reti->data = d;
84da74a7
YO
8783 }
8784 else
f8fc2ecf 8785 reti->data = NULL;
84da74a7 8786
f8fc2ecf
YO
8787 Newx(reti->offsets, 2*len+1, U32);
8788 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
81714fb9 8789
f8149455 8790 return (void*)reti;
84da74a7 8791}
f8149455
YO
8792
8793#endif /* USE_ITHREADS */
84da74a7 8794
de8c5301
YO
8795/*
8796 reg_stringify()
8797
8798 converts a regexp embedded in a MAGIC struct to its stringified form,
8799 caching the converted form in the struct and returns the cached
8800 string.
8801
8802 If lp is nonnull then it is used to return the length of the
8803 resulting string
8804
8805 If flags is nonnull and the returned string contains UTF8 then
f8149455 8806 (*flags & 1) will be true.
de8c5301
YO
8807
8808 If haseval is nonnull then it is used to return whether the pattern
8809 contains evals.
8810
8811 Normally called via macro:
8812
f8149455 8813 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8814
8815 And internally with
8816
f8149455 8817 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8818
8819 See sv_2pv_flags() in sv.c for an example of internal usage.
8820
8821 */
f8149455 8822#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8823char *
8824Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8825 dVAR;
8826 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8827
de8c5301
YO
8828 if (!mg->mg_ptr) {
8829 const char *fptr = "msix";
8830 char reflags[6];
8831 char ch;
8832 int left = 0;
8833 int right = 4;
8834 bool need_newline = 0;
bbe252da 8835 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8836
8837 while((ch = *fptr++)) {
8838 if(reganch & 1) {
8839 reflags[left++] = ch;
8840 }
8841 else {
8842 reflags[right--] = ch;
8843 }
8844 reganch >>= 1;
8845 }
8846 if(left != 4) {
8847 reflags[left] = '-';
8848 left = 5;
8849 }
8850
8851 mg->mg_len = re->prelen + 4 + left;
8852 /*
8853 * If /x was used, we have to worry about a regex ending with a
8854 * comment later being embedded within another regex. If so, we don't
8855 * want this regex's "commentization" to leak out to the right part of
8856 * the enclosing regex, we must cap it with a newline.
8857 *
8858 * So, if /x was used, we scan backwards from the end of the regex. If
8859 * we find a '#' before we find a newline, we need to add a newline
8860 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8861 * we don't need to add anything. -jfriedl
8862 */
bbe252da 8863 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8864 const char *endptr = re->precomp + re->prelen;
8865 while (endptr >= re->precomp) {
8866 const char c = *(endptr--);
8867 if (c == '\n')
8868 break; /* don't need another */
8869 if (c == '#') {
8870 /* we end while in a comment, so we need a newline */
8871 mg->mg_len++; /* save space for it */
8872 need_newline = 1; /* note to add it */
8873 break;
8874 }
8875 }
8876 }
8877
8878 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8879 mg->mg_ptr[0] = '(';
8880 mg->mg_ptr[1] = '?';
8881 Copy(reflags, mg->mg_ptr+2, left, char);
8882 *(mg->mg_ptr+left+2) = ':';
8883 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8884 if (need_newline)
8885 mg->mg_ptr[mg->mg_len - 2] = '\n';
8886 mg->mg_ptr[mg->mg_len - 1] = ')';
8887 mg->mg_ptr[mg->mg_len] = 0;
8888 }
8889 if (haseval)
f8149455 8890 *haseval = re->seen_evals;
de8c5301 8891 if (flags)
bbe252da 8892 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8893
8894 if (lp)
8895 *lp = mg->mg_len;
8896 return mg->mg_ptr;
8897}
8898
c277df42
IZ
8899/*
8900 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8901 */
8902regnode *
864dbfa3 8903Perl_regnext(pTHX_ register regnode *p)
c277df42 8904{
97aff369 8905 dVAR;
c277df42
IZ
8906 register I32 offset;
8907
f8fc2ecf 8908 if (!p)
c277df42
IZ
8909 return(NULL);
8910
8911 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8912 if (offset == 0)
8913 return(NULL);
8914
c277df42 8915 return(p+offset);
c277df42 8916}
76234dfb 8917#endif
c277df42 8918
01f988be 8919STATIC void
cea2e8a9 8920S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8921{
8922 va_list args;
8923 STRLEN l1 = strlen(pat1);
8924 STRLEN l2 = strlen(pat2);
8925 char buf[512];
06bf62c7 8926 SV *msv;
73d840c0 8927 const char *message;
c277df42
IZ
8928
8929 if (l1 > 510)
8930 l1 = 510;
8931 if (l1 + l2 > 510)
8932 l2 = 510 - l1;
8933 Copy(pat1, buf, l1 , char);
8934 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8935 buf[l1 + l2] = '\n';
8936 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8937#ifdef I_STDARG
8938 /* ANSI variant takes additional second argument */
c277df42 8939 va_start(args, pat2);
8736538c
AS
8940#else
8941 va_start(args);
8942#endif
5a844595 8943 msv = vmess(buf, &args);
c277df42 8944 va_end(args);
cfd0369c 8945 message = SvPV_const(msv,l1);
c277df42
IZ
8946 if (l1 > 512)
8947 l1 = 512;
8948 Copy(message, buf, l1 , char);
197cf9b9 8949 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8950 Perl_croak(aTHX_ "%s", buf);
c277df42 8951}
a0ed51b3
LW
8952
8953/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8954
76234dfb 8955#ifndef PERL_IN_XSUB_RE
a0ed51b3 8956void
864dbfa3 8957Perl_save_re_context(pTHX)
b81d288d 8958{
97aff369 8959 dVAR;
1ade1aa1
NC
8960
8961 struct re_save_state *state;
8962
8963 SAVEVPTR(PL_curcop);
8964 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8965
8966 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8967 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8968 SSPUSHINT(SAVEt_RE_STATE);
8969
46ab3289 8970 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8971
a0ed51b3 8972 PL_reg_start_tmp = 0;
a0ed51b3 8973 PL_reg_start_tmpl = 0;
c445ea15 8974 PL_reg_oldsaved = NULL;
a5db57d6 8975 PL_reg_oldsavedlen = 0;
a5db57d6 8976 PL_reg_maxiter = 0;
a5db57d6 8977 PL_reg_leftiter = 0;
c445ea15 8978 PL_reg_poscache = NULL;
a5db57d6 8979 PL_reg_poscache_size = 0;
1ade1aa1
NC
8980#ifdef PERL_OLD_COPY_ON_WRITE
8981 PL_nrs = NULL;
8982#endif
ada6e8a9 8983
c445ea15
AL
8984 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8985 if (PL_curpm) {
8986 const REGEXP * const rx = PM_GETRE(PL_curpm);
8987 if (rx) {
1df70142 8988 U32 i;
ada6e8a9 8989 for (i = 1; i <= rx->nparens; i++) {
1df70142 8990 char digits[TYPE_CHARS(long)];
d9fad198 8991 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8992 GV *const *const gvp
8993 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8994
b37c2d43
AL
8995 if (gvp) {
8996 GV * const gv = *gvp;
8997 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8998 save_scalar(gv);
49f27e4b 8999 }
ada6e8a9
AMS
9000 }
9001 }
9002 }
a0ed51b3 9003}
76234dfb 9004#endif
51371543 9005
51371543 9006static void
acfe0abc 9007clear_re(pTHX_ void *r)
51371543 9008{
97aff369 9009 dVAR;
51371543
GS
9010 ReREFCNT_dec((regexp *)r);
9011}
ffbc6a93 9012
a28509cc
AL
9013#ifdef DEBUGGING
9014
9015STATIC void
9016S_put_byte(pTHX_ SV *sv, int c)
9017{
9018 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9019 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9020 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9021 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9022 else
9023 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9024}
9025
786e8c11 9026
3dab1dad
YO
9027#define CLEAR_OPTSTART \
9028 if (optstart) STMT_START { \
70685ca0 9029 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9030 optstart=NULL; \
9031 } STMT_END
9032
786e8c11 9033#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9034
b5a2f8d8
NC
9035STATIC const regnode *
9036S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9037 const regnode *last, const regnode *plast,
9038 SV* sv, I32 indent, U32 depth)
a28509cc 9039{
97aff369 9040 dVAR;
786e8c11 9041 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9042 register const regnode *next;
3dab1dad 9043 const regnode *optstart= NULL;
1f1031fe 9044
f8fc2ecf 9045 RXi_GET_DECL(r,ri);
3dab1dad 9046 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9047
786e8c11
YO
9048#ifdef DEBUG_DUMPUNTIL
9049 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9050 last ? last-start : 0,plast ? plast-start : 0);
9051#endif
9052
9053 if (plast && plast < last)
9054 last= plast;
9055
9056 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9057 /* While that wasn't END last time... */
a28509cc
AL
9058 NODE_ALIGN(node);
9059 op = OP(node);
de734bd5 9060 if (op == CLOSE || op == WHILEM)
786e8c11 9061 indent--;
b5a2f8d8 9062 next = regnext((regnode *)node);
1f1031fe 9063
a28509cc 9064 /* Where, what. */
8e11feef 9065 if (OP(node) == OPTIMIZED) {
e68ec53f 9066 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9067 optstart = node;
3dab1dad 9068 else
8e11feef 9069 goto after_print;
3dab1dad
YO
9070 } else
9071 CLEAR_OPTSTART;
1f1031fe 9072
32fc9b6a 9073 regprop(r, sv, node);
a28509cc 9074 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9075 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9076
9077 if (OP(node) != OPTIMIZED) {
9078 if (next == NULL) /* Next ptr. */
9079 PerlIO_printf(Perl_debug_log, " (0)");
9080 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9081 PerlIO_printf(Perl_debug_log, " (FAIL)");
9082 else
9083 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9084 (void)PerlIO_putc(Perl_debug_log, '\n');
9085 }
9086
a28509cc
AL
9087 after_print:
9088 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9089 assert(next);
9090 {
9091 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9092 ? regnext((regnode *)next)
9093 : next);
be8e71aa
YO
9094 if (last && nnode > last)
9095 nnode = last;
786e8c11 9096 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9097 }
a28509cc
AL
9098 }
9099 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9100 assert(next);
786e8c11 9101 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9102 }
9103 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9104 const regnode *this_trie = node;
1de06328 9105 const char op = OP(node);
647f639f 9106 const U32 n = ARG(node);
1de06328 9107 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9108 (reg_ac_data *)ri->data->data[n] :
1de06328 9109 NULL;
3251b653
NC
9110 const reg_trie_data * const trie =
9111 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9112#ifdef DEBUGGING
9113 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9114#endif
786e8c11 9115 const regnode *nextbranch= NULL;
a28509cc 9116 I32 word_idx;
1de06328 9117 sv_setpvn(sv, "", 0);
786e8c11 9118 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9119 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9120
9121 PerlIO_printf(Perl_debug_log, "%*s%s ",
9122 (int)(2*(indent+3)), "",
9123 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9124 PL_colors[0], PL_colors[1],
9125 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9126 PERL_PV_PRETTY_ELIPSES |
7f69552c 9127 PERL_PV_PRETTY_LTGT
786e8c11
YO
9128 )
9129 : "???"
9130 );
9131 if (trie->jump) {
40d049e4 9132 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9133 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9134 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9135 if (dist) {
9136 if (!nextbranch)
24b23f37 9137 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9138 DUMPUNTIL(this_trie + dist, nextbranch);
9139 }
786e8c11
YO
9140 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9141 nextbranch= regnext((regnode *)nextbranch);
9142 } else {
9143 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9144 }
786e8c11
YO
9145 }
9146 if (last && next > last)
9147 node= last;
9148 else
9149 node= next;
a28509cc 9150 }
786e8c11
YO
9151 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9152 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9153 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9154 }
9155 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9156 assert(next);
786e8c11 9157 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9158 }
9159 else if ( op == PLUS || op == STAR) {
786e8c11 9160 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9161 }
9162 else if (op == ANYOF) {
9163 /* arglen 1 + class block */
9164 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9165 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9166 node = NEXTOPER(node);
9167 }
9168 else if (PL_regkind[(U8)op] == EXACT) {
9169 /* Literal string, where present. */
9170 node += NODE_SZ_STR(node) - 1;
9171 node = NEXTOPER(node);
9172 }
9173 else {
9174 node = NEXTOPER(node);
9175 node += regarglen[(U8)op];
9176 }
9177 if (op == CURLYX || op == OPEN)
786e8c11 9178 indent++;
a28509cc 9179 }
3dab1dad 9180 CLEAR_OPTSTART;
786e8c11 9181#ifdef DEBUG_DUMPUNTIL
70685ca0 9182 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9183#endif
1de06328 9184 return node;
a28509cc
AL
9185}
9186
9187#endif /* DEBUGGING */
9188
241d1a3b
NC
9189/*
9190 * Local variables:
9191 * c-indentation-style: bsd
9192 * c-basic-offset: 4
9193 * indent-tabs-mode: t
9194 * End:
9195 *
37442d52
RGS
9196 * ex: set ts=8 sts=4 sw=4 noet:
9197 */