This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: prerelease checklist for Perl 5.10
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
821b33a5 189#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
190#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x4 /* Starts with * or +. */
192#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 193
3dab1dad
YO
194#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
195
07be1b83
YO
196/* whether trie related optimizations are enabled */
197#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
198#define TRIE_STUDY_OPT
786e8c11 199#define FULL_TRIE_STUDY
07be1b83
YO
200#define TRIE_STCLASS
201#endif
1de06328
YO
202
203
40d049e4
YO
204
205#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
206#define PBITVAL(paren) (1 << ((paren) & 7))
207#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
208#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
209#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210
211
1de06328
YO
212/* About scan_data_t.
213
214 During optimisation we recurse through the regexp program performing
215 various inplace (keyhole style) optimisations. In addition study_chunk
216 and scan_commit populate this data structure with information about
217 what strings MUST appear in the pattern. We look for the longest
218 string that must appear for at a fixed location, and we look for the
219 longest string that may appear at a floating location. So for instance
220 in the pattern:
221
222 /FOO[xX]A.*B[xX]BAR/
223
224 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
225 strings (because they follow a .* construct). study_chunk will identify
226 both FOO and BAR as being the longest fixed and floating strings respectively.
227
228 The strings can be composites, for instance
229
230 /(f)(o)(o)/
231
232 will result in a composite fixed substring 'foo'.
233
234 For each string some basic information is maintained:
235
236 - offset or min_offset
237 This is the position the string must appear at, or not before.
238 It also implicitly (when combined with minlenp) tells us how many
239 character must match before the string we are searching.
240 Likewise when combined with minlenp and the length of the string
241 tells us how many characters must appear after the string we have
242 found.
243
244 - max_offset
245 Only used for floating strings. This is the rightmost point that
246 the string can appear at. Ifset to I32 max it indicates that the
247 string can occur infinitely far to the right.
248
249 - minlenp
250 A pointer to the minimum length of the pattern that the string
251 was found inside. This is important as in the case of positive
252 lookahead or positive lookbehind we can have multiple patterns
253 involved. Consider
254
255 /(?=FOO).*F/
256
257 The minimum length of the pattern overall is 3, the minimum length
258 of the lookahead part is 3, but the minimum length of the part that
259 will actually match is 1. So 'FOO's minimum length is 3, but the
260 minimum length for the F is 1. This is important as the minimum length
261 is used to determine offsets in front of and behind the string being
262 looked for. Since strings can be composites this is the length of the
263 pattern at the time it was commited with a scan_commit. Note that
264 the length is calculated by study_chunk, so that the minimum lengths
265 are not known until the full pattern has been compiled, thus the
266 pointer to the value.
267
268 - lookbehind
269
270 In the case of lookbehind the string being searched for can be
271 offset past the start point of the final matching string.
272 If this value was just blithely removed from the min_offset it would
273 invalidate some of the calculations for how many chars must match
274 before or after (as they are derived from min_offset and minlen and
275 the length of the string being searched for).
276 When the final pattern is compiled and the data is moved from the
277 scan_data_t structure into the regexp structure the information
278 about lookbehind is factored in, with the information that would
279 have been lost precalculated in the end_shift field for the
280 associated string.
281
282 The fields pos_min and pos_delta are used to store the minimum offset
283 and the delta to the maximum offset at the current point in the pattern.
284
285*/
2c2d71f5
JH
286
287typedef struct scan_data_t {
1de06328
YO
288 /*I32 len_min; unused */
289 /*I32 len_delta; unused */
2c2d71f5
JH
290 I32 pos_min;
291 I32 pos_delta;
292 SV *last_found;
1de06328 293 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
294 I32 last_start_min;
295 I32 last_start_max;
1de06328
YO
296 SV **longest; /* Either &l_fixed, or &l_float. */
297 SV *longest_fixed; /* longest fixed string found in pattern */
298 I32 offset_fixed; /* offset where it starts */
299 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
301 SV *longest_float; /* longest floating string found in pattern */
302 I32 offset_float_min; /* earliest point in string it can appear */
303 I32 offset_float_max; /* latest point in string it can appear */
304 I32 *minlen_float; /* pointer to the minlen relevent to the string */
305 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
306 I32 flags;
307 I32 whilem_c;
cb434fcc 308 I32 *last_closep;
653099ff 309 struct regnode_charclass_class *start_class;
2c2d71f5
JH
310} scan_data_t;
311
a687059c 312/*
e50aee73 313 * Forward declarations for pregcomp()'s friends.
a687059c 314 */
a0d0e21e 315
27da23d5 316static const scan_data_t zero_scan_data =
1de06328 317 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
318
319#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
320#define SF_BEFORE_SEOL 0x0001
321#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
322#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
323#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324
09b7f37c
CB
325#ifdef NO_UNARY_PLUS
326# define SF_FIX_SHIFT_EOL (0+2)
327# define SF_FL_SHIFT_EOL (0+4)
328#else
329# define SF_FIX_SHIFT_EOL (+2)
330# define SF_FL_SHIFT_EOL (+4)
331#endif
c277df42
IZ
332
333#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
334#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
335
336#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
337#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
338#define SF_IS_INF 0x0040
339#define SF_HAS_PAR 0x0080
340#define SF_IN_PAR 0x0100
341#define SF_HAS_EVAL 0x0200
342#define SCF_DO_SUBSTR 0x0400
653099ff
GS
343#define SCF_DO_STCLASS_AND 0x0800
344#define SCF_DO_STCLASS_OR 0x1000
345#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 346#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 347
786e8c11 348#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 349#define SCF_SEEN_ACCEPT 0x8000
07be1b83 350
eb160463 351#define UTF (RExC_utf8 != 0)
bbe252da
YO
352#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
353#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 354
ffc61ed2 355#define OOB_UNICODE 12345678
93733859 356#define OOB_NAMEDCLASS -1
b8c5462f 357
a0ed51b3
LW
358#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
359#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360
8615cb43 361
b45f050a
JF
362/* length of regex to show in messages that don't mark a position within */
363#define RegexLengthToShowInErrorMessages 127
364
365/*
366 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
367 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
368 * op/pragma/warn/regcomp.
369 */
7253e4e3
RK
370#define MARKER1 "<-- HERE" /* marker as it appears in the description */
371#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 372
7253e4e3 373#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
377 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 * "...".
379 */
58e23c8d 380#define _FAIL(code) STMT_START { \
bfed75c6 381 const char *ellipses = ""; \
ccb2c380
MP
382 IV len = RExC_end - RExC_precomp; \
383 \
384 if (!SIZE_ONLY) \
385 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
386 if (len > RegexLengthToShowInErrorMessages) { \
387 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
388 len = RegexLengthToShowInErrorMessages - 10; \
389 ellipses = "..."; \
390 } \
58e23c8d 391 code; \
ccb2c380 392} STMT_END
8615cb43 393
58e23c8d
YO
394#define FAIL(msg) _FAIL( \
395 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
396 msg, (int)len, RExC_precomp, ellipses))
397
398#define FAIL2(msg,arg) _FAIL( \
399 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
400 arg, (int)len, RExC_precomp, ellipses))
401
b45f050a 402/*
b45f050a
JF
403 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
404 */
ccb2c380 405#define Simple_vFAIL(m) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
408 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
413 */
ccb2c380
MP
414#define vFAIL(m) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL(m); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Like Simple_vFAIL(), but accepts two arguments.
422 */
ccb2c380 423#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 424 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
425 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
426 (int)offset, RExC_precomp, RExC_precomp + offset); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
431 */
ccb2c380
MP
432#define vFAIL2(m,a1) STMT_START { \
433 if (!SIZE_ONLY) \
434 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
435 Simple_vFAIL2(m, a1); \
436} STMT_END
b45f050a
JF
437
438
439/*
440 * Like Simple_vFAIL(), but accepts three arguments.
441 */
ccb2c380 442#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
450 */
ccb2c380
MP
451#define vFAIL3(m,a1,a2) STMT_START { \
452 if (!SIZE_ONLY) \
453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
454 Simple_vFAIL3(m, a1, a2); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts four arguments.
459 */
ccb2c380 460#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a 465
ccb2c380 466#define vWARN(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARNdep(loc,m) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
475 "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479
480#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
497
498#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 499 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
9d1d55b5 503
8615cb43 504
cd439c50 505/* Allow for side effects in s */
ccb2c380
MP
506#define REGC(c,s) STMT_START { \
507 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508} STMT_END
cd439c50 509
fac92740
MJD
510/* Macros for recording node offsets. 20001227 mjd@plover.com
511 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
512 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
513 * Element 0 holds the number n.
07be1b83 514 * Position is 1 indexed.
fac92740 515 */
7122b237
YO
516#ifndef RE_TRACK_PATTERN_OFFSETS
517#define Set_Node_Offset_To_R(node,byte)
518#define Set_Node_Offset(node,byte)
519#define Set_Cur_Node_Offset
520#define Set_Node_Length_To_R(node,len)
521#define Set_Node_Length(node,len)
522#define Set_Node_Cur_Length(node)
523#define Node_Offset(n)
524#define Node_Length(n)
525#define Set_Node_Offset_Length(node,offset,len)
526#define ProgLen(ri) ri->u.proglen
527#define SetProgLen(ri,x) ri->u.proglen = x
528#else
529#define ProgLen(ri) ri->u.offsets[0]
530#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
531#define Set_Node_Offset_To_R(node,byte) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 534 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)-1] = (byte); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Offset(node,byte) \
544 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
545#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
546
547#define Set_Node_Length_To_R(node,len) STMT_START { \
548 if (! SIZE_ONLY) { \
549 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 550 __LINE__, (int)(node), (int)(len))); \
ccb2c380 551 if((node) < 0) { \
551405c4 552 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
553 } else { \
554 RExC_offsets[2*(node)] = (len); \
555 } \
556 } \
557} STMT_END
558
559#define Set_Node_Length(node,len) \
560 Set_Node_Length_To_R((node)-RExC_emit_start, len)
561#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
562#define Set_Node_Cur_Length(node) \
563 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
564
565/* Get offsets and lengths */
566#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
567#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
568
07be1b83
YO
569#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
570 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
571 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
572} STMT_END
7122b237 573#endif
07be1b83
YO
574
575#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
576#define EXPERIMENTAL_INPLACESCAN
7122b237 577#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 578
304ee84b
YO
579#define DEBUG_STUDYDATA(str,data,depth) \
580DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 581 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
582 "%*s" str "Pos:%"IVdf"/%"IVdf \
583 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
584 (int)(depth)*2, "", \
585 (IV)((data)->pos_min), \
586 (IV)((data)->pos_delta), \
304ee84b 587 (UV)((data)->flags), \
1de06328 588 (IV)((data)->whilem_c), \
304ee84b
YO
589 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
590 is_inf ? "INF " : "" \
1de06328
YO
591 ); \
592 if ((data)->last_found) \
593 PerlIO_printf(Perl_debug_log, \
594 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
595 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
596 SvPVX_const((data)->last_found), \
597 (IV)((data)->last_end), \
598 (IV)((data)->last_start_min), \
599 (IV)((data)->last_start_max), \
600 ((data)->longest && \
601 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
602 SvPVX_const((data)->longest_fixed), \
603 (IV)((data)->offset_fixed), \
604 ((data)->longest && \
605 (data)->longest==&((data)->longest_float)) ? "*" : "", \
606 SvPVX_const((data)->longest_float), \
607 (IV)((data)->offset_float_min), \
608 (IV)((data)->offset_float_max) \
609 ); \
610 PerlIO_printf(Perl_debug_log,"\n"); \
611});
612
acfe0abc 613static void clear_re(pTHX_ void *r);
4327152a 614
653099ff 615/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 616 Update the longest found anchored substring and the longest found
653099ff
GS
617 floating substrings if needed. */
618
4327152a 619STATIC void
304ee84b 620S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 621{
e1ec3a88
AL
622 const STRLEN l = CHR_SVLEN(data->last_found);
623 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 624 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 625
c277df42 626 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 627 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
628 if (*data->longest == data->longest_fixed) {
629 data->offset_fixed = l ? data->last_start_min : data->pos_min;
630 if (data->flags & SF_BEFORE_EOL)
b81d288d 631 data->flags
c277df42
IZ
632 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
633 else
634 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
635 data->minlen_fixed=minlenp;
636 data->lookbehind_fixed=0;
a0ed51b3 637 }
304ee84b 638 else { /* *data->longest == data->longest_float */
c277df42 639 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
640 data->offset_float_max = (l
641 ? data->last_start_max
c277df42 642 : data->pos_min + data->pos_delta);
304ee84b 643 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 644 data->offset_float_max = I32_MAX;
c277df42 645 if (data->flags & SF_BEFORE_EOL)
b81d288d 646 data->flags
c277df42
IZ
647 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
648 else
649 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
650 data->minlen_float=minlenp;
651 data->lookbehind_float=0;
c277df42
IZ
652 }
653 }
654 SvCUR_set(data->last_found, 0);
0eda9292 655 {
a28509cc 656 SV * const sv = data->last_found;
097eb12c
AL
657 if (SvUTF8(sv) && SvMAGICAL(sv)) {
658 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
659 if (mg)
660 mg->mg_len = 0;
661 }
0eda9292 662 }
c277df42
IZ
663 data->last_end = -1;
664 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 665 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
666}
667
653099ff
GS
668/* Can match anything (initialization) */
669STATIC void
097eb12c 670S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 671{
653099ff 672 ANYOF_CLASS_ZERO(cl);
f8bef550 673 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 674 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
675 if (LOC)
676 cl->flags |= ANYOF_LOCALE;
677}
678
679/* Can match anything (initialization) */
680STATIC int
5f66b61c 681S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
682{
683 int value;
684
aaa51d5e 685 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
686 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
687 return 1;
1aa99e6b
IH
688 if (!(cl->flags & ANYOF_UNICODE_ALL))
689 return 0;
10edeb5d 690 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 691 return 0;
653099ff
GS
692 return 1;
693}
694
695/* Can match anything (initialization) */
696STATIC void
097eb12c 697S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 698{
8ecf7187 699 Zero(cl, 1, struct regnode_charclass_class);
653099ff 700 cl->type = ANYOF;
830247a4 701 cl_anything(pRExC_state, cl);
653099ff
GS
702}
703
704STATIC void
097eb12c 705S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 706{
8ecf7187 707 Zero(cl, 1, struct regnode_charclass_class);
653099ff 708 cl->type = ANYOF;
830247a4 709 cl_anything(pRExC_state, cl);
653099ff
GS
710 if (LOC)
711 cl->flags |= ANYOF_LOCALE;
712}
713
714/* 'And' a given class with another one. Can create false positives */
715/* We assume that cl is not inverted */
716STATIC void
5f66b61c 717S_cl_and(struct regnode_charclass_class *cl,
a28509cc 718 const struct regnode_charclass_class *and_with)
653099ff 719{
40d049e4
YO
720
721 assert(and_with->type == ANYOF);
653099ff
GS
722 if (!(and_with->flags & ANYOF_CLASS)
723 && !(cl->flags & ANYOF_CLASS)
724 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && !(and_with->flags & ANYOF_FOLD)
726 && !(cl->flags & ANYOF_FOLD)) {
727 int i;
728
729 if (and_with->flags & ANYOF_INVERT)
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] &= ~and_with->bitmap[i];
732 else
733 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
734 cl->bitmap[i] &= and_with->bitmap[i];
735 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
736 if (!(and_with->flags & ANYOF_EOS))
737 cl->flags &= ~ANYOF_EOS;
1aa99e6b 738
14ebb1a2
JH
739 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
740 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
741 cl->flags &= ~ANYOF_UNICODE_ALL;
742 cl->flags |= ANYOF_UNICODE;
743 ARG_SET(cl, ARG(and_with));
744 }
14ebb1a2
JH
745 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
746 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 747 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
748 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
749 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 750 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
751}
752
753/* 'OR' a given class with another one. Can create false positives */
754/* We assume that cl is not inverted */
755STATIC void
097eb12c 756S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 757{
653099ff
GS
758 if (or_with->flags & ANYOF_INVERT) {
759 /* We do not use
760 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
761 * <= (B1 | !B2) | (CL1 | !CL2)
762 * which is wasteful if CL2 is small, but we ignore CL2:
763 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
764 * XXXX Can we handle case-fold? Unclear:
765 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
766 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
767 */
768 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(or_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD) ) {
771 int i;
772
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] |= ~or_with->bitmap[i];
775 } /* XXXX: logic is complicated otherwise */
776 else {
830247a4 777 cl_anything(pRExC_state, cl);
653099ff
GS
778 }
779 } else {
780 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
781 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 782 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
783 || (cl->flags & ANYOF_FOLD)) ) {
784 int i;
785
786 /* OR char bitmap and class bitmap separately */
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] |= or_with->bitmap[i];
789 if (or_with->flags & ANYOF_CLASS) {
790 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
791 cl->classflags[i] |= or_with->classflags[i];
792 cl->flags |= ANYOF_CLASS;
793 }
794 }
795 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 796 cl_anything(pRExC_state, cl);
653099ff
GS
797 }
798 }
799 if (or_with->flags & ANYOF_EOS)
800 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
801
802 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
803 ARG(cl) != ARG(or_with)) {
804 cl->flags |= ANYOF_UNICODE_ALL;
805 cl->flags &= ~ANYOF_UNICODE;
806 }
807 if (or_with->flags & ANYOF_UNICODE_ALL) {
808 cl->flags |= ANYOF_UNICODE_ALL;
809 cl->flags &= ~ANYOF_UNICODE;
810 }
653099ff
GS
811}
812
a3621e74
YO
813#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
814#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
815#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
816#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
817
3dab1dad
YO
818
819#ifdef DEBUGGING
07be1b83 820/*
2b8b4781
NC
821 dump_trie(trie,widecharmap,revcharmap)
822 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
823 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
824
825 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
826 The _interim_ variants are used for debugging the interim
827 tables that are used to generate the final compressed
828 representation which is what dump_trie expects.
829
3dab1dad
YO
830 Part of the reason for their existance is to provide a form
831 of documentation as to how the different representations function.
07be1b83
YO
832
833*/
3dab1dad
YO
834
835/*
3dab1dad
YO
836 Dumps the final compressed table form of the trie to Perl_debug_log.
837 Used for debugging make_trie().
838*/
839
840STATIC void
2b8b4781
NC
841S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
842 AV *revcharmap, U32 depth)
3dab1dad
YO
843{
844 U32 state;
ab3bbdeb 845 SV *sv=sv_newmortal();
55eed653 846 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
847 GET_RE_DEBUG_FLAGS_DECL;
848
ab3bbdeb 849
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
851 (int)depth * 2 + 2,"",
852 "Match","Base","Ofs" );
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 855 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 856 if ( tmp ) {
ab3bbdeb
YO
857 PerlIO_printf( Perl_debug_log, "%*s",
858 colwidth,
ddc5bc0f 859 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
860 PL_colors[0], PL_colors[1],
861 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
862 PERL_PV_ESCAPE_FIRSTCHAR
863 )
864 );
3dab1dad
YO
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 871 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
872 PerlIO_printf( Perl_debug_log, "\n");
873
1e2e3d02 874 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*"UVXf,
903 colwidth,
3dab1dad
YO
904 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 } else {
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
907 }
908 }
909
910 PerlIO_printf( Perl_debug_log, "]");
911
912 }
913 PerlIO_printf( Perl_debug_log, "\n" );
914 }
915}
916/*
3dab1dad
YO
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
55eed653 923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
924 HV *widecharmap, AV *revcharmap, U32 next_alloc,
925 U32 depth)
3dab1dad
YO
926{
927 U32 state;
ab3bbdeb 928 SV *sv=sv_newmortal();
55eed653 929 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
930 GET_RE_DEBUG_FLAGS_DECL;
931 /* print out the table precompression. */
ab3bbdeb
YO
932 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
933 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
934 "------:-----+-----------------\n" );
3dab1dad
YO
935
936 for( state=1 ; state < next_alloc ; state ++ ) {
937 U16 charid;
938
ab3bbdeb 939 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
940 (int)depth * 2 + 2,"", (UV)state );
941 if ( ! trie->states[ state ].wordnum ) {
942 PerlIO_printf( Perl_debug_log, "%5s| ","");
943 } else {
944 PerlIO_printf( Perl_debug_log, "W%4x| ",
945 trie->states[ state ].wordnum
946 );
947 }
948 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 949 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
950 if ( tmp ) {
951 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
952 colwidth,
ddc5bc0f 953 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
954 PL_colors[0], PL_colors[1],
955 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
956 PERL_PV_ESCAPE_FIRSTCHAR
957 ) ,
1e2e3d02
YO
958 TRIE_LIST_ITEM(state,charid).forid,
959 (UV)TRIE_LIST_ITEM(state,charid).newstate
960 );
961 if (!(charid % 10))
664e119d
RGS
962 PerlIO_printf(Perl_debug_log, "\n%*s| ",
963 (int)((depth * 2) + 14), "");
1e2e3d02 964 }
ab3bbdeb
YO
965 }
966 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
967 }
968}
969
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in table form.
972 This is the normal DFA style state transition table, with a few
973 twists to facilitate compression later.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
982 U16 charid;
ab3bbdeb 983 SV *sv=sv_newmortal();
55eed653 984 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
985 GET_RE_DEBUG_FLAGS_DECL;
986
987 /*
988 print out the table precompression so that we can do a visual check
989 that they are identical.
990 */
991
992 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
993
994 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 995 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 996 if ( tmp ) {
ab3bbdeb
YO
997 PerlIO_printf( Perl_debug_log, "%*s",
998 colwidth,
ddc5bc0f 999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1000 PL_colors[0], PL_colors[1],
1001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002 PERL_PV_ESCAPE_FIRSTCHAR
1003 )
1004 );
3dab1dad
YO
1005 }
1006 }
1007
1008 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1009
1010 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1011 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n" );
1015
1016 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1017
1018 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1019 (int)depth * 2 + 2,"",
1020 (UV)TRIE_NODENUM( state ) );
1021
1022 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1023 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1024 if (v)
1025 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1026 else
1027 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1028 }
1029 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1030 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1031 } else {
1032 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1033 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1034 }
1035 }
07be1b83 1036}
3dab1dad
YO
1037
1038#endif
1039
786e8c11
YO
1040/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1041 startbranch: the first branch in the whole branch sequence
1042 first : start branch of sequence of branch-exact nodes.
1043 May be the same as startbranch
1044 last : Thing following the last branch.
1045 May be the same as tail.
1046 tail : item following the branch sequence
1047 count : words in the sequence
1048 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1049 depth : indent depth
3dab1dad 1050
786e8c11 1051Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1052
786e8c11
YO
1053A trie is an N'ary tree where the branches are determined by digital
1054decomposition of the key. IE, at the root node you look up the 1st character and
1055follow that branch repeat until you find the end of the branches. Nodes can be
1056marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1057
786e8c11 1058 /he|she|his|hers/
72f13be8 1059
786e8c11
YO
1060would convert into the following structure. Numbers represent states, letters
1061following numbers represent valid transitions on the letter from that state, if
1062the number is in square brackets it represents an accepting state, otherwise it
1063will be in parenthesis.
07be1b83 1064
786e8c11
YO
1065 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1066 | |
1067 | (2)
1068 | |
1069 (1) +-i->(6)-+-s->[7]
1070 |
1071 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1072
786e8c11
YO
1073 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1074
1075This shows that when matching against the string 'hers' we will begin at state 1
1076read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1077then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1078is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1079single traverse. We store a mapping from accepting to state to which word was
1080matched, and then when we have multiple possibilities we try to complete the
1081rest of the regex in the order in which they occured in the alternation.
1082
1083The only prior NFA like behaviour that would be changed by the TRIE support is
1084the silent ignoring of duplicate alternations which are of the form:
1085
1086 / (DUPE|DUPE) X? (?{ ... }) Y /x
1087
1088Thus EVAL blocks follwing a trie may be called a different number of times with
1089and without the optimisation. With the optimisations dupes will be silently
1090ignored. This inconsistant behaviour of EVAL type nodes is well established as
1091the following demonstrates:
1092
1093 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1094
1095which prints out 'word' three times, but
1096
1097 'words'=~/(word|word|word)(?{ print $1 })S/
1098
1099which doesnt print it out at all. This is due to other optimisations kicking in.
1100
1101Example of what happens on a structural level:
1102
1103The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1104
1105 1: CURLYM[1] {1,32767}(18)
1106 5: BRANCH(8)
1107 6: EXACT <ac>(16)
1108 8: BRANCH(11)
1109 9: EXACT <ad>(16)
1110 11: BRANCH(14)
1111 12: EXACT <ab>(16)
1112 16: SUCCEED(0)
1113 17: NOTHING(18)
1114 18: END(0)
1115
1116This would be optimizable with startbranch=5, first=5, last=16, tail=16
1117and should turn into:
1118
1119 1: CURLYM[1] {1,32767}(18)
1120 5: TRIE(16)
1121 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1122 <ac>
1123 <ad>
1124 <ab>
1125 16: SUCCEED(0)
1126 17: NOTHING(18)
1127 18: END(0)
1128
1129Cases where tail != last would be like /(?foo|bar)baz/:
1130
1131 1: BRANCH(4)
1132 2: EXACT <foo>(8)
1133 4: BRANCH(7)
1134 5: EXACT <bar>(8)
1135 7: TAIL(8)
1136 8: EXACT <baz>(10)
1137 10: END(0)
1138
1139which would be optimizable with startbranch=1, first=1, last=7, tail=8
1140and would end up looking like:
1141
1142 1: TRIE(8)
1143 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1144 <foo>
1145 <bar>
1146 7: TAIL(8)
1147 8: EXACT <baz>(10)
1148 10: END(0)
1149
1150 d = uvuni_to_utf8_flags(d, uv, 0);
1151
1152is the recommended Unicode-aware way of saying
1153
1154 *(d++) = uv;
1155*/
1156
1e2e3d02 1157#define TRIE_STORE_REVCHAR \
786e8c11 1158 STMT_START { \
1e2e3d02 1159 SV *tmp = newSVpvs(""); \
786e8c11 1160 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1161 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1162 av_push( revcharmap, tmp ); \
786e8c11
YO
1163 } STMT_END
1164
1165#define TRIE_READ_CHAR STMT_START { \
1166 wordlen++; \
1167 if ( UTF ) { \
1168 if ( folder ) { \
1169 if ( foldlen > 0 ) { \
1170 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1171 foldlen -= len; \
1172 scan += len; \
1173 len = 0; \
1174 } else { \
1175 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1176 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1177 foldlen -= UNISKIP( uvc ); \
1178 scan = foldbuf + UNISKIP( uvc ); \
1179 } \
1180 } else { \
1181 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1182 } \
1183 } else { \
1184 uvc = (U32)*uc; \
1185 len = 1; \
1186 } \
1187} STMT_END
1188
1189
1190
1191#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1192 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1193 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1194 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1195 } \
1196 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1198 TRIE_LIST_CUR( state )++; \
1199} STMT_END
07be1b83 1200
786e8c11
YO
1201#define TRIE_LIST_NEW(state) STMT_START { \
1202 Newxz( trie->states[ state ].trans.list, \
1203 4, reg_trie_trans_le ); \
1204 TRIE_LIST_CUR( state ) = 1; \
1205 TRIE_LIST_LEN( state ) = 4; \
1206} STMT_END
07be1b83 1207
786e8c11
YO
1208#define TRIE_HANDLE_WORD(state) STMT_START { \
1209 U16 dupe= trie->states[ state ].wordnum; \
1210 regnode * const noper_next = regnext( noper ); \
1211 \
1212 if (trie->wordlen) \
1213 trie->wordlen[ curword ] = wordlen; \
1214 DEBUG_r({ \
1215 /* store the word for dumping */ \
1216 SV* tmp; \
1217 if (OP(noper) != NOTHING) \
1218 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1219 else \
1220 tmp = newSVpvn( "", 0 ); \
1221 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1222 av_push( trie_words, tmp ); \
786e8c11
YO
1223 }); \
1224 \
1225 curword++; \
1226 \
1227 if ( noper_next < tail ) { \
1228 if (!trie->jump) \
c944940b 1229 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1230 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1231 if (!jumper) \
1232 jumper = noper_next; \
1233 if (!nextbranch) \
1234 nextbranch= regnext(cur); \
1235 } \
1236 \
1237 if ( dupe ) { \
1238 /* So it's a dupe. This means we need to maintain a */\
1239 /* linked-list from the first to the next. */\
1240 /* we only allocate the nextword buffer when there */\
1241 /* a dupe, so first time we have to do the allocation */\
1242 if (!trie->nextword) \
c944940b 1243 trie->nextword = (U16 *) \
446bd890 1244 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1245 while ( trie->nextword[dupe] ) \
1246 dupe= trie->nextword[dupe]; \
1247 trie->nextword[dupe]= curword; \
1248 } else { \
1249 /* we haven't inserted this word yet. */ \
1250 trie->states[ state ].wordnum = curword; \
1251 } \
1252} STMT_END
07be1b83 1253
3dab1dad 1254
786e8c11
YO
1255#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1256 ( ( base + charid >= ucharcount \
1257 && base + charid < ubound \
1258 && state == trie->trans[ base - ucharcount + charid ].check \
1259 && trie->trans[ base - ucharcount + charid ].next ) \
1260 ? trie->trans[ base - ucharcount + charid ].next \
1261 : ( state==1 ? special : 0 ) \
1262 )
3dab1dad 1263
786e8c11
YO
1264#define MADE_TRIE 1
1265#define MADE_JUMP_TRIE 2
1266#define MADE_EXACT_TRIE 4
3dab1dad 1267
a3621e74 1268STATIC I32
786e8c11 1269S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1270{
27da23d5 1271 dVAR;
a3621e74
YO
1272 /* first pass, loop through and scan words */
1273 reg_trie_data *trie;
55eed653 1274 HV *widecharmap = NULL;
2b8b4781 1275 AV *revcharmap = newAV();
a3621e74 1276 regnode *cur;
9f7f3913 1277 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1278 STRLEN len = 0;
1279 UV uvc = 0;
1280 U16 curword = 0;
1281 U32 next_alloc = 0;
786e8c11
YO
1282 regnode *jumper = NULL;
1283 regnode *nextbranch = NULL;
7f69552c 1284 regnode *convert = NULL;
a3621e74 1285 /* we just use folder as a flag in utf8 */
e1ec3a88 1286 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1287 ? PL_fold
1288 : ( flags == EXACTFL
1289 ? PL_fold_locale
1290 : NULL
1291 )
1292 );
1293
2b8b4781
NC
1294#ifdef DEBUGGING
1295 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1296 AV *trie_words = NULL;
1297 /* along with revcharmap, this only used during construction but both are
1298 * useful during debugging so we store them in the struct when debugging.
8e11feef 1299 */
2b8b4781
NC
1300#else
1301 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1302 STRLEN trie_charcount=0;
3dab1dad 1303#endif
2b8b4781 1304 SV *re_trie_maxbuff;
a3621e74 1305 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1306#ifndef DEBUGGING
1307 PERL_UNUSED_ARG(depth);
1308#endif
a3621e74 1309
c944940b 1310 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1311 trie->refcount = 1;
3dab1dad 1312 trie->startstate = 1;
786e8c11 1313 trie->wordcount = word_count;
f8fc2ecf 1314 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1315 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1316 if (!(UTF && folder))
c944940b 1317 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1318 DEBUG_r({
2b8b4781 1319 trie_words = newAV();
a3621e74 1320 });
a3621e74 1321
0111c4fd 1322 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1323 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1324 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1325 }
3dab1dad
YO
1326 DEBUG_OPTIMISE_r({
1327 PerlIO_printf( Perl_debug_log,
786e8c11 1328 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1329 (int)depth * 2 + 2, "",
1330 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1331 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1332 (int)depth);
3dab1dad 1333 });
7f69552c
YO
1334
1335 /* Find the node we are going to overwrite */
1336 if ( first == startbranch && OP( last ) != BRANCH ) {
1337 /* whole branch chain */
1338 convert = first;
1339 } else {
1340 /* branch sub-chain */
1341 convert = NEXTOPER( first );
1342 }
1343
a3621e74
YO
1344 /* -- First loop and Setup --
1345
1346 We first traverse the branches and scan each word to determine if it
1347 contains widechars, and how many unique chars there are, this is
1348 important as we have to build a table with at least as many columns as we
1349 have unique chars.
1350
1351 We use an array of integers to represent the character codes 0..255
1352 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1353 native representation of the character value as the key and IV's for the
1354 coded index.
1355
1356 *TODO* If we keep track of how many times each character is used we can
1357 remap the columns so that the table compression later on is more
1358 efficient in terms of memory by ensuring most common value is in the
1359 middle and the least common are on the outside. IMO this would be better
1360 than a most to least common mapping as theres a decent chance the most
1361 common letter will share a node with the least common, meaning the node
1362 will not be compressable. With a middle is most common approach the worst
1363 case is when we have the least common nodes twice.
1364
1365 */
1366
a3621e74 1367 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1368 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1369 const U8 *uc = (U8*)STRING( noper );
a28509cc 1370 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1371 STRLEN foldlen = 0;
1372 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1373 const U8 *scan = (U8*)NULL;
07be1b83 1374 U32 wordlen = 0; /* required init */
3dab1dad 1375 STRLEN chars=0;
a3621e74 1376
3dab1dad
YO
1377 if (OP(noper) == NOTHING) {
1378 trie->minlen= 0;
1379 continue;
1380 }
1381 if (trie->bitmap) {
1382 TRIE_BITMAP_SET(trie,*uc);
1383 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1384 }
a3621e74 1385 for ( ; uc < e ; uc += len ) {
3dab1dad 1386 TRIE_CHARCOUNT(trie)++;
a3621e74 1387 TRIE_READ_CHAR;
3dab1dad 1388 chars++;
a3621e74
YO
1389 if ( uvc < 256 ) {
1390 if ( !trie->charmap[ uvc ] ) {
1391 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1392 if ( folder )
1393 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1394 TRIE_STORE_REVCHAR;
a3621e74
YO
1395 }
1396 } else {
1397 SV** svpp;
55eed653
NC
1398 if ( !widecharmap )
1399 widecharmap = newHV();
a3621e74 1400
55eed653 1401 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1402
1403 if ( !svpp )
e4584336 1404 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1405
1406 if ( !SvTRUE( *svpp ) ) {
1407 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1408 TRIE_STORE_REVCHAR;
a3621e74
YO
1409 }
1410 }
1411 }
3dab1dad
YO
1412 if( cur == first ) {
1413 trie->minlen=chars;
1414 trie->maxlen=chars;
1415 } else if (chars < trie->minlen) {
1416 trie->minlen=chars;
1417 } else if (chars > trie->maxlen) {
1418 trie->maxlen=chars;
1419 }
1420
a3621e74
YO
1421 } /* end first pass */
1422 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1423 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1424 (int)depth * 2 + 2,"",
55eed653 1425 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1426 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1427 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1428 );
c944940b 1429 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1430
1431 /*
1432 We now know what we are dealing with in terms of unique chars and
1433 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1434 representation using a flat table will take. If it's over a reasonable
1435 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1436 conservative but potentially much slower representation using an array
1437 of lists.
1438
1439 At the end we convert both representations into the same compressed
1440 form that will be used in regexec.c for matching with. The latter
1441 is a form that cannot be used to construct with but has memory
1442 properties similar to the list form and access properties similar
1443 to the table form making it both suitable for fast searches and
1444 small enough that its feasable to store for the duration of a program.
1445
1446 See the comment in the code where the compressed table is produced
1447 inplace from the flat tabe representation for an explanation of how
1448 the compression works.
1449
1450 */
1451
1452
3dab1dad 1453 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1454 /*
1455 Second Pass -- Array Of Lists Representation
1456
1457 Each state will be represented by a list of charid:state records
1458 (reg_trie_trans_le) the first such element holds the CUR and LEN
1459 points of the allocated array. (See defines above).
1460
1461 We build the initial structure using the lists, and then convert
1462 it into the compressed table form which allows faster lookups
1463 (but cant be modified once converted).
a3621e74
YO
1464 */
1465
a3621e74
YO
1466 STRLEN transcount = 1;
1467
1e2e3d02
YO
1468 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1469 "%*sCompiling trie using list compiler\n",
1470 (int)depth * 2 + 2, ""));
446bd890 1471
c944940b
JH
1472 trie->states = (reg_trie_state *)
1473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1474 sizeof(reg_trie_state) );
a3621e74
YO
1475 TRIE_LIST_NEW(1);
1476 next_alloc = 2;
1477
1478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1479
c445ea15
AL
1480 regnode * const noper = NEXTOPER( cur );
1481 U8 *uc = (U8*)STRING( noper );
1482 const U8 * const e = uc + STR_LEN( noper );
1483 U32 state = 1; /* required init */
1484 U16 charid = 0; /* sanity init */
1485 U8 *scan = (U8*)NULL; /* sanity init */
1486 STRLEN foldlen = 0; /* required init */
07be1b83 1487 U32 wordlen = 0; /* required init */
c445ea15
AL
1488 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1489
3dab1dad 1490 if (OP(noper) != NOTHING) {
786e8c11 1491 for ( ; uc < e ; uc += len ) {
c445ea15 1492
786e8c11 1493 TRIE_READ_CHAR;
c445ea15 1494
786e8c11
YO
1495 if ( uvc < 256 ) {
1496 charid = trie->charmap[ uvc ];
c445ea15 1497 } else {
55eed653 1498 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1499 if ( !svpp ) {
1500 charid = 0;
1501 } else {
1502 charid=(U16)SvIV( *svpp );
1503 }
c445ea15 1504 }
786e8c11
YO
1505 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1506 if ( charid ) {
a3621e74 1507
786e8c11
YO
1508 U16 check;
1509 U32 newstate = 0;
a3621e74 1510
786e8c11
YO
1511 charid--;
1512 if ( !trie->states[ state ].trans.list ) {
1513 TRIE_LIST_NEW( state );
c445ea15 1514 }
786e8c11
YO
1515 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1516 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1517 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1518 break;
1519 }
1520 }
1521 if ( ! newstate ) {
1522 newstate = next_alloc++;
1523 TRIE_LIST_PUSH( state, charid, newstate );
1524 transcount++;
1525 }
1526 state = newstate;
1527 } else {
1528 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1529 }
a28509cc 1530 }
c445ea15 1531 }
3dab1dad 1532 TRIE_HANDLE_WORD(state);
a3621e74
YO
1533
1534 } /* end second pass */
1535
1e2e3d02
YO
1536 /* next alloc is the NEXT state to be allocated */
1537 trie->statecount = next_alloc;
c944940b
JH
1538 trie->states = (reg_trie_state *)
1539 PerlMemShared_realloc( trie->states,
1540 next_alloc
1541 * sizeof(reg_trie_state) );
a3621e74 1542
3dab1dad 1543 /* and now dump it out before we compress it */
2b8b4781
NC
1544 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1545 revcharmap, next_alloc,
1546 depth+1)
1e2e3d02 1547 );
a3621e74 1548
c944940b
JH
1549 trie->trans = (reg_trie_trans *)
1550 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1551 {
1552 U32 state;
a3621e74
YO
1553 U32 tp = 0;
1554 U32 zp = 0;
1555
1556
1557 for( state=1 ; state < next_alloc ; state ++ ) {
1558 U32 base=0;
1559
1560 /*
1561 DEBUG_TRIE_COMPILE_MORE_r(
1562 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1563 );
1564 */
1565
1566 if (trie->states[state].trans.list) {
1567 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1568 U16 maxid=minid;
a28509cc 1569 U16 idx;
a3621e74
YO
1570
1571 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1572 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1573 if ( forid < minid ) {
1574 minid=forid;
1575 } else if ( forid > maxid ) {
1576 maxid=forid;
1577 }
a3621e74
YO
1578 }
1579 if ( transcount < tp + maxid - minid + 1) {
1580 transcount *= 2;
c944940b
JH
1581 trie->trans = (reg_trie_trans *)
1582 PerlMemShared_realloc( trie->trans,
446bd890
NC
1583 transcount
1584 * sizeof(reg_trie_trans) );
a3621e74
YO
1585 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1586 }
1587 base = trie->uniquecharcount + tp - minid;
1588 if ( maxid == minid ) {
1589 U32 set = 0;
1590 for ( ; zp < tp ; zp++ ) {
1591 if ( ! trie->trans[ zp ].next ) {
1592 base = trie->uniquecharcount + zp - minid;
1593 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1594 trie->trans[ zp ].check = state;
1595 set = 1;
1596 break;
1597 }
1598 }
1599 if ( !set ) {
1600 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1601 trie->trans[ tp ].check = state;
1602 tp++;
1603 zp = tp;
1604 }
1605 } else {
1606 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1607 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1608 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1609 trie->trans[ tid ].check = state;
1610 }
1611 tp += ( maxid - minid + 1 );
1612 }
1613 Safefree(trie->states[ state ].trans.list);
1614 }
1615 /*
1616 DEBUG_TRIE_COMPILE_MORE_r(
1617 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1618 );
1619 */
1620 trie->states[ state ].trans.base=base;
1621 }
cc601c31 1622 trie->lasttrans = tp + 1;
a3621e74
YO
1623 }
1624 } else {
1625 /*
1626 Second Pass -- Flat Table Representation.
1627
1628 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1629 We know that we will need Charcount+1 trans at most to store the data
1630 (one row per char at worst case) So we preallocate both structures
1631 assuming worst case.
1632
1633 We then construct the trie using only the .next slots of the entry
1634 structs.
1635
1636 We use the .check field of the first entry of the node temporarily to
1637 make compression both faster and easier by keeping track of how many non
1638 zero fields are in the node.
1639
1640 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1641 transition.
1642
1643 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1644 number representing the first entry of the node, and state as a
1645 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1646 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1647 are 2 entrys per node. eg:
1648
1649 A B A B
1650 1. 2 4 1. 3 7
1651 2. 0 3 3. 0 5
1652 3. 0 0 5. 0 0
1653 4. 0 0 7. 0 0
1654
1655 The table is internally in the right hand, idx form. However as we also
1656 have to deal with the states array which is indexed by nodenum we have to
1657 use TRIE_NODENUM() to convert.
1658
1659 */
1e2e3d02
YO
1660 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1661 "%*sCompiling trie using table compiler\n",
1662 (int)depth * 2 + 2, ""));
3dab1dad 1663
c944940b
JH
1664 trie->trans = (reg_trie_trans *)
1665 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1666 * trie->uniquecharcount + 1,
1667 sizeof(reg_trie_trans) );
1668 trie->states = (reg_trie_state *)
1669 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1670 sizeof(reg_trie_state) );
a3621e74
YO
1671 next_alloc = trie->uniquecharcount + 1;
1672
3dab1dad 1673
a3621e74
YO
1674 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1675
c445ea15 1676 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1677 const U8 *uc = (U8*)STRING( noper );
1678 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1679
1680 U32 state = 1; /* required init */
1681
1682 U16 charid = 0; /* sanity init */
1683 U32 accept_state = 0; /* sanity init */
1684 U8 *scan = (U8*)NULL; /* sanity init */
1685
1686 STRLEN foldlen = 0; /* required init */
07be1b83 1687 U32 wordlen = 0; /* required init */
a3621e74
YO
1688 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1689
3dab1dad 1690 if ( OP(noper) != NOTHING ) {
786e8c11 1691 for ( ; uc < e ; uc += len ) {
a3621e74 1692
786e8c11 1693 TRIE_READ_CHAR;
a3621e74 1694
786e8c11
YO
1695 if ( uvc < 256 ) {
1696 charid = trie->charmap[ uvc ];
1697 } else {
55eed653 1698 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1699 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1700 }
786e8c11
YO
1701 if ( charid ) {
1702 charid--;
1703 if ( !trie->trans[ state + charid ].next ) {
1704 trie->trans[ state + charid ].next = next_alloc;
1705 trie->trans[ state ].check++;
1706 next_alloc += trie->uniquecharcount;
1707 }
1708 state = trie->trans[ state + charid ].next;
1709 } else {
1710 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1711 }
1712 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1713 }
a3621e74 1714 }
3dab1dad
YO
1715 accept_state = TRIE_NODENUM( state );
1716 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1717
1718 } /* end second pass */
1719
3dab1dad 1720 /* and now dump it out before we compress it */
2b8b4781
NC
1721 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1722 revcharmap,
1723 next_alloc, depth+1));
a3621e74 1724
a3621e74
YO
1725 {
1726 /*
1727 * Inplace compress the table.*
1728
1729 For sparse data sets the table constructed by the trie algorithm will
1730 be mostly 0/FAIL transitions or to put it another way mostly empty.
1731 (Note that leaf nodes will not contain any transitions.)
1732
1733 This algorithm compresses the tables by eliminating most such
1734 transitions, at the cost of a modest bit of extra work during lookup:
1735
1736 - Each states[] entry contains a .base field which indicates the
1737 index in the state[] array wheres its transition data is stored.
1738
1739 - If .base is 0 there are no valid transitions from that node.
1740
1741 - If .base is nonzero then charid is added to it to find an entry in
1742 the trans array.
1743
1744 -If trans[states[state].base+charid].check!=state then the
1745 transition is taken to be a 0/Fail transition. Thus if there are fail
1746 transitions at the front of the node then the .base offset will point
1747 somewhere inside the previous nodes data (or maybe even into a node
1748 even earlier), but the .check field determines if the transition is
1749 valid.
1750
786e8c11 1751 XXX - wrong maybe?
a3621e74
YO
1752 The following process inplace converts the table to the compressed
1753 table: We first do not compress the root node 1,and mark its all its
1754 .check pointers as 1 and set its .base pointer as 1 as well. This
1755 allows to do a DFA construction from the compressed table later, and
1756 ensures that any .base pointers we calculate later are greater than
1757 0.
1758
1759 - We set 'pos' to indicate the first entry of the second node.
1760
1761 - We then iterate over the columns of the node, finding the first and
1762 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1763 and set the .check pointers accordingly, and advance pos
1764 appropriately and repreat for the next node. Note that when we copy
1765 the next pointers we have to convert them from the original
1766 NODEIDX form to NODENUM form as the former is not valid post
1767 compression.
1768
1769 - If a node has no transitions used we mark its base as 0 and do not
1770 advance the pos pointer.
1771
1772 - If a node only has one transition we use a second pointer into the
1773 structure to fill in allocated fail transitions from other states.
1774 This pointer is independent of the main pointer and scans forward
1775 looking for null transitions that are allocated to a state. When it
1776 finds one it writes the single transition into the "hole". If the
786e8c11 1777 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1778
1779 - Once compressed we can Renew/realloc the structures to release the
1780 excess space.
1781
1782 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1783 specifically Fig 3.47 and the associated pseudocode.
1784
1785 demq
1786 */
a3b680e6 1787 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1788 U32 state, charid;
a3621e74 1789 U32 pos = 0, zp=0;
1e2e3d02 1790 trie->statecount = laststate;
a3621e74
YO
1791
1792 for ( state = 1 ; state < laststate ; state++ ) {
1793 U8 flag = 0;
a28509cc
AL
1794 const U32 stateidx = TRIE_NODEIDX( state );
1795 const U32 o_used = trie->trans[ stateidx ].check;
1796 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1797 trie->trans[ stateidx ].check = 0;
1798
1799 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1800 if ( flag || trie->trans[ stateidx + charid ].next ) {
1801 if ( trie->trans[ stateidx + charid ].next ) {
1802 if (o_used == 1) {
1803 for ( ; zp < pos ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 break;
1806 }
1807 }
1808 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1809 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1810 trie->trans[ zp ].check = state;
1811 if ( ++zp > pos ) pos = zp;
1812 break;
1813 }
1814 used--;
1815 }
1816 if ( !flag ) {
1817 flag = 1;
1818 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1819 }
1820 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1821 trie->trans[ pos ].check = state;
1822 pos++;
1823 }
1824 }
1825 }
cc601c31 1826 trie->lasttrans = pos + 1;
c944940b
JH
1827 trie->states = (reg_trie_state *)
1828 PerlMemShared_realloc( trie->states, laststate
1829 * sizeof(reg_trie_state) );
a3621e74 1830 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1831 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1832 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1833 (int)depth * 2 + 2,"",
1834 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1835 (IV)next_alloc,
1836 (IV)pos,
a3621e74
YO
1837 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1838 );
1839
1840 } /* end table compress */
1841 }
1e2e3d02
YO
1842 DEBUG_TRIE_COMPILE_MORE_r(
1843 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1844 (int)depth * 2 + 2, "",
1845 (UV)trie->statecount,
1846 (UV)trie->lasttrans)
1847 );
cc601c31 1848 /* resize the trans array to remove unused space */
c944940b
JH
1849 trie->trans = (reg_trie_trans *)
1850 PerlMemShared_realloc( trie->trans, trie->lasttrans
1851 * sizeof(reg_trie_trans) );
a3621e74 1852
3dab1dad 1853 /* and now dump out the compressed format */
2b8b4781 1854 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1855
3dab1dad 1856 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1857 U8 nodetype =(U8)(flags & 0xFF);
1858 char *str=NULL;
786e8c11 1859
07be1b83 1860#ifdef DEBUGGING
e62cc96a 1861 regnode *optimize = NULL;
7122b237
YO
1862#ifdef RE_TRACK_PATTERN_OFFSETS
1863
b57a0404
JH
1864 U32 mjd_offset = 0;
1865 U32 mjd_nodelen = 0;
7122b237
YO
1866#endif /* RE_TRACK_PATTERN_OFFSETS */
1867#endif /* DEBUGGING */
a3621e74 1868 /*
3dab1dad
YO
1869 This means we convert either the first branch or the first Exact,
1870 depending on whether the thing following (in 'last') is a branch
1871 or not and whther first is the startbranch (ie is it a sub part of
1872 the alternation or is it the whole thing.)
1873 Assuming its a sub part we conver the EXACT otherwise we convert
1874 the whole branch sequence, including the first.
a3621e74 1875 */
3dab1dad 1876 /* Find the node we are going to overwrite */
7f69552c 1877 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1878 /* branch sub-chain */
3dab1dad 1879 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1880#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1881 DEBUG_r({
1882 mjd_offset= Node_Offset((convert));
1883 mjd_nodelen= Node_Length((convert));
1884 });
7122b237 1885#endif
7f69552c 1886 /* whole branch chain */
7122b237
YO
1887 }
1888#ifdef RE_TRACK_PATTERN_OFFSETS
1889 else {
7f69552c
YO
1890 DEBUG_r({
1891 const regnode *nop = NEXTOPER( convert );
1892 mjd_offset= Node_Offset((nop));
1893 mjd_nodelen= Node_Length((nop));
1894 });
07be1b83
YO
1895 }
1896 DEBUG_OPTIMISE_r(
1897 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1898 (int)depth * 2 + 2, "",
786e8c11 1899 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1900 );
7122b237 1901#endif
3dab1dad
YO
1902 /* But first we check to see if there is a common prefix we can
1903 split out as an EXACT and put in front of the TRIE node. */
1904 trie->startstate= 1;
55eed653 1905 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1906 U32 state;
1e2e3d02 1907 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1908 U32 ofs = 0;
8e11feef
RGS
1909 I32 idx = -1;
1910 U32 count = 0;
1911 const U32 base = trie->states[ state ].trans.base;
a3621e74 1912
3dab1dad 1913 if ( trie->states[state].wordnum )
8e11feef 1914 count = 1;
a3621e74 1915
8e11feef 1916 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1917 if ( ( base + ofs >= trie->uniquecharcount ) &&
1918 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1919 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1920 {
3dab1dad 1921 if ( ++count > 1 ) {
2b8b4781 1922 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1923 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1924 if ( state == 1 ) break;
3dab1dad
YO
1925 if ( count == 2 ) {
1926 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1927 DEBUG_OPTIMISE_r(
8e11feef
RGS
1928 PerlIO_printf(Perl_debug_log,
1929 "%*sNew Start State=%"UVuf" Class: [",
1930 (int)depth * 2 + 2, "",
786e8c11 1931 (UV)state));
be8e71aa 1932 if (idx >= 0) {
2b8b4781 1933 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1934 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1935
3dab1dad 1936 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1937 if ( folder )
1938 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1939 DEBUG_OPTIMISE_r(
07be1b83 1940 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1941 );
8e11feef
RGS
1942 }
1943 }
1944 TRIE_BITMAP_SET(trie,*ch);
1945 if ( folder )
1946 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1947 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1948 }
1949 idx = ofs;
1950 }
3dab1dad
YO
1951 }
1952 if ( count == 1 ) {
2b8b4781 1953 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1954 char *ch = SvPV_nolen( *tmp );
1955 DEBUG_OPTIMISE_r({
1956 SV *sv=sv_newmortal();
8e11feef
RGS
1957 PerlIO_printf( Perl_debug_log,
1958 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1959 (int)depth * 2 + 2, "",
de734bd5
A
1960 (UV)state, (UV)idx,
1961 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1962 PL_colors[0], PL_colors[1],
1963 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1964 PERL_PV_ESCAPE_FIRSTCHAR
1965 )
1966 );
1967 });
3dab1dad
YO
1968 if ( state==1 ) {
1969 OP( convert ) = nodetype;
1970 str=STRING(convert);
1971 STR_LEN(convert)=0;
1972 }
de734bd5
A
1973 while (*ch) {
1974 *str++ = *ch++;
1975 STR_LEN(convert)++;
1976 }
1977
8e11feef 1978 } else {
f9049ba1 1979#ifdef DEBUGGING
8e11feef
RGS
1980 if (state>1)
1981 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1982#endif
8e11feef
RGS
1983 break;
1984 }
1985 }
3dab1dad 1986 if (str) {
8e11feef 1987 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1988 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1989 trie->startstate = state;
07be1b83
YO
1990 trie->minlen -= (state - 1);
1991 trie->maxlen -= (state - 1);
1992 DEBUG_r({
1993 regnode *fix = convert;
de734bd5 1994 U32 word = trie->wordcount;
07be1b83
YO
1995 mjd_nodelen++;
1996 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1997 while( ++fix < n ) {
1998 Set_Node_Offset_Length(fix, 0, 0);
1999 }
de734bd5 2000 while (word--) {
2b8b4781 2001 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2002 if (tmp) {
2003 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2004 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2005 else
2006 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2007 }
2008 }
07be1b83 2009 });
8e11feef
RGS
2010 if (trie->maxlen) {
2011 convert = n;
2012 } else {
3dab1dad 2013 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2014 DEBUG_r(optimize= n);
3dab1dad
YO
2015 }
2016 }
2017 }
a5ca303d
YO
2018 if (!jumper)
2019 jumper = last;
3dab1dad 2020 if ( trie->maxlen ) {
8e11feef
RGS
2021 NEXT_OFF( convert ) = (U16)(tail - convert);
2022 ARG_SET( convert, data_slot );
786e8c11
YO
2023 /* Store the offset to the first unabsorbed branch in
2024 jump[0], which is otherwise unused by the jump logic.
2025 We use this when dumping a trie and during optimisation. */
2026 if (trie->jump)
7f69552c 2027 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2028
786e8c11
YO
2029 /* XXXX */
2030 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2031 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2032 {
2033 OP( convert ) = TRIEC;
2034 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2035 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2036 trie->bitmap= NULL;
2037 } else
2038 OP( convert ) = TRIE;
a3621e74 2039
3dab1dad
YO
2040 /* store the type in the flags */
2041 convert->flags = nodetype;
a5ca303d
YO
2042 DEBUG_r({
2043 optimize = convert
2044 + NODE_STEP_REGNODE
2045 + regarglen[ OP( convert ) ];
2046 });
2047 /* XXX We really should free up the resource in trie now,
2048 as we won't use them - (which resources?) dmq */
3dab1dad 2049 }
a3621e74 2050 /* needed for dumping*/
e62cc96a 2051 DEBUG_r(if (optimize) {
07be1b83 2052 regnode *opt = convert;
bcdf7404 2053
e62cc96a 2054 while ( ++opt < optimize) {
07be1b83
YO
2055 Set_Node_Offset_Length(opt,0,0);
2056 }
786e8c11
YO
2057 /*
2058 Try to clean up some of the debris left after the
2059 optimisation.
a3621e74 2060 */
786e8c11 2061 while( optimize < jumper ) {
07be1b83 2062 mjd_nodelen += Node_Length((optimize));
a3621e74 2063 OP( optimize ) = OPTIMIZED;
07be1b83 2064 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2065 optimize++;
2066 }
07be1b83 2067 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2068 });
2069 } /* end node insert */
55eed653 2070 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2071#ifdef DEBUGGING
2072 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2073 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2074#else
2075 SvREFCNT_dec(revcharmap);
07be1b83 2076#endif
786e8c11
YO
2077 return trie->jump
2078 ? MADE_JUMP_TRIE
2079 : trie->startstate>1
2080 ? MADE_EXACT_TRIE
2081 : MADE_TRIE;
2082}
2083
2084STATIC void
2085S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2086{
2087/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2088
2089 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2090 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2091 ISBN 0-201-10088-6
2092
2093 We find the fail state for each state in the trie, this state is the longest proper
2094 suffix of the current states 'word' that is also a proper prefix of another word in our
2095 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2096 the DFA not to have to restart after its tried and failed a word at a given point, it
2097 simply continues as though it had been matching the other word in the first place.
2098 Consider
2099 'abcdgu'=~/abcdefg|cdgu/
2100 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2101 fail, which would bring use to the state representing 'd' in the second word where we would
2102 try 'g' and succeed, prodceding to match 'cdgu'.
2103 */
2104 /* add a fail transition */
3251b653
NC
2105 const U32 trie_offset = ARG(source);
2106 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2107 U32 *q;
2108 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2109 const U32 numstates = trie->statecount;
786e8c11
YO
2110 const U32 ubound = trie->lasttrans + ucharcount;
2111 U32 q_read = 0;
2112 U32 q_write = 0;
2113 U32 charid;
2114 U32 base = trie->states[ 1 ].trans.base;
2115 U32 *fail;
2116 reg_ac_data *aho;
2117 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2118 GET_RE_DEBUG_FLAGS_DECL;
2119#ifndef DEBUGGING
2120 PERL_UNUSED_ARG(depth);
2121#endif
2122
2123
2124 ARG_SET( stclass, data_slot );
c944940b 2125 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2126 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2127 aho->trie=trie_offset;
446bd890
NC
2128 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2129 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2130 Newxz( q, numstates, U32);
c944940b 2131 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2132 aho->refcount = 1;
2133 fail = aho->fail;
2134 /* initialize fail[0..1] to be 1 so that we always have
2135 a valid final fail state */
2136 fail[ 0 ] = fail[ 1 ] = 1;
2137
2138 for ( charid = 0; charid < ucharcount ; charid++ ) {
2139 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2140 if ( newstate ) {
2141 q[ q_write ] = newstate;
2142 /* set to point at the root */
2143 fail[ q[ q_write++ ] ]=1;
2144 }
2145 }
2146 while ( q_read < q_write) {
2147 const U32 cur = q[ q_read++ % numstates ];
2148 base = trie->states[ cur ].trans.base;
2149
2150 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2151 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2152 if (ch_state) {
2153 U32 fail_state = cur;
2154 U32 fail_base;
2155 do {
2156 fail_state = fail[ fail_state ];
2157 fail_base = aho->states[ fail_state ].trans.base;
2158 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2159
2160 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2161 fail[ ch_state ] = fail_state;
2162 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2163 {
2164 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2165 }
2166 q[ q_write++ % numstates] = ch_state;
2167 }
2168 }
2169 }
2170 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2171 when we fail in state 1, this allows us to use the
2172 charclass scan to find a valid start char. This is based on the principle
2173 that theres a good chance the string being searched contains lots of stuff
2174 that cant be a start char.
2175 */
2176 fail[ 0 ] = fail[ 1 ] = 0;
2177 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2178 PerlIO_printf(Perl_debug_log,
2179 "%*sStclass Failtable (%"UVuf" states): 0",
2180 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2181 );
786e8c11
YO
2182 for( q_read=1; q_read<numstates; q_read++ ) {
2183 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2184 }
2185 PerlIO_printf(Perl_debug_log, "\n");
2186 });
2187 Safefree(q);
2188 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2189}
2190
786e8c11 2191
a3621e74 2192/*
5d1c421c
JH
2193 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2194 * These need to be revisited when a newer toolchain becomes available.
2195 */
2196#if defined(__sparc64__) && defined(__GNUC__)
2197# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2198# undef SPARC64_GCC_WORKAROUND
2199# define SPARC64_GCC_WORKAROUND 1
2200# endif
2201#endif
2202
07be1b83 2203#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2204 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2205 SV * const mysv=sv_newmortal(); \
2206 regnode *Next = regnext(scan); \
2207 regprop(RExC_rx, mysv, scan); \
7f69552c 2208 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2209 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2210 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2211 }});
07be1b83 2212
1de06328
YO
2213
2214
2215
2216
07be1b83
YO
2217#define JOIN_EXACT(scan,min,flags) \
2218 if (PL_regkind[OP(scan)] == EXACT) \
2219 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2220
be8e71aa 2221STATIC U32
07be1b83
YO
2222S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2223 /* Merge several consecutive EXACTish nodes into one. */
2224 regnode *n = regnext(scan);
2225 U32 stringok = 1;
2226 regnode *next = scan + NODE_SZ_STR(scan);
2227 U32 merged = 0;
2228 U32 stopnow = 0;
2229#ifdef DEBUGGING
2230 regnode *stop = scan;
72f13be8 2231 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2232#else
d47053eb
RGS
2233 PERL_UNUSED_ARG(depth);
2234#endif
2235#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2236 PERL_UNUSED_ARG(flags);
2237 PERL_UNUSED_ARG(val);
07be1b83 2238#endif
07be1b83
YO
2239 DEBUG_PEEP("join",scan,depth);
2240
2241 /* Skip NOTHING, merge EXACT*. */
2242 while (n &&
2243 ( PL_regkind[OP(n)] == NOTHING ||
2244 (stringok && (OP(n) == OP(scan))))
2245 && NEXT_OFF(n)
2246 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2247
2248 if (OP(n) == TAIL || n > next)
2249 stringok = 0;
2250 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2251 DEBUG_PEEP("skip:",n,depth);
2252 NEXT_OFF(scan) += NEXT_OFF(n);
2253 next = n + NODE_STEP_REGNODE;
2254#ifdef DEBUGGING
2255 if (stringok)
2256 stop = n;
2257#endif
2258 n = regnext(n);
2259 }
2260 else if (stringok) {
786e8c11 2261 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2262 regnode * const nnext = regnext(n);
2263
2264 DEBUG_PEEP("merg",n,depth);
2265
2266 merged++;
2267 if (oldl + STR_LEN(n) > U8_MAX)
2268 break;
2269 NEXT_OFF(scan) += NEXT_OFF(n);
2270 STR_LEN(scan) += STR_LEN(n);
2271 next = n + NODE_SZ_STR(n);
2272 /* Now we can overwrite *n : */
2273 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2274#ifdef DEBUGGING
2275 stop = next - 1;
2276#endif
2277 n = nnext;
2278 if (stopnow) break;
2279 }
2280
d47053eb
RGS
2281#ifdef EXPERIMENTAL_INPLACESCAN
2282 if (flags && !NEXT_OFF(n)) {
2283 DEBUG_PEEP("atch", val, depth);
2284 if (reg_off_by_arg[OP(n)]) {
2285 ARG_SET(n, val - n);
2286 }
2287 else {
2288 NEXT_OFF(n) = val - n;
2289 }
2290 stopnow = 1;
2291 }
07be1b83
YO
2292#endif
2293 }
2294
2295 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2296 /*
2297 Two problematic code points in Unicode casefolding of EXACT nodes:
2298
2299 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2300 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2301
2302 which casefold to
2303
2304 Unicode UTF-8
2305
2306 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2307 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2308
2309 This means that in case-insensitive matching (or "loose matching",
2310 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2311 length of the above casefolded versions) can match a target string
2312 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2313 This would rather mess up the minimum length computation.
2314
2315 What we'll do is to look for the tail four bytes, and then peek
2316 at the preceding two bytes to see whether we need to decrease
2317 the minimum length by four (six minus two).
2318
2319 Thanks to the design of UTF-8, there cannot be false matches:
2320 A sequence of valid UTF-8 bytes cannot be a subsequence of
2321 another valid sequence of UTF-8 bytes.
2322
2323 */
2324 char * const s0 = STRING(scan), *s, *t;
2325 char * const s1 = s0 + STR_LEN(scan) - 1;
2326 char * const s2 = s1 - 4;
e294cc5d
JH
2327#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2328 const char t0[] = "\xaf\x49\xaf\x42";
2329#else
07be1b83 2330 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2331#endif
07be1b83
YO
2332 const char * const t1 = t0 + 3;
2333
2334 for (s = s0 + 2;
2335 s < s2 && (t = ninstr(s, s1, t0, t1));
2336 s = t + 4) {
e294cc5d
JH
2337#ifdef EBCDIC
2338 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2339 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2340#else
07be1b83
YO
2341 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2342 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2343#endif
07be1b83
YO
2344 *min -= 4;
2345 }
2346 }
2347
2348#ifdef DEBUGGING
2349 /* Allow dumping */
2350 n = scan + NODE_SZ_STR(scan);
2351 while (n <= stop) {
2352 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2353 OP(n) = OPTIMIZED;
2354 NEXT_OFF(n) = 0;
2355 }
2356 n++;
2357 }
2358#endif
2359 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2360 return stopnow;
2361}
2362
653099ff
GS
2363/* REx optimizer. Converts nodes into quickier variants "in place".
2364 Finds fixed substrings. */
2365
a0288114 2366/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2367 to the position after last scanned or to NULL. */
2368
40d049e4
YO
2369#define INIT_AND_WITHP \
2370 assert(!and_withp); \
2371 Newx(and_withp,1,struct regnode_charclass_class); \
2372 SAVEFREEPV(and_withp)
07be1b83 2373
b515a41d
YO
2374/* this is a chain of data about sub patterns we are processing that
2375 need to be handled seperately/specially in study_chunk. Its so
2376 we can simulate recursion without losing state. */
2377struct scan_frame;
2378typedef struct scan_frame {
2379 regnode *last; /* last node to process in this frame */
2380 regnode *next; /* next node to process when last is reached */
2381 struct scan_frame *prev; /*previous frame*/
2382 I32 stop; /* what stopparen do we use */
2383} scan_frame;
2384
304ee84b
YO
2385
2386#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2387
76e3520e 2388STATIC I32
40d049e4 2389S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2390 I32 *minlenp, I32 *deltap,
40d049e4
YO
2391 regnode *last,
2392 scan_data_t *data,
2393 I32 stopparen,
2394 U8* recursed,
2395 struct regnode_charclass_class *and_withp,
2396 U32 flags, U32 depth)
c277df42
IZ
2397 /* scanp: Start here (read-write). */
2398 /* deltap: Write maxlen-minlen here. */
2399 /* last: Stop before this one. */
40d049e4
YO
2400 /* data: string data about the pattern */
2401 /* stopparen: treat close N as END */
2402 /* recursed: which subroutines have we recursed into */
2403 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2404{
97aff369 2405 dVAR;
c277df42
IZ
2406 I32 min = 0, pars = 0, code;
2407 regnode *scan = *scanp, *next;
2408 I32 delta = 0;
2409 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2410 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2411 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2412 scan_data_t data_fake;
a3621e74 2413 SV *re_trie_maxbuff = NULL;
786e8c11 2414 regnode *first_non_open = scan;
e2e6a0f1 2415 I32 stopmin = I32_MAX;
8aa23a47
YO
2416 scan_frame *frame = NULL;
2417
a3621e74 2418 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2419
13a24bad 2420#ifdef DEBUGGING
40d049e4 2421 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2422#endif
40d049e4 2423
786e8c11 2424 if ( depth == 0 ) {
40d049e4 2425 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2426 first_non_open=regnext(first_non_open);
2427 }
2428
b81d288d 2429
8aa23a47
YO
2430 fake_study_recurse:
2431 while ( scan && OP(scan) != END && scan < last ){
2432 /* Peephole optimizer: */
304ee84b 2433 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2434 DEBUG_PEEP("Peep",scan,depth);
2435 JOIN_EXACT(scan,&min,0);
2436
2437 /* Follow the next-chain of the current node and optimize
2438 away all the NOTHINGs from it. */
2439 if (OP(scan) != CURLYX) {
2440 const int max = (reg_off_by_arg[OP(scan)]
2441 ? I32_MAX
2442 /* I32 may be smaller than U16 on CRAYs! */
2443 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2444 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2445 int noff;
2446 regnode *n = scan;
2447
2448 /* Skip NOTHING and LONGJMP. */
2449 while ((n = regnext(n))
2450 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2451 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2452 && off + noff < max)
2453 off += noff;
2454 if (reg_off_by_arg[OP(scan)])
2455 ARG(scan) = off;
2456 else
2457 NEXT_OFF(scan) = off;
2458 }
a3621e74 2459
c277df42 2460
8aa23a47
YO
2461
2462 /* The principal pseudo-switch. Cannot be a switch, since we
2463 look into several different things. */
2464 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2465 || OP(scan) == IFTHEN) {
2466 next = regnext(scan);
2467 code = OP(scan);
2468 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2469
2470 if (OP(next) == code || code == IFTHEN) {
2471 /* NOTE - There is similar code to this block below for handling
2472 TRIE nodes on a re-study. If you change stuff here check there
2473 too. */
2474 I32 max1 = 0, min1 = I32_MAX, num = 0;
2475 struct regnode_charclass_class accum;
2476 regnode * const startbranch=scan;
2477
2478 if (flags & SCF_DO_SUBSTR)
304ee84b 2479 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2480 if (flags & SCF_DO_STCLASS)
2481 cl_init_zero(pRExC_state, &accum);
2482
2483 while (OP(scan) == code) {
2484 I32 deltanext, minnext, f = 0, fake;
2485 struct regnode_charclass_class this_class;
2486
2487 num++;
2488 data_fake.flags = 0;
2489 if (data) {
2490 data_fake.whilem_c = data->whilem_c;
2491 data_fake.last_closep = data->last_closep;
2492 }
2493 else
2494 data_fake.last_closep = &fake;
58e23c8d
YO
2495
2496 data_fake.pos_delta = delta;
8aa23a47
YO
2497 next = regnext(scan);
2498 scan = NEXTOPER(scan);
2499 if (code != BRANCH)
c277df42 2500 scan = NEXTOPER(scan);
8aa23a47
YO
2501 if (flags & SCF_DO_STCLASS) {
2502 cl_init(pRExC_state, &this_class);
2503 data_fake.start_class = &this_class;
2504 f = SCF_DO_STCLASS_AND;
58e23c8d 2505 }
8aa23a47
YO
2506 if (flags & SCF_WHILEM_VISITED_POS)
2507 f |= SCF_WHILEM_VISITED_POS;
2508
2509 /* we suppose the run is continuous, last=next...*/
2510 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2511 next, &data_fake,
2512 stopparen, recursed, NULL, f,depth+1);
2513 if (min1 > minnext)
2514 min1 = minnext;
2515 if (max1 < minnext + deltanext)
2516 max1 = minnext + deltanext;
2517 if (deltanext == I32_MAX)
2518 is_inf = is_inf_internal = 1;
2519 scan = next;
2520 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2521 pars++;
2522 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2523 if ( stopmin > minnext)
2524 stopmin = min + min1;
2525 flags &= ~SCF_DO_SUBSTR;
2526 if (data)
2527 data->flags |= SCF_SEEN_ACCEPT;
2528 }
2529 if (data) {
2530 if (data_fake.flags & SF_HAS_EVAL)
2531 data->flags |= SF_HAS_EVAL;
2532 data->whilem_c = data_fake.whilem_c;
3dab1dad 2533 }
8aa23a47
YO
2534 if (flags & SCF_DO_STCLASS)
2535 cl_or(pRExC_state, &accum, &this_class);
2536 }
2537 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2538 min1 = 0;
2539 if (flags & SCF_DO_SUBSTR) {
2540 data->pos_min += min1;
2541 data->pos_delta += max1 - min1;
2542 if (max1 != min1 || is_inf)
2543 data->longest = &(data->longest_float);
2544 }
2545 min += min1;
2546 delta += max1 - min1;
2547 if (flags & SCF_DO_STCLASS_OR) {
2548 cl_or(pRExC_state, data->start_class, &accum);
2549 if (min1) {
2550 cl_and(data->start_class, and_withp);
2551 flags &= ~SCF_DO_STCLASS;
653099ff 2552 }
8aa23a47
YO
2553 }
2554 else if (flags & SCF_DO_STCLASS_AND) {
2555 if (min1) {
2556 cl_and(data->start_class, &accum);
2557 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2558 }
8aa23a47
YO
2559 else {
2560 /* Switch to OR mode: cache the old value of
2561 * data->start_class */
2562 INIT_AND_WITHP;
2563 StructCopy(data->start_class, and_withp,
2564 struct regnode_charclass_class);
2565 flags &= ~SCF_DO_STCLASS_AND;
2566 StructCopy(&accum, data->start_class,
2567 struct regnode_charclass_class);
2568 flags |= SCF_DO_STCLASS_OR;
2569 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2570 }
8aa23a47 2571 }
a3621e74 2572
8aa23a47
YO
2573 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2574 /* demq.
a3621e74 2575
8aa23a47
YO
2576 Assuming this was/is a branch we are dealing with: 'scan' now
2577 points at the item that follows the branch sequence, whatever
2578 it is. We now start at the beginning of the sequence and look
2579 for subsequences of
a3621e74 2580
8aa23a47
YO
2581 BRANCH->EXACT=>x1
2582 BRANCH->EXACT=>x2
2583 tail
a3621e74 2584
8aa23a47 2585 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2586
8aa23a47
YO
2587 If we can find such a subseqence we need to turn the first
2588 element into a trie and then add the subsequent branch exact
2589 strings to the trie.
a3621e74 2590
8aa23a47 2591 We have two cases
a3621e74 2592
8aa23a47 2593 1. patterns where the whole set of branch can be converted.
a3621e74 2594
8aa23a47 2595 2. patterns where only a subset can be converted.
a3621e74 2596
8aa23a47
YO
2597 In case 1 we can replace the whole set with a single regop
2598 for the trie. In case 2 we need to keep the start and end
2599 branchs so
a3621e74 2600
8aa23a47
YO
2601 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2602 becomes BRANCH TRIE; BRANCH X;
786e8c11 2603
8aa23a47
YO
2604 There is an additional case, that being where there is a
2605 common prefix, which gets split out into an EXACT like node
2606 preceding the TRIE node.
a3621e74 2607
8aa23a47
YO
2608 If x(1..n)==tail then we can do a simple trie, if not we make
2609 a "jump" trie, such that when we match the appropriate word
2610 we "jump" to the appopriate tail node. Essentailly we turn
2611 a nested if into a case structure of sorts.
b515a41d 2612
8aa23a47
YO
2613 */
2614
2615 int made=0;
2616 if (!re_trie_maxbuff) {
2617 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2618 if (!SvIOK(re_trie_maxbuff))
2619 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2620 }
2621 if ( SvIV(re_trie_maxbuff)>=0 ) {
2622 regnode *cur;
2623 regnode *first = (regnode *)NULL;
2624 regnode *last = (regnode *)NULL;
2625 regnode *tail = scan;
2626 U8 optype = 0;
2627 U32 count=0;
a3621e74
YO
2628
2629#ifdef DEBUGGING
8aa23a47 2630 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2631#endif
8aa23a47
YO
2632 /* var tail is used because there may be a TAIL
2633 regop in the way. Ie, the exacts will point to the
2634 thing following the TAIL, but the last branch will
2635 point at the TAIL. So we advance tail. If we
2636 have nested (?:) we may have to move through several
2637 tails.
2638 */
2639
2640 while ( OP( tail ) == TAIL ) {
2641 /* this is the TAIL generated by (?:) */
2642 tail = regnext( tail );
2643 }
a3621e74 2644
8aa23a47
YO
2645
2646 DEBUG_OPTIMISE_r({
2647 regprop(RExC_rx, mysv, tail );
2648 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2649 (int)depth * 2 + 2, "",
2650 "Looking for TRIE'able sequences. Tail node is: ",
2651 SvPV_nolen_const( mysv )
2652 );
2653 });
2654
2655 /*
2656
2657 step through the branches, cur represents each
2658 branch, noper is the first thing to be matched
2659 as part of that branch and noper_next is the
2660 regnext() of that node. if noper is an EXACT
2661 and noper_next is the same as scan (our current
2662 position in the regex) then the EXACT branch is
2663 a possible optimization target. Once we have
2664 two or more consequetive such branches we can
2665 create a trie of the EXACT's contents and stich
2666 it in place. If the sequence represents all of
2667 the branches we eliminate the whole thing and
2668 replace it with a single TRIE. If it is a
2669 subsequence then we need to stitch it in. This
2670 means the first branch has to remain, and needs
2671 to be repointed at the item on the branch chain
2672 following the last branch optimized. This could
2673 be either a BRANCH, in which case the
2674 subsequence is internal, or it could be the
2675 item following the branch sequence in which
2676 case the subsequence is at the end.
2677
2678 */
2679
2680 /* dont use tail as the end marker for this traverse */
2681 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2682 regnode * const noper = NEXTOPER( cur );
b515a41d 2683#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2684 regnode * const noper_next = regnext( noper );
b515a41d
YO
2685#endif
2686
8aa23a47
YO
2687 DEBUG_OPTIMISE_r({
2688 regprop(RExC_rx, mysv, cur);
2689 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2690 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2691
2692 regprop(RExC_rx, mysv, noper);
2693 PerlIO_printf( Perl_debug_log, " -> %s",
2694 SvPV_nolen_const(mysv));
2695
2696 if ( noper_next ) {
2697 regprop(RExC_rx, mysv, noper_next );
2698 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2699 SvPV_nolen_const(mysv));
2700 }
2701 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2702 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2703 });
2704 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2705 : PL_regkind[ OP( noper ) ] == EXACT )
2706 || OP(noper) == NOTHING )
786e8c11 2707#ifdef NOJUMPTRIE
8aa23a47 2708 && noper_next == tail
786e8c11 2709#endif
8aa23a47
YO
2710 && count < U16_MAX)
2711 {
2712 count++;
2713 if ( !first || optype == NOTHING ) {
2714 if (!first) first = cur;
2715 optype = OP( noper );
2716 } else {
2717 last = cur;
2718 }
2719 } else {
2720 if ( last ) {
2721 make_trie( pRExC_state,
2722 startbranch, first, cur, tail, count,
2723 optype, depth+1 );
2724 }
2725 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2726#ifdef NOJUMPTRIE
8aa23a47 2727 && noper_next == tail
786e8c11 2728#endif
8aa23a47
YO
2729 ){
2730 count = 1;
2731 first = cur;
2732 optype = OP( noper );
2733 } else {
2734 count = 0;
2735 first = NULL;
2736 optype = 0;
2737 }
2738 last = NULL;
2739 }
2740 }
2741 DEBUG_OPTIMISE_r({
2742 regprop(RExC_rx, mysv, cur);
2743 PerlIO_printf( Perl_debug_log,
2744 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2745 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2746
2747 });
2748 if ( last ) {
2749 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2750#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2751 if ( ((made == MADE_EXACT_TRIE &&
2752 startbranch == first)
2753 || ( first_non_open == first )) &&
2754 depth==0 ) {
2755 flags |= SCF_TRIE_RESTUDY;
2756 if ( startbranch == first
2757 && scan == tail )
2758 {
2759 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2760 }
2761 }
3dab1dad 2762#endif
8aa23a47
YO
2763 }
2764 }
2765
2766 } /* do trie */
2767
653099ff 2768 }
8aa23a47
YO
2769 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2770 scan = NEXTOPER(NEXTOPER(scan));
2771 } else /* single branch is optimized. */
2772 scan = NEXTOPER(scan);
2773 continue;
2774 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2775 scan_frame *newframe = NULL;
2776 I32 paren;
2777 regnode *start;
2778 regnode *end;
2779
2780 if (OP(scan) != SUSPEND) {
2781 /* set the pointer */
2782 if (OP(scan) == GOSUB) {
2783 paren = ARG(scan);
2784 RExC_recurse[ARG2L(scan)] = scan;
2785 start = RExC_open_parens[paren-1];
2786 end = RExC_close_parens[paren-1];
2787 } else {
2788 paren = 0;
f8fc2ecf 2789 start = RExC_rxi->program + 1;
8aa23a47
YO
2790 end = RExC_opend;
2791 }
2792 if (!recursed) {
2793 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2794 SAVEFREEPV(recursed);
2795 }
2796 if (!PAREN_TEST(recursed,paren+1)) {
2797 PAREN_SET(recursed,paren+1);
2798 Newx(newframe,1,scan_frame);
2799 } else {
2800 if (flags & SCF_DO_SUBSTR) {
304ee84b 2801 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2802 data->longest = &(data->longest_float);
2803 }
2804 is_inf = is_inf_internal = 1;
2805 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2806 cl_anything(pRExC_state, data->start_class);
2807 flags &= ~SCF_DO_STCLASS;
2808 }
2809 } else {
2810 Newx(newframe,1,scan_frame);
2811 paren = stopparen;
2812 start = scan+2;
2813 end = regnext(scan);
2814 }
2815 if (newframe) {
2816 assert(start);
2817 assert(end);
2818 SAVEFREEPV(newframe);
2819 newframe->next = regnext(scan);
2820 newframe->last = last;
2821 newframe->stop = stopparen;
2822 newframe->prev = frame;
2823
2824 frame = newframe;
2825 scan = start;
2826 stopparen = paren;
2827 last = end;
2828
2829 continue;
2830 }
2831 }
2832 else if (OP(scan) == EXACT) {
2833 I32 l = STR_LEN(scan);
2834 UV uc;
2835 if (UTF) {
2836 const U8 * const s = (U8*)STRING(scan);
2837 l = utf8_length(s, s + l);
2838 uc = utf8_to_uvchr(s, NULL);
2839 } else {
2840 uc = *((U8*)STRING(scan));
2841 }
2842 min += l;
2843 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2844 /* The code below prefers earlier match for fixed
2845 offset, later match for variable offset. */
2846 if (data->last_end == -1) { /* Update the start info. */
2847 data->last_start_min = data->pos_min;
2848 data->last_start_max = is_inf
2849 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2850 }
8aa23a47
YO
2851 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2852 if (UTF)
2853 SvUTF8_on(data->last_found);
2854 {
2855 SV * const sv = data->last_found;
2856 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2857 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2858 if (mg && mg->mg_len >= 0)
2859 mg->mg_len += utf8_length((U8*)STRING(scan),
2860 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2861 }
8aa23a47
YO
2862 data->last_end = data->pos_min + l;
2863 data->pos_min += l; /* As in the first entry. */
2864 data->flags &= ~SF_BEFORE_EOL;
2865 }
2866 if (flags & SCF_DO_STCLASS_AND) {
2867 /* Check whether it is compatible with what we know already! */
2868 int compat = 1;
2869
2870 if (uc >= 0x100 ||
2871 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2872 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2873 && (!(data->start_class->flags & ANYOF_FOLD)
2874 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2875 )
2876 compat = 0;
2877 ANYOF_CLASS_ZERO(data->start_class);
2878 ANYOF_BITMAP_ZERO(data->start_class);
2879 if (compat)
2880 ANYOF_BITMAP_SET(data->start_class, uc);
2881 data->start_class->flags &= ~ANYOF_EOS;
2882 if (uc < 0x100)
2883 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2884 }
2885 else if (flags & SCF_DO_STCLASS_OR) {
2886 /* false positive possible if the class is case-folded */
2887 if (uc < 0x100)
2888 ANYOF_BITMAP_SET(data->start_class, uc);
2889 else
2890 data->start_class->flags |= ANYOF_UNICODE_ALL;
2891 data->start_class->flags &= ~ANYOF_EOS;
2892 cl_and(data->start_class, and_withp);
2893 }
2894 flags &= ~SCF_DO_STCLASS;
2895 }
2896 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2897 I32 l = STR_LEN(scan);
2898 UV uc = *((U8*)STRING(scan));
2899
2900 /* Search for fixed substrings supports EXACT only. */
2901 if (flags & SCF_DO_SUBSTR) {
2902 assert(data);
304ee84b 2903 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2904 }
2905 if (UTF) {
2906 const U8 * const s = (U8 *)STRING(scan);
2907 l = utf8_length(s, s + l);
2908 uc = utf8_to_uvchr(s, NULL);
2909 }
2910 min += l;
2911 if (flags & SCF_DO_SUBSTR)
2912 data->pos_min += l;
2913 if (flags & SCF_DO_STCLASS_AND) {
2914 /* Check whether it is compatible with what we know already! */
2915 int compat = 1;
2916
2917 if (uc >= 0x100 ||
2918 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2919 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2920 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2921 compat = 0;
2922 ANYOF_CLASS_ZERO(data->start_class);
2923 ANYOF_BITMAP_ZERO(data->start_class);
2924 if (compat) {
2925 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2926 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2927 data->start_class->flags |= ANYOF_FOLD;
2928 if (OP(scan) == EXACTFL)
2929 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2930 }
8aa23a47
YO
2931 }
2932 else if (flags & SCF_DO_STCLASS_OR) {
2933 if (data->start_class->flags & ANYOF_FOLD) {
2934 /* false positive possible if the class is case-folded.
2935 Assume that the locale settings are the same... */
1aa99e6b
IH
2936 if (uc < 0x100)
2937 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2938 data->start_class->flags &= ~ANYOF_EOS;
2939 }
8aa23a47 2940 cl_and(data->start_class, and_withp);
653099ff 2941 }
8aa23a47
YO
2942 flags &= ~SCF_DO_STCLASS;
2943 }
2944 else if (strchr((const char*)PL_varies,OP(scan))) {
2945 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2946 I32 f = flags, pos_before = 0;
2947 regnode * const oscan = scan;
2948 struct regnode_charclass_class this_class;
2949 struct regnode_charclass_class *oclass = NULL;
2950 I32 next_is_eval = 0;
2951
2952 switch (PL_regkind[OP(scan)]) {
2953 case WHILEM: /* End of (?:...)* . */
2954 scan = NEXTOPER(scan);
2955 goto finish;
2956 case PLUS:
2957 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2958 next = NEXTOPER(scan);
2959 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2960 mincount = 1;
2961 maxcount = REG_INFTY;
2962 next = regnext(scan);
2963 scan = NEXTOPER(scan);
2964 goto do_curly;
2965 }
2966 }
2967 if (flags & SCF_DO_SUBSTR)
2968 data->pos_min++;
2969 min++;
2970 /* Fall through. */
2971 case STAR:
2972 if (flags & SCF_DO_STCLASS) {
2973 mincount = 0;
2974 maxcount = REG_INFTY;
2975 next = regnext(scan);
2976 scan = NEXTOPER(scan);
2977 goto do_curly;
2978 }
2979 is_inf = is_inf_internal = 1;
2980 scan = regnext(scan);
c277df42 2981 if (flags & SCF_DO_SUBSTR) {
304ee84b 2982 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2983 data->longest = &(data->longest_float);
c277df42 2984 }
8aa23a47
YO
2985 goto optimize_curly_tail;
2986 case CURLY:
2987 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2988 && (scan->flags == stopparen))
2989 {
2990 mincount = 1;
2991 maxcount = 1;
2992 } else {
2993 mincount = ARG1(scan);
2994 maxcount = ARG2(scan);
653099ff 2995 }
8aa23a47
YO
2996 next = regnext(scan);
2997 if (OP(scan) == CURLYX) {
2998 I32 lp = (data ? *(data->last_closep) : 0);
2999 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3000 }
8aa23a47
YO
3001 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3002 next_is_eval = (OP(scan) == EVAL);
3003 do_curly:
3004 if (flags & SCF_DO_SUBSTR) {
304ee84b 3005 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3006 pos_before = data->pos_min;
b45f050a 3007 }
8aa23a47
YO
3008 if (data) {
3009 fl = data->flags;
3010 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3011 if (is_inf)
3012 data->flags |= SF_IS_INF;
3013 }
3014 if (flags & SCF_DO_STCLASS) {
3015 cl_init(pRExC_state, &this_class);
3016 oclass = data->start_class;
3017 data->start_class = &this_class;
3018 f |= SCF_DO_STCLASS_AND;
3019 f &= ~SCF_DO_STCLASS_OR;
3020 }
3021 /* These are the cases when once a subexpression
3022 fails at a particular position, it cannot succeed
3023 even after backtracking at the enclosing scope.
3024
3025 XXXX what if minimal match and we are at the
3026 initial run of {n,m}? */
3027 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3028 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3029
8aa23a47
YO
3030 /* This will finish on WHILEM, setting scan, or on NULL: */
3031 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3032 last, data, stopparen, recursed, NULL,
3033 (mincount == 0
3034 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3035
8aa23a47
YO
3036 if (flags & SCF_DO_STCLASS)
3037 data->start_class = oclass;
3038 if (mincount == 0 || minnext == 0) {
3039 if (flags & SCF_DO_STCLASS_OR) {
3040 cl_or(pRExC_state, data->start_class, &this_class);
3041 }
3042 else if (flags & SCF_DO_STCLASS_AND) {
3043 /* Switch to OR mode: cache the old value of
3044 * data->start_class */
3045 INIT_AND_WITHP;
3046 StructCopy(data->start_class, and_withp,
3047 struct regnode_charclass_class);
3048 flags &= ~SCF_DO_STCLASS_AND;
3049 StructCopy(&this_class, data->start_class,
3050 struct regnode_charclass_class);
3051 flags |= SCF_DO_STCLASS_OR;
3052 data->start_class->flags |= ANYOF_EOS;
3053 }
3054 } else { /* Non-zero len */
3055 if (flags & SCF_DO_STCLASS_OR) {
3056 cl_or(pRExC_state, data->start_class, &this_class);
3057 cl_and(data->start_class, and_withp);
3058 }
3059 else if (flags & SCF_DO_STCLASS_AND)
3060 cl_and(data->start_class, &this_class);
3061 flags &= ~SCF_DO_STCLASS;
3062 }
3063 if (!scan) /* It was not CURLYX, but CURLY. */
3064 scan = next;
3065 if ( /* ? quantifier ok, except for (?{ ... }) */
3066 (next_is_eval || !(mincount == 0 && maxcount == 1))
3067 && (minnext == 0) && (deltanext == 0)
3068 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3069 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3070 && ckWARN(WARN_REGEXP))
3071 {
3072 vWARN(RExC_parse,
3073 "Quantifier unexpected on zero-length expression");
3074 }
3075
3076 min += minnext * mincount;
3077 is_inf_internal |= ((maxcount == REG_INFTY
3078 && (minnext + deltanext) > 0)
3079 || deltanext == I32_MAX);
3080 is_inf |= is_inf_internal;
3081 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3082
3083 /* Try powerful optimization CURLYX => CURLYN. */
3084 if ( OP(oscan) == CURLYX && data
3085 && data->flags & SF_IN_PAR
3086 && !(data->flags & SF_HAS_EVAL)
3087 && !deltanext && minnext == 1 ) {
3088 /* Try to optimize to CURLYN. */
3089 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3090 regnode * const nxt1 = nxt;
497b47a8 3091#ifdef DEBUGGING
8aa23a47 3092 regnode *nxt2;
497b47a8 3093#endif
c277df42 3094
8aa23a47
YO
3095 /* Skip open. */
3096 nxt = regnext(nxt);
3097 if (!strchr((const char*)PL_simple,OP(nxt))
3098 && !(PL_regkind[OP(nxt)] == EXACT
3099 && STR_LEN(nxt) == 1))
3100 goto nogo;
497b47a8 3101#ifdef DEBUGGING
8aa23a47 3102 nxt2 = nxt;
497b47a8 3103#endif
8aa23a47
YO
3104 nxt = regnext(nxt);
3105 if (OP(nxt) != CLOSE)
3106 goto nogo;
3107 if (RExC_open_parens) {
3108 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3109 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3110 }
3111 /* Now we know that nxt2 is the only contents: */
3112 oscan->flags = (U8)ARG(nxt);
3113 OP(oscan) = CURLYN;
3114 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3115
c277df42 3116#ifdef DEBUGGING
8aa23a47
YO
3117 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3118 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3119 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3120 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3121 OP(nxt + 1) = OPTIMIZED; /* was count. */
3122 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3123#endif
8aa23a47
YO
3124 }
3125 nogo:
3126
3127 /* Try optimization CURLYX => CURLYM. */
3128 if ( OP(oscan) == CURLYX && data
3129 && !(data->flags & SF_HAS_PAR)
3130 && !(data->flags & SF_HAS_EVAL)
3131 && !deltanext /* atom is fixed width */
3132 && minnext != 0 /* CURLYM can't handle zero width */
3133 ) {
3134 /* XXXX How to optimize if data == 0? */
3135 /* Optimize to a simpler form. */
3136 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3137 regnode *nxt2;
3138
3139 OP(oscan) = CURLYM;
3140 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3141 && (OP(nxt2) != WHILEM))
3142 nxt = nxt2;
3143 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3144 /* Need to optimize away parenths. */
3145 if (data->flags & SF_IN_PAR) {
3146 /* Set the parenth number. */
3147 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3148
3149 if (OP(nxt) != CLOSE)
3150 FAIL("Panic opt close");
3151 oscan->flags = (U8)ARG(nxt);
3152 if (RExC_open_parens) {
3153 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3154 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3155 }
8aa23a47
YO
3156 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3157 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3158
c277df42 3159#ifdef DEBUGGING
8aa23a47
YO
3160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3161 OP(nxt + 1) = OPTIMIZED; /* was count. */
3162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3164#endif
c277df42 3165#if 0
8aa23a47
YO
3166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3167 regnode *nnxt = regnext(nxt1);
3168
3169 if (nnxt == nxt) {
3170 if (reg_off_by_arg[OP(nxt1)])
3171 ARG_SET(nxt1, nxt2 - nxt1);
3172 else if (nxt2 - nxt1 < U16_MAX)
3173 NEXT_OFF(nxt1) = nxt2 - nxt1;
3174 else
3175 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3176 }
8aa23a47 3177 nxt1 = nnxt;
c277df42 3178 }
5d1c421c 3179#endif
8aa23a47
YO
3180 /* Optimize again: */
3181 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3182 NULL, stopparen, recursed, NULL, 0,depth+1);
3183 }
3184 else
3185 oscan->flags = 0;
3186 }
3187 else if ((OP(oscan) == CURLYX)
3188 && (flags & SCF_WHILEM_VISITED_POS)
3189 /* See the comment on a similar expression above.
3190 However, this time it not a subexpression
3191 we care about, but the expression itself. */
3192 && (maxcount == REG_INFTY)
3193 && data && ++data->whilem_c < 16) {
3194 /* This stays as CURLYX, we can put the count/of pair. */
3195 /* Find WHILEM (as in regexec.c) */
3196 regnode *nxt = oscan + NEXT_OFF(oscan);
3197
3198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3199 nxt += ARG(nxt);
3200 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3201 | (RExC_whilem_seen << 4)); /* On WHILEM */
3202 }
3203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3204 pars++;
3205 if (flags & SCF_DO_SUBSTR) {
3206 SV *last_str = NULL;
3207 int counted = mincount != 0;
a0ed51b3 3208
8aa23a47
YO
3209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3210#if defined(SPARC64_GCC_WORKAROUND)
3211 I32 b = 0;
3212 STRLEN l = 0;
3213 const char *s = NULL;
3214 I32 old = 0;
b515a41d 3215
8aa23a47
YO
3216 if (pos_before >= data->last_start_min)
3217 b = pos_before;
3218 else
3219 b = data->last_start_min;
b515a41d 3220
8aa23a47
YO
3221 l = 0;
3222 s = SvPV_const(data->last_found, l);
3223 old = b - data->last_start_min;
3224
3225#else
3226 I32 b = pos_before >= data->last_start_min
3227 ? pos_before : data->last_start_min;
3228 STRLEN l;
3229 const char * const s = SvPV_const(data->last_found, l);
3230 I32 old = b - data->last_start_min;
3231#endif
3232
3233 if (UTF)
3234 old = utf8_hop((U8*)s, old) - (U8*)s;
3235
3236 l -= old;
3237 /* Get the added string: */
3238 last_str = newSVpvn(s + old, l);
3239 if (UTF)
3240 SvUTF8_on(last_str);
3241 if (deltanext == 0 && pos_before == b) {
3242 /* What was added is a constant string */
3243 if (mincount > 1) {
3244 SvGROW(last_str, (mincount * l) + 1);
3245 repeatcpy(SvPVX(last_str) + l,
3246 SvPVX_const(last_str), l, mincount - 1);
3247 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3248 /* Add additional parts. */
3249 SvCUR_set(data->last_found,
3250 SvCUR(data->last_found) - l);
3251 sv_catsv(data->last_found, last_str);
3252 {
3253 SV * sv = data->last_found;
3254 MAGIC *mg =
3255 SvUTF8(sv) && SvMAGICAL(sv) ?
3256 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3257 if (mg && mg->mg_len >= 0)
3258 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3259 }
8aa23a47 3260 data->last_end += l * (mincount - 1);
b515a41d 3261 }
8aa23a47
YO
3262 } else {
3263 /* start offset must point into the last copy */
3264 data->last_start_min += minnext * (mincount - 1);
3265 data->last_start_max += is_inf ? I32_MAX
3266 : (maxcount - 1) * (minnext + data->pos_delta);
3267 }
c277df42 3268 }
8aa23a47
YO
3269 /* It is counted once already... */
3270 data->pos_min += minnext * (mincount - counted);
3271 data->pos_delta += - counted * deltanext +
3272 (minnext + deltanext) * maxcount - minnext * mincount;
3273 if (mincount != maxcount) {
3274 /* Cannot extend fixed substrings found inside
3275 the group. */
304ee84b 3276 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3277 if (mincount && last_str) {
3278 SV * const sv = data->last_found;
3279 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3280 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3281
3282 if (mg)
3283 mg->mg_len = -1;
3284 sv_setsv(sv, last_str);
3285 data->last_end = data->pos_min;
3286 data->last_start_min =
3287 data->pos_min - CHR_SVLEN(last_str);
3288 data->last_start_max = is_inf
3289 ? I32_MAX
3290 : data->pos_min + data->pos_delta
3291 - CHR_SVLEN(last_str);
3292 }
3293 data->longest = &(data->longest_float);
3294 }
3295 SvREFCNT_dec(last_str);
c277df42 3296 }
8aa23a47
YO
3297 if (data && (fl & SF_HAS_EVAL))
3298 data->flags |= SF_HAS_EVAL;
3299 optimize_curly_tail:
3300 if (OP(oscan) != CURLYX) {
3301 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3302 && NEXT_OFF(next))
3303 NEXT_OFF(oscan) += NEXT_OFF(next);
3304 }
3305 continue;
3306 default: /* REF and CLUMP only? */
3307 if (flags & SCF_DO_SUBSTR) {
304ee84b 3308 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3309 data->longest = &(data->longest_float);
3310 }
3311 is_inf = is_inf_internal = 1;
3312 if (flags & SCF_DO_STCLASS_OR)
3313 cl_anything(pRExC_state, data->start_class);
3314 flags &= ~SCF_DO_STCLASS;
3315 break;
c277df42 3316 }
8aa23a47
YO
3317 }
3318 else if (strchr((const char*)PL_simple,OP(scan))) {
3319 int value = 0;
653099ff 3320
8aa23a47 3321 if (flags & SCF_DO_SUBSTR) {
304ee84b 3322 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3323 data->pos_min++;
3324 }
3325 min++;
3326 if (flags & SCF_DO_STCLASS) {
3327 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3328
8aa23a47
YO
3329 /* Some of the logic below assumes that switching
3330 locale on will only add false positives. */
3331 switch (PL_regkind[OP(scan)]) {
3332 case SANY:
3333 default:
3334 do_default:
3335 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3336 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3337 cl_anything(pRExC_state, data->start_class);
3338 break;
3339 case REG_ANY:
3340 if (OP(scan) == SANY)
3341 goto do_default;
3342 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3343 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3344 || (data->start_class->flags & ANYOF_CLASS));
3345 cl_anything(pRExC_state, data->start_class);
653099ff 3346 }
8aa23a47
YO
3347 if (flags & SCF_DO_STCLASS_AND || !value)
3348 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3349 break;
3350 case ANYOF:
3351 if (flags & SCF_DO_STCLASS_AND)
3352 cl_and(data->start_class,
3353 (struct regnode_charclass_class*)scan);
653099ff 3354 else
8aa23a47
YO
3355 cl_or(pRExC_state, data->start_class,
3356 (struct regnode_charclass_class*)scan);
3357 break;
3358 case ALNUM:
3359 if (flags & SCF_DO_STCLASS_AND) {
3360 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3362 for (value = 0; value < 256; value++)
3363 if (!isALNUM(value))
3364 ANYOF_BITMAP_CLEAR(data->start_class, value);
3365 }
653099ff 3366 }
8aa23a47
YO
3367 else {
3368 if (data->start_class->flags & ANYOF_LOCALE)
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3370 else {
3371 for (value = 0; value < 256; value++)
3372 if (isALNUM(value))
3373 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3374 }
8aa23a47
YO
3375 }
3376 break;
3377 case ALNUML:
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (data->start_class->flags & ANYOF_LOCALE)
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3381 }
3382 else {
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 }
3386 break;
3387 case NALNUM:
3388 if (flags & SCF_DO_STCLASS_AND) {
3389 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3391 for (value = 0; value < 256; value++)
3392 if (isALNUM(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3394 }
3395 }
8aa23a47
YO
3396 else {
3397 if (data->start_class->flags & ANYOF_LOCALE)
3398 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3399 else {
3400 for (value = 0; value < 256; value++)
3401 if (!isALNUM(value))
3402 ANYOF_BITMAP_SET(data->start_class, value);
3403 }
653099ff 3404 }
8aa23a47
YO
3405 break;
3406 case NALNUML:
3407 if (flags & SCF_DO_STCLASS_AND) {
3408 if (data->start_class->flags & ANYOF_LOCALE)
3409 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3410 }
8aa23a47
YO
3411 else {
3412 data->start_class->flags |= ANYOF_LOCALE;
3413 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3414 }
3415 break;
3416 case SPACE:
3417 if (flags & SCF_DO_STCLASS_AND) {
3418 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3419 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3420 for (value = 0; value < 256; value++)
3421 if (!isSPACE(value))
3422 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3423 }
3424 }
8aa23a47
YO
3425 else {
3426 if (data->start_class->flags & ANYOF_LOCALE)
3427 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3428 else {
3429 for (value = 0; value < 256; value++)
3430 if (isSPACE(value))
3431 ANYOF_BITMAP_SET(data->start_class, value);
3432 }
653099ff 3433 }
8aa23a47
YO
3434 break;
3435 case SPACEL:
3436 if (flags & SCF_DO_STCLASS_AND) {
3437 if (data->start_class->flags & ANYOF_LOCALE)
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3439 }
3440 else {
3441 data->start_class->flags |= ANYOF_LOCALE;
3442 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3443 }
3444 break;
3445 case NSPACE:
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3448 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3449 for (value = 0; value < 256; value++)
3450 if (isSPACE(value))
3451 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3452 }
8aa23a47
YO
3453 }
3454 else {
3455 if (data->start_class->flags & ANYOF_LOCALE)
3456 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3457 else {
3458 for (value = 0; value < 256; value++)
3459 if (!isSPACE(value))
3460 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3461 }
3462 }
8aa23a47
YO
3463 break;
3464 case NSPACEL:
3465 if (flags & SCF_DO_STCLASS_AND) {
3466 if (data->start_class->flags & ANYOF_LOCALE) {
3467 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3468 for (value = 0; value < 256; value++)
3469 if (!isSPACE(value))
3470 ANYOF_BITMAP_CLEAR(data->start_class, value);
3471 }
653099ff 3472 }
8aa23a47
YO
3473 else {
3474 data->start_class->flags |= ANYOF_LOCALE;
3475 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3476 }
3477 break;
3478 case DIGIT:
3479 if (flags & SCF_DO_STCLASS_AND) {
3480 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3481 for (value = 0; value < 256; value++)
3482 if (!isDIGIT(value))
3483 ANYOF_BITMAP_CLEAR(data->start_class, value);
3484 }
3485 else {
3486 if (data->start_class->flags & ANYOF_LOCALE)
3487 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3488 else {
3489 for (value = 0; value < 256; value++)
3490 if (isDIGIT(value))
3491 ANYOF_BITMAP_SET(data->start_class, value);
3492 }
3493 }
3494 break;
3495 case NDIGIT:
3496 if (flags & SCF_DO_STCLASS_AND) {
3497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3498 for (value = 0; value < 256; value++)
3499 if (isDIGIT(value))
3500 ANYOF_BITMAP_CLEAR(data->start_class, value);
3501 }
3502 else {
3503 if (data->start_class->flags & ANYOF_LOCALE)
3504 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3505 else {
3506 for (value = 0; value < 256; value++)
3507 if (!isDIGIT(value))
3508 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3509 }
3510 }
8aa23a47
YO
3511 break;
3512 }
3513 if (flags & SCF_DO_STCLASS_OR)
3514 cl_and(data->start_class, and_withp);
3515 flags &= ~SCF_DO_STCLASS;
3516 }
3517 }
3518 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3519 data->flags |= (OP(scan) == MEOL
3520 ? SF_BEFORE_MEOL
3521 : SF_BEFORE_SEOL);
3522 }
3523 else if ( PL_regkind[OP(scan)] == BRANCHJ
3524 /* Lookbehind, or need to calculate parens/evals/stclass: */
3525 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3526 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3527 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3528 || OP(scan) == UNLESSM )
3529 {
3530 /* Negative Lookahead/lookbehind
3531 In this case we can't do fixed string optimisation.
3532 */
1de06328 3533
8aa23a47
YO
3534 I32 deltanext, minnext, fake = 0;
3535 regnode *nscan;
3536 struct regnode_charclass_class intrnl;
3537 int f = 0;
1de06328 3538
8aa23a47
YO
3539 data_fake.flags = 0;
3540 if (data) {
3541 data_fake.whilem_c = data->whilem_c;
3542 data_fake.last_closep = data->last_closep;
c277df42 3543 }
8aa23a47
YO
3544 else
3545 data_fake.last_closep = &fake;
58e23c8d 3546 data_fake.pos_delta = delta;
8aa23a47
YO
3547 if ( flags & SCF_DO_STCLASS && !scan->flags
3548 && OP(scan) == IFMATCH ) { /* Lookahead */
3549 cl_init(pRExC_state, &intrnl);
3550 data_fake.start_class = &intrnl;
3551 f |= SCF_DO_STCLASS_AND;
3552 }
3553 if (flags & SCF_WHILEM_VISITED_POS)
3554 f |= SCF_WHILEM_VISITED_POS;
3555 next = regnext(scan);
3556 nscan = NEXTOPER(NEXTOPER(scan));
3557 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3558 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3559 if (scan->flags) {
3560 if (deltanext) {
58e23c8d 3561 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3562 }
3563 else if (minnext > (I32)U8_MAX) {
58e23c8d 3564 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3565 }
3566 scan->flags = (U8)minnext;
3567 }
3568 if (data) {
3569 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3570 pars++;
3571 if (data_fake.flags & SF_HAS_EVAL)
3572 data->flags |= SF_HAS_EVAL;
3573 data->whilem_c = data_fake.whilem_c;
3574 }
3575 if (f & SCF_DO_STCLASS_AND) {
3576 const int was = (data->start_class->flags & ANYOF_EOS);
3577
3578 cl_and(data->start_class, &intrnl);
3579 if (was)
3580 data->start_class->flags |= ANYOF_EOS;
3581 }
cb434fcc 3582 }
8aa23a47
YO
3583#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3584 else {
3585 /* Positive Lookahead/lookbehind
3586 In this case we can do fixed string optimisation,
3587 but we must be careful about it. Note in the case of
3588 lookbehind the positions will be offset by the minimum
3589 length of the pattern, something we won't know about
3590 until after the recurse.
3591 */
3592 I32 deltanext, fake = 0;
3593 regnode *nscan;
3594 struct regnode_charclass_class intrnl;
3595 int f = 0;
3596 /* We use SAVEFREEPV so that when the full compile
3597 is finished perl will clean up the allocated
3598 minlens when its all done. This was we don't
3599 have to worry about freeing them when we know
3600 they wont be used, which would be a pain.
3601 */
3602 I32 *minnextp;
3603 Newx( minnextp, 1, I32 );
3604 SAVEFREEPV(minnextp);
3605
3606 if (data) {
3607 StructCopy(data, &data_fake, scan_data_t);
3608 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3609 f |= SCF_DO_SUBSTR;
3610 if (scan->flags)
304ee84b 3611 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3612 data_fake.last_found=newSVsv(data->last_found);
3613 }
3614 }
3615 else
3616 data_fake.last_closep = &fake;
3617 data_fake.flags = 0;
58e23c8d 3618 data_fake.pos_delta = delta;
8aa23a47
YO
3619 if (is_inf)
3620 data_fake.flags |= SF_IS_INF;
3621 if ( flags & SCF_DO_STCLASS && !scan->flags
3622 && OP(scan) == IFMATCH ) { /* Lookahead */
3623 cl_init(pRExC_state, &intrnl);
3624 data_fake.start_class = &intrnl;
3625 f |= SCF_DO_STCLASS_AND;
3626 }
3627 if (flags & SCF_WHILEM_VISITED_POS)
3628 f |= SCF_WHILEM_VISITED_POS;
3629 next = regnext(scan);
3630 nscan = NEXTOPER(NEXTOPER(scan));
3631
3632 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3633 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3634 if (scan->flags) {
3635 if (deltanext) {
58e23c8d 3636 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3637 }
3638 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3639 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3640 }
3641 scan->flags = (U8)*minnextp;
3642 }
3643
3644 *minnextp += min;
3645
3646 if (f & SCF_DO_STCLASS_AND) {
3647 const int was = (data->start_class->flags & ANYOF_EOS);
3648
3649 cl_and(data->start_class, &intrnl);
3650 if (was)
3651 data->start_class->flags |= ANYOF_EOS;
3652 }
3653 if (data) {
3654 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3655 pars++;
3656 if (data_fake.flags & SF_HAS_EVAL)
3657 data->flags |= SF_HAS_EVAL;
3658 data->whilem_c = data_fake.whilem_c;
3659 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3660 if (RExC_rx->minlen<*minnextp)
3661 RExC_rx->minlen=*minnextp;
304ee84b 3662 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3663 SvREFCNT_dec(data_fake.last_found);
3664
3665 if ( data_fake.minlen_fixed != minlenp )
3666 {
3667 data->offset_fixed= data_fake.offset_fixed;
3668 data->minlen_fixed= data_fake.minlen_fixed;
3669 data->lookbehind_fixed+= scan->flags;
3670 }
3671 if ( data_fake.minlen_float != minlenp )
3672 {
3673 data->minlen_float= data_fake.minlen_float;
3674 data->offset_float_min=data_fake.offset_float_min;
3675 data->offset_float_max=data_fake.offset_float_max;
3676 data->lookbehind_float+= scan->flags;
3677 }
3678 }
3679 }
3680
3681
40d049e4 3682 }
8aa23a47
YO
3683#endif
3684 }
3685 else if (OP(scan) == OPEN) {
3686 if (stopparen != (I32)ARG(scan))
3687 pars++;
3688 }
3689 else if (OP(scan) == CLOSE) {
3690 if (stopparen == (I32)ARG(scan)) {
3691 break;
3692 }
3693 if ((I32)ARG(scan) == is_par) {
3694 next = regnext(scan);
b515a41d 3695
8aa23a47
YO
3696 if ( next && (OP(next) != WHILEM) && next < last)
3697 is_par = 0; /* Disable optimization */
40d049e4 3698 }
8aa23a47
YO
3699 if (data)
3700 *(data->last_closep) = ARG(scan);
3701 }
3702 else if (OP(scan) == EVAL) {
c277df42
IZ
3703 if (data)
3704 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3705 }
3706 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3707 if (flags & SCF_DO_SUBSTR) {
304ee84b 3708 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3709 flags &= ~SCF_DO_SUBSTR;
40d049e4 3710 }
8aa23a47
YO
3711 if (data && OP(scan)==ACCEPT) {
3712 data->flags |= SCF_SEEN_ACCEPT;
3713 if (stopmin > min)
3714 stopmin = min;
e2e6a0f1 3715 }
8aa23a47
YO
3716 }
3717 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3718 {
0f5d15d6 3719 if (flags & SCF_DO_SUBSTR) {
304ee84b 3720 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3721 data->longest = &(data->longest_float);
3722 }
3723 is_inf = is_inf_internal = 1;
653099ff 3724 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3725 cl_anything(pRExC_state, data->start_class);
96776eda 3726 flags &= ~SCF_DO_STCLASS;
8aa23a47 3727 }
58e23c8d 3728 else if (OP(scan) == GPOS) {
bbe252da 3729 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3730 !(delta || is_inf || (data && data->pos_delta)))
3731 {
bbe252da
YO
3732 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3733 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3734 if (RExC_rx->gofs < (U32)min)
3735 RExC_rx->gofs = min;
3736 } else {
bbe252da 3737 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3738 RExC_rx->gofs = 0;
3739 }
3740 }
786e8c11 3741#ifdef TRIE_STUDY_OPT
40d049e4 3742#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3743 else if (PL_regkind[OP(scan)] == TRIE) {
3744 /* NOTE - There is similar code to this block above for handling
3745 BRANCH nodes on the initial study. If you change stuff here
3746 check there too. */
3747 regnode *trie_node= scan;
3748 regnode *tail= regnext(scan);
f8fc2ecf 3749 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3750 I32 max1 = 0, min1 = I32_MAX;
3751 struct regnode_charclass_class accum;
3752
3753 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3754 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3755 if (flags & SCF_DO_STCLASS)
3756 cl_init_zero(pRExC_state, &accum);
3757
3758 if (!trie->jump) {
3759 min1= trie->minlen;
3760 max1= trie->maxlen;
3761 } else {
3762 const regnode *nextbranch= NULL;
3763 U32 word;
3764
3765 for ( word=1 ; word <= trie->wordcount ; word++)
3766 {
3767 I32 deltanext=0, minnext=0, f = 0, fake;
3768 struct regnode_charclass_class this_class;
3769
3770 data_fake.flags = 0;
3771 if (data) {
3772 data_fake.whilem_c = data->whilem_c;
3773 data_fake.last_closep = data->last_closep;
3774 }
3775 else
3776 data_fake.last_closep = &fake;
58e23c8d 3777 data_fake.pos_delta = delta;
8aa23a47
YO
3778 if (flags & SCF_DO_STCLASS) {
3779 cl_init(pRExC_state, &this_class);
3780 data_fake.start_class = &this_class;
3781 f = SCF_DO_STCLASS_AND;
3782 }
3783 if (flags & SCF_WHILEM_VISITED_POS)
3784 f |= SCF_WHILEM_VISITED_POS;
3785
3786 if (trie->jump[word]) {
3787 if (!nextbranch)
3788 nextbranch = trie_node + trie->jump[0];
3789 scan= trie_node + trie->jump[word];
3790 /* We go from the jump point to the branch that follows
3791 it. Note this means we need the vestigal unused branches
3792 even though they arent otherwise used.
3793 */
3794 minnext = study_chunk(pRExC_state, &scan, minlenp,
3795 &deltanext, (regnode *)nextbranch, &data_fake,
3796 stopparen, recursed, NULL, f,depth+1);
3797 }
3798 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3799 nextbranch= regnext((regnode*)nextbranch);
3800
3801 if (min1 > (I32)(minnext + trie->minlen))
3802 min1 = minnext + trie->minlen;
3803 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3804 max1 = minnext + deltanext + trie->maxlen;
3805 if (deltanext == I32_MAX)
3806 is_inf = is_inf_internal = 1;
3807
3808 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3809 pars++;
3810 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3811 if ( stopmin > min + min1)
3812 stopmin = min + min1;
3813 flags &= ~SCF_DO_SUBSTR;
3814 if (data)
3815 data->flags |= SCF_SEEN_ACCEPT;
3816 }
3817 if (data) {
3818 if (data_fake.flags & SF_HAS_EVAL)
3819 data->flags |= SF_HAS_EVAL;
3820 data->whilem_c = data_fake.whilem_c;
3821 }
3822 if (flags & SCF_DO_STCLASS)
3823 cl_or(pRExC_state, &accum, &this_class);
3824 }
3825 }
3826 if (flags & SCF_DO_SUBSTR) {
3827 data->pos_min += min1;
3828 data->pos_delta += max1 - min1;
3829 if (max1 != min1 || is_inf)
3830 data->longest = &(data->longest_float);
3831 }
3832 min += min1;
3833 delta += max1 - min1;
3834 if (flags & SCF_DO_STCLASS_OR) {
3835 cl_or(pRExC_state, data->start_class, &accum);
3836 if (min1) {
3837 cl_and(data->start_class, and_withp);
3838 flags &= ~SCF_DO_STCLASS;
3839 }
3840 }
3841 else if (flags & SCF_DO_STCLASS_AND) {
3842 if (min1) {
3843 cl_and(data->start_class, &accum);
3844 flags &= ~SCF_DO_STCLASS;
3845 }
3846 else {
3847 /* Switch to OR mode: cache the old value of
3848 * data->start_class */
3849 INIT_AND_WITHP;
3850 StructCopy(data->start_class, and_withp,
3851 struct regnode_charclass_class);
3852 flags &= ~SCF_DO_STCLASS_AND;
3853 StructCopy(&accum, data->start_class,
3854 struct regnode_charclass_class);
3855 flags |= SCF_DO_STCLASS_OR;
3856 data->start_class->flags |= ANYOF_EOS;
3857 }
3858 }
3859 scan= tail;
3860 continue;
3861 }
786e8c11 3862#else
8aa23a47 3863 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3864 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3865 U8*bang=NULL;
3866
3867 min += trie->minlen;
3868 delta += (trie->maxlen - trie->minlen);
3869 flags &= ~SCF_DO_STCLASS; /* xxx */
3870 if (flags & SCF_DO_SUBSTR) {
304ee84b 3871 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3872 data->pos_min += trie->minlen;
3873 data->pos_delta += (trie->maxlen - trie->minlen);
3874 if (trie->maxlen != trie->minlen)
3875 data->longest = &(data->longest_float);
3876 }
3877 if (trie->jump) /* no more substrings -- for now /grr*/
3878 flags &= ~SCF_DO_SUBSTR;
b515a41d 3879 }
8aa23a47
YO
3880#endif /* old or new */
3881#endif /* TRIE_STUDY_OPT */
3882 /* Else: zero-length, ignore. */
3883 scan = regnext(scan);
3884 }
3885 if (frame) {
3886 last = frame->last;
3887 scan = frame->next;
3888 stopparen = frame->stop;
3889 frame = frame->prev;
3890 goto fake_study_recurse;
c277df42
IZ
3891 }
3892
3893 finish:
8aa23a47 3894 assert(!frame);
304ee84b 3895 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3896
c277df42 3897 *scanp = scan;
aca2d497 3898 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3899 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3900 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3901 if (is_par > (I32)U8_MAX)
c277df42
IZ
3902 is_par = 0;
3903 if (is_par && pars==1 && data) {
3904 data->flags |= SF_IN_PAR;
3905 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3906 }
3907 else if (pars && data) {
c277df42
IZ
3908 data->flags |= SF_HAS_PAR;
3909 data->flags &= ~SF_IN_PAR;
3910 }
653099ff 3911 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3912 cl_and(data->start_class, and_withp);
786e8c11
YO
3913 if (flags & SCF_TRIE_RESTUDY)
3914 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3915
304ee84b 3916 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3917
e2e6a0f1 3918 return min < stopmin ? min : stopmin;
c277df42
IZ
3919}
3920
2eccd3b2
NC
3921STATIC U32
3922S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3923{
4a4e7719
NC
3924 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3925
3926 Renewc(RExC_rxi->data,
3927 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3928 char, struct reg_data);
3929 if(count)
f8fc2ecf 3930 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3931 else
f8fc2ecf 3932 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3933 RExC_rxi->data->count = count + n;
3934 Copy(s, RExC_rxi->data->what + count, n, U8);
3935 return count;
c277df42
IZ
3936}
3937
f8149455 3938/*XXX: todo make this not included in a non debugging perl */
76234dfb 3939#ifndef PERL_IN_XSUB_RE
d88dccdf 3940void
864dbfa3 3941Perl_reginitcolors(pTHX)
d88dccdf 3942{
97aff369 3943 dVAR;
1df70142 3944 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3945 if (s) {
1df70142
AL
3946 char *t = savepv(s);
3947 int i = 0;
3948 PL_colors[0] = t;
d88dccdf 3949 while (++i < 6) {
1df70142
AL
3950 t = strchr(t, '\t');
3951 if (t) {
3952 *t = '\0';
3953 PL_colors[i] = ++t;
d88dccdf
IZ
3954 }
3955 else
1df70142 3956 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3957 }
3958 } else {
1df70142 3959 int i = 0;
b81d288d 3960 while (i < 6)
06b5626a 3961 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3962 }
3963 PL_colorset = 1;
3964}
76234dfb 3965#endif
8615cb43 3966
07be1b83 3967
786e8c11
YO
3968#ifdef TRIE_STUDY_OPT
3969#define CHECK_RESTUDY_GOTO \
3970 if ( \
3971 (data.flags & SCF_TRIE_RESTUDY) \
3972 && ! restudied++ \
3973 ) goto reStudy
3974#else
3975#define CHECK_RESTUDY_GOTO
3976#endif
f9f4320a 3977
a687059c 3978/*
e50aee73 3979 - pregcomp - compile a regular expression into internal code
a687059c
LW
3980 *
3981 * We can't allocate space until we know how big the compiled form will be,
3982 * but we can't compile it (and thus know how big it is) until we've got a
3983 * place to put the code. So we cheat: we compile it twice, once with code
3984 * generation turned off and size counting turned on, and once "for real".
3985 * This also means that we don't allocate space until we are sure that the
3986 * thing really will compile successfully, and we never have to move the
3987 * code and thus invalidate pointers into it. (Note that it has to be in
3988 * one piece because free() must be able to free it all.) [NB: not true in perl]
3989 *
3990 * Beware that the optimization-preparation code in here knows about some
3991 * of the structure of the compiled regexp. [I'll say.]
3992 */
b9b4dddf
YO
3993
3994
3995
f9f4320a 3996#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3997#define RE_ENGINE_PTR &PL_core_reg_engine
3998#else
f9f4320a
YO
3999extern const struct regexp_engine my_reg_engine;
4000#define RE_ENGINE_PTR &my_reg_engine
4001#endif
6d5c990f
RGS
4002
4003#ifndef PERL_IN_XSUB_RE
a687059c 4004regexp *
864dbfa3 4005Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 4006{
97aff369 4007 dVAR;
6d5c990f 4008 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
4009 /* Dispatch a request to compile a regexp to correct
4010 regexp engine. */
f9f4320a
YO
4011 if (table) {
4012 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4013 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4014 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4015 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4016 DEBUG_COMPILE_r({
8d8756e7 4017 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4018 SvIV(*ptr));
4019 });
f2f78491 4020 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 4021 }
b9b4dddf 4022 }
2a5d9b1d
RGS
4023 return Perl_re_compile(aTHX_ exp, xend, pm);
4024}
6d5c990f 4025#endif
2a5d9b1d
RGS
4026
4027regexp *
4028Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4029{
4030 dVAR;
a0d0e21e 4031 register regexp *r;
f8fc2ecf 4032 register regexp_internal *ri;
c277df42 4033 regnode *scan;
c277df42 4034 regnode *first;
a0d0e21e 4035 I32 flags;
a0d0e21e
LW
4036 I32 minlen = 0;
4037 I32 sawplus = 0;
4038 I32 sawopen = 0;
2c2d71f5 4039 scan_data_t data;
830247a4 4040 RExC_state_t RExC_state;
be8e71aa 4041 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4042#ifdef TRIE_STUDY_OPT
4043 int restudied= 0;
4044 RExC_state_t copyRExC_state;
4045#endif
2a5d9b1d 4046 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4047 DEBUG_r(if (!PL_colorset) reginitcolors());
4048
a0d0e21e 4049 if (exp == NULL)
c277df42 4050 FAIL("NULL regexp argument");
a0d0e21e 4051
a5961de5 4052 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4053
5cfc7842 4054 RExC_precomp = exp;
a3621e74 4055 DEBUG_COMPILE_r({
ab3bbdeb
YO
4056 SV *dsv= sv_newmortal();
4057 RE_PV_QUOTED_DECL(s, RExC_utf8,
4058 dsv, RExC_precomp, (xend - exp), 60);
4059 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4060 PL_colors[4],PL_colors[5],s);
a5961de5 4061 });
e2509266 4062 RExC_flags = pm->op_pmflags;
830247a4 4063 RExC_sawback = 0;
bbce6d69 4064
830247a4
IZ
4065 RExC_seen = 0;
4066 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4067 RExC_seen_evals = 0;
4068 RExC_extralen = 0;
c277df42 4069
bbce6d69 4070 /* First pass: determine size, legality. */
830247a4 4071 RExC_parse = exp;
fac92740 4072 RExC_start = exp;
830247a4
IZ
4073 RExC_end = xend;
4074 RExC_naughty = 0;
4075 RExC_npar = 1;
e2e6a0f1 4076 RExC_nestroot = 0;
830247a4
IZ
4077 RExC_size = 0L;
4078 RExC_emit = &PL_regdummy;
4079 RExC_whilem_seen = 0;
fc8cd66c 4080 RExC_charnames = NULL;
40d049e4
YO
4081 RExC_open_parens = NULL;
4082 RExC_close_parens = NULL;
4083 RExC_opend = NULL;
81714fb9 4084 RExC_paren_names = NULL;
1f1031fe
YO
4085#ifdef DEBUGGING
4086 RExC_paren_name_list = NULL;
4087#endif
40d049e4
YO
4088 RExC_recurse = NULL;
4089 RExC_recurse_count = 0;
81714fb9 4090
85ddcde9
JH
4091#if 0 /* REGC() is (currently) a NOP at the first pass.
4092 * Clever compilers notice this and complain. --jhi */
830247a4 4093 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4094#endif
3dab1dad
YO
4095 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4096 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4097 RExC_precomp = NULL;
a0d0e21e
LW
4098 return(NULL);
4099 }
07be1b83 4100 DEBUG_PARSE_r({
81714fb9
YO
4101 PerlIO_printf(Perl_debug_log,
4102 "Required size %"IVdf" nodes\n"
4103 "Starting second pass (creation)\n",
4104 (IV)RExC_size);
07be1b83
YO
4105 RExC_lastnum=0;
4106 RExC_lastparse=NULL;
4107 });
c277df42
IZ
4108 /* Small enough for pointer-storage convention?
4109 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4110 if (RExC_size >= 0x10000L && RExC_extralen)
4111 RExC_size += RExC_extralen;
c277df42 4112 else
830247a4
IZ
4113 RExC_extralen = 0;
4114 if (RExC_whilem_seen > 15)
4115 RExC_whilem_seen = 15;
a0d0e21e 4116
e2e6a0f1
YO
4117#ifdef DEBUGGING
4118 /* Make room for a sentinel value at the end of the program */
4119 RExC_size++;
4120#endif
4121
f9f4320a
YO
4122 /* Allocate space and zero-initialize. Note, the two step process
4123 of zeroing when in debug mode, thus anything assigned has to
4124 happen after that */
f8fc2ecf
YO
4125 Newxz(r, 1, regexp);
4126 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4127 char, regexp_internal);
4128 if ( r == NULL || ri == NULL )
b45f050a 4129 FAIL("Regexp out of space");
0f79a09d
GS
4130#ifdef DEBUGGING
4131 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4132 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4133#else
f8fc2ecf
YO
4134 /* bulk initialize base fields with 0. */
4135 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4136#endif
58e23c8d
YO
4137
4138 /* non-zero initialization begins here */
f8fc2ecf 4139 RXi_SET( r, ri );
f9f4320a 4140 r->engine= RE_ENGINE_PTR;
c277df42 4141 r->refcnt = 1;
bbce6d69 4142 r->prelen = xend - exp;
bbe252da 4143 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
bcdf7404
YO
4144 {
4145 bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4146 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4147 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4148 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4149 const char *fptr = STD_PAT_MODS; /*"msix"*/
4150 char *p;
4151 r->wraplen = r->prelen + has_minus + has_k + has_runon
4152 + (sizeof(STD_PAT_MODS) - 1)
4153 + (sizeof("(?:)") - 1);
4154
4155 Newx(r->wrapped, r->wraplen, char );
4156 p = r->wrapped;
4157 *p++='('; *p++='?';
4158 if (has_k)
4159 *p++ = KEEPCOPY_PAT_MOD; /*'k'*/
4160 {
4161 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4162 char *colon = r + 1;
4163 char ch;
4164
4165 while((ch = *fptr++)) {
4166 if(reganch & 1)
4167 *p++ = ch;
4168 else
4169 *r-- = ch;
4170 reganch >>= 1;
4171 }
4172 if(has_minus) {
4173 *r = '-';
4174 p = colon;
4175 }
4176 }
4177
4178 *p++=':';
4179 Copy(RExC_precomp, p, r->prelen, char);
4180 r->precomp = p;
4181 p += r->prelen;
4182 if (has_runon)
4183 *p++='\n';
4184 *p=')';
4185
4186
4187 if (0)
4188 PerlIO_printf(Perl_debug_log,
4189 "RExC_precomp: %.*s\nr->precomp: %.*s\nr->wrapped:%.*s\n",
4190 r->prelen,
4191 RExC_precomp,
4192 r->prelen,
4193 r->precomp,
4194 r->wraplen,
4195 r->wrapped
4196 );
4197
4198
4199 }
4200
bbe252da 4201 r->intflags = 0;
830247a4 4202 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4203
6bda09f9 4204 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4205 Newxz(RExC_open_parens, RExC_npar,regnode *);
4206 SAVEFREEPV(RExC_open_parens);
4207 Newxz(RExC_close_parens,RExC_npar,regnode *);
4208 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4209 }
4210
4211 /* Useful during FAIL. */
7122b237
YO
4212#ifdef RE_TRACK_PATTERN_OFFSETS
4213 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4214 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4215 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4216 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4217 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4218#endif
4219 SetProgLen(ri,RExC_size);
830247a4 4220 RExC_rx = r;
f8fc2ecf 4221 RExC_rxi = ri;
bbce6d69 4222
4223 /* Second pass: emit code. */
e2509266 4224 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4225 RExC_parse = exp;
4226 RExC_end = xend;
4227 RExC_naughty = 0;
4228 RExC_npar = 1;
f8fc2ecf
YO
4229 RExC_emit_start = ri->program;
4230 RExC_emit = ri->program;
e2e6a0f1
YO
4231#ifdef DEBUGGING
4232 /* put a sentinal on the end of the program so we can check for
4233 overwrites */
f8fc2ecf 4234 ri->program[RExC_size].type = 255;
e2e6a0f1 4235#endif
2cd61cdb 4236 /* Store the count of eval-groups for security checks: */
f8149455 4237 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4238 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4239 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4240 return(NULL);
6bda09f9 4241
07be1b83
YO
4242 /* XXXX To minimize changes to RE engine we always allocate
4243 3-units-long substrs field. */
4244 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4245 if (RExC_recurse_count) {
4246 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4247 SAVEFREEPV(RExC_recurse);
4248 }
a0d0e21e 4249
07be1b83 4250reStudy:
1de06328 4251 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4252 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4253
07be1b83
YO
4254#ifdef TRIE_STUDY_OPT
4255 if ( restudied ) {
5d458dd8 4256 U32 seen=RExC_seen;
07be1b83 4257 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4258
4259 RExC_state = copyRExC_state;
4260 if (seen & REG_TOP_LEVEL_BRANCHES)
4261 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4262 else
4263 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4264 if (data.last_found) {
07be1b83 4265 SvREFCNT_dec(data.longest_fixed);
07be1b83 4266 SvREFCNT_dec(data.longest_float);
07be1b83 4267 SvREFCNT_dec(data.last_found);
1de06328 4268 }
40d049e4 4269 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4270 } else {
40d049e4 4271 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4272 copyRExC_state = RExC_state;
07be1b83 4273 }
40d049e4
YO
4274#else
4275 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4276#endif
fc8cd66c 4277
a0d0e21e 4278 /* Dig out information for optimizations. */
bbe252da 4279 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4280 pm->op_pmflags = RExC_flags;
a0ed51b3 4281 if (UTF)
bbe252da 4282 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4283 ri->regstclass = NULL;
830247a4 4284 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4285 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4286 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4287
1de06328
YO
4288 /* testing for BRANCH here tells us whether there is "must appear"
4289 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4290 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4291 I32 fake;
c5254dd6 4292 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4293 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4294 int stclass_flag;
07be1b83 4295 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4296
4297 first = scan;
c277df42 4298 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4299 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4300 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4301 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4302 /* for now we can't handle lookbehind IFMATCH*/
4303 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4304 (OP(first) == PLUS) ||
4305 (OP(first) == MINMOD) ||
653099ff 4306 /* An {n,m} with n>0 */
07be1b83
YO
4307 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4308 {
786e8c11 4309
a0d0e21e
LW
4310 if (OP(first) == PLUS)
4311 sawplus = 1;
4312 else
3dab1dad 4313 first += regarglen[OP(first)];
07be1b83
YO
4314 if (OP(first) == IFMATCH) {
4315 first = NEXTOPER(first);
4316 first += EXTRA_STEP_2ARGS;
7c167cea 4317 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4318 first = NEXTOPER(first);
a687059c
LW
4319 }
4320
a0d0e21e
LW
4321 /* Starting-point info. */
4322 again:
786e8c11 4323 DEBUG_PEEP("first:",first,0);
07be1b83 4324 /* Ignore EXACT as we deal with it later. */
3dab1dad 4325 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4326 if (OP(first) == EXACT)
6f207bd3 4327 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4328 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4329 ri->regstclass = first;
b3c9acc1 4330 }
07be1b83 4331#ifdef TRIE_STCLASS
786e8c11 4332 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4333 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4334 {
786e8c11 4335 regnode *trie_op;
07be1b83 4336 /* this can happen only on restudy */
786e8c11 4337 if ( OP(first) == TRIE ) {
c944940b 4338 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4339 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4340 StructCopy(first,trieop,struct regnode_1);
4341 trie_op=(regnode *)trieop;
4342 } else {
c944940b 4343 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4344 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4345 StructCopy(first,trieop,struct regnode_charclass);
4346 trie_op=(regnode *)trieop;
4347 }
1de06328 4348 OP(trie_op)+=2;
786e8c11 4349 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4350 ri->regstclass = trie_op;
07be1b83
YO
4351 }
4352#endif
bfed75c6 4353 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4354 ri->regstclass = first;
3dab1dad
YO
4355 else if (PL_regkind[OP(first)] == BOUND ||
4356 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4357 ri->regstclass = first;
3dab1dad 4358 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4359 r->extflags |= (OP(first) == MBOL
4360 ? RXf_ANCH_MBOL
cad2e5aa 4361 : (OP(first) == SBOL
bbe252da
YO
4362 ? RXf_ANCH_SBOL
4363 : RXf_ANCH_BOL));
a0d0e21e 4364 first = NEXTOPER(first);
774d564b 4365 goto again;
4366 }
4367 else if (OP(first) == GPOS) {
bbe252da 4368 r->extflags |= RXf_ANCH_GPOS;
774d564b 4369 first = NEXTOPER(first);
4370 goto again;
a0d0e21e 4371 }
cf2a2b69
YO
4372 else if ((!sawopen || !RExC_sawback) &&
4373 (OP(first) == STAR &&
3dab1dad 4374 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4375 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4376 {
4377 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4378 const int type =
4379 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4380 ? RXf_ANCH_MBOL
4381 : RXf_ANCH_SBOL;
4382 r->extflags |= type;
4383 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4384 first = NEXTOPER(first);
774d564b 4385 goto again;
a0d0e21e 4386 }
b81d288d 4387 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4388 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4389 /* x+ must match at the 1st pos of run of x's */
bbe252da 4390 r->intflags |= PREGf_SKIP;
a0d0e21e 4391
c277df42 4392 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4393#ifdef TRIE_STUDY_OPT
81714fb9 4394 DEBUG_PARSE_r(
be8e71aa
YO
4395 if (!restudied)
4396 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4397 (IV)(first - scan + 1))
4398 );
4399#else
81714fb9 4400 DEBUG_PARSE_r(
be8e71aa
YO
4401 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4402 (IV)(first - scan + 1))
4403 );
4404#endif
4405
4406
a0d0e21e
LW
4407 /*
4408 * If there's something expensive in the r.e., find the
4409 * longest literal string that must appear and make it the
4410 * regmust. Resolve ties in favor of later strings, since
4411 * the regstart check works with the beginning of the r.e.
4412 * and avoiding duplication strengthens checking. Not a
4413 * strong reason, but sufficient in the absence of others.
4414 * [Now we resolve ties in favor of the earlier string if
c277df42 4415 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4416 * earlier string may buy us something the later one won't.]
4417 */
de8c5301 4418
396482e1
GA
4419 data.longest_fixed = newSVpvs("");
4420 data.longest_float = newSVpvs("");
4421 data.last_found = newSVpvs("");
c277df42
IZ
4422 data.longest = &(data.longest_fixed);
4423 first = scan;
f8fc2ecf 4424 if (!ri->regstclass) {
830247a4 4425 cl_init(pRExC_state, &ch_class);
653099ff
GS
4426 data.start_class = &ch_class;
4427 stclass_flag = SCF_DO_STCLASS_AND;
4428 } else /* XXXX Check for BOUND? */
4429 stclass_flag = 0;
cb434fcc 4430 data.last_closep = &last_close;
de8c5301 4431
1de06328 4432 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4433 &data, -1, NULL, NULL,
4434 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4435
07be1b83 4436
786e8c11
YO
4437 CHECK_RESTUDY_GOTO;
4438
4439
830247a4 4440 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4441 && data.last_start_min == 0 && data.last_end > 0
830247a4 4442 && !RExC_seen_zerolen
2bf803e2 4443 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4444 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4445 r->extflags |= RXf_CHECK_ALL;
304ee84b 4446 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4447 SvREFCNT_dec(data.last_found);
4448
1de06328
YO
4449 /* Note that code very similar to this but for anchored string
4450 follows immediately below, changes may need to be made to both.
4451 Be careful.
4452 */
a0ed51b3 4453 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4454 if (longest_float_length
c277df42
IZ
4455 || (data.flags & SF_FL_BEFORE_EOL
4456 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4457 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4458 {
1182767e 4459 I32 t,ml;
cf93c79d 4460
1de06328 4461 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4462 && data.offset_fixed == data.offset_float_min
4463 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4464 goto remove_float; /* As in (a)+. */
4465
1de06328
YO
4466 /* copy the information about the longest float from the reg_scan_data
4467 over to the program. */
33b8afdf
JH
4468 if (SvUTF8(data.longest_float)) {
4469 r->float_utf8 = data.longest_float;
c445ea15 4470 r->float_substr = NULL;
33b8afdf
JH
4471 } else {
4472 r->float_substr = data.longest_float;
c445ea15 4473 r->float_utf8 = NULL;
33b8afdf 4474 }
1de06328
YO
4475 /* float_end_shift is how many chars that must be matched that
4476 follow this item. We calculate it ahead of time as once the
4477 lookbehind offset is added in we lose the ability to correctly
4478 calculate it.*/
4479 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4480 : (I32)longest_float_length;
1de06328
YO
4481 r->float_end_shift = ml - data.offset_float_min
4482 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4483 + data.lookbehind_float;
4484 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4485 r->float_max_offset = data.offset_float_max;
1182767e 4486 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4487 r->float_max_offset -= data.lookbehind_float;
4488
cf93c79d
IZ
4489 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4490 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4491 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4492 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4493 }
4494 else {
aca2d497 4495 remove_float:
c445ea15 4496 r->float_substr = r->float_utf8 = NULL;
c277df42 4497 SvREFCNT_dec(data.longest_float);
c5254dd6 4498 longest_float_length = 0;
a0d0e21e 4499 }
c277df42 4500
1de06328
YO
4501 /* Note that code very similar to this but for floating string
4502 is immediately above, changes may need to be made to both.
4503 Be careful.
4504 */
a0ed51b3 4505 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4506 if (longest_fixed_length
c277df42
IZ
4507 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4508 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4509 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4510 {
1182767e 4511 I32 t,ml;
cf93c79d 4512
1de06328
YO
4513 /* copy the information about the longest fixed
4514 from the reg_scan_data over to the program. */
33b8afdf
JH
4515 if (SvUTF8(data.longest_fixed)) {
4516 r->anchored_utf8 = data.longest_fixed;
c445ea15 4517 r->anchored_substr = NULL;
33b8afdf
JH
4518 } else {
4519 r->anchored_substr = data.longest_fixed;
c445ea15 4520 r->anchored_utf8 = NULL;
33b8afdf 4521 }
1de06328
YO
4522 /* fixed_end_shift is how many chars that must be matched that
4523 follow this item. We calculate it ahead of time as once the
4524 lookbehind offset is added in we lose the ability to correctly
4525 calculate it.*/
4526 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4527 : (I32)longest_fixed_length;
1de06328
YO
4528 r->anchored_end_shift = ml - data.offset_fixed
4529 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4530 + data.lookbehind_fixed;
4531 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4532
cf93c79d
IZ
4533 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4534 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4535 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4536 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4537 }
4538 else {
c445ea15 4539 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4540 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4541 longest_fixed_length = 0;
a0d0e21e 4542 }
f8fc2ecf
YO
4543 if (ri->regstclass
4544 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4545 ri->regstclass = NULL;
33b8afdf
JH
4546 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4547 && stclass_flag
653099ff 4548 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4549 && !cl_is_anything(data.start_class))
4550 {
2eccd3b2 4551 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4552
f8fc2ecf 4553 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4554 struct regnode_charclass_class);
4555 StructCopy(data.start_class,
f8fc2ecf 4556 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4557 struct regnode_charclass_class);
f8fc2ecf 4558 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4559 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4560 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4561 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4562 PerlIO_printf(Perl_debug_log,
a0288114 4563 "synthetic stclass \"%s\".\n",
3f7c398e 4564 SvPVX_const(sv));});
653099ff 4565 }
c277df42
IZ
4566
4567 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4568 if (longest_fixed_length > longest_float_length) {
1de06328 4569 r->check_end_shift = r->anchored_end_shift;
c277df42 4570 r->check_substr = r->anchored_substr;
33b8afdf 4571 r->check_utf8 = r->anchored_utf8;
c277df42 4572 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4573 if (r->extflags & RXf_ANCH_SINGLE)
4574 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4575 }
4576 else {
1de06328 4577 r->check_end_shift = r->float_end_shift;
c277df42 4578 r->check_substr = r->float_substr;
33b8afdf 4579 r->check_utf8 = r->float_utf8;
1de06328
YO
4580 r->check_offset_min = r->float_min_offset;
4581 r->check_offset_max = r->float_max_offset;
a0d0e21e 4582 }
30382c73
IZ
4583 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4584 This should be changed ASAP! */
bbe252da
YO
4585 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4586 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4587 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4588 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4589 }
1de06328
YO
4590 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4591 if ( (STRLEN)minlen < longest_float_length )
4592 minlen= longest_float_length;
4593 if ( (STRLEN)minlen < longest_fixed_length )
4594 minlen= longest_fixed_length;
4595 */
a0ed51b3
LW
4596 }
4597 else {
c277df42
IZ
4598 /* Several toplevels. Best we can is to set minlen. */
4599 I32 fake;
653099ff 4600 struct regnode_charclass_class ch_class;
cb434fcc 4601 I32 last_close = 0;
c277df42 4602
5d458dd8 4603 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4604
f8fc2ecf 4605 scan = ri->program + 1;
830247a4 4606 cl_init(pRExC_state, &ch_class);
653099ff 4607 data.start_class = &ch_class;
cb434fcc 4608 data.last_closep = &last_close;
07be1b83 4609
de8c5301 4610
1de06328 4611 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4612 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4613
786e8c11 4614 CHECK_RESTUDY_GOTO;
07be1b83 4615
33b8afdf 4616 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4617 = r->float_substr = r->float_utf8 = NULL;
653099ff 4618 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4619 && !cl_is_anything(data.start_class))
4620 {
2eccd3b2 4621 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4622
f8fc2ecf 4623 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4624 struct regnode_charclass_class);
4625 StructCopy(data.start_class,
f8fc2ecf 4626 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4627 struct regnode_charclass_class);
f8fc2ecf 4628 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4629 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4630 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4631 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4632 PerlIO_printf(Perl_debug_log,
a0288114 4633 "synthetic stclass \"%s\".\n",
3f7c398e 4634 SvPVX_const(sv));});
653099ff 4635 }
a0d0e21e
LW
4636 }
4637
1de06328
YO
4638 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4639 the "real" pattern. */
cf9788e3
RGS
4640 DEBUG_OPTIMISE_r({
4641 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4642 (IV)minlen, (IV)r->minlen);
cf9788e3 4643 });
de8c5301 4644 r->minlenret = minlen;
1de06328
YO
4645 if (r->minlen < minlen)
4646 r->minlen = minlen;
4647
b81d288d 4648 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4649 r->extflags |= RXf_GPOS_SEEN;
830247a4 4650 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4651 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4652 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4653 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4654 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4655 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4656 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4657 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4658 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4659 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4660 if (RExC_paren_names)
4661 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4662 else
4663 r->paren_names = NULL;
e357fc67
YO
4664 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4665 r->extflags |= RXf_WHITE;
4666 else if (r->prelen == 1 && r->precomp[0] == '^')
4667 r->extflags |= RXf_START_ONLY;
4668
1f1031fe
YO
4669#ifdef DEBUGGING
4670 if (RExC_paren_names) {
4671 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4672 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4673 } else
1f1031fe 4674#endif
cde0cee5 4675 ri->name_list_idx = 0;
1f1031fe 4676
40d049e4
YO
4677 if (RExC_recurse_count) {
4678 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4679 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4680 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4681 }
4682 }
a02a5408
JC
4683 Newxz(r->startp, RExC_npar, I32);
4684 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4685 /* assume we don't need to swap parens around before we match */
4686
be8e71aa
YO
4687 DEBUG_DUMP_r({
4688 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4689 regdump(r);
4690 });
7122b237
YO
4691#ifdef RE_TRACK_PATTERN_OFFSETS
4692 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4693 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4694 U32 i;
4695 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4696 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4697 for (i = 1; i <= len; i++) {
7122b237 4698 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4699 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4700 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4701 }
4702 PerlIO_printf(Perl_debug_log, "\n");
4703 });
7122b237 4704#endif
a0d0e21e 4705 return(r);
a687059c
LW
4706}
4707
f9f4320a 4708#undef CORE_ONLY_BLOCK
f9f4320a 4709#undef RE_ENGINE_PTR
3dab1dad 4710
9af228c6 4711#ifndef PERL_IN_XSUB_RE
81714fb9 4712SV*
44a2ac75 4713Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
81714fb9 4714{
44a2ac75
YO
4715 AV *retarray = NULL;
4716 SV *ret;
4717 if (flags & 1)
4718 retarray=newAV();
4719
4720 if (from_re || PL_curpm) {
4721 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
81714fb9
YO
4722 if (rx && rx->paren_names) {
4723 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4724 if (he_str) {
4725 IV i;
4726 SV* sv_dat=HeVAL(he_str);
4727 I32 *nums=(I32*)SvPVX(sv_dat);
4728 for ( i=0; i<SvIVX(sv_dat); i++ ) {
ded05c2a
YO
4729 if ((I32)(rx->nparens) >= nums[i]
4730 && rx->startp[nums[i]] != -1
4731 && rx->endp[nums[i]] != -1)
81714fb9 4732 {
44a2ac75
YO
4733 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4734 if (!retarray)
4735 return ret;
4736 } else {
4737 ret = newSVsv(&PL_sv_undef);
4738 }
4739 if (retarray) {
4740 SvREFCNT_inc(ret);
4741 av_push(retarray, ret);
81714fb9
YO
4742 }
4743 }
44a2ac75
YO
4744 if (retarray)
4745 return (SV*)retarray;
81714fb9
YO
4746 }
4747 }
4748 }
44a2ac75
YO
4749 return NULL;
4750}
4751
4752SV*
4753Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4754{
4755 char *s = NULL;
a9d504c3 4756 I32 i = 0;
44a2ac75
YO
4757 I32 s1, t1;
4758 SV *sv = usesv ? usesv : newSVpvs("");
cb5c6874 4759 PERL_UNUSED_ARG(flags);
44a2ac75 4760
cde0cee5
YO
4761 if (!rx->subbeg) {
4762 sv_setsv(sv,&PL_sv_undef);
4763 return sv;
4764 }
4765 else
4766 if (paren == -2 && rx->startp[0] != -1) {
44a2ac75
YO
4767 /* $` */
4768 i = rx->startp[0];
cde0cee5 4769 s = rx->subbeg;
44a2ac75
YO
4770 }
4771 else
cde0cee5 4772 if (paren == -1 && rx->endp[0] != -1) {
44a2ac75
YO
4773 /* $' */
4774 s = rx->subbeg + rx->endp[0];
4775 i = rx->sublen - rx->endp[0];
4776 }
4777 else
4778 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4779 (s1 = rx->startp[paren]) != -1 &&
4780 (t1 = rx->endp[paren]) != -1)
4781 {
4782 /* $& $1 ... */
4783 i = t1 - s1;
4784 s = rx->subbeg + s1;
cde0cee5
YO
4785 } else {
4786 sv_setsv(sv,&PL_sv_undef);
4787 return sv;
4788 }
4789 assert(rx->sublen >= (s - rx->subbeg) + i );
4790 if (i >= 0) {
4791 const int oldtainted = PL_tainted;
4792 TAINT_NOT;
4793 sv_setpvn(sv, s, i);
4794 PL_tainted = oldtainted;
4795 if ( (rx->extflags & RXf_CANY_SEEN)
4796 ? (RX_MATCH_UTF8(rx)
4797 && (!i || is_utf8_string((U8*)s, i)))
4798 : (RX_MATCH_UTF8(rx)) )
4799 {
4800 SvUTF8_on(sv);
4801 }
4802 else
4803 SvUTF8_off(sv);
4804 if (PL_tainting) {
4805 if (RX_MATCH_TAINTED(rx)) {
4806 if (SvTYPE(sv) >= SVt_PVMG) {
4807 MAGIC* const mg = SvMAGIC(sv);
4808 MAGIC* mgt;
4809 PL_tainted = 1;
4810 SvMAGIC_set(sv, mg->mg_moremagic);
4811 SvTAINT(sv);
4812 if ((mgt = SvMAGIC(sv))) {
4813 mg->mg_moremagic = mgt;
4814 SvMAGIC_set(sv, mg);
44a2ac75 4815 }
cde0cee5
YO
4816 } else {
4817 PL_tainted = 1;
4818 SvTAINT(sv);
4819 }
4820 } else
4821 SvTAINTED_off(sv);
44a2ac75 4822 }
81714fb9 4823 } else {
44a2ac75 4824 sv_setsv(sv,&PL_sv_undef);
81714fb9 4825 }
44a2ac75 4826 return sv;
81714fb9 4827}
9af228c6 4828#endif
0a4db386 4829
894be9b7 4830/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4831 * If flags is REG_RSN_RETURN_NULL returns null.
4832 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4833 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4834 * to the parsed name as looked up in the RExC_paren_names hash.
4835 * If there is an error throws a vFAIL().. type exception.
894be9b7 4836 */
0a4db386
YO
4837
4838#define REG_RSN_RETURN_NULL 0
4839#define REG_RSN_RETURN_NAME 1
4840#define REG_RSN_RETURN_DATA 2
4841
894be9b7
YO
4842STATIC SV*
4843S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4844 char *name_start = RExC_parse;
1f1031fe
YO
4845
4846 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4847 /* skip IDFIRST by using do...while */
4848 if (UTF)
4849 do {
4850 RExC_parse += UTF8SKIP(RExC_parse);
4851 } while (isALNUM_utf8((U8*)RExC_parse));
4852 else
4853 do {
4854 RExC_parse++;
4855 } while (isALNUM(*RExC_parse));
894be9b7 4856 }
1f1031fe 4857
0a4db386
YO
4858 if ( flags ) {
4859 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4860 (int)(RExC_parse - name_start)));
894be9b7 4861 if (UTF)
0a4db386
YO
4862 SvUTF8_on(sv_name);
4863 if ( flags == REG_RSN_RETURN_NAME)
4864 return sv_name;
4865 else if (flags==REG_RSN_RETURN_DATA) {
4866 HE *he_str = NULL;
4867 SV *sv_dat = NULL;
4868 if ( ! sv_name ) /* should not happen*/
4869 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4870 if (RExC_paren_names)
4871 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4872 if ( he_str )
4873 sv_dat = HeVAL(he_str);
4874 if ( ! sv_dat )
4875 vFAIL("Reference to nonexistent named group");
4876 return sv_dat;
4877 }
4878 else {
4879 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4880 }
4881 /* NOT REACHED */
894be9b7 4882 }
0a4db386 4883 return NULL;
894be9b7
YO
4884}
4885
3dab1dad
YO
4886#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4887 int rem=(int)(RExC_end - RExC_parse); \
4888 int cut; \
4889 int num; \
4890 int iscut=0; \
4891 if (rem>10) { \
4892 rem=10; \
4893 iscut=1; \
4894 } \
4895 cut=10-rem; \
4896 if (RExC_lastparse!=RExC_parse) \
4897 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4898 rem, RExC_parse, \
4899 cut + 4, \
4900 iscut ? "..." : "<" \
4901 ); \
4902 else \
4903 PerlIO_printf(Perl_debug_log,"%16s",""); \
4904 \
4905 if (SIZE_ONLY) \
4906 num=RExC_size; \
4907 else \
4908 num=REG_NODE_NUM(RExC_emit); \
4909 if (RExC_lastnum!=num) \
0a4db386 4910 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4911 else \
0a4db386 4912 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4913 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4914 (int)((depth*2)), "", \
3dab1dad
YO
4915 (funcname) \
4916 ); \
4917 RExC_lastnum=num; \
4918 RExC_lastparse=RExC_parse; \
4919})
4920
07be1b83
YO
4921
4922
3dab1dad
YO
4923#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4924 DEBUG_PARSE_MSG((funcname)); \
4925 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4926})
6bda09f9
YO
4927#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4928 DEBUG_PARSE_MSG((funcname)); \
4929 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4930})
a687059c
LW
4931/*
4932 - reg - regular expression, i.e. main body or parenthesized thing
4933 *
4934 * Caller must absorb opening parenthesis.
4935 *
4936 * Combining parenthesis handling with the base level of regular expression
4937 * is a trifle forced, but the need to tie the tails of the branches to what
4938 * follows makes it hard to avoid.
4939 */
07be1b83
YO
4940#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4941#ifdef DEBUGGING
4942#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4943#else
4944#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4945#endif
3dab1dad 4946
e2e6a0f1
YO
4947/* this idea is borrowed from STR_WITH_LEN in handy.h */
4948#define CHECK_WORD(s,v,l) \
4949 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4950
76e3520e 4951STATIC regnode *
3dab1dad 4952S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4953 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4954{
27da23d5 4955 dVAR;
c277df42
IZ
4956 register regnode *ret; /* Will be the head of the group. */
4957 register regnode *br;
4958 register regnode *lastbr;
cbbf8932 4959 register regnode *ender = NULL;
a0d0e21e 4960 register I32 parno = 0;
cbbf8932
AL
4961 I32 flags;
4962 const I32 oregflags = RExC_flags;
6136c704
AL
4963 bool have_branch = 0;
4964 bool is_open = 0;
9d1d55b5
JP
4965
4966 /* for (?g), (?gc), and (?o) warnings; warning
4967 about (?c) will warn about (?g) -- japhy */
4968
6136c704
AL
4969#define WASTED_O 0x01
4970#define WASTED_G 0x02
4971#define WASTED_C 0x04
4972#define WASTED_GC (0x02|0x04)
cbbf8932 4973 I32 wastedflags = 0x00;
9d1d55b5 4974
fac92740 4975 char * parse_start = RExC_parse; /* MJD */
a28509cc 4976 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4977
3dab1dad
YO
4978 GET_RE_DEBUG_FLAGS_DECL;
4979 DEBUG_PARSE("reg ");
4980
4981
821b33a5 4982 *flagp = 0; /* Tentatively. */
a0d0e21e 4983
9d1d55b5 4984
a0d0e21e
LW
4985 /* Make an OPEN node, if parenthesized. */
4986 if (paren) {
e2e6a0f1
YO
4987 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4988 char *start_verb = RExC_parse;
4989 STRLEN verb_len = 0;
4990 char *start_arg = NULL;
4991 unsigned char op = 0;
4992 int argok = 1;
4993 int internal_argval = 0; /* internal_argval is only useful if !argok */
4994 while ( *RExC_parse && *RExC_parse != ')' ) {
4995 if ( *RExC_parse == ':' ) {
4996 start_arg = RExC_parse + 1;
4997 break;
4998 }
4999 RExC_parse++;
5000 }
5001 ++start_verb;
5002 verb_len = RExC_parse - start_verb;
5003 if ( start_arg ) {
5004 RExC_parse++;
5005 while ( *RExC_parse && *RExC_parse != ')' )
5006 RExC_parse++;
5007 if ( *RExC_parse != ')' )
5008 vFAIL("Unterminated verb pattern argument");
5009 if ( RExC_parse == start_arg )
5010 start_arg = NULL;
5011 } else {
5012 if ( *RExC_parse != ')' )
5013 vFAIL("Unterminated verb pattern");
5014 }
5d458dd8 5015
e2e6a0f1
YO
5016 switch ( *start_verb ) {
5017 case 'A': /* (*ACCEPT) */
5018 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
5019 op = ACCEPT;
5020 internal_argval = RExC_nestroot;
5021 }
5022 break;
5023 case 'C': /* (*COMMIT) */
5024 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
5025 op = COMMIT;
e2e6a0f1
YO
5026 break;
5027 case 'F': /* (*FAIL) */
5028 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
5029 op = OPFAIL;
5030 argok = 0;
5031 }
5032 break;
5d458dd8
YO
5033 case ':': /* (*:NAME) */
5034 case 'M': /* (*MARK:NAME) */
5035 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 5036 op = MARKPOINT;
5d458dd8
YO
5037 argok = -1;
5038 }
5039 break;
5040 case 'P': /* (*PRUNE) */
5041 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
5042 op = PRUNE;
e2e6a0f1 5043 break;
5d458dd8
YO
5044 case 'S': /* (*SKIP) */
5045 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
5046 op = SKIP;
5047 break;
5048 case 'T': /* (*THEN) */
5049 /* [19:06] <TimToady> :: is then */
5050 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
5051 op = CUTGROUP;
5052 RExC_seen |= REG_SEEN_CUTGROUP;
5053 }
e2e6a0f1
YO
5054 break;
5055 }
5056 if ( ! op ) {
5057 RExC_parse++;
5058 vFAIL3("Unknown verb pattern '%.*s'",
5059 verb_len, start_verb);
5060 }
5061 if ( argok ) {
5062 if ( start_arg && internal_argval ) {
5063 vFAIL3("Verb pattern '%.*s' may not have an argument",
5064 verb_len, start_verb);
5065 } else if ( argok < 0 && !start_arg ) {
5066 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5067 verb_len, start_verb);
5068 } else {
5069 ret = reganode(pRExC_state, op, internal_argval);
5070 if ( ! internal_argval && ! SIZE_ONLY ) {
5071 if (start_arg) {
5072 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5073 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5074 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5075 ret->flags = 0;
5076 } else {
5077 ret->flags = 1;
5078 }
5079 }
5080 }
5081 if (!internal_argval)
5082 RExC_seen |= REG_SEEN_VERBARG;
5083 } else if ( start_arg ) {
5084 vFAIL3("Verb pattern '%.*s' may not have an argument",
5085 verb_len, start_verb);
5086 } else {
5087 ret = reg_node(pRExC_state, op);
5088 }
5089 nextchar(pRExC_state);
5090 return ret;
5091 } else
fac92740 5092 if (*RExC_parse == '?') { /* (?...) */
6136c704 5093 bool is_logical = 0;
a28509cc 5094 const char * const seqstart = RExC_parse;
ca9dfc88 5095
830247a4
IZ
5096 RExC_parse++;
5097 paren = *RExC_parse++;
c277df42 5098 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5099 switch (paren) {
894be9b7 5100
1f1031fe
YO
5101 case 'P': /* (?P...) variants for those used to PCRE/Python */
5102 paren = *RExC_parse++;
5103 if ( paren == '<') /* (?P<...>) named capture */
5104 goto named_capture;
5105 else if (paren == '>') { /* (?P>name) named recursion */
5106 goto named_recursion;
5107 }
5108 else if (paren == '=') { /* (?P=...) named backref */
5109 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5110 you change this make sure you change that */
5111 char* name_start = RExC_parse;
5112 U32 num = 0;
5113 SV *sv_dat = reg_scan_name(pRExC_state,
5114 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5115 if (RExC_parse == name_start || *RExC_parse != ')')
5116 vFAIL2("Sequence %.3s... not terminated",parse_start);
5117
5118 if (!SIZE_ONLY) {
5119 num = add_data( pRExC_state, 1, "S" );
5120 RExC_rxi->data->data[num]=(void*)sv_dat;
5121 SvREFCNT_inc(sv_dat);
5122 }
5123 RExC_sawback = 1;
5124 ret = reganode(pRExC_state,
5125 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5126 num);
5127 *flagp |= HASWIDTH;
5128
5129 Set_Node_Offset(ret, parse_start+1);
5130 Set_Node_Cur_Length(ret); /* MJD */
5131
5132 nextchar(pRExC_state);
5133 return ret;
5134 }
57b84237
YO
5135 RExC_parse++;
5136 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5137 /*NOTREACHED*/
5138 case '<': /* (?<...) */
b81d288d 5139 if (*RExC_parse == '!')
c277df42 5140 paren = ',';
0a4db386 5141 else if (*RExC_parse != '=')
1f1031fe 5142 named_capture:
0a4db386 5143 { /* (?<...>) */
81714fb9 5144 char *name_start;
894be9b7 5145 SV *svname;
81714fb9
YO
5146 paren= '>';
5147 case '\'': /* (?'...') */
5148 name_start= RExC_parse;
0a4db386
YO
5149 svname = reg_scan_name(pRExC_state,
5150 SIZE_ONLY ? /* reverse test from the others */
5151 REG_RSN_RETURN_NAME :
5152 REG_RSN_RETURN_NULL);
57b84237
YO
5153 if (RExC_parse == name_start) {
5154 RExC_parse++;
5155 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5156 /*NOTREACHED*/
5157 }
81714fb9
YO
5158 if (*RExC_parse != paren)
5159 vFAIL2("Sequence (?%c... not terminated",
5160 paren=='>' ? '<' : paren);
5161 if (SIZE_ONLY) {
e62cc96a
YO
5162 HE *he_str;
5163 SV *sv_dat = NULL;
894be9b7
YO
5164 if (!svname) /* shouldnt happen */
5165 Perl_croak(aTHX_
5166 "panic: reg_scan_name returned NULL");
81714fb9
YO
5167 if (!RExC_paren_names) {
5168 RExC_paren_names= newHV();
5169 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5170#ifdef DEBUGGING
5171 RExC_paren_name_list= newAV();
5172 sv_2mortal((SV*)RExC_paren_name_list);
5173#endif
81714fb9
YO
5174 }
5175 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5176 if ( he_str )
81714fb9 5177 sv_dat = HeVAL(he_str);
e62cc96a 5178 if ( ! sv_dat ) {
81714fb9 5179 /* croak baby croak */
e62cc96a
YO
5180 Perl_croak(aTHX_
5181 "panic: paren_name hash element allocation failed");
5182 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
5183 IV count=SvIV(sv_dat);
5184 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5185 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5186 pv[count]=RExC_npar;
5187 SvIVX(sv_dat)++;
5188 } else {
5189 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5190 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5191 SvIOK_on(sv_dat);
5192 SvIVX(sv_dat)= 1;
e62cc96a 5193 }
1f1031fe
YO
5194#ifdef DEBUGGING
5195 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5196 SvREFCNT_dec(svname);
5197#endif
e62cc96a 5198
81714fb9
YO
5199 /*sv_dump(sv_dat);*/
5200 }
5201 nextchar(pRExC_state);
5202 paren = 1;
5203 goto capturing_parens;
5204 }
5205 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5206 RExC_parse++;
fac92740
MJD
5207 case '=': /* (?=...) */
5208 case '!': /* (?!...) */
830247a4 5209 RExC_seen_zerolen++;
e2e6a0f1
YO
5210 if (*RExC_parse == ')') {
5211 ret=reg_node(pRExC_state, OPFAIL);
5212 nextchar(pRExC_state);
5213 return ret;
5214 }
fac92740
MJD
5215 case ':': /* (?:...) */
5216 case '>': /* (?>...) */
a0d0e21e 5217 break;
fac92740
MJD
5218 case '$': /* (?$...) */
5219 case '@': /* (?@...) */
8615cb43 5220 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5221 break;
fac92740 5222 case '#': /* (?#...) */
830247a4
IZ
5223 while (*RExC_parse && *RExC_parse != ')')
5224 RExC_parse++;
5225 if (*RExC_parse != ')')
c277df42 5226 FAIL("Sequence (?#... not terminated");
830247a4 5227 nextchar(pRExC_state);
a0d0e21e
LW
5228 *flagp = TRYAGAIN;
5229 return NULL;
894be9b7
YO
5230 case '0' : /* (?0) */
5231 case 'R' : /* (?R) */
5232 if (*RExC_parse != ')')
6bda09f9 5233 FAIL("Sequence (?R) not terminated");
1a147d38 5234 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5235 nextchar(pRExC_state);
5236 return ret;
5237 /*notreached*/
894be9b7
YO
5238 { /* named and numeric backreferences */
5239 I32 num;
894be9b7
YO
5240 case '&': /* (?&NAME) */
5241 parse_start = RExC_parse - 1;
1f1031fe 5242 named_recursion:
894be9b7 5243 {
0a4db386
YO
5244 SV *sv_dat = reg_scan_name(pRExC_state,
5245 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5246 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5247 }
5248 goto gen_recurse_regop;
5249 /* NOT REACHED */
542fa716
YO
5250 case '+':
5251 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5252 RExC_parse++;
5253 vFAIL("Illegal pattern");
5254 }
5255 goto parse_recursion;
5256 /* NOT REACHED*/
5257 case '-': /* (?-1) */
5258 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5259 RExC_parse--; /* rewind to let it be handled later */
5260 goto parse_flags;
5261 }
5262 /*FALLTHROUGH */
6bda09f9
YO
5263 case '1': case '2': case '3': case '4': /* (?1) */
5264 case '5': case '6': case '7': case '8': case '9':
5265 RExC_parse--;
542fa716 5266 parse_recursion:
894be9b7
YO
5267 num = atoi(RExC_parse);
5268 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5269 if (*RExC_parse == '-')
5270 RExC_parse++;
6bda09f9
YO
5271 while (isDIGIT(*RExC_parse))
5272 RExC_parse++;
5273 if (*RExC_parse!=')')
5274 vFAIL("Expecting close bracket");
894be9b7
YO
5275
5276 gen_recurse_regop:
542fa716
YO
5277 if ( paren == '-' ) {
5278 /*
5279 Diagram of capture buffer numbering.
5280 Top line is the normal capture buffer numbers
5281 Botton line is the negative indexing as from
5282 the X (the (?-2))
5283
5284 + 1 2 3 4 5 X 6 7
5285 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5286 - 5 4 3 2 1 X x x
5287
5288 */
5289 num = RExC_npar + num;
5290 if (num < 1) {
5291 RExC_parse++;
5292 vFAIL("Reference to nonexistent group");
5293 }
5294 } else if ( paren == '+' ) {
5295 num = RExC_npar + num - 1;
5296 }
5297
1a147d38 5298 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5299 if (!SIZE_ONLY) {
5300 if (num > (I32)RExC_rx->nparens) {
5301 RExC_parse++;
5302 vFAIL("Reference to nonexistent group");
5303 }
40d049e4 5304 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5305 RExC_emit++;
226de585 5306 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5307 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5308 } else {
6bda09f9 5309 RExC_size++;
6bda09f9 5310 }
0a4db386 5311 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5312 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5313 Set_Node_Offset(ret, parse_start); /* MJD */
5314
6bda09f9
YO
5315 nextchar(pRExC_state);
5316 return ret;
894be9b7
YO
5317 } /* named and numeric backreferences */
5318 /* NOT REACHED */
5319
fac92740 5320 case 'p': /* (?p...) */
9014280d 5321 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5322 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5323 /* FALL THROUGH*/
fac92740 5324 case '?': /* (??...) */
6136c704 5325 is_logical = 1;
57b84237
YO
5326 if (*RExC_parse != '{') {
5327 RExC_parse++;
5328 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5329 /*NOTREACHED*/
5330 }
830247a4 5331 paren = *RExC_parse++;
0f5d15d6 5332 /* FALL THROUGH */
fac92740 5333 case '{': /* (?{...}) */
c277df42 5334 {
2eccd3b2
NC
5335 I32 count = 1;
5336 U32 n = 0;
c277df42 5337 char c;
830247a4 5338 char *s = RExC_parse;
c277df42 5339
830247a4
IZ
5340 RExC_seen_zerolen++;
5341 RExC_seen |= REG_SEEN_EVAL;
5342 while (count && (c = *RExC_parse)) {
6136c704
AL
5343 if (c == '\\') {
5344 if (RExC_parse[1])
5345 RExC_parse++;
5346 }
b81d288d 5347 else if (c == '{')
c277df42 5348 count++;
b81d288d 5349 else if (c == '}')
c277df42 5350 count--;
830247a4 5351 RExC_parse++;
c277df42 5352 }
6136c704 5353 if (*RExC_parse != ')') {
b81d288d 5354 RExC_parse = s;
b45f050a
JF
5355 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5356 }
c277df42 5357 if (!SIZE_ONLY) {
f3548bdc 5358 PAD *pad;
6136c704
AL
5359 OP_4tree *sop, *rop;
5360 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5361
569233ed
SB
5362 ENTER;
5363 Perl_save_re_context(aTHX);
f3548bdc 5364 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5365 sop->op_private |= OPpREFCOUNTED;
5366 /* re_dup will OpREFCNT_inc */
5367 OpREFCNT_set(sop, 1);
569233ed 5368 LEAVE;
c277df42 5369
830247a4 5370 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5371 RExC_rxi->data->data[n] = (void*)rop;
5372 RExC_rxi->data->data[n+1] = (void*)sop;
5373 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5374 SvREFCNT_dec(sv);
a0ed51b3 5375 }
e24b16f9 5376 else { /* First pass */
830247a4 5377 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5378 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5379 /* No compiled RE interpolated, has runtime
5380 components ===> unsafe. */
5381 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5382 if (PL_tainting && PL_tainted)
cc6b7395 5383 FAIL("Eval-group in insecure regular expression");
54df2634 5384#if PERL_VERSION > 8
923e4eb5 5385 if (IN_PERL_COMPILETIME)
b5c19bd7 5386 PL_cv_has_eval = 1;
54df2634 5387#endif
c277df42 5388 }
b5c19bd7 5389
830247a4 5390 nextchar(pRExC_state);
6136c704 5391 if (is_logical) {
830247a4 5392 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5393 if (!SIZE_ONLY)
5394 ret->flags = 2;
3dab1dad 5395 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5396 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5397 return ret;
5398 }
ccb2c380
MP
5399 ret = reganode(pRExC_state, EVAL, n);
5400 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5401 Set_Node_Offset(ret, parse_start);
5402 return ret;
c277df42 5403 }
fac92740 5404 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5405 {
0a4db386 5406 int is_define= 0;
fac92740 5407 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5408 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5409 || RExC_parse[1] == '<'
830247a4 5410 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5411 I32 flag;
5412
830247a4 5413 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5414 if (!SIZE_ONLY)
5415 ret->flags = 1;
3dab1dad 5416 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5417 goto insert_if;
b81d288d 5418 }
a0ed51b3 5419 }
0a4db386
YO
5420 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5421 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5422 {
5423 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5424 char *name_start= RExC_parse++;
2eccd3b2 5425 U32 num = 0;
0a4db386
YO
5426 SV *sv_dat=reg_scan_name(pRExC_state,
5427 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5428 if (RExC_parse == name_start || *RExC_parse != ch)
5429 vFAIL2("Sequence (?(%c... not terminated",
5430 (ch == '>' ? '<' : ch));
5431 RExC_parse++;
5432 if (!SIZE_ONLY) {
5433 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5434 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5435 SvREFCNT_inc(sv_dat);
5436 }
5437 ret = reganode(pRExC_state,NGROUPP,num);
5438 goto insert_if_check_paren;
5439 }
5440 else if (RExC_parse[0] == 'D' &&
5441 RExC_parse[1] == 'E' &&
5442 RExC_parse[2] == 'F' &&
5443 RExC_parse[3] == 'I' &&
5444 RExC_parse[4] == 'N' &&
5445 RExC_parse[5] == 'E')
5446 {
5447 ret = reganode(pRExC_state,DEFINEP,0);
5448 RExC_parse +=6 ;
5449 is_define = 1;
5450 goto insert_if_check_paren;
5451 }
5452 else if (RExC_parse[0] == 'R') {
5453 RExC_parse++;
5454 parno = 0;
5455 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5456 parno = atoi(RExC_parse++);
5457 while (isDIGIT(*RExC_parse))
5458 RExC_parse++;
5459 } else if (RExC_parse[0] == '&') {
5460 SV *sv_dat;
5461 RExC_parse++;
5462 sv_dat = reg_scan_name(pRExC_state,
5463 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5464 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5465 }
1a147d38 5466 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5467 goto insert_if_check_paren;
5468 }
830247a4 5469 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5470 /* (?(1)...) */
6136c704 5471 char c;
830247a4 5472 parno = atoi(RExC_parse++);
c277df42 5473
830247a4
IZ
5474 while (isDIGIT(*RExC_parse))
5475 RExC_parse++;
fac92740 5476 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5477
0a4db386 5478 insert_if_check_paren:
830247a4 5479 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5480 vFAIL("Switch condition not recognized");
c277df42 5481 insert_if:
3dab1dad
YO
5482 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5483 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5484 if (br == NULL)
830247a4 5485 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5486 else
3dab1dad 5487 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5488 c = *nextchar(pRExC_state);
d1b80229
IZ
5489 if (flags&HASWIDTH)
5490 *flagp |= HASWIDTH;
c277df42 5491 if (c == '|') {
0a4db386
YO
5492 if (is_define)
5493 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5494 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5495 regbranch(pRExC_state, &flags, 1,depth+1);
5496 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5497 if (flags&HASWIDTH)
5498 *flagp |= HASWIDTH;
830247a4 5499 c = *nextchar(pRExC_state);
a0ed51b3
LW
5500 }
5501 else
c277df42
IZ
5502 lastbr = NULL;
5503 if (c != ')')
8615cb43 5504 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5505 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5506 REGTAIL(pRExC_state, br, ender);
c277df42 5507 if (lastbr) {
3dab1dad
YO
5508 REGTAIL(pRExC_state, lastbr, ender);
5509 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5510 }
5511 else
3dab1dad 5512 REGTAIL(pRExC_state, ret, ender);
c277df42 5513 return ret;
a0ed51b3
LW
5514 }
5515 else {
830247a4 5516 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5517 }
5518 }
1b1626e4 5519 case 0:
830247a4 5520 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5521 vFAIL("Sequence (? incomplete");
1b1626e4 5522 break;
a0d0e21e 5523 default:
cde0cee5
YO
5524 --RExC_parse;
5525 parse_flags: /* (?i) */
5526 {
5527 U32 posflags = 0, negflags = 0;
5528 U32 *flagsp = &posflags;
5529
5530 while (*RExC_parse) {
5531 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5532 /* (?g), (?gc) and (?o) are useless here
5533 and must be globally applied -- japhy */
cde0cee5
YO
5534 switch (*RExC_parse) {
5535 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5536 case 'o':
5537 case 'g':
9d1d55b5 5538 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5539 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5540 if (! (wastedflags & wflagbit) ) {
5541 wastedflags |= wflagbit;
5542 vWARN5(
5543 RExC_parse + 1,
5544 "Useless (%s%c) - %suse /%c modifier",
5545 flagsp == &negflags ? "?-" : "?",
5546 *RExC_parse,
5547 flagsp == &negflags ? "don't " : "",
5548 *RExC_parse
5549 );
5550 }
5551 }
cde0cee5
YO
5552 break;
5553
5554 case 'c':
9d1d55b5 5555 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5556 if (! (wastedflags & WASTED_C) ) {
5557 wastedflags |= WASTED_GC;
9d1d55b5
JP
5558 vWARN3(
5559 RExC_parse + 1,
5560 "Useless (%sc) - %suse /gc modifier",
5561 flagsp == &negflags ? "?-" : "?",
5562 flagsp == &negflags ? "don't " : ""
5563 );
5564 }
5565 }
cde0cee5
YO
5566 break;
5567 case 'k':
5568 if (flagsp == &negflags) {
5569 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5570 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5571 } else {
5572 *flagsp |= RXf_PMf_KEEPCOPY;
5573 }
5574 break;
5575 case '-':
57b84237
YO
5576 if (flagsp == &negflags) {
5577 RExC_parse++;
5578 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5579 /*NOTREACHED*/
5580 }
cde0cee5
YO
5581 flagsp = &negflags;
5582 wastedflags = 0; /* reset so (?g-c) warns twice */
5583 break;
5584 case ':':
5585 paren = ':';
5586 /*FALLTHROUGH*/
5587 case ')':
5588 RExC_flags |= posflags;
5589 RExC_flags &= ~negflags;
5590 nextchar(pRExC_state);
5591 if (paren != ':') {
5592 *flagp = TRYAGAIN;
5593 return NULL;
5594 } else {
5595 ret = NULL;
5596 goto parse_rest;
5597 }
5598 /*NOTREACHED*/
5599 default:
cde0cee5
YO
5600 RExC_parse++;
5601 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5602 /*NOTREACHED*/
5603 }
830247a4 5604 ++RExC_parse;
48c036b1 5605 }
cde0cee5 5606 }} /* one for the default block, one for the switch */
a0d0e21e 5607 }
fac92740 5608 else { /* (...) */
81714fb9 5609 capturing_parens:
830247a4
IZ
5610 parno = RExC_npar;
5611 RExC_npar++;
e2e6a0f1 5612
830247a4 5613 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5614 if (!SIZE_ONLY ){
5615 if (!RExC_nestroot)
5616 RExC_nestroot = parno;
5617 if (RExC_seen & REG_SEEN_RECURSE) {
5618 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5619 "Setting open paren #%"IVdf" to %d\n",
5620 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5621 RExC_open_parens[parno-1]= ret;
5622 }
6bda09f9 5623 }
fac92740
MJD
5624 Set_Node_Length(ret, 1); /* MJD */
5625 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5626 is_open = 1;
a0d0e21e 5627 }
a0ed51b3 5628 }
fac92740 5629 else /* ! paren */
a0d0e21e 5630 ret = NULL;
cde0cee5
YO
5631
5632 parse_rest:
a0d0e21e 5633 /* Pick up the branches, linking them together. */
fac92740 5634 parse_start = RExC_parse; /* MJD */
3dab1dad 5635 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5636 /* branch_len = (paren != 0); */
2af232bd 5637
a0d0e21e
LW
5638 if (br == NULL)
5639 return(NULL);
830247a4
IZ
5640 if (*RExC_parse == '|') {
5641 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5642 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5643 }
fac92740 5644 else { /* MJD */
6bda09f9 5645 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5646 Set_Node_Length(br, paren != 0);
5647 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5648 }
c277df42
IZ
5649 have_branch = 1;
5650 if (SIZE_ONLY)
830247a4 5651 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5652 }
5653 else if (paren == ':') {
c277df42
IZ
5654 *flagp |= flags&SIMPLE;
5655 }
6136c704 5656 if (is_open) { /* Starts with OPEN. */
3dab1dad 5657 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5658 }
5659 else if (paren != '?') /* Not Conditional */
a0d0e21e 5660 ret = br;
32a0ca98 5661 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5662 lastbr = br;
830247a4
IZ
5663 while (*RExC_parse == '|') {
5664 if (!SIZE_ONLY && RExC_extralen) {
5665 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5666 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5667 }
5668 if (SIZE_ONLY)
830247a4
IZ
5669 RExC_extralen += 2; /* Account for LONGJMP. */
5670 nextchar(pRExC_state);
3dab1dad 5671 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5672
a687059c 5673 if (br == NULL)
a0d0e21e 5674 return(NULL);
3dab1dad 5675 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5676 lastbr = br;
821b33a5
IZ
5677 if (flags&HASWIDTH)
5678 *flagp |= HASWIDTH;
a687059c 5679 *flagp |= flags&SPSTART;
a0d0e21e
LW
5680 }
5681
c277df42
IZ
5682 if (have_branch || paren != ':') {
5683 /* Make a closing node, and hook it on the end. */
5684 switch (paren) {
5685 case ':':
830247a4 5686 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5687 break;
5688 case 1:
830247a4 5689 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5690 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5691 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5692 "Setting close paren #%"IVdf" to %d\n",
5693 (IV)parno, REG_NODE_NUM(ender)));
5694 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5695 if (RExC_nestroot == parno)
5696 RExC_nestroot = 0;
40d049e4 5697 }
fac92740
MJD
5698 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5699 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5700 break;
5701 case '<':
c277df42
IZ
5702 case ',':
5703 case '=':
5704 case '!':
c277df42 5705 *flagp &= ~HASWIDTH;
821b33a5
IZ
5706 /* FALL THROUGH */
5707 case '>':
830247a4 5708 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5709 break;
5710 case 0:
830247a4 5711 ender = reg_node(pRExC_state, END);
40d049e4
YO
5712 if (!SIZE_ONLY) {
5713 assert(!RExC_opend); /* there can only be one! */
5714 RExC_opend = ender;
5715 }
c277df42
IZ
5716 break;
5717 }
eaf3ca90 5718 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5719
9674d46a 5720 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5721 if (depth==1)
5722 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5723
c277df42 5724 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5725 for (br = ret; br; br = regnext(br)) {
5726 const U8 op = PL_regkind[OP(br)];
5727 if (op == BRANCH) {
07be1b83 5728 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5729 }
5730 else if (op == BRANCHJ) {
07be1b83 5731 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5732 }
c277df42
IZ
5733 }
5734 }
a0d0e21e 5735 }
c277df42
IZ
5736
5737 {
e1ec3a88
AL
5738 const char *p;
5739 static const char parens[] = "=!<,>";
c277df42
IZ
5740
5741 if (paren && (p = strchr(parens, paren))) {
eb160463 5742 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5743 int flag = (p - parens) > 1;
5744
5745 if (paren == '>')
5746 node = SUSPEND, flag = 0;
6bda09f9 5747 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5748 Set_Node_Cur_Length(ret);
5749 Set_Node_Offset(ret, parse_start + 1);
c277df42 5750 ret->flags = flag;
07be1b83 5751 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5752 }
a0d0e21e
LW
5753 }
5754
5755 /* Check for proper termination. */
ce3e6498 5756 if (paren) {
e2509266 5757 RExC_flags = oregflags;
830247a4
IZ
5758 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5759 RExC_parse = oregcomp_parse;
380a0633 5760 vFAIL("Unmatched (");
ce3e6498 5761 }
a0ed51b3 5762 }
830247a4
IZ
5763 else if (!paren && RExC_parse < RExC_end) {
5764 if (*RExC_parse == ')') {
5765 RExC_parse++;
380a0633 5766 vFAIL("Unmatched )");
a0ed51b3
LW
5767 }
5768 else
b45f050a 5769 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5770 /* NOTREACHED */
5771 }
a687059c 5772
a0d0e21e 5773 return(ret);
a687059c
LW
5774}
5775
5776/*
5777 - regbranch - one alternative of an | operator
5778 *
5779 * Implements the concatenation operator.
5780 */
76e3520e 5781STATIC regnode *
3dab1dad 5782S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5783{
97aff369 5784 dVAR;
c277df42
IZ
5785 register regnode *ret;
5786 register regnode *chain = NULL;
5787 register regnode *latest;
5788 I32 flags = 0, c = 0;
3dab1dad
YO
5789 GET_RE_DEBUG_FLAGS_DECL;
5790 DEBUG_PARSE("brnc");
b81d288d 5791 if (first)
c277df42
IZ
5792 ret = NULL;
5793 else {
b81d288d 5794 if (!SIZE_ONLY && RExC_extralen)
830247a4 5795 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5796 else {
830247a4 5797 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5798 Set_Node_Length(ret, 1);
5799 }
c277df42
IZ
5800 }
5801
b81d288d 5802 if (!first && SIZE_ONLY)
830247a4 5803 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5804
c277df42 5805 *flagp = WORST; /* Tentatively. */
a0d0e21e 5806
830247a4
IZ
5807 RExC_parse--;
5808 nextchar(pRExC_state);
5809 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5810 flags &= ~TRYAGAIN;
3dab1dad 5811 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5812 if (latest == NULL) {
5813 if (flags & TRYAGAIN)
5814 continue;
5815 return(NULL);
a0ed51b3
LW
5816 }
5817 else if (ret == NULL)
c277df42 5818 ret = latest;
a0d0e21e 5819 *flagp |= flags&HASWIDTH;
c277df42 5820 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5821 *flagp |= flags&SPSTART;
5822 else {
830247a4 5823 RExC_naughty++;
3dab1dad 5824 REGTAIL(pRExC_state, chain, latest);
a687059c 5825 }
a0d0e21e 5826 chain = latest;
c277df42
IZ
5827 c++;
5828 }
5829 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5830 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5831 if (ret == NULL)
5832 ret = chain;
5833 }
5834 if (c == 1) {
5835 *flagp |= flags&SIMPLE;
a0d0e21e 5836 }
a687059c 5837
d4c19fe8 5838 return ret;
a687059c
LW
5839}
5840
5841/*
5842 - regpiece - something followed by possible [*+?]
5843 *
5844 * Note that the branching code sequences used for ? and the general cases
5845 * of * and + are somewhat optimized: they use the same NOTHING node as
5846 * both the endmarker for their branch list and the body of the last branch.
5847 * It might seem that this node could be dispensed with entirely, but the
5848 * endmarker role is not redundant.
5849 */
76e3520e 5850STATIC regnode *
3dab1dad 5851S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5852{
97aff369 5853 dVAR;
c277df42 5854 register regnode *ret;
a0d0e21e
LW
5855 register char op;
5856 register char *next;
5857 I32 flags;
1df70142 5858 const char * const origparse = RExC_parse;
a0d0e21e 5859 I32 min;
c277df42 5860 I32 max = REG_INFTY;
fac92740 5861 char *parse_start;
10edeb5d 5862 const char *maxpos = NULL;
3dab1dad
YO
5863 GET_RE_DEBUG_FLAGS_DECL;
5864 DEBUG_PARSE("piec");
a0d0e21e 5865
3dab1dad 5866 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5867 if (ret == NULL) {
5868 if (flags & TRYAGAIN)
5869 *flagp |= TRYAGAIN;
5870 return(NULL);
5871 }
5872
830247a4 5873 op = *RExC_parse;
a0d0e21e 5874
830247a4 5875 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5876 maxpos = NULL;
fac92740 5877 parse_start = RExC_parse; /* MJD */
830247a4 5878 next = RExC_parse + 1;
a0d0e21e
LW
5879 while (isDIGIT(*next) || *next == ',') {
5880 if (*next == ',') {
5881 if (maxpos)
5882 break;
5883 else
5884 maxpos = next;
a687059c 5885 }
a0d0e21e
LW
5886 next++;
5887 }
5888 if (*next == '}') { /* got one */
5889 if (!maxpos)
5890 maxpos = next;
830247a4
IZ
5891 RExC_parse++;
5892 min = atoi(RExC_parse);
a0d0e21e
LW
5893 if (*maxpos == ',')
5894 maxpos++;
5895 else
830247a4 5896 maxpos = RExC_parse;
a0d0e21e
LW
5897 max = atoi(maxpos);
5898 if (!max && *maxpos != '0')
c277df42
IZ
5899 max = REG_INFTY; /* meaning "infinity" */
5900 else if (max >= REG_INFTY)
8615cb43 5901 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5902 RExC_parse = next;
5903 nextchar(pRExC_state);
a0d0e21e
LW
5904
5905 do_curly:
5906 if ((flags&SIMPLE)) {
830247a4 5907 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5908 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5909 Set_Node_Offset(ret, parse_start+1); /* MJD */
5910 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5911 }
5912 else {
3dab1dad 5913 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5914
5915 w->flags = 0;
3dab1dad 5916 REGTAIL(pRExC_state, ret, w);
830247a4 5917 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5918 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5919 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5920 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5921 }
6bda09f9 5922 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5923 /* MJD hk */
5924 Set_Node_Offset(ret, parse_start+1);
2af232bd 5925 Set_Node_Length(ret,
fac92740 5926 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5927
830247a4 5928 if (!SIZE_ONLY && RExC_extralen)
c277df42 5929 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5930 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5931 if (SIZE_ONLY)
830247a4
IZ
5932 RExC_whilem_seen++, RExC_extralen += 3;
5933 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5934 }
c277df42 5935 ret->flags = 0;
a0d0e21e
LW
5936
5937 if (min > 0)
821b33a5
IZ
5938 *flagp = WORST;
5939 if (max > 0)
5940 *flagp |= HASWIDTH;
a0d0e21e 5941 if (max && max < min)
8615cb43 5942 vFAIL("Can't do {n,m} with n > m");
c277df42 5943 if (!SIZE_ONLY) {
eb160463
GS
5944 ARG1_SET(ret, (U16)min);
5945 ARG2_SET(ret, (U16)max);
a687059c 5946 }
a687059c 5947
a0d0e21e 5948 goto nest_check;
a687059c 5949 }
a0d0e21e 5950 }
a687059c 5951
a0d0e21e
LW
5952 if (!ISMULT1(op)) {
5953 *flagp = flags;
a687059c 5954 return(ret);
a0d0e21e 5955 }
bb20fd44 5956
c277df42 5957#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5958
5959 /* if this is reinstated, don't forget to put this back into perldiag:
5960
5961 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5962
5963 (F) The part of the regexp subject to either the * or + quantifier
5964 could match an empty string. The {#} shows in the regular
5965 expression about where the problem was discovered.
5966
5967 */
5968
bb20fd44 5969 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5970 vFAIL("Regexp *+ operand could be empty");
b81d288d 5971#endif
bb20fd44 5972
fac92740 5973 parse_start = RExC_parse;
830247a4 5974 nextchar(pRExC_state);
a0d0e21e 5975
821b33a5 5976 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5977
5978 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5979 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5980 ret->flags = 0;
830247a4 5981 RExC_naughty += 4;
a0d0e21e
LW
5982 }
5983 else if (op == '*') {
5984 min = 0;
5985 goto do_curly;
a0ed51b3
LW
5986 }
5987 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5988 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5989 ret->flags = 0;
830247a4 5990 RExC_naughty += 3;
a0d0e21e
LW
5991 }
5992 else if (op == '+') {
5993 min = 1;
5994 goto do_curly;
a0ed51b3
LW
5995 }
5996 else if (op == '?') {
a0d0e21e
LW
5997 min = 0; max = 1;
5998 goto do_curly;
5999 }
6000 nest_check:
041457d9 6001 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 6002 vWARN3(RExC_parse,
b45f050a 6003 "%.*s matches null string many times",
afd78fd5 6004 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 6005 origparse);
a0d0e21e
LW
6006 }
6007
b9b4dddf 6008 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6009 nextchar(pRExC_state);
6bda09f9 6010 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6011 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6012 }
b9b4dddf
YO
6013#ifndef REG_ALLOW_MINMOD_SUSPEND
6014 else
6015#endif
6016 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6017 regnode *ender;
6018 nextchar(pRExC_state);
6019 ender = reg_node(pRExC_state, SUCCEED);
6020 REGTAIL(pRExC_state, ret, ender);
6021 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6022 ret->flags = 0;
6023 ender = reg_node(pRExC_state, TAIL);
6024 REGTAIL(pRExC_state, ret, ender);
6025 /*ret= ender;*/
6026 }
6027
6028 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6029 RExC_parse++;
b45f050a
JF
6030 vFAIL("Nested quantifiers");
6031 }
a0d0e21e
LW
6032
6033 return(ret);
a687059c
LW
6034}
6035
fc8cd66c
YO
6036
6037/* reg_namedseq(pRExC_state,UVp)
6038
6039 This is expected to be called by a parser routine that has
6040 recognized'\N' and needs to handle the rest. RExC_parse is
6041 expected to point at the first char following the N at the time
6042 of the call.
6043
6044 If valuep is non-null then it is assumed that we are parsing inside
6045 of a charclass definition and the first codepoint in the resolved
6046 string is returned via *valuep and the routine will return NULL.
6047 In this mode if a multichar string is returned from the charnames
6048 handler a warning will be issued, and only the first char in the
6049 sequence will be examined. If the string returned is zero length
6050 then the value of *valuep is undefined and NON-NULL will
6051 be returned to indicate failure. (This will NOT be a valid pointer
6052 to a regnode.)
6053
6054 If value is null then it is assumed that we are parsing normal text
6055 and inserts a new EXACT node into the program containing the resolved
6056 string and returns a pointer to the new node. If the string is
6057 zerolength a NOTHING node is emitted.
6058
6059 On success RExC_parse is set to the char following the endbrace.
6060 Parsing failures will generate a fatal errorvia vFAIL(...)
6061
6062 NOTE: We cache all results from the charnames handler locally in
6063 the RExC_charnames hash (created on first use) to prevent a charnames
6064 handler from playing silly-buggers and returning a short string and
6065 then a long string for a given pattern. Since the regexp program
6066 size is calculated during an initial parse this would result
6067 in a buffer overrun so we cache to prevent the charname result from
6068 changing during the course of the parse.
6069
6070 */
6071STATIC regnode *
6072S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6073{
6074 char * name; /* start of the content of the name */
6075 char * endbrace; /* endbrace following the name */
6076 SV *sv_str = NULL;
6077 SV *sv_name = NULL;
6078 STRLEN len; /* this has various purposes throughout the code */
6079 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6080 regnode *ret = NULL;
6081
6082 if (*RExC_parse != '{') {
6083 vFAIL("Missing braces on \\N{}");
6084 }
6085 name = RExC_parse+1;
6086 endbrace = strchr(RExC_parse, '}');
6087 if ( ! endbrace ) {
6088 RExC_parse++;
6089 vFAIL("Missing right brace on \\N{}");
6090 }
6091 RExC_parse = endbrace + 1;
6092
6093
6094 /* RExC_parse points at the beginning brace,
6095 endbrace points at the last */
6096 if ( name[0]=='U' && name[1]=='+' ) {
6097 /* its a "unicode hex" notation {U+89AB} */
6098 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6099 | PERL_SCAN_DISALLOW_PREFIX
6100 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6101 UV cp;
196f1508 6102 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6103 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6104 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6105 cp = 0xFFFD;
6106 }
6107 if (cp > 0xff)
6108 RExC_utf8 = 1;
6109 if ( valuep ) {
6110 *valuep = cp;
6111 return NULL;
6112 }
6113 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6114 } else {
6115 /* fetch the charnames handler for this scope */
6116 HV * const table = GvHV(PL_hintgv);
6117 SV **cvp= table ?
6118 hv_fetchs(table, "charnames", FALSE) :
6119 NULL;
6120 SV *cv= cvp ? *cvp : NULL;
6121 HE *he_str;
6122 int count;
6123 /* create an SV with the name as argument */
6124 sv_name = newSVpvn(name, endbrace - name);
6125
6126 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6127 vFAIL2("Constant(\\N{%s}) unknown: "
6128 "(possibly a missing \"use charnames ...\")",
6129 SvPVX(sv_name));
6130 }
6131 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6132 vFAIL2("Constant(\\N{%s}): "
6133 "$^H{charnames} is not defined",SvPVX(sv_name));
6134 }
6135
6136
6137
6138 if (!RExC_charnames) {
6139 /* make sure our cache is allocated */
6140 RExC_charnames = newHV();
6bda09f9 6141 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6142 }
6143 /* see if we have looked this one up before */
6144 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6145 if ( he_str ) {
6146 sv_str = HeVAL(he_str);
6147 cached = 1;
6148 } else {
6149 dSP ;
6150
6151 ENTER ;
6152 SAVETMPS ;
6153 PUSHMARK(SP) ;
6154
6155 XPUSHs(sv_name);
6156
6157 PUTBACK ;
6158
6159 count= call_sv(cv, G_SCALAR);
6160
6161 if (count == 1) { /* XXXX is this right? dmq */
6162 sv_str = POPs;
6163 SvREFCNT_inc_simple_void(sv_str);
6164 }
6165
6166 SPAGAIN ;
6167 PUTBACK ;
6168 FREETMPS ;
6169 LEAVE ;
6170
6171 if ( !sv_str || !SvOK(sv_str) ) {
6172 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6173 "did not return a defined value",SvPVX(sv_name));
6174 }
6175 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6176 cached = 1;
6177 }
6178 }
6179 if (valuep) {
6180 char *p = SvPV(sv_str, len);
6181 if (len) {
6182 STRLEN numlen = 1;
6183 if ( SvUTF8(sv_str) ) {
196f1508 6184 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6185 if (*valuep > 0x7F)
6186 RExC_utf8 = 1;
6187 /* XXXX
6188 We have to turn on utf8 for high bit chars otherwise
6189 we get failures with
6190
6191 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6192 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6193
6194 This is different from what \x{} would do with the same
6195 codepoint, where the condition is > 0xFF.
6196 - dmq
6197 */
6198
6199
6200 } else {
6201 *valuep = (UV)*p;
6202 /* warn if we havent used the whole string? */
6203 }
6204 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6205 vWARN2(RExC_parse,
6206 "Ignoring excess chars from \\N{%s} in character class",
6207 SvPVX(sv_name)
6208 );
6209 }
6210 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6211 vWARN2(RExC_parse,
6212 "Ignoring zero length \\N{%s} in character class",
6213 SvPVX(sv_name)
6214 );
6215 }
6216 if (sv_name)
6217 SvREFCNT_dec(sv_name);
6218 if (!cached)
6219 SvREFCNT_dec(sv_str);
6220 return len ? NULL : (regnode *)&len;
6221 } else if(SvCUR(sv_str)) {
6222
6223 char *s;
6224 char *p, *pend;
6225 STRLEN charlen = 1;
d008bc60 6226#ifdef DEBUGGING
fc8cd66c 6227 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6228#endif
fc8cd66c
YO
6229 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6230
6231 ret = reg_node(pRExC_state,
6232 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6233 s= STRING(ret);
6234
6235 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6236 sv_utf8_upgrade(sv_str);
6237 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6238 RExC_utf8= 1;
6239 }
6240
6241 p = SvPV(sv_str, len);
6242 pend = p + len;
6243 /* len is the length written, charlen is the size the char read */
6244 for ( len = 0; p < pend; p += charlen ) {
6245 if (UTF) {
196f1508 6246 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6247 if (FOLD) {
6248 STRLEN foldlen,numlen;
6249 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6250 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6251 /* Emit all the Unicode characters. */
6252
6253 for (foldbuf = tmpbuf;
6254 foldlen;
6255 foldlen -= numlen)
6256 {
6257 uvc = utf8_to_uvchr(foldbuf, &numlen);
6258 if (numlen > 0) {
6259 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6260 s += unilen;
6261 len += unilen;
6262 /* In EBCDIC the numlen
6263 * and unilen can differ. */
6264 foldbuf += numlen;
6265 if (numlen >= foldlen)
6266 break;
6267 }
6268 else
6269 break; /* "Can't happen." */
6270 }
6271 } else {
6272 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6273 if (unilen > 0) {
6274 s += unilen;
6275 len += unilen;
6276 }
6277 }
6278 } else {
6279 len++;
6280 REGC(*p, s++);
6281 }
6282 }
6283 if (SIZE_ONLY) {
6284 RExC_size += STR_SZ(len);
6285 } else {
6286 STR_LEN(ret) = len;
6287 RExC_emit += STR_SZ(len);
6288 }
6289 Set_Node_Cur_Length(ret); /* MJD */
6290 RExC_parse--;
6291 nextchar(pRExC_state);
6292 } else {
6293 ret = reg_node(pRExC_state,NOTHING);
6294 }
6295 if (!cached) {
6296 SvREFCNT_dec(sv_str);
6297 }
6298 if (sv_name) {
6299 SvREFCNT_dec(sv_name);
6300 }
6301 return ret;
6302
6303}
6304
6305
9e08bc66
TS
6306/*
6307 * reg_recode
6308 *
6309 * It returns the code point in utf8 for the value in *encp.
6310 * value: a code value in the source encoding
6311 * encp: a pointer to an Encode object
6312 *
6313 * If the result from Encode is not a single character,
6314 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6315 */
6316STATIC UV
6317S_reg_recode(pTHX_ const char value, SV **encp)
6318{
6319 STRLEN numlen = 1;
6320 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6321 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6322 : SvPVX(sv);
6323 const STRLEN newlen = SvCUR(sv);
6324 UV uv = UNICODE_REPLACEMENT;
6325
6326 if (newlen)
6327 uv = SvUTF8(sv)
6328 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6329 : *(U8*)s;
6330
6331 if (!newlen || numlen != newlen) {
6332 uv = UNICODE_REPLACEMENT;
6333 if (encp)
6334 *encp = NULL;
6335 }
6336 return uv;
6337}
6338
fc8cd66c 6339
a687059c
LW
6340/*
6341 - regatom - the lowest level
ee9b8eae
YO
6342
6343 Try to identify anything special at the start of the pattern. If there
6344 is, then handle it as required. This may involve generating a single regop,
6345 such as for an assertion; or it may involve recursing, such as to
6346 handle a () structure.
6347
6348 If the string doesn't start with something special then we gobble up
6349 as much literal text as we can.
6350
6351 Once we have been able to handle whatever type of thing started the
6352 sequence, we return.
6353
6354 Note: we have to be careful with escapes, as they can be both literal
6355 and special, and in the case of \10 and friends can either, depending
6356 on context. Specifically there are two seperate switches for handling
6357 escape sequences, with the one for handling literal escapes requiring
6358 a dummy entry for all of the special escapes that are actually handled
6359 by the other.
6360*/
6361
76e3520e 6362STATIC regnode *
3dab1dad 6363S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6364{
97aff369 6365 dVAR;
cbbf8932 6366 register regnode *ret = NULL;
a0d0e21e 6367 I32 flags;
45948336 6368 char *parse_start = RExC_parse;
3dab1dad
YO
6369 GET_RE_DEBUG_FLAGS_DECL;
6370 DEBUG_PARSE("atom");
a0d0e21e
LW
6371 *flagp = WORST; /* Tentatively. */
6372
ee9b8eae 6373
a0d0e21e 6374tryagain:
830247a4 6375 switch (*RExC_parse) {
a0d0e21e 6376 case '^':
830247a4
IZ
6377 RExC_seen_zerolen++;
6378 nextchar(pRExC_state);
bbe252da 6379 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6380 ret = reg_node(pRExC_state, MBOL);
bbe252da 6381 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6382 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6383 else
830247a4 6384 ret = reg_node(pRExC_state, BOL);
fac92740 6385 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6386 break;
6387 case '$':
830247a4 6388 nextchar(pRExC_state);
b81d288d 6389 if (*RExC_parse)
830247a4 6390 RExC_seen_zerolen++;
bbe252da 6391 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6392 ret = reg_node(pRExC_state, MEOL);
bbe252da 6393 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6394 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6395 else
830247a4 6396 ret = reg_node(pRExC_state, EOL);
fac92740 6397 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6398 break;
6399 case '.':
830247a4 6400 nextchar(pRExC_state);
bbe252da 6401 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6402 ret = reg_node(pRExC_state, SANY);
6403 else
6404 ret = reg_node(pRExC_state, REG_ANY);
6405 *flagp |= HASWIDTH|SIMPLE;
830247a4 6406 RExC_naughty++;
fac92740 6407 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6408 break;
6409 case '[':
b45f050a 6410 {
3dab1dad
YO
6411 char * const oregcomp_parse = ++RExC_parse;
6412 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6413 if (*RExC_parse != ']') {
6414 RExC_parse = oregcomp_parse;
b45f050a
JF
6415 vFAIL("Unmatched [");
6416 }
830247a4 6417 nextchar(pRExC_state);
a0d0e21e 6418 *flagp |= HASWIDTH|SIMPLE;
fac92740 6419 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6420 break;
b45f050a 6421 }
a0d0e21e 6422 case '(':
830247a4 6423 nextchar(pRExC_state);
3dab1dad 6424 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6425 if (ret == NULL) {
bf93d4cc 6426 if (flags & TRYAGAIN) {
830247a4 6427 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6428 /* Make parent create an empty node if needed. */
6429 *flagp |= TRYAGAIN;
6430 return(NULL);
6431 }
a0d0e21e 6432 goto tryagain;
bf93d4cc 6433 }
a0d0e21e
LW
6434 return(NULL);
6435 }
c277df42 6436 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6437 break;
6438 case '|':
6439 case ')':
6440 if (flags & TRYAGAIN) {
6441 *flagp |= TRYAGAIN;
6442 return NULL;
6443 }
b45f050a 6444 vFAIL("Internal urp");
a0d0e21e
LW
6445 /* Supposed to be caught earlier. */
6446 break;
85afd4ae 6447 case '{':
830247a4
IZ
6448 if (!regcurly(RExC_parse)) {
6449 RExC_parse++;
85afd4ae
CS
6450 goto defchar;
6451 }
6452 /* FALL THROUGH */
a0d0e21e
LW
6453 case '?':
6454 case '+':
6455 case '*':
830247a4 6456 RExC_parse++;
b45f050a 6457 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6458 break;
6459 case '\\':
ee9b8eae
YO
6460 /* Special Escapes
6461
6462 This switch handles escape sequences that resolve to some kind
6463 of special regop and not to literal text. Escape sequnces that
6464 resolve to literal text are handled below in the switch marked
6465 "Literal Escapes".
6466
6467 Every entry in this switch *must* have a corresponding entry
6468 in the literal escape switch. However, the opposite is not
6469 required, as the default for this switch is to jump to the
6470 literal text handling code.
6471 */
830247a4 6472 switch (*++RExC_parse) {
ee9b8eae 6473 /* Special Escapes */
a0d0e21e 6474 case 'A':
830247a4
IZ
6475 RExC_seen_zerolen++;
6476 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6477 *flagp |= SIMPLE;
ee9b8eae 6478 goto finish_meta_pat;
a0d0e21e 6479 case 'G':
830247a4
IZ
6480 ret = reg_node(pRExC_state, GPOS);
6481 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6482 *flagp |= SIMPLE;
ee9b8eae
YO
6483 goto finish_meta_pat;
6484 case 'K':
6485 RExC_seen_zerolen++;
6486 ret = reg_node(pRExC_state, KEEPS);
6487 *flagp |= SIMPLE;
6488 goto finish_meta_pat;
a0d0e21e 6489 case 'Z':
830247a4 6490 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6491 *flagp |= SIMPLE;
a1917ab9 6492 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6493 goto finish_meta_pat;
b85d18e9 6494 case 'z':
830247a4 6495 ret = reg_node(pRExC_state, EOS);
b85d18e9 6496 *flagp |= SIMPLE;
830247a4 6497 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6498 goto finish_meta_pat;
4a2d328f 6499 case 'C':
f33976b4
DB
6500 ret = reg_node(pRExC_state, CANY);
6501 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6502 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6503 goto finish_meta_pat;
a0ed51b3 6504 case 'X':
830247a4 6505 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6506 *flagp |= HASWIDTH;
ee9b8eae 6507 goto finish_meta_pat;
a0d0e21e 6508 case 'w':
eb160463 6509 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6510 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6511 goto finish_meta_pat;
a0d0e21e 6512 case 'W':
eb160463 6513 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6514 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6515 goto finish_meta_pat;
a0d0e21e 6516 case 'b':
830247a4
IZ
6517 RExC_seen_zerolen++;
6518 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6519 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6520 *flagp |= SIMPLE;
ee9b8eae 6521 goto finish_meta_pat;
a0d0e21e 6522 case 'B':
830247a4
IZ
6523 RExC_seen_zerolen++;
6524 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6525 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6526 *flagp |= SIMPLE;
ee9b8eae 6527 goto finish_meta_pat;
a0d0e21e 6528 case 's':
eb160463 6529 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6530 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6531 goto finish_meta_pat;
a0d0e21e 6532 case 'S':
eb160463 6533 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6534 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6535 goto finish_meta_pat;
a0d0e21e 6536 case 'd':
ffc61ed2 6537 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6538 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6539 goto finish_meta_pat;
a0d0e21e 6540 case 'D':
ffc61ed2 6541 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6542 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
6543 goto finish_meta_pat;
6544 case 'v':
6545 ret = reganode(pRExC_state, PRUNE, 0);
6546 ret->flags = 1;
6547 *flagp |= SIMPLE;
6548 goto finish_meta_pat;
6549 case 'V':
6550 ret = reganode(pRExC_state, SKIP, 0);
6551 ret->flags = 1;
6552 *flagp |= SIMPLE;
6553 finish_meta_pat:
830247a4 6554 nextchar(pRExC_state);
fac92740 6555 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 6556 break;
a14b48bc
LW
6557 case 'p':
6558 case 'P':
3568d838 6559 {
3dab1dad 6560 char* const oldregxend = RExC_end;
d008bc60 6561#ifdef DEBUGGING
ccb2c380 6562 char* parse_start = RExC_parse - 2;
d008bc60 6563#endif
a14b48bc 6564
830247a4 6565 if (RExC_parse[1] == '{') {
3568d838 6566 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6567 RExC_end = strchr(RExC_parse, '}');
6568 if (!RExC_end) {
3dab1dad 6569 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6570 RExC_parse += 2;
6571 RExC_end = oldregxend;
0da60cf5 6572 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6573 }
830247a4 6574 RExC_end++;
a14b48bc 6575 }
af6f566e 6576 else {
830247a4 6577 RExC_end = RExC_parse + 2;
af6f566e
HS
6578 if (RExC_end > oldregxend)
6579 RExC_end = oldregxend;
6580 }
830247a4 6581 RExC_parse--;
a14b48bc 6582
3dab1dad 6583 ret = regclass(pRExC_state,depth+1);
a14b48bc 6584
830247a4
IZ
6585 RExC_end = oldregxend;
6586 RExC_parse--;
ccb2c380
MP
6587
6588 Set_Node_Offset(ret, parse_start + 2);
6589 Set_Node_Cur_Length(ret);
830247a4 6590 nextchar(pRExC_state);
a14b48bc
LW
6591 *flagp |= HASWIDTH|SIMPLE;
6592 }
6593 break;
fc8cd66c
YO
6594 case 'N':
6595 /* Handle \N{NAME} here and not below because it can be
6596 multicharacter. join_exact() will join them up later on.
6597 Also this makes sure that things like /\N{BLAH}+/ and
6598 \N{BLAH} being multi char Just Happen. dmq*/
6599 ++RExC_parse;
6600 ret= reg_namedseq(pRExC_state, NULL);
6601 break;
0a4db386 6602 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6603 parse_named_seq:
81714fb9
YO
6604 {
6605 char ch= RExC_parse[1];
1f1031fe
YO
6606 if (ch != '<' && ch != '\'' && ch != '{') {
6607 RExC_parse++;
6608 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6609 } else {
1f1031fe
YO
6610 /* this pretty much dupes the code for (?P=...) in reg(), if
6611 you change this make sure you change that */
81714fb9 6612 char* name_start = (RExC_parse += 2);
2eccd3b2 6613 U32 num = 0;
0a4db386
YO
6614 SV *sv_dat = reg_scan_name(pRExC_state,
6615 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6616 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6617 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6618 vFAIL2("Sequence %.3s... not terminated",parse_start);
6619
6620 if (!SIZE_ONLY) {
6621 num = add_data( pRExC_state, 1, "S" );
6622 RExC_rxi->data->data[num]=(void*)sv_dat;
6623 SvREFCNT_inc(sv_dat);
6624 }
6625
81714fb9
YO
6626 RExC_sawback = 1;
6627 ret = reganode(pRExC_state,
6628 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6629 num);
6630 *flagp |= HASWIDTH;
1f1031fe 6631
81714fb9
YO
6632 /* override incorrect value set in reganode MJD */
6633 Set_Node_Offset(ret, parse_start+1);
6634 Set_Node_Cur_Length(ret); /* MJD */
6635 nextchar(pRExC_state);
1f1031fe 6636
81714fb9
YO
6637 }
6638 break;
1f1031fe 6639 }
2bf803e2 6640 case 'g':
a0d0e21e
LW
6641 case '1': case '2': case '3': case '4':
6642 case '5': case '6': case '7': case '8': case '9':
6643 {
c74340f9 6644 I32 num;
2bf803e2
YO
6645 bool isg = *RExC_parse == 'g';
6646 bool isrel = 0;
6647 bool hasbrace = 0;
6648 if (isg) {
c74340f9 6649 RExC_parse++;
2bf803e2
YO
6650 if (*RExC_parse == '{') {
6651 RExC_parse++;
6652 hasbrace = 1;
6653 }
6654 if (*RExC_parse == '-') {
6655 RExC_parse++;
6656 isrel = 1;
6657 }
1f1031fe
YO
6658 if (hasbrace && !isDIGIT(*RExC_parse)) {
6659 if (isrel) RExC_parse--;
6660 RExC_parse -= 2;
6661 goto parse_named_seq;
6662 } }
c74340f9
YO
6663 num = atoi(RExC_parse);
6664 if (isrel) {
5624f11d 6665 num = RExC_npar - num;
c74340f9
YO
6666 if (num < 1)
6667 vFAIL("Reference to nonexistent or unclosed group");
6668 }
2bf803e2 6669 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6670 goto defchar;
6671 else {
3dab1dad 6672 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6673 while (isDIGIT(*RExC_parse))
6674 RExC_parse++;
1f1031fe
YO
6675 if (parse_start == RExC_parse - 1)
6676 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6677 if (hasbrace) {
6678 if (*RExC_parse != '}')
6679 vFAIL("Unterminated \\g{...} pattern");
6680 RExC_parse++;
6681 }
c74340f9
YO
6682 if (!SIZE_ONLY) {
6683 if (num > (I32)RExC_rx->nparens)
6684 vFAIL("Reference to nonexistent group");
c74340f9 6685 }
830247a4 6686 RExC_sawback = 1;
eb160463
GS
6687 ret = reganode(pRExC_state,
6688 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6689 num);
a0d0e21e 6690 *flagp |= HASWIDTH;
2af232bd 6691
fac92740 6692 /* override incorrect value set in reganode MJD */
2af232bd 6693 Set_Node_Offset(ret, parse_start+1);
fac92740 6694 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6695 RExC_parse--;
6696 nextchar(pRExC_state);
a0d0e21e
LW
6697 }
6698 }
6699 break;
6700 case '\0':
830247a4 6701 if (RExC_parse >= RExC_end)
b45f050a 6702 FAIL("Trailing \\");
a0d0e21e
LW
6703 /* FALL THROUGH */
6704 default:
a0288114 6705 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6706 back into the quick-grab loop below */
45948336 6707 parse_start--;
a0d0e21e
LW
6708 goto defchar;
6709 }
6710 break;
4633a7c4
LW
6711
6712 case '#':
bbe252da 6713 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 6714 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
6715 goto tryagain;
6716 }
6717 /* FALL THROUGH */
6718
a0d0e21e 6719 default: {
ba210ebe 6720 register STRLEN len;
58ae7d3f 6721 register UV ender;
a0d0e21e 6722 register char *p;
3dab1dad 6723 char *s;
80aecb99 6724 STRLEN foldlen;
89ebb4a3 6725 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6726
6727 parse_start = RExC_parse - 1;
a0d0e21e 6728
830247a4 6729 RExC_parse++;
a0d0e21e
LW
6730
6731 defchar:
58ae7d3f 6732 ender = 0;
eb160463
GS
6733 ret = reg_node(pRExC_state,
6734 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6735 s = STRING(ret);
830247a4
IZ
6736 for (len = 0, p = RExC_parse - 1;
6737 len < 127 && p < RExC_end;
a0d0e21e
LW
6738 len++)
6739 {
3dab1dad 6740 char * const oldp = p;
5b5a24f7 6741
bbe252da 6742 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 6743 p = regwhite( pRExC_state, p );
a0d0e21e
LW
6744 switch (*p) {
6745 case '^':
6746 case '$':
6747 case '.':
6748 case '[':
6749 case '(':
6750 case ')':
6751 case '|':
6752 goto loopdone;
6753 case '\\':
ee9b8eae
YO
6754 /* Literal Escapes Switch
6755
6756 This switch is meant to handle escape sequences that
6757 resolve to a literal character.
6758
6759 Every escape sequence that represents something
6760 else, like an assertion or a char class, is handled
6761 in the switch marked 'Special Escapes' above in this
6762 routine, but also has an entry here as anything that
6763 isn't explicitly mentioned here will be treated as
6764 an unescaped equivalent literal.
6765 */
6766
a0d0e21e 6767 switch (*++p) {
ee9b8eae
YO
6768 /* These are all the special escapes. */
6769 case 'A': /* Start assertion */
6770 case 'b': case 'B': /* Word-boundary assertion*/
6771 case 'C': /* Single char !DANGEROUS! */
6772 case 'd': case 'D': /* digit class */
6773 case 'g': case 'G': /* generic-backref, pos assertion */
6774 case 'k': case 'K': /* named backref, keep marker */
6775 case 'N': /* named char sequence */
6776 case 'p': case 'P': /* unicode property */
6777 case 's': case 'S': /* space class */
6778 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6779 case 'w': case 'W': /* word class */
6780 case 'X': /* eXtended Unicode "combining character sequence" */
6781 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
6782 --p;
6783 goto loopdone;
ee9b8eae
YO
6784
6785 /* Anything after here is an escape that resolves to a
6786 literal. (Except digits, which may or may not)
6787 */
a0d0e21e
LW
6788 case 'n':
6789 ender = '\n';
6790 p++;
a687059c 6791 break;
a0d0e21e
LW
6792 case 'r':
6793 ender = '\r';
6794 p++;
a687059c 6795 break;
a0d0e21e
LW
6796 case 't':
6797 ender = '\t';
6798 p++;
a687059c 6799 break;
a0d0e21e
LW
6800 case 'f':
6801 ender = '\f';
6802 p++;
a687059c 6803 break;
a0d0e21e 6804 case 'e':
c7f1f016 6805 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6806 p++;
a687059c 6807 break;
a0d0e21e 6808 case 'a':
c7f1f016 6809 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6810 p++;
a687059c 6811 break;
a0d0e21e 6812 case 'x':
a0ed51b3 6813 if (*++p == '{') {
1df70142 6814 char* const e = strchr(p, '}');
b81d288d 6815
b45f050a 6816 if (!e) {
830247a4 6817 RExC_parse = p + 1;
b45f050a
JF
6818 vFAIL("Missing right brace on \\x{}");
6819 }
de5f0749 6820 else {
a4c04bdc
NC
6821 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6822 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6823 STRLEN numlen = e - p - 1;
53305cf1 6824 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6825 if (ender > 0xff)
6826 RExC_utf8 = 1;
a0ed51b3
LW
6827 p = e + 1;
6828 }
a0ed51b3
LW
6829 }
6830 else {
a4c04bdc 6831 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6832 STRLEN numlen = 2;
53305cf1 6833 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6834 p += numlen;
6835 }
9e08bc66
TS
6836 if (PL_encoding && ender < 0x100)
6837 goto recode_encoding;
a687059c 6838 break;
a0d0e21e
LW
6839 case 'c':
6840 p++;
bbce6d69 6841 ender = UCHARAT(p++);
6842 ender = toCTRL(ender);
a687059c 6843 break;
a0d0e21e
LW
6844 case '0': case '1': case '2': case '3':case '4':
6845 case '5': case '6': case '7': case '8':case '9':
6846 if (*p == '0' ||
830247a4 6847 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6848 I32 flags = 0;
1df70142 6849 STRLEN numlen = 3;
53305cf1 6850 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6851 p += numlen;
6852 }
6853 else {
6854 --p;
6855 goto loopdone;
a687059c 6856 }
9e08bc66
TS
6857 if (PL_encoding && ender < 0x100)
6858 goto recode_encoding;
6859 break;
6860 recode_encoding:
6861 {
6862 SV* enc = PL_encoding;
6863 ender = reg_recode((const char)(U8)ender, &enc);
6864 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6865 vWARN(p, "Invalid escape in the specified encoding");
6866 RExC_utf8 = 1;
6867 }
a687059c 6868 break;
a0d0e21e 6869 case '\0':
830247a4 6870 if (p >= RExC_end)
b45f050a 6871 FAIL("Trailing \\");
a687059c 6872 /* FALL THROUGH */
a0d0e21e 6873 default:
041457d9 6874 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6875 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6876 goto normal_default;
a0d0e21e
LW
6877 }
6878 break;
a687059c 6879 default:
a0ed51b3 6880 normal_default:
fd400ab9 6881 if (UTF8_IS_START(*p) && UTF) {
1df70142 6882 STRLEN numlen;
5e12f4fb 6883 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6884 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6885 p += numlen;
6886 }
6887 else
6888 ender = *p++;
a0d0e21e 6889 break;
a687059c 6890 }
bcdf7404
YO
6891 if ( RExC_flags & RXf_PMf_EXTENDED)
6892 p = regwhite( pRExC_state, p );
60a8b682
JH
6893 if (UTF && FOLD) {
6894 /* Prime the casefolded buffer. */
ac7e0132 6895 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6896 }
bcdf7404 6897 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
6898 if (len)
6899 p = oldp;
16ea2a2e 6900 else if (UTF) {
80aecb99 6901 if (FOLD) {
60a8b682 6902 /* Emit all the Unicode characters. */
1df70142 6903 STRLEN numlen;
80aecb99
JH
6904 for (foldbuf = tmpbuf;
6905 foldlen;
6906 foldlen -= numlen) {
6907 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6908 if (numlen > 0) {
71207a34 6909 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6910 s += unilen;
6911 len += unilen;
6912 /* In EBCDIC the numlen
6913 * and unilen can differ. */
9dc45d57 6914 foldbuf += numlen;
47654450
JH
6915 if (numlen >= foldlen)
6916 break;
9dc45d57
JH
6917 }
6918 else
6919 break; /* "Can't happen." */
80aecb99
JH
6920 }
6921 }
6922 else {
71207a34 6923 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6924 if (unilen > 0) {
0ebc6274
JH
6925 s += unilen;
6926 len += unilen;
9dc45d57 6927 }
80aecb99 6928 }
a0ed51b3 6929 }
a0d0e21e
LW
6930 else {
6931 len++;
eb160463 6932 REGC((char)ender, s++);
a0d0e21e
LW
6933 }
6934 break;
a687059c 6935 }
16ea2a2e 6936 if (UTF) {
80aecb99 6937 if (FOLD) {
60a8b682 6938 /* Emit all the Unicode characters. */
1df70142 6939 STRLEN numlen;
80aecb99
JH
6940 for (foldbuf = tmpbuf;
6941 foldlen;
6942 foldlen -= numlen) {
6943 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6944 if (numlen > 0) {
71207a34 6945 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6946 len += unilen;
6947 s += unilen;
6948 /* In EBCDIC the numlen
6949 * and unilen can differ. */
9dc45d57 6950 foldbuf += numlen;
47654450
JH
6951 if (numlen >= foldlen)
6952 break;
9dc45d57
JH
6953 }
6954 else
6955 break;
80aecb99
JH
6956 }
6957 }
6958 else {
71207a34 6959 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6960 if (unilen > 0) {
0ebc6274
JH
6961 s += unilen;
6962 len += unilen;
9dc45d57 6963 }
80aecb99
JH
6964 }
6965 len--;
a0ed51b3
LW
6966 }
6967 else
eb160463 6968 REGC((char)ender, s++);
a0d0e21e
LW
6969 }
6970 loopdone:
830247a4 6971 RExC_parse = p - 1;
fac92740 6972 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6973 nextchar(pRExC_state);
793db0cb
JH
6974 {
6975 /* len is STRLEN which is unsigned, need to copy to signed */
6976 IV iv = len;
6977 if (iv < 0)
6978 vFAIL("Internal disaster");
6979 }
a0d0e21e
LW
6980 if (len > 0)
6981 *flagp |= HASWIDTH;
090f7165 6982 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6983 *flagp |= SIMPLE;
3dab1dad 6984
cd439c50 6985 if (SIZE_ONLY)
830247a4 6986 RExC_size += STR_SZ(len);
3dab1dad
YO
6987 else {
6988 STR_LEN(ret) = len;
830247a4 6989 RExC_emit += STR_SZ(len);
07be1b83 6990 }
3dab1dad 6991 }
a0d0e21e
LW
6992 break;
6993 }
a687059c 6994
a0d0e21e 6995 return(ret);
a687059c
LW
6996}
6997
873ef191 6998STATIC char *
bcdf7404 6999S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7000{
bcdf7404 7001 const char *e = RExC_end;
5b5a24f7
CS
7002 while (p < e) {
7003 if (isSPACE(*p))
7004 ++p;
7005 else if (*p == '#') {
bcdf7404 7006 bool ended = 0;
5b5a24f7 7007 do {
bcdf7404
YO
7008 if (*p++ == '\n') {
7009 ended = 1;
7010 break;
7011 }
7012 } while (p < e);
7013 if (!ended)
7014 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7015 }
7016 else
7017 break;
7018 }
7019 return p;
7020}
7021
b8c5462f
JH
7022/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7023 Character classes ([:foo:]) can also be negated ([:^foo:]).
7024 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7025 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7026 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7027
7028#define POSIXCC_DONE(c) ((c) == ':')
7029#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7030#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7031
b8c5462f 7032STATIC I32
830247a4 7033S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7034{
97aff369 7035 dVAR;
936ed897 7036 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7037
830247a4 7038 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7039 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7040 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7041 const char c = UCHARAT(RExC_parse);
097eb12c 7042 char* const s = RExC_parse++;
b81d288d 7043
9a86a77b 7044 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7045 RExC_parse++;
7046 if (RExC_parse == RExC_end)
620e46c5 7047 /* Grandfather lone [:, [=, [. */
830247a4 7048 RExC_parse = s;
620e46c5 7049 else {
3dab1dad 7050 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7051 assert(*t == c);
7052
9a86a77b 7053 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7054 const char *posixcc = s + 1;
830247a4 7055 RExC_parse++; /* skip over the ending ] */
3dab1dad 7056
b8c5462f 7057 if (*s == ':') {
1df70142
AL
7058 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7059 const I32 skip = t - posixcc;
80916619
NC
7060
7061 /* Initially switch on the length of the name. */
7062 switch (skip) {
7063 case 4:
3dab1dad
YO
7064 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7065 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7066 break;
80916619
NC
7067 case 5:
7068 /* Names all of length 5. */
7069 /* alnum alpha ascii blank cntrl digit graph lower
7070 print punct space upper */
7071 /* Offset 4 gives the best switch position. */
7072 switch (posixcc[4]) {
7073 case 'a':
3dab1dad
YO
7074 if (memEQ(posixcc, "alph", 4)) /* alpha */
7075 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7076 break;
7077 case 'e':
3dab1dad
YO
7078 if (memEQ(posixcc, "spac", 4)) /* space */
7079 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7080 break;
7081 case 'h':
3dab1dad
YO
7082 if (memEQ(posixcc, "grap", 4)) /* graph */
7083 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7084 break;
7085 case 'i':
3dab1dad
YO
7086 if (memEQ(posixcc, "asci", 4)) /* ascii */
7087 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7088 break;
7089 case 'k':
3dab1dad
YO
7090 if (memEQ(posixcc, "blan", 4)) /* blank */
7091 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7092 break;
7093 case 'l':
3dab1dad
YO
7094 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7095 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7096 break;
7097 case 'm':
3dab1dad
YO
7098 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7099 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7100 break;
7101 case 'r':
3dab1dad
YO
7102 if (memEQ(posixcc, "lowe", 4)) /* lower */
7103 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7104 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7105 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7106 break;
7107 case 't':
3dab1dad
YO
7108 if (memEQ(posixcc, "digi", 4)) /* digit */
7109 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7110 else if (memEQ(posixcc, "prin", 4)) /* print */
7111 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7112 else if (memEQ(posixcc, "punc", 4)) /* punct */
7113 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7114 break;
b8c5462f
JH
7115 }
7116 break;
80916619 7117 case 6:
3dab1dad
YO
7118 if (memEQ(posixcc, "xdigit", 6))
7119 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7120 break;
7121 }
80916619
NC
7122
7123 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7124 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7125 t - s - 1, s + 1);
80916619
NC
7126 assert (posixcc[skip] == ':');
7127 assert (posixcc[skip+1] == ']');
b45f050a 7128 } else if (!SIZE_ONLY) {
b8c5462f 7129 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7130
830247a4 7131 /* adjust RExC_parse so the warning shows after
b45f050a 7132 the class closes */
9a86a77b 7133 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7134 RExC_parse++;
b45f050a
JF
7135 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7136 }
b8c5462f
JH
7137 } else {
7138 /* Maternal grandfather:
7139 * "[:" ending in ":" but not in ":]" */
830247a4 7140 RExC_parse = s;
767d463e 7141 }
620e46c5
JH
7142 }
7143 }
7144
b8c5462f
JH
7145 return namedclass;
7146}
7147
7148STATIC void
830247a4 7149S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7150{
97aff369 7151 dVAR;
3dab1dad 7152 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7153 const char *s = RExC_parse;
7154 const char c = *s++;
b8c5462f 7155
3dab1dad 7156 while (isALNUM(*s))
b8c5462f
JH
7157 s++;
7158 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7159 if (ckWARN(WARN_REGEXP))
7160 vWARN3(s+2,
7161 "POSIX syntax [%c %c] belongs inside character classes",
7162 c, c);
b45f050a
JF
7163
7164 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7165 if (POSIXCC_NOTYET(c)) {
830247a4 7166 /* adjust RExC_parse so the error shows after
b45f050a 7167 the class closes */
9a86a77b 7168 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7169 NOOP;
b45f050a
JF
7170 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7171 }
b8c5462f
JH
7172 }
7173 }
620e46c5
JH
7174}
7175
7f6f358c 7176
89836f1f
YO
7177#define _C_C_T_(NAME,TEST,WORD) \
7178ANYOF_##NAME: \
7179 if (LOC) \
7180 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7181 else { \
7182 for (value = 0; value < 256; value++) \
7183 if (TEST) \
7184 ANYOF_BITMAP_SET(ret, value); \
7185 } \
7186 yesno = '+'; \
7187 what = WORD; \
7188 break; \
7189case ANYOF_N##NAME: \
7190 if (LOC) \
7191 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7192 else { \
7193 for (value = 0; value < 256; value++) \
7194 if (!TEST) \
7195 ANYOF_BITMAP_SET(ret, value); \
7196 } \
7197 yesno = '!'; \
7198 what = WORD; \
7199 break
7200
7201
7f6f358c
YO
7202/*
7203 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7204 matches the pattern or if the pattern matches a single char only and
7205 that char is < 256 and we are case insensitive then we produce an
7206 EXACT node instead.
7f6f358c 7207*/
89836f1f 7208
76e3520e 7209STATIC regnode *
3dab1dad 7210S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7211{
97aff369 7212 dVAR;
9ef43ace 7213 register UV value = 0;
9a86a77b 7214 register UV nextvalue;
3568d838 7215 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7216 register IV range = 0;
c277df42 7217 register regnode *ret;
ba210ebe 7218 STRLEN numlen;
ffc61ed2 7219 IV namedclass;
cbbf8932 7220 char *rangebegin = NULL;
936ed897 7221 bool need_class = 0;
c445ea15 7222 SV *listsv = NULL;
ffc61ed2 7223 UV n;
9e55ce06 7224 bool optimize_invert = TRUE;
cbbf8932 7225 AV* unicode_alternate = NULL;
1b2d223b
JH
7226#ifdef EBCDIC
7227 UV literal_endpoint = 0;
7228#endif
7f6f358c 7229 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7230
3dab1dad 7231 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7232 case we need to change the emitted regop to an EXACT. */
07be1b83 7233 const char * orig_parse = RExC_parse;
72f13be8 7234 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7235#ifndef DEBUGGING
7236 PERL_UNUSED_ARG(depth);
7237#endif
72f13be8 7238
3dab1dad 7239 DEBUG_PARSE("clas");
7f6f358c
YO
7240
7241 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7242 ret = reganode(pRExC_state, ANYOF, 0);
7243
7244 if (!SIZE_ONLY)
7245 ANYOF_FLAGS(ret) = 0;
7246
9a86a77b 7247 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7248 RExC_naughty++;
7249 RExC_parse++;
7250 if (!SIZE_ONLY)
7251 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7252 }
a0d0e21e 7253
73060fc4 7254 if (SIZE_ONLY) {
830247a4 7255 RExC_size += ANYOF_SKIP;
73060fc4
JH
7256 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7257 }
936ed897 7258 else {
830247a4 7259 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7260 if (FOLD)
7261 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7262 if (LOC)
7263 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7264 ANYOF_BITMAP_ZERO(ret);
396482e1 7265 listsv = newSVpvs("# comment\n");
a0d0e21e 7266 }
b8c5462f 7267
9a86a77b
JH
7268 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7269
b938889d 7270 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7271 checkposixcc(pRExC_state);
b8c5462f 7272
f064b6ad
HS
7273 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7274 if (UCHARAT(RExC_parse) == ']')
7275 goto charclassloop;
ffc61ed2 7276
fc8cd66c 7277parseit:
9a86a77b 7278 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7279
7280 charclassloop:
7281
7282 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7283
73b437c8 7284 if (!range)
830247a4 7285 rangebegin = RExC_parse;
ffc61ed2 7286 if (UTF) {
5e12f4fb 7287 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7288 RExC_end - RExC_parse,
9f7f3913 7289 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7290 RExC_parse += numlen;
7291 }
7292 else
7293 value = UCHARAT(RExC_parse++);
7f6f358c 7294
9a86a77b
JH
7295 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7296 if (value == '[' && POSIXCC(nextvalue))
830247a4 7297 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7298 else if (value == '\\') {
ffc61ed2 7299 if (UTF) {
5e12f4fb 7300 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7301 RExC_end - RExC_parse,
9f7f3913 7302 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7303 RExC_parse += numlen;
7304 }
7305 else
7306 value = UCHARAT(RExC_parse++);
470c3474 7307 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7308 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7309 * be a problem later if we want switch on Unicode.
7310 * A similar issue a little bit later when switching on
7311 * namedclass. --jhi */
ffc61ed2 7312 switch ((I32)value) {
b8c5462f
JH
7313 case 'w': namedclass = ANYOF_ALNUM; break;
7314 case 'W': namedclass = ANYOF_NALNUM; break;
7315 case 's': namedclass = ANYOF_SPACE; break;
7316 case 'S': namedclass = ANYOF_NSPACE; break;
7317 case 'd': namedclass = ANYOF_DIGIT; break;
7318 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7319 case 'N': /* Handle \N{NAME} in class */
7320 {
7321 /* We only pay attention to the first char of
7322 multichar strings being returned. I kinda wonder
7323 if this makes sense as it does change the behaviour
7324 from earlier versions, OTOH that behaviour was broken
7325 as well. */
7326 UV v; /* value is register so we cant & it /grrr */
7327 if (reg_namedseq(pRExC_state, &v)) {
7328 goto parseit;
7329 }
7330 value= v;
7331 }
7332 break;
ffc61ed2
JH
7333 case 'p':
7334 case 'P':
3dab1dad
YO
7335 {
7336 char *e;
af6f566e 7337 if (RExC_parse >= RExC_end)
2a4859cd 7338 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7339 if (*RExC_parse == '{') {
1df70142 7340 const U8 c = (U8)value;
ffc61ed2
JH
7341 e = strchr(RExC_parse++, '}');
7342 if (!e)
0da60cf5 7343 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7344 while (isSPACE(UCHARAT(RExC_parse)))
7345 RExC_parse++;
7346 if (e == RExC_parse)
0da60cf5 7347 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7348 n = e - RExC_parse;
ab13f0c7
JH
7349 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7350 n--;
ffc61ed2
JH
7351 }
7352 else {
7353 e = RExC_parse;
7354 n = 1;
7355 }
7356 if (!SIZE_ONLY) {
ab13f0c7
JH
7357 if (UCHARAT(RExC_parse) == '^') {
7358 RExC_parse++;
7359 n--;
7360 value = value == 'p' ? 'P' : 'p'; /* toggle */
7361 while (isSPACE(UCHARAT(RExC_parse))) {
7362 RExC_parse++;
7363 n--;
7364 }
7365 }
097eb12c
AL
7366 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7367 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7368 }
7369 RExC_parse = e + 1;
7370 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7371 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7372 }
f81125e2 7373 break;
b8c5462f
JH
7374 case 'n': value = '\n'; break;
7375 case 'r': value = '\r'; break;
7376 case 't': value = '\t'; break;
7377 case 'f': value = '\f'; break;
7378 case 'b': value = '\b'; break;
c7f1f016
NIS
7379 case 'e': value = ASCII_TO_NATIVE('\033');break;
7380 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7381 case 'x':
ffc61ed2 7382 if (*RExC_parse == '{') {
a4c04bdc
NC
7383 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7384 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7385 char * const e = strchr(RExC_parse++, '}');
b81d288d 7386 if (!e)
ffc61ed2 7387 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7388
7389 numlen = e - RExC_parse;
7390 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7391 RExC_parse = e + 1;
7392 }
7393 else {
a4c04bdc 7394 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7395 numlen = 2;
7396 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7397 RExC_parse += numlen;
7398 }
9e08bc66
TS
7399 if (PL_encoding && value < 0x100)
7400 goto recode_encoding;
b8c5462f
JH
7401 break;
7402 case 'c':
830247a4 7403 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7404 value = toCTRL(value);
7405 break;
7406 case '0': case '1': case '2': case '3': case '4':
7407 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7408 {
7409 I32 flags = 0;
7410 numlen = 3;
7411 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7412 RExC_parse += numlen;
7413 if (PL_encoding && value < 0x100)
7414 goto recode_encoding;
7415 break;
7416 }
7417 recode_encoding:
7418 {
7419 SV* enc = PL_encoding;
7420 value = reg_recode((const char)(U8)value, &enc);
7421 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7422 vWARN(RExC_parse,
7423 "Invalid escape in the specified encoding");
7424 break;
7425 }
1028017a 7426 default:
041457d9 7427 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7428 vWARN2(RExC_parse,
7429 "Unrecognized escape \\%c in character class passed through",
7430 (int)value);
1028017a 7431 break;
b8c5462f 7432 }
ffc61ed2 7433 } /* end of \blah */
1b2d223b
JH
7434#ifdef EBCDIC
7435 else
7436 literal_endpoint++;
7437#endif
ffc61ed2
JH
7438
7439 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7440
7441 if (!SIZE_ONLY && !need_class)
936ed897 7442 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7443
936ed897 7444 need_class = 1;
ffc61ed2
JH
7445
7446 /* a bad range like a-\d, a-[:digit:] ? */
7447 if (range) {
73b437c8 7448 if (!SIZE_ONLY) {
afd78fd5 7449 if (ckWARN(WARN_REGEXP)) {
097eb12c 7450 const int w =
afd78fd5
JH
7451 RExC_parse >= rangebegin ?
7452 RExC_parse - rangebegin : 0;
830247a4 7453 vWARN4(RExC_parse,
b45f050a 7454 "False [] range \"%*.*s\"",
097eb12c 7455 w, w, rangebegin);
afd78fd5 7456 }
3568d838
JH
7457 if (prevvalue < 256) {
7458 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7459 ANYOF_BITMAP_SET(ret, '-');
7460 }
7461 else {
7462 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7463 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7464 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7465 }
b8c5462f 7466 }
ffc61ed2
JH
7467
7468 range = 0; /* this was not a true range */
73b437c8 7469 }
ffc61ed2 7470
89836f1f
YO
7471
7472
73b437c8 7473 if (!SIZE_ONLY) {
c49a72a9
NC
7474 const char *what = NULL;
7475 char yesno = 0;
7476
3568d838
JH
7477 if (namedclass > OOB_NAMEDCLASS)
7478 optimize_invert = FALSE;
e2962f66
JH
7479 /* Possible truncation here but in some 64-bit environments
7480 * the compiler gets heartburn about switch on 64-bit values.
7481 * A similar issue a little earlier when switching on value.
98f323fa 7482 * --jhi */
e2962f66 7483 switch ((I32)namedclass) {
89836f1f
YO
7484 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7485 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7486 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7487 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7488 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7489 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7490 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7491 case _C_C_T_(PRINT, isPRINT(value), "Print");
7492 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7493 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7494 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7495 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7496 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7497 case ANYOF_ASCII:
7498 if (LOC)
936ed897 7499 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7500 else {
c7f1f016 7501#ifndef EBCDIC
1ba5c669
JH
7502 for (value = 0; value < 128; value++)
7503 ANYOF_BITMAP_SET(ret, value);
7504#else /* EBCDIC */
ffbc6a93 7505 for (value = 0; value < 256; value++) {
3a3c4447
JH
7506 if (isASCII(value))
7507 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7508 }
1ba5c669 7509#endif /* EBCDIC */
73b437c8 7510 }
c49a72a9
NC
7511 yesno = '+';
7512 what = "ASCII";
73b437c8
JH
7513 break;
7514 case ANYOF_NASCII:
7515 if (LOC)
936ed897 7516 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7517 else {
c7f1f016 7518#ifndef EBCDIC
1ba5c669
JH
7519 for (value = 128; value < 256; value++)
7520 ANYOF_BITMAP_SET(ret, value);
7521#else /* EBCDIC */
ffbc6a93 7522 for (value = 0; value < 256; value++) {
3a3c4447
JH
7523 if (!isASCII(value))
7524 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7525 }
1ba5c669 7526#endif /* EBCDIC */
73b437c8 7527 }
c49a72a9
NC
7528 yesno = '!';
7529 what = "ASCII";
89836f1f 7530 break;
ffc61ed2
JH
7531 case ANYOF_DIGIT:
7532 if (LOC)
7533 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7534 else {
7535 /* consecutive digits assumed */
7536 for (value = '0'; value <= '9'; value++)
7537 ANYOF_BITMAP_SET(ret, value);
7538 }
c49a72a9
NC
7539 yesno = '+';
7540 what = "Digit";
ffc61ed2
JH
7541 break;
7542 case ANYOF_NDIGIT:
7543 if (LOC)
7544 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7545 else {
7546 /* consecutive digits assumed */
7547 for (value = 0; value < '0'; value++)
7548 ANYOF_BITMAP_SET(ret, value);
7549 for (value = '9' + 1; value < 256; value++)
7550 ANYOF_BITMAP_SET(ret, value);
7551 }
c49a72a9
NC
7552 yesno = '!';
7553 what = "Digit";
89836f1f 7554 break;
f81125e2
JP
7555 case ANYOF_MAX:
7556 /* this is to handle \p and \P */
7557 break;
73b437c8 7558 default:
b45f050a 7559 vFAIL("Invalid [::] class");
73b437c8 7560 break;
b8c5462f 7561 }
c49a72a9
NC
7562 if (what) {
7563 /* Strings such as "+utf8::isWord\n" */
7564 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7565 }
b8c5462f 7566 if (LOC)
936ed897 7567 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7568 continue;
a0d0e21e 7569 }
ffc61ed2
JH
7570 } /* end of namedclass \blah */
7571
a0d0e21e 7572 if (range) {
eb160463 7573 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7574 const int w = RExC_parse - rangebegin;
7575 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7576 range = 0; /* not a valid range */
73b437c8 7577 }
a0d0e21e
LW
7578 }
7579 else {
3568d838 7580 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7581 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7582 RExC_parse[1] != ']') {
7583 RExC_parse++;
ffc61ed2
JH
7584
7585 /* a bad range like \w-, [:word:]- ? */
7586 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7587 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7588 const int w =
afd78fd5
JH
7589 RExC_parse >= rangebegin ?
7590 RExC_parse - rangebegin : 0;
830247a4 7591 vWARN4(RExC_parse,
b45f050a 7592 "False [] range \"%*.*s\"",
097eb12c 7593 w, w, rangebegin);
afd78fd5 7594 }
73b437c8 7595 if (!SIZE_ONLY)
936ed897 7596 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7597 } else
ffc61ed2
JH
7598 range = 1; /* yeah, it's a range! */
7599 continue; /* but do it the next time */
a0d0e21e 7600 }
a687059c 7601 }
ffc61ed2 7602
93733859 7603 /* now is the next time */
07be1b83 7604 /*stored += (value - prevvalue + 1);*/
ae5c130c 7605 if (!SIZE_ONLY) {
3568d838 7606 if (prevvalue < 256) {
1df70142 7607 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7608 IV i;
3568d838 7609#ifdef EBCDIC
1b2d223b
JH
7610 /* In EBCDIC [\x89-\x91] should include
7611 * the \x8e but [i-j] should not. */
7612 if (literal_endpoint == 2 &&
7613 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7614 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7615 {
3568d838
JH
7616 if (isLOWER(prevvalue)) {
7617 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7618 if (isLOWER(i))
7619 ANYOF_BITMAP_SET(ret, i);
7620 } else {
3568d838 7621 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7622 if (isUPPER(i))
7623 ANYOF_BITMAP_SET(ret, i);
7624 }
8ada0baa 7625 }
ffc61ed2 7626 else
8ada0baa 7627#endif
07be1b83
YO
7628 for (i = prevvalue; i <= ceilvalue; i++) {
7629 if (!ANYOF_BITMAP_TEST(ret,i)) {
7630 stored++;
7631 ANYOF_BITMAP_SET(ret, i);
7632 }
7633 }
3568d838 7634 }
a5961de5 7635 if (value > 255 || UTF) {
1df70142
AL
7636 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7637 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7638 stored+=2; /* can't optimize this class */
ffc61ed2 7639 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7640 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7641 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7642 prevnatvalue, natvalue);
7643 }
7644 else if (prevnatvalue == natvalue) {
7645 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7646 if (FOLD) {
89ebb4a3 7647 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7648 STRLEN foldlen;
1df70142 7649 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7650
e294cc5d
JH
7651#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7652 if (RExC_precomp[0] == ':' &&
7653 RExC_precomp[1] == '[' &&
7654 (f == 0xDF || f == 0x92)) {
7655 f = NATIVE_TO_UNI(f);
7656 }
7657#endif
c840d2a2
JH
7658 /* If folding and foldable and a single
7659 * character, insert also the folded version
7660 * to the charclass. */
9e55ce06 7661 if (f != value) {
e294cc5d
JH
7662#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7663 if ((RExC_precomp[0] == ':' &&
7664 RExC_precomp[1] == '[' &&
7665 (f == 0xA2 &&
7666 (value == 0xFB05 || value == 0xFB06))) ?
7667 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7668 foldlen == (STRLEN)UNISKIP(f) )
7669#else
eb160463 7670 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7671#endif
9e55ce06
JH
7672 Perl_sv_catpvf(aTHX_ listsv,
7673 "%04"UVxf"\n", f);
7674 else {
7675 /* Any multicharacter foldings
7676 * require the following transform:
7677 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7678 * where E folds into "pq" and F folds
7679 * into "rst", all other characters
7680 * fold to single characters. We save
7681 * away these multicharacter foldings,
7682 * to be later saved as part of the
7683 * additional "s" data. */
7684 SV *sv;
7685
7686 if (!unicode_alternate)
7687 unicode_alternate = newAV();
7688 sv = newSVpvn((char*)foldbuf, foldlen);
7689 SvUTF8_on(sv);
7690 av_push(unicode_alternate, sv);
7691 }
7692 }
254ba52a 7693
60a8b682
JH
7694 /* If folding and the value is one of the Greek
7695 * sigmas insert a few more sigmas to make the
7696 * folding rules of the sigmas to work right.
7697 * Note that not all the possible combinations
7698 * are handled here: some of them are handled
9e55ce06
JH
7699 * by the standard folding rules, and some of
7700 * them (literal or EXACTF cases) are handled
7701 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7702 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7703 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7704 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7705 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7706 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7707 }
7708 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7709 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7710 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7711 }
7712 }
ffc61ed2 7713 }
1b2d223b
JH
7714#ifdef EBCDIC
7715 literal_endpoint = 0;
7716#endif
8ada0baa 7717 }
ffc61ed2
JH
7718
7719 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7720 }
ffc61ed2 7721
936ed897 7722 if (need_class) {
4f66b38d 7723 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7724 if (SIZE_ONLY)
830247a4 7725 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7726 else
830247a4 7727 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7728 }
ffc61ed2 7729
7f6f358c
YO
7730
7731 if (SIZE_ONLY)
7732 return ret;
7733 /****** !SIZE_ONLY AFTER HERE *********/
7734
7735 if( stored == 1 && value < 256
7736 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7737 ) {
7738 /* optimize single char class to an EXACT node
7739 but *only* when its not a UTF/high char */
07be1b83
YO
7740 const char * cur_parse= RExC_parse;
7741 RExC_emit = (regnode *)orig_emit;
7742 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7743 ret = reg_node(pRExC_state,
7744 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7745 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7746 *STRING(ret)= (char)value;
7747 STR_LEN(ret)= 1;
7748 RExC_emit += STR_SZ(1);
7749 return ret;
7750 }
ae5c130c 7751 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7752 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7753 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7754 ) {
a0ed51b3 7755 for (value = 0; value < 256; ++value) {
936ed897 7756 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7757 UV fold = PL_fold[value];
ffc61ed2
JH
7758
7759 if (fold != value)
7760 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7761 }
7762 }
936ed897 7763 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7764 }
ffc61ed2 7765
ae5c130c 7766 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7767 if (optimize_invert &&
ffc61ed2
JH
7768 /* If the only flag is inversion. */
7769 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7770 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7771 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7772 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7773 }
7f6f358c 7774 {
097eb12c 7775 AV * const av = newAV();
ffc61ed2 7776 SV *rv;
9e55ce06 7777 /* The 0th element stores the character class description
6a0407ee 7778 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7779 * to initialize the appropriate swash (which gets stored in
7780 * the 1st element), and also useful for dumping the regnode.
7781 * The 2nd element stores the multicharacter foldings,
6a0407ee 7782 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7783 av_store(av, 0, listsv);
7784 av_store(av, 1, NULL);
9e55ce06 7785 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7786 rv = newRV_noinc((SV*)av);
19860706 7787 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7788 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7789 ARG_SET(ret, n);
a0ed51b3 7790 }
a0ed51b3
LW
7791 return ret;
7792}
89836f1f
YO
7793#undef _C_C_T_
7794
a0ed51b3 7795
bcdf7404
YO
7796/* reg_skipcomment()
7797
7798 Absorbs an /x style # comments from the input stream.
7799 Returns true if there is more text remaining in the stream.
7800 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
7801 terminates the pattern without including a newline.
7802
7803 Note its the callers responsibility to ensure that we are
7804 actually in /x mode
7805
7806*/
7807
7808STATIC bool
7809S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
7810{
7811 bool ended = 0;
7812 while (RExC_parse < RExC_end)
7813 if (*RExC_parse++ == '\n') {
7814 ended = 1;
7815 break;
7816 }
7817 if (!ended) {
7818 /* we ran off the end of the pattern without ending
7819 the comment, so we have to add an \n when wrapping */
7820 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7821 return 0;
7822 } else
7823 return 1;
7824}
7825
7826/* nextchar()
7827
7828 Advance that parse position, and optionall absorbs
7829 "whitespace" from the inputstream.
7830
7831 Without /x "whitespace" means (?#...) style comments only,
7832 with /x this means (?#...) and # comments and whitespace proper.
7833
7834 Returns the RExC_parse point from BEFORE the scan occurs.
7835
7836 This is the /x friendly way of saying RExC_parse++.
7837*/
7838
76e3520e 7839STATIC char*
830247a4 7840S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7841{
097eb12c 7842 char* const retval = RExC_parse++;
a0d0e21e 7843
4633a7c4 7844 for (;;) {
830247a4
IZ
7845 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7846 RExC_parse[2] == '#') {
e994fd66
AE
7847 while (*RExC_parse != ')') {
7848 if (RExC_parse == RExC_end)
7849 FAIL("Sequence (?#... not terminated");
830247a4 7850 RExC_parse++;
e994fd66 7851 }
830247a4 7852 RExC_parse++;
4633a7c4
LW
7853 continue;
7854 }
bbe252da 7855 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7856 if (isSPACE(*RExC_parse)) {
7857 RExC_parse++;
748a9306
LW
7858 continue;
7859 }
830247a4 7860 else if (*RExC_parse == '#') {
bcdf7404
YO
7861 if ( reg_skipcomment( pRExC_state ) )
7862 continue;
748a9306 7863 }
748a9306 7864 }
4633a7c4 7865 return retval;
a0d0e21e 7866 }
a687059c
LW
7867}
7868
7869/*
c277df42 7870- reg_node - emit a node
a0d0e21e 7871*/
76e3520e 7872STATIC regnode * /* Location. */
830247a4 7873S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7874{
97aff369 7875 dVAR;
c277df42 7876 register regnode *ptr;
504618e9 7877 regnode * const ret = RExC_emit;
07be1b83 7878 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7879
c277df42 7880 if (SIZE_ONLY) {
830247a4
IZ
7881 SIZE_ALIGN(RExC_size);
7882 RExC_size += 1;
a0d0e21e
LW
7883 return(ret);
7884 }
e2e6a0f1
YO
7885#ifdef DEBUGGING
7886 if (OP(RExC_emit) == 255)
7887 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7888 reg_name[op], OP(RExC_emit));
7889#endif
c277df42 7890 NODE_ALIGN_FILL(ret);
a0d0e21e 7891 ptr = ret;
c277df42 7892 FILL_ADVANCE_NODE(ptr, op);
7122b237 7893#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7894 if (RExC_offsets) { /* MJD */
07be1b83 7895 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7896 "reg_node", __LINE__,
7897 reg_name[op],
07be1b83
YO
7898 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7899 ? "Overwriting end of array!\n" : "OK",
7900 (UV)(RExC_emit - RExC_emit_start),
7901 (UV)(RExC_parse - RExC_start),
7902 (UV)RExC_offsets[0]));
ccb2c380 7903 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7904 }
7122b237 7905#endif
830247a4 7906 RExC_emit = ptr;
a0d0e21e 7907 return(ret);
a687059c
LW
7908}
7909
7910/*
a0d0e21e
LW
7911- reganode - emit a node with an argument
7912*/
76e3520e 7913STATIC regnode * /* Location. */
830247a4 7914S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7915{
97aff369 7916 dVAR;
c277df42 7917 register regnode *ptr;
504618e9 7918 regnode * const ret = RExC_emit;
07be1b83 7919 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7920
c277df42 7921 if (SIZE_ONLY) {
830247a4
IZ
7922 SIZE_ALIGN(RExC_size);
7923 RExC_size += 2;
6bda09f9
YO
7924 /*
7925 We can't do this:
7926
7927 assert(2==regarglen[op]+1);
7928
7929 Anything larger than this has to allocate the extra amount.
7930 If we changed this to be:
7931
7932 RExC_size += (1 + regarglen[op]);
7933
7934 then it wouldn't matter. Its not clear what side effect
7935 might come from that so its not done so far.
7936 -- dmq
7937 */
a0d0e21e
LW
7938 return(ret);
7939 }
e2e6a0f1
YO
7940#ifdef DEBUGGING
7941 if (OP(RExC_emit) == 255)
7942 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7943#endif
c277df42 7944 NODE_ALIGN_FILL(ret);
a0d0e21e 7945 ptr = ret;
c277df42 7946 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 7947#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7948 if (RExC_offsets) { /* MJD */
07be1b83 7949 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7950 "reganode",
ccb2c380
MP
7951 __LINE__,
7952 reg_name[op],
07be1b83 7953 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7954 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7955 (UV)(RExC_emit - RExC_emit_start),
7956 (UV)(RExC_parse - RExC_start),
7957 (UV)RExC_offsets[0]));
ccb2c380 7958 Set_Cur_Node_Offset;
fac92740 7959 }
7122b237 7960#endif
830247a4 7961 RExC_emit = ptr;
a0d0e21e 7962 return(ret);
fe14fcc3
LW
7963}
7964
7965/*
cd439c50 7966- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7967*/
71207a34
AL
7968STATIC STRLEN
7969S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7970{
97aff369 7971 dVAR;
71207a34 7972 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7973}
7974
7975/*
a0d0e21e
LW
7976- reginsert - insert an operator in front of already-emitted operand
7977*
7978* Means relocating the operand.
7979*/
76e3520e 7980STATIC void
6bda09f9 7981S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7982{
97aff369 7983 dVAR;
c277df42
IZ
7984 register regnode *src;
7985 register regnode *dst;
7986 register regnode *place;
504618e9 7987 const int offset = regarglen[(U8)op];
6bda09f9 7988 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7989 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7990/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7991 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7992 if (SIZE_ONLY) {
6bda09f9 7993 RExC_size += size;
a0d0e21e
LW
7994 return;
7995 }
a687059c 7996
830247a4 7997 src = RExC_emit;
6bda09f9 7998 RExC_emit += size;
830247a4 7999 dst = RExC_emit;
40d049e4 8000 if (RExC_open_parens) {
6bda09f9 8001 int paren;
6d99fb9b 8002 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 8003 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
8004 if ( RExC_open_parens[paren] >= opnd ) {
8005 DEBUG_PARSE_FMT("open"," - %d",size);
8006 RExC_open_parens[paren] += size;
8007 } else {
8008 DEBUG_PARSE_FMT("open"," - %s","ok");
8009 }
8010 if ( RExC_close_parens[paren] >= opnd ) {
8011 DEBUG_PARSE_FMT("close"," - %d",size);
8012 RExC_close_parens[paren] += size;
8013 } else {
8014 DEBUG_PARSE_FMT("close"," - %s","ok");
8015 }
8016 }
6bda09f9 8017 }
40d049e4 8018
fac92740 8019 while (src > opnd) {
c277df42 8020 StructCopy(--src, --dst, regnode);
7122b237 8021#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8022 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8023 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8024 "reg_insert",
ccb2c380
MP
8025 __LINE__,
8026 reg_name[op],
07be1b83
YO
8027 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8028 ? "Overwriting end of array!\n" : "OK",
8029 (UV)(src - RExC_emit_start),
8030 (UV)(dst - RExC_emit_start),
8031 (UV)RExC_offsets[0]));
ccb2c380
MP
8032 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8033 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8034 }
7122b237 8035#endif
fac92740
MJD
8036 }
8037
a0d0e21e
LW
8038
8039 place = opnd; /* Op node, where operand used to be. */
7122b237 8040#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8041 if (RExC_offsets) { /* MJD */
07be1b83 8042 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8043 "reginsert",
ccb2c380
MP
8044 __LINE__,
8045 reg_name[op],
07be1b83 8046 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8047 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8048 (UV)(place - RExC_emit_start),
8049 (UV)(RExC_parse - RExC_start),
786e8c11 8050 (UV)RExC_offsets[0]));
ccb2c380 8051 Set_Node_Offset(place, RExC_parse);
45948336 8052 Set_Node_Length(place, 1);
fac92740 8053 }
7122b237 8054#endif
c277df42
IZ
8055 src = NEXTOPER(place);
8056 FILL_ADVANCE_NODE(place, op);
8057 Zero(src, offset, regnode);
a687059c
LW
8058}
8059
8060/*
c277df42 8061- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8062- SEE ALSO: regtail_study
a0d0e21e 8063*/
097eb12c 8064/* TODO: All three parms should be const */
76e3520e 8065STATIC void
3dab1dad 8066S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8067{
97aff369 8068 dVAR;
c277df42 8069 register regnode *scan;
72f13be8 8070 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
8071#ifndef DEBUGGING
8072 PERL_UNUSED_ARG(depth);
8073#endif
a0d0e21e 8074
c277df42 8075 if (SIZE_ONLY)
a0d0e21e
LW
8076 return;
8077
8078 /* Find last node. */
8079 scan = p;
8080 for (;;) {
504618e9 8081 regnode * const temp = regnext(scan);
3dab1dad
YO
8082 DEBUG_PARSE_r({
8083 SV * const mysv=sv_newmortal();
8084 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8085 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8086 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8087 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8088 (temp == NULL ? "->" : ""),
8089 (temp == NULL ? reg_name[OP(val)] : "")
8090 );
3dab1dad
YO
8091 });
8092 if (temp == NULL)
8093 break;
8094 scan = temp;
8095 }
8096
8097 if (reg_off_by_arg[OP(scan)]) {
8098 ARG_SET(scan, val - scan);
8099 }
8100 else {
8101 NEXT_OFF(scan) = val - scan;
8102 }
8103}
8104
07be1b83 8105#ifdef DEBUGGING
3dab1dad
YO
8106/*
8107- regtail_study - set the next-pointer at the end of a node chain of p to val.
8108- Look for optimizable sequences at the same time.
8109- currently only looks for EXACT chains.
07be1b83
YO
8110
8111This is expermental code. The idea is to use this routine to perform
8112in place optimizations on branches and groups as they are constructed,
8113with the long term intention of removing optimization from study_chunk so
8114that it is purely analytical.
8115
8116Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8117to control which is which.
8118
3dab1dad
YO
8119*/
8120/* TODO: All four parms should be const */
07be1b83 8121
3dab1dad
YO
8122STATIC U8
8123S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8124{
8125 dVAR;
8126 register regnode *scan;
07be1b83
YO
8127 U8 exact = PSEUDO;
8128#ifdef EXPERIMENTAL_INPLACESCAN
8129 I32 min = 0;
8130#endif
8131
3dab1dad
YO
8132 GET_RE_DEBUG_FLAGS_DECL;
8133
07be1b83 8134
3dab1dad
YO
8135 if (SIZE_ONLY)
8136 return exact;
8137
8138 /* Find last node. */
8139
8140 scan = p;
8141 for (;;) {
8142 regnode * const temp = regnext(scan);
07be1b83
YO
8143#ifdef EXPERIMENTAL_INPLACESCAN
8144 if (PL_regkind[OP(scan)] == EXACT)
8145 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8146 return EXACT;
8147#endif
3dab1dad
YO
8148 if ( exact ) {
8149 switch (OP(scan)) {
8150 case EXACT:
8151 case EXACTF:
8152 case EXACTFL:
8153 if( exact == PSEUDO )
8154 exact= OP(scan);
07be1b83
YO
8155 else if ( exact != OP(scan) )
8156 exact= 0;
3dab1dad
YO
8157 case NOTHING:
8158 break;
8159 default:
8160 exact= 0;
8161 }
8162 }
8163 DEBUG_PARSE_r({
8164 SV * const mysv=sv_newmortal();
8165 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8166 regprop(RExC_rx, mysv, scan);
eaf3ca90 8167 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8168 SvPV_nolen_const(mysv),
eaf3ca90
YO
8169 REG_NODE_NUM(scan),
8170 reg_name[exact]);
3dab1dad 8171 });
a0d0e21e
LW
8172 if (temp == NULL)
8173 break;
8174 scan = temp;
8175 }
07be1b83
YO
8176 DEBUG_PARSE_r({
8177 SV * const mysv_val=sv_newmortal();
8178 DEBUG_PARSE_MSG("");
8179 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8180 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8181 SvPV_nolen_const(mysv_val),
8182 (IV)REG_NODE_NUM(val),
8183 (IV)(val - scan)
07be1b83
YO
8184 );
8185 });
c277df42
IZ
8186 if (reg_off_by_arg[OP(scan)]) {
8187 ARG_SET(scan, val - scan);
a0ed51b3
LW
8188 }
8189 else {
c277df42
IZ
8190 NEXT_OFF(scan) = val - scan;
8191 }
3dab1dad
YO
8192
8193 return exact;
a687059c 8194}
07be1b83 8195#endif
a687059c
LW
8196
8197/*
a687059c
LW
8198 - regcurly - a little FSA that accepts {\d+,?\d*}
8199 */
79072805 8200STATIC I32
5f66b61c 8201S_regcurly(register const char *s)
a687059c
LW
8202{
8203 if (*s++ != '{')
8204 return FALSE;
f0fcb552 8205 if (!isDIGIT(*s))
a687059c 8206 return FALSE;
f0fcb552 8207 while (isDIGIT(*s))
a687059c
LW
8208 s++;
8209 if (*s == ',')
8210 s++;
f0fcb552 8211 while (isDIGIT(*s))
a687059c
LW
8212 s++;
8213 if (*s != '}')
8214 return FALSE;
8215 return TRUE;
8216}
8217
a687059c
LW
8218
8219/*
fd181c75 8220 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8221 */
8222void
097eb12c 8223Perl_regdump(pTHX_ const regexp *r)
a687059c 8224{
35ff7856 8225#ifdef DEBUGGING
97aff369 8226 dVAR;
c445ea15 8227 SV * const sv = sv_newmortal();
ab3bbdeb 8228 SV *dsv= sv_newmortal();
f8fc2ecf 8229 RXi_GET_DECL(r,ri);
a687059c 8230
f8fc2ecf 8231 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8232
8233 /* Header fields of interest. */
ab3bbdeb
YO
8234 if (r->anchored_substr) {
8235 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8236 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8237 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8238 "anchored %s%s at %"IVdf" ",
8239 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8240 (IV)r->anchored_offset);
ab3bbdeb
YO
8241 } else if (r->anchored_utf8) {
8242 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8243 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8244 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8245 "anchored utf8 %s%s at %"IVdf" ",
8246 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8247 (IV)r->anchored_offset);
ab3bbdeb
YO
8248 }
8249 if (r->float_substr) {
8250 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8251 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8252 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8253 "floating %s%s at %"IVdf"..%"UVuf" ",
8254 s, RE_SV_TAIL(r->float_substr),
7b0972df 8255 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8256 } else if (r->float_utf8) {
8257 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8258 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8259 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8260 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8261 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8262 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8263 }
33b8afdf 8264 if (r->check_substr || r->check_utf8)
b81d288d 8265 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8266 (const char *)
8267 (r->check_substr == r->float_substr
8268 && r->check_utf8 == r->float_utf8
8269 ? "(checking floating" : "(checking anchored"));
bbe252da 8270 if (r->extflags & RXf_NOSCAN)
c277df42 8271 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8272 if (r->extflags & RXf_CHECK_ALL)
c277df42 8273 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8274 if (r->check_substr || r->check_utf8)
c277df42
IZ
8275 PerlIO_printf(Perl_debug_log, ") ");
8276
f8fc2ecf
YO
8277 if (ri->regstclass) {
8278 regprop(r, sv, ri->regstclass);
1de06328 8279 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8280 }
bbe252da 8281 if (r->extflags & RXf_ANCH) {
774d564b 8282 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8283 if (r->extflags & RXf_ANCH_BOL)
774d564b 8284 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8285 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8286 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8287 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8288 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8289 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8290 PerlIO_printf(Perl_debug_log, "(GPOS)");
8291 PerlIO_putc(Perl_debug_log, ' ');
8292 }
bbe252da 8293 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8294 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8295 if (r->intflags & PREGf_SKIP)
760ac839 8296 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8297 if (r->intflags & PREGf_IMPLICIT)
760ac839 8298 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8299 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8300 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8301 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8302 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8303#else
96a5add6 8304 PERL_UNUSED_CONTEXT;
65e66c80 8305 PERL_UNUSED_ARG(r);
17c3b450 8306#endif /* DEBUGGING */
a687059c
LW
8307}
8308
8309/*
a0d0e21e
LW
8310- regprop - printable representation of opcode
8311*/
46fc3d4c 8312void
32fc9b6a 8313Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8314{
35ff7856 8315#ifdef DEBUGGING
97aff369 8316 dVAR;
9b155405 8317 register int k;
f8fc2ecf 8318 RXi_GET_DECL(prog,progi);
1de06328 8319 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8320
a0d0e21e 8321
54dc92de 8322 sv_setpvn(sv, "", 0);
8aa23a47 8323
03363afd 8324 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8325 /* It would be nice to FAIL() here, but this may be called from
8326 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8327 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8328 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8329
3dab1dad 8330 k = PL_regkind[OP(o)];
9b155405 8331
2a782b5b 8332 if (k == EXACT) {
396482e1 8333 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8334 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8335 * is a crude hack but it may be the best for now since
8336 * we have no flag "this EXACTish node was UTF-8"
8337 * --jhi */
8338 const char * const s =
ddc5bc0f 8339 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8340 PL_colors[0], PL_colors[1],
8341 PERL_PV_ESCAPE_UNI_DETECT |
8342 PERL_PV_PRETTY_ELIPSES |
8343 PERL_PV_PRETTY_LTGT
8344 );
8345 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8346 } else if (k == TRIE) {
3dab1dad 8347 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8348 * progi->data isn't available here */
1de06328 8349 const char op = OP(o);
647f639f 8350 const U32 n = ARG(o);
1de06328 8351 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8352 (reg_ac_data *)progi->data->data[n] :
1de06328 8353 NULL;
3251b653
NC
8354 const reg_trie_data * const trie
8355 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8356
8357 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8358 DEBUG_TRIE_COMPILE_r(
8359 Perl_sv_catpvf(aTHX_ sv,
8360 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8361 (UV)trie->startstate,
1e2e3d02 8362 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8363 (UV)trie->wordcount,
8364 (UV)trie->minlen,
8365 (UV)trie->maxlen,
8366 (UV)TRIE_CHARCOUNT(trie),
8367 (UV)trie->uniquecharcount
8368 )
8369 );
8370 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8371 int i;
8372 int rangestart = -1;
f46cb337 8373 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8374 Perl_sv_catpvf(aTHX_ sv, "[");
8375 for (i = 0; i <= 256; i++) {
8376 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8377 if (rangestart == -1)
8378 rangestart = i;
8379 } else if (rangestart != -1) {
8380 if (i <= rangestart + 3)
8381 for (; rangestart < i; rangestart++)
8382 put_byte(sv, rangestart);
8383 else {
8384 put_byte(sv, rangestart);
8385 sv_catpvs(sv, "-");
8386 put_byte(sv, i - 1);
8387 }
8388 rangestart = -1;
8389 }
8390 }
8391 Perl_sv_catpvf(aTHX_ sv, "]");
8392 }
8393
a3621e74 8394 } else if (k == CURLY) {
cb434fcc 8395 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8396 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8397 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8398 }
2c2d71f5
JH
8399 else if (k == WHILEM && o->flags) /* Ordinal/of */
8400 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8401 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8402 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe 8403 if ( prog->paren_names ) {
ee9b8eae
YO
8404 if ( k != REF || OP(o) < NREF) {
8405 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8406 SV **name= av_fetch(list, ARG(o), 0 );
8407 if (name)
8408 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8409 }
8410 else {
8411 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8412 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8413 I32 *nums=(I32*)SvPVX(sv_dat);
8414 SV **name= av_fetch(list, nums[0], 0 );
8415 I32 n;
8416 if (name) {
8417 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8418 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8419 (n ? "," : ""), (IV)nums[n]);
8420 }
8421 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8422 }
1f1031fe 8423 }
ee9b8eae 8424 }
1f1031fe 8425 } else if (k == GOSUB)
6bda09f9 8426 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8427 else if (k == VERB) {
8428 if (!o->flags)
8429 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8430 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8431 } else if (k == LOGICAL)
04ebc1ab 8432 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8433 else if (k == ANYOF) {
8434 int i, rangestart = -1;
2d03de9c 8435 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8436
8437 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8438 static const char * const anyofs[] = {
653099ff
GS
8439 "\\w",
8440 "\\W",
8441 "\\s",
8442 "\\S",
8443 "\\d",
8444 "\\D",
8445 "[:alnum:]",
8446 "[:^alnum:]",
8447 "[:alpha:]",
8448 "[:^alpha:]",
8449 "[:ascii:]",
8450 "[:^ascii:]",
8451 "[:ctrl:]",
8452 "[:^ctrl:]",
8453 "[:graph:]",
8454 "[:^graph:]",
8455 "[:lower:]",
8456 "[:^lower:]",
8457 "[:print:]",
8458 "[:^print:]",
8459 "[:punct:]",
8460 "[:^punct:]",
8461 "[:upper:]",
aaa51d5e 8462 "[:^upper:]",
653099ff 8463 "[:xdigit:]",
aaa51d5e
JF
8464 "[:^xdigit:]",
8465 "[:space:]",
8466 "[:^space:]",
8467 "[:blank:]",
8468 "[:^blank:]"
653099ff
GS
8469 };
8470
19860706 8471 if (flags & ANYOF_LOCALE)
396482e1 8472 sv_catpvs(sv, "{loc}");
19860706 8473 if (flags & ANYOF_FOLD)
396482e1 8474 sv_catpvs(sv, "{i}");
653099ff 8475 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8476 if (flags & ANYOF_INVERT)
396482e1 8477 sv_catpvs(sv, "^");
ffc61ed2
JH
8478 for (i = 0; i <= 256; i++) {
8479 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8480 if (rangestart == -1)
8481 rangestart = i;
8482 } else if (rangestart != -1) {
8483 if (i <= rangestart + 3)
8484 for (; rangestart < i; rangestart++)
653099ff 8485 put_byte(sv, rangestart);
ffc61ed2
JH
8486 else {
8487 put_byte(sv, rangestart);
396482e1 8488 sv_catpvs(sv, "-");
ffc61ed2 8489 put_byte(sv, i - 1);
653099ff 8490 }
ffc61ed2 8491 rangestart = -1;
653099ff 8492 }
847a199f 8493 }
ffc61ed2
JH
8494
8495 if (o->flags & ANYOF_CLASS)
bb7a0f54 8496 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8497 if (ANYOF_CLASS_TEST(o,i))
8498 sv_catpv(sv, anyofs[i]);
8499
8500 if (flags & ANYOF_UNICODE)
396482e1 8501 sv_catpvs(sv, "{unicode}");
1aa99e6b 8502 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8503 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8504
8505 {
8506 SV *lv;
32fc9b6a 8507 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8508
ffc61ed2
JH
8509 if (lv) {
8510 if (sw) {
89ebb4a3 8511 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8512
ffc61ed2 8513 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8514 uvchr_to_utf8(s, i);
ffc61ed2 8515
3568d838 8516 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8517 if (rangestart == -1)
8518 rangestart = i;
8519 } else if (rangestart != -1) {
ffc61ed2
JH
8520 if (i <= rangestart + 3)
8521 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8522 const U8 * const e = uvchr_to_utf8(s,rangestart);
8523 U8 *p;
8524 for(p = s; p < e; p++)
ffc61ed2
JH
8525 put_byte(sv, *p);
8526 }
8527 else {
2d03de9c
AL
8528 const U8 *e = uvchr_to_utf8(s,rangestart);
8529 U8 *p;
8530 for (p = s; p < e; p++)
ffc61ed2 8531 put_byte(sv, *p);
396482e1 8532 sv_catpvs(sv, "-");
2d03de9c
AL
8533 e = uvchr_to_utf8(s, i-1);
8534 for (p = s; p < e; p++)
1df70142 8535 put_byte(sv, *p);
ffc61ed2
JH
8536 }
8537 rangestart = -1;
8538 }
19860706 8539 }
ffc61ed2 8540
396482e1 8541 sv_catpvs(sv, "..."); /* et cetera */
19860706 8542 }
fde631ed 8543
ffc61ed2 8544 {
2e0de35c 8545 char *s = savesvpv(lv);
c445ea15 8546 char * const origs = s;
b81d288d 8547
3dab1dad
YO
8548 while (*s && *s != '\n')
8549 s++;
b81d288d 8550
ffc61ed2 8551 if (*s == '\n') {
2d03de9c 8552 const char * const t = ++s;
ffc61ed2
JH
8553
8554 while (*s) {
8555 if (*s == '\n')
8556 *s = ' ';
8557 s++;
8558 }
8559 if (s[-1] == ' ')
8560 s[-1] = 0;
8561
8562 sv_catpv(sv, t);
fde631ed 8563 }
b81d288d 8564
ffc61ed2 8565 Safefree(origs);
fde631ed
JH
8566 }
8567 }
653099ff 8568 }
ffc61ed2 8569
653099ff
GS
8570 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8571 }
9b155405 8572 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8573 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8574#else
96a5add6 8575 PERL_UNUSED_CONTEXT;
65e66c80
SP
8576 PERL_UNUSED_ARG(sv);
8577 PERL_UNUSED_ARG(o);
f9049ba1 8578 PERL_UNUSED_ARG(prog);
17c3b450 8579#endif /* DEBUGGING */
35ff7856 8580}
a687059c 8581
cad2e5aa
JH
8582SV *
8583Perl_re_intuit_string(pTHX_ regexp *prog)
8584{ /* Assume that RE_INTUIT is set */
97aff369 8585 dVAR;
a3621e74 8586 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8587 PERL_UNUSED_CONTEXT;
8588
a3621e74 8589 DEBUG_COMPILE_r(
cfd0369c 8590 {
2d03de9c 8591 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8592 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8593
8594 if (!PL_colorset) reginitcolors();
8595 PerlIO_printf(Perl_debug_log,
a0288114 8596 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8597 PL_colors[4],
8598 prog->check_substr ? "" : "utf8 ",
8599 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8600 s,
8601 PL_colors[1],
8602 (strlen(s) > 60 ? "..." : ""));
8603 } );
8604
33b8afdf 8605 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8606}
8607
84da74a7 8608/*
f8149455 8609 pregfree()
84da74a7 8610
f8149455
YO
8611 handles refcounting and freeing the perl core regexp structure. When
8612 it is necessary to actually free the structure the first thing it
8613 does is call the 'free' method of the regexp_engine associated to to
8614 the regexp, allowing the handling of the void *pprivate; member
8615 first. (This routine is not overridable by extensions, which is why
8616 the extensions free is called first.)
8617
8618 See regdupe and regdupe_internal if you change anything here.
84da74a7 8619*/
f8149455 8620#ifndef PERL_IN_XSUB_RE
2b69d0c2 8621void
864dbfa3 8622Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8623{
27da23d5 8624 dVAR;
fc32ee4a 8625 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8626
7821416a
IZ
8627 if (!r || (--r->refcnt > 0))
8628 return;
f8149455
YO
8629
8630 CALLREGFREE_PVT(r); /* free the private data */
ed252734 8631 RX_MATCH_COPY_FREE(r);
f8c7b90f 8632#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8633 if (r->saved_copy)
8634 SvREFCNT_dec(r->saved_copy);
8635#endif
a193d654
GS
8636 if (r->substrs) {
8637 if (r->anchored_substr)
8638 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8639 if (r->anchored_utf8)
8640 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8641 if (r->float_substr)
8642 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8643 if (r->float_utf8)
8644 SvREFCNT_dec(r->float_utf8);
2779dcf1 8645 Safefree(r->substrs);
a193d654 8646 }
81714fb9 8647 if (r->paren_names)
bcdf7404
YO
8648 SvREFCNT_dec(r->paren_names);
8649 Safefree(r->wrapped);
f8149455
YO
8650 Safefree(r->startp);
8651 Safefree(r->endp);
8652 Safefree(r);
8653}
8654#endif
8655
8656/* regfree_internal()
8657
8658 Free the private data in a regexp. This is overloadable by
8659 extensions. Perl takes care of the regexp structure in pregfree(),
8660 this covers the *pprivate pointer which technically perldoesnt
8661 know about, however of course we have to handle the
8662 regexp_internal structure when no extension is in use.
8663
8664 Note this is called before freeing anything in the regexp
8665 structure.
8666 */
8667
8668void
8669Perl_regfree_internal(pTHX_ struct regexp *r)
8670{
8671 dVAR;
8672 RXi_GET_DECL(r,ri);
8673 GET_RE_DEBUG_FLAGS_DECL;
8674
8675 DEBUG_COMPILE_r({
8676 if (!PL_colorset)
8677 reginitcolors();
8678 {
8679 SV *dsv= sv_newmortal();
8680 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8681 dsv, r->precomp, r->prelen, 60);
8682 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8683 PL_colors[4],PL_colors[5],s);
8684 }
8685 });
7122b237
YO
8686#ifdef RE_TRACK_PATTERN_OFFSETS
8687 if (ri->u.offsets)
8688 Safefree(ri->u.offsets); /* 20010421 MJD */
8689#endif
f8fc2ecf
YO
8690 if (ri->data) {
8691 int n = ri->data->count;
f3548bdc
DM
8692 PAD* new_comppad = NULL;
8693 PAD* old_comppad;
4026c95a 8694 PADOFFSET refcnt;
dfad63ad 8695
c277df42 8696 while (--n >= 0) {
261faec3 8697 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8698 switch (ri->data->what[n]) {
c277df42 8699 case 's':
81714fb9 8700 case 'S':
55eed653 8701 case 'u':
f8fc2ecf 8702 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8703 break;
653099ff 8704 case 'f':
f8fc2ecf 8705 Safefree(ri->data->data[n]);
653099ff 8706 break;
dfad63ad 8707 case 'p':
f8fc2ecf 8708 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8709 break;
c277df42 8710 case 'o':
dfad63ad 8711 if (new_comppad == NULL)
cea2e8a9 8712 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8713 PAD_SAVE_LOCAL(old_comppad,
8714 /* Watch out for global destruction's random ordering. */
c445ea15 8715 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8716 );
b34c0dd4 8717 OP_REFCNT_LOCK;
f8fc2ecf 8718 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8719 OP_REFCNT_UNLOCK;
8720 if (!refcnt)
f8fc2ecf 8721 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8722
f3548bdc 8723 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8724 SvREFCNT_dec((SV*)new_comppad);
8725 new_comppad = NULL;
c277df42
IZ
8726 break;
8727 case 'n':
9e55ce06 8728 break;
07be1b83 8729 case 'T':
be8e71aa
YO
8730 { /* Aho Corasick add-on structure for a trie node.
8731 Used in stclass optimization only */
07be1b83 8732 U32 refcount;
f8fc2ecf 8733 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8734 OP_REFCNT_LOCK;
8735 refcount = --aho->refcount;
8736 OP_REFCNT_UNLOCK;
8737 if ( !refcount ) {
446bd890
NC
8738 PerlMemShared_free(aho->states);
8739 PerlMemShared_free(aho->fail);
446bd890
NC
8740 /* do this last!!!! */
8741 PerlMemShared_free(ri->data->data[n]);
8742 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8743 }
8744 }
8745 break;
a3621e74 8746 case 't':
07be1b83 8747 {
be8e71aa 8748 /* trie structure. */
07be1b83 8749 U32 refcount;
f8fc2ecf 8750 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8751 OP_REFCNT_LOCK;
8752 refcount = --trie->refcount;
8753 OP_REFCNT_UNLOCK;
8754 if ( !refcount ) {
446bd890 8755 PerlMemShared_free(trie->charmap);
446bd890
NC
8756 PerlMemShared_free(trie->states);
8757 PerlMemShared_free(trie->trans);
07be1b83 8758 if (trie->bitmap)
446bd890 8759 PerlMemShared_free(trie->bitmap);
07be1b83 8760 if (trie->wordlen)
446bd890 8761 PerlMemShared_free(trie->wordlen);
786e8c11 8762 if (trie->jump)
446bd890 8763 PerlMemShared_free(trie->jump);
786e8c11 8764 if (trie->nextword)
446bd890 8765 PerlMemShared_free(trie->nextword);
446bd890
NC
8766 /* do this last!!!! */
8767 PerlMemShared_free(ri->data->data[n]);
a3621e74 8768 }
07be1b83
YO
8769 }
8770 break;
c277df42 8771 default:
f8fc2ecf 8772 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8773 }
8774 }
f8fc2ecf
YO
8775 Safefree(ri->data->what);
8776 Safefree(ri->data);
a0d0e21e 8777 }
f8fc2ecf
YO
8778 if (ri->swap) {
8779 Safefree(ri->swap->startp);
8780 Safefree(ri->swap->endp);
8781 Safefree(ri->swap);
c74340f9 8782 }
f8fc2ecf 8783 Safefree(ri);
a687059c 8784}
c277df42 8785
84da74a7
YO
8786#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8787#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8788#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8789#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8790
8791/*
8792 regdupe - duplicate a regexp.
8793
8794 This routine is called by sv.c's re_dup and is expected to clone a
8795 given regexp structure. It is a no-op when not under USE_ITHREADS.
8796 (Originally this *was* re_dup() for change history see sv.c)
8797
f8149455
YO
8798 After all of the core data stored in struct regexp is duplicated
8799 the regexp_engine.dupe method is used to copy any private data
8800 stored in the *pprivate pointer. This allows extensions to handle
8801 any duplication it needs to do.
8802
8803 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8804*/
a3c0e9ca 8805#if defined(USE_ITHREADS)
f8149455 8806#ifndef PERL_IN_XSUB_RE
84da74a7 8807regexp *
f8149455 8808Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8809{
84da74a7 8810 dVAR;
f8fc2ecf 8811 regexp *ret;
f8149455 8812 int i, npar;
84da74a7 8813 struct reg_substr_datum *s;
644c02aa 8814
84da74a7
YO
8815 if (!r)
8816 return (REGEXP *)NULL;
8817
8818 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8819 return ret;
8820
f8149455 8821
84da74a7 8822 npar = r->nparens+1;
f8fc2ecf 8823 Newxz(ret, 1, regexp);
84da74a7
YO
8824 Newx(ret->startp, npar, I32);
8825 Copy(r->startp, ret->startp, npar, I32);
8826 Newx(ret->endp, npar, I32);
f8149455 8827 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8828
c945c181 8829 if (r->substrs) {
785a26d5
YO
8830 Newx(ret->substrs, 1, struct reg_substr_data);
8831 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8832 s->min_offset = r->substrs->data[i].min_offset;
8833 s->max_offset = r->substrs->data[i].max_offset;
8834 s->end_shift = r->substrs->data[i].end_shift;
8835 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8836 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8837 }
8838 } else
8839 ret->substrs = NULL;
f8149455 8840
bcdf7404
YO
8841 ret->wrapped = SAVEPVN(r->wrapped, r->wraplen);
8842 ret->precomp = ret->wrapped + (r->precomp - r->wrapped);
8843 ret->prelen = r->prelen;
8844 ret->wraplen = r->wraplen;
8845
f8149455
YO
8846 ret->refcnt = r->refcnt;
8847 ret->minlen = r->minlen;
8848 ret->minlenret = r->minlenret;
f8149455
YO
8849 ret->nparens = r->nparens;
8850 ret->lastparen = r->lastparen;
8851 ret->lastcloseparen = r->lastcloseparen;
8852 ret->intflags = r->intflags;
8853 ret->extflags = r->extflags;
8854
8855 ret->sublen = r->sublen;
8856
8857 ret->engine = r->engine;
8858
8859 ret->paren_names = hv_dup_inc(r->paren_names, param);
8860
8861 if (RX_MATCH_COPIED(ret))
8862 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8863 else
8864 ret->subbeg = NULL;
8865#ifdef PERL_OLD_COPY_ON_WRITE
8866 ret->saved_copy = NULL;
8867#endif
8868
8869 ret->pprivate = r->pprivate;
785a26d5
YO
8870 if (ret->pprivate)
8871 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8872
8873 ptr_table_store(PL_ptr_table, r, ret);
8874 return ret;
8875}
8876#endif /* PERL_IN_XSUB_RE */
8877
8878/*
8879 regdupe_internal()
8880
8881 This is the internal complement to regdupe() which is used to copy
8882 the structure pointed to by the *pprivate pointer in the regexp.
8883 This is the core version of the extension overridable cloning hook.
8884 The regexp structure being duplicated will be copied by perl prior
8885 to this and will be provided as the regexp *r argument, however
8886 with the /old/ structures pprivate pointer value. Thus this routine
8887 may override any copying normally done by perl.
8888
8889 It returns a pointer to the new regexp_internal structure.
8890*/
8891
8892void *
8893Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8894{
8895 dVAR;
8896 regexp_internal *reti;
8897 int len, npar;
8898 RXi_GET_DECL(r,ri);
8899
8900 npar = r->nparens+1;
7122b237 8901 len = ProgLen(ri);
f8149455
YO
8902
8903 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8904 Copy(ri->program, reti->program, len+1, regnode);
8905
8906 if(ri->swap) {
8907 Newx(reti->swap, 1, regexp_paren_ofs);
8908 /* no need to copy these */
8909 Newx(reti->swap->startp, npar, I32);
8910 Newx(reti->swap->endp, npar, I32);
8911 } else {
8912 reti->swap = NULL;
8913 }
8914
f8fc2ecf 8915 reti->regstclass = NULL;
bcdf7404 8916
f8fc2ecf 8917 if (ri->data) {
84da74a7 8918 struct reg_data *d;
f8fc2ecf 8919 const int count = ri->data->count;
84da74a7
YO
8920 int i;
8921
8922 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8923 char, struct reg_data);
8924 Newx(d->what, count, U8);
8925
8926 d->count = count;
8927 for (i = 0; i < count; i++) {
f8fc2ecf 8928 d->what[i] = ri->data->what[i];
84da74a7 8929 switch (d->what[i]) {
55eed653 8930 /* legal options are one of: sSfpontTu
84da74a7
YO
8931 see also regcomp.h and pregfree() */
8932 case 's':
81714fb9 8933 case 'S':
0536c0a7 8934 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8935 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8936 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8937 break;
84da74a7
YO
8938 case 'f':
8939 /* This is cheating. */
8940 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8941 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8942 struct regnode_charclass_class);
f8fc2ecf 8943 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8944 break;
8945 case 'o':
bbe252da
YO
8946 /* Compiled op trees are readonly and in shared memory,
8947 and can thus be shared without duplication. */
84da74a7 8948 OP_REFCNT_LOCK;
f8fc2ecf 8949 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8950 OP_REFCNT_UNLOCK;
8951 break;
23eab42c
NC
8952 case 'T':
8953 /* Trie stclasses are readonly and can thus be shared
8954 * without duplication. We free the stclass in pregfree
8955 * when the corresponding reg_ac_data struct is freed.
8956 */
8957 reti->regstclass= ri->regstclass;
8958 /* Fall through */
84da74a7 8959 case 't':
84da74a7 8960 OP_REFCNT_LOCK;
0536c0a7 8961 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8962 OP_REFCNT_UNLOCK;
0536c0a7
NC
8963 /* Fall through */
8964 case 'n':
8965 d->data[i] = ri->data->data[i];
84da74a7 8966 break;
84da74a7 8967 default:
f8fc2ecf 8968 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8969 }
8970 }
8971
f8fc2ecf 8972 reti->data = d;
84da74a7
YO
8973 }
8974 else
f8fc2ecf 8975 reti->data = NULL;
84da74a7 8976
cde0cee5
YO
8977 reti->name_list_idx = ri->name_list_idx;
8978
7122b237
YO
8979#ifdef RE_TRACK_PATTERN_OFFSETS
8980 if (ri->u.offsets) {
8981 Newx(reti->u.offsets, 2*len+1, U32);
8982 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
8983 }
8984#else
8985 SetProgLen(reti,len);
8986#endif
8987
f8149455 8988 return (void*)reti;
84da74a7 8989}
f8149455
YO
8990
8991#endif /* USE_ITHREADS */
84da74a7 8992
de8c5301
YO
8993/*
8994 reg_stringify()
8995
8996 converts a regexp embedded in a MAGIC struct to its stringified form,
8997 caching the converted form in the struct and returns the cached
8998 string.
8999
9000 If lp is nonnull then it is used to return the length of the
9001 resulting string
9002
9003 If flags is nonnull and the returned string contains UTF8 then
f8149455 9004 (*flags & 1) will be true.
de8c5301
YO
9005
9006 If haseval is nonnull then it is used to return whether the pattern
9007 contains evals.
9008
9009 Normally called via macro:
9010
f8149455 9011 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
9012
9013 And internally with
9014
f8149455 9015 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
9016
9017 See sv_2pv_flags() in sv.c for an example of internal usage.
9018
9019 */
f8149455 9020#ifndef PERL_IN_XSUB_RE
bcdf7404 9021
de8c5301
YO
9022char *
9023Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9024 dVAR;
9025 const regexp * const re = (regexp *)mg->mg_obj;
de8c5301 9026 if (haseval)
f8149455 9027 *haseval = re->seen_evals;
de8c5301 9028 if (flags)
bbe252da 9029 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301 9030 if (lp)
bcdf7404
YO
9031 *lp = re->wraplen;
9032 return re->wrapped;
de8c5301
YO
9033}
9034
c277df42
IZ
9035/*
9036 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9037 */
9038regnode *
864dbfa3 9039Perl_regnext(pTHX_ register regnode *p)
c277df42 9040{
97aff369 9041 dVAR;
c277df42
IZ
9042 register I32 offset;
9043
f8fc2ecf 9044 if (!p)
c277df42
IZ
9045 return(NULL);
9046
9047 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9048 if (offset == 0)
9049 return(NULL);
9050
c277df42 9051 return(p+offset);
c277df42 9052}
76234dfb 9053#endif
c277df42 9054
01f988be 9055STATIC void
cea2e8a9 9056S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9057{
9058 va_list args;
9059 STRLEN l1 = strlen(pat1);
9060 STRLEN l2 = strlen(pat2);
9061 char buf[512];
06bf62c7 9062 SV *msv;
73d840c0 9063 const char *message;
c277df42
IZ
9064
9065 if (l1 > 510)
9066 l1 = 510;
9067 if (l1 + l2 > 510)
9068 l2 = 510 - l1;
9069 Copy(pat1, buf, l1 , char);
9070 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9071 buf[l1 + l2] = '\n';
9072 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9073#ifdef I_STDARG
9074 /* ANSI variant takes additional second argument */
c277df42 9075 va_start(args, pat2);
8736538c
AS
9076#else
9077 va_start(args);
9078#endif
5a844595 9079 msv = vmess(buf, &args);
c277df42 9080 va_end(args);
cfd0369c 9081 message = SvPV_const(msv,l1);
c277df42
IZ
9082 if (l1 > 512)
9083 l1 = 512;
9084 Copy(message, buf, l1 , char);
197cf9b9 9085 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9086 Perl_croak(aTHX_ "%s", buf);
c277df42 9087}
a0ed51b3
LW
9088
9089/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9090
76234dfb 9091#ifndef PERL_IN_XSUB_RE
a0ed51b3 9092void
864dbfa3 9093Perl_save_re_context(pTHX)
b81d288d 9094{
97aff369 9095 dVAR;
1ade1aa1
NC
9096
9097 struct re_save_state *state;
9098
9099 SAVEVPTR(PL_curcop);
9100 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9101
9102 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9103 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9104 SSPUSHINT(SAVEt_RE_STATE);
9105
46ab3289 9106 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9107
a0ed51b3 9108 PL_reg_start_tmp = 0;
a0ed51b3 9109 PL_reg_start_tmpl = 0;
c445ea15 9110 PL_reg_oldsaved = NULL;
a5db57d6 9111 PL_reg_oldsavedlen = 0;
a5db57d6 9112 PL_reg_maxiter = 0;
a5db57d6 9113 PL_reg_leftiter = 0;
c445ea15 9114 PL_reg_poscache = NULL;
a5db57d6 9115 PL_reg_poscache_size = 0;
1ade1aa1
NC
9116#ifdef PERL_OLD_COPY_ON_WRITE
9117 PL_nrs = NULL;
9118#endif
ada6e8a9 9119
c445ea15
AL
9120 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9121 if (PL_curpm) {
9122 const REGEXP * const rx = PM_GETRE(PL_curpm);
9123 if (rx) {
1df70142 9124 U32 i;
ada6e8a9 9125 for (i = 1; i <= rx->nparens; i++) {
1df70142 9126 char digits[TYPE_CHARS(long)];
d9fad198 9127 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9128 GV *const *const gvp
9129 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9130
b37c2d43
AL
9131 if (gvp) {
9132 GV * const gv = *gvp;
9133 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9134 save_scalar(gv);
49f27e4b 9135 }
ada6e8a9
AMS
9136 }
9137 }
9138 }
a0ed51b3 9139}
76234dfb 9140#endif
51371543 9141
51371543 9142static void
acfe0abc 9143clear_re(pTHX_ void *r)
51371543 9144{
97aff369 9145 dVAR;
51371543
GS
9146 ReREFCNT_dec((regexp *)r);
9147}
ffbc6a93 9148
a28509cc
AL
9149#ifdef DEBUGGING
9150
9151STATIC void
9152S_put_byte(pTHX_ SV *sv, int c)
9153{
9154 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9155 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9156 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9157 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9158 else
9159 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9160}
9161
786e8c11 9162
3dab1dad
YO
9163#define CLEAR_OPTSTART \
9164 if (optstart) STMT_START { \
70685ca0 9165 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9166 optstart=NULL; \
9167 } STMT_END
9168
786e8c11 9169#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9170
b5a2f8d8
NC
9171STATIC const regnode *
9172S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9173 const regnode *last, const regnode *plast,
9174 SV* sv, I32 indent, U32 depth)
a28509cc 9175{
97aff369 9176 dVAR;
786e8c11 9177 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9178 register const regnode *next;
3dab1dad 9179 const regnode *optstart= NULL;
1f1031fe 9180
f8fc2ecf 9181 RXi_GET_DECL(r,ri);
3dab1dad 9182 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9183
786e8c11
YO
9184#ifdef DEBUG_DUMPUNTIL
9185 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9186 last ? last-start : 0,plast ? plast-start : 0);
9187#endif
9188
9189 if (plast && plast < last)
9190 last= plast;
9191
9192 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9193 /* While that wasn't END last time... */
a28509cc
AL
9194 NODE_ALIGN(node);
9195 op = OP(node);
de734bd5 9196 if (op == CLOSE || op == WHILEM)
786e8c11 9197 indent--;
b5a2f8d8 9198 next = regnext((regnode *)node);
1f1031fe 9199
a28509cc 9200 /* Where, what. */
8e11feef 9201 if (OP(node) == OPTIMIZED) {
e68ec53f 9202 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9203 optstart = node;
3dab1dad 9204 else
8e11feef 9205 goto after_print;
3dab1dad
YO
9206 } else
9207 CLEAR_OPTSTART;
1f1031fe 9208
32fc9b6a 9209 regprop(r, sv, node);
a28509cc 9210 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9211 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9212
9213 if (OP(node) != OPTIMIZED) {
9214 if (next == NULL) /* Next ptr. */
9215 PerlIO_printf(Perl_debug_log, " (0)");
9216 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9217 PerlIO_printf(Perl_debug_log, " (FAIL)");
9218 else
9219 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9220 (void)PerlIO_putc(Perl_debug_log, '\n');
9221 }
9222
a28509cc
AL
9223 after_print:
9224 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9225 assert(next);
9226 {
9227 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9228 ? regnext((regnode *)next)
9229 : next);
be8e71aa
YO
9230 if (last && nnode > last)
9231 nnode = last;
786e8c11 9232 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9233 }
a28509cc
AL
9234 }
9235 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9236 assert(next);
786e8c11 9237 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9238 }
9239 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9240 const regnode *this_trie = node;
1de06328 9241 const char op = OP(node);
647f639f 9242 const U32 n = ARG(node);
1de06328 9243 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9244 (reg_ac_data *)ri->data->data[n] :
1de06328 9245 NULL;
3251b653
NC
9246 const reg_trie_data * const trie =
9247 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9248#ifdef DEBUGGING
9249 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9250#endif
786e8c11 9251 const regnode *nextbranch= NULL;
a28509cc 9252 I32 word_idx;
1de06328 9253 sv_setpvn(sv, "", 0);
786e8c11 9254 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9255 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9256
9257 PerlIO_printf(Perl_debug_log, "%*s%s ",
9258 (int)(2*(indent+3)), "",
9259 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9260 PL_colors[0], PL_colors[1],
9261 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9262 PERL_PV_PRETTY_ELIPSES |
7f69552c 9263 PERL_PV_PRETTY_LTGT
786e8c11
YO
9264 )
9265 : "???"
9266 );
9267 if (trie->jump) {
40d049e4 9268 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9269 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9270 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9271 if (dist) {
9272 if (!nextbranch)
24b23f37 9273 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9274 DUMPUNTIL(this_trie + dist, nextbranch);
9275 }
786e8c11
YO
9276 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9277 nextbranch= regnext((regnode *)nextbranch);
9278 } else {
9279 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9280 }
786e8c11
YO
9281 }
9282 if (last && next > last)
9283 node= last;
9284 else
9285 node= next;
a28509cc 9286 }
786e8c11
YO
9287 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9288 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9289 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9290 }
9291 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9292 assert(next);
786e8c11 9293 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9294 }
9295 else if ( op == PLUS || op == STAR) {
786e8c11 9296 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9297 }
9298 else if (op == ANYOF) {
9299 /* arglen 1 + class block */
9300 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9301 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9302 node = NEXTOPER(node);
9303 }
9304 else if (PL_regkind[(U8)op] == EXACT) {
9305 /* Literal string, where present. */
9306 node += NODE_SZ_STR(node) - 1;
9307 node = NEXTOPER(node);
9308 }
9309 else {
9310 node = NEXTOPER(node);
9311 node += regarglen[(U8)op];
9312 }
9313 if (op == CURLYX || op == OPEN)
786e8c11 9314 indent++;
a28509cc 9315 }
3dab1dad 9316 CLEAR_OPTSTART;
786e8c11 9317#ifdef DEBUG_DUMPUNTIL
70685ca0 9318 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9319#endif
1de06328 9320 return node;
a28509cc
AL
9321}
9322
9323#endif /* DEBUGGING */
9324
241d1a3b
NC
9325/*
9326 * Local variables:
9327 * c-indentation-style: bsd
9328 * c-basic-offset: 4
9329 * indent-tabs-mode: t
9330 * End:
9331 *
37442d52
RGS
9332 * ex: set ts=8 sts=4 sw=4 noet:
9333 */