This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More safety in free()s
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
58e23c8d 373#define _FAIL(code) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
58e23c8d 384 code; \
ccb2c380 385} STMT_END
8615cb43 386
58e23c8d
YO
387#define FAIL(msg) _FAIL( \
388 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
389 msg, (int)len, RExC_precomp, ellipses))
390
391#define FAIL2(msg,arg) _FAIL( \
392 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
393 arg, (int)len, RExC_precomp, ellipses))
394
b45f050a 395/*
b45f050a
JF
396 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
397 */
ccb2c380 398#define Simple_vFAIL(m) STMT_START { \
a28509cc 399 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
400 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
406 */
ccb2c380
MP
407#define vFAIL(m) STMT_START { \
408 if (!SIZE_ONLY) \
409 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
410 Simple_vFAIL(m); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Like Simple_vFAIL(), but accepts two arguments.
415 */
ccb2c380 416#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 417 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
418 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
419 (int)offset, RExC_precomp, RExC_precomp + offset); \
420} STMT_END
b45f050a
JF
421
422/*
423 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
424 */
ccb2c380
MP
425#define vFAIL2(m,a1) STMT_START { \
426 if (!SIZE_ONLY) \
427 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 Simple_vFAIL2(m, a1); \
429} STMT_END
b45f050a
JF
430
431
432/*
433 * Like Simple_vFAIL(), but accepts three arguments.
434 */
ccb2c380 435#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
443 */
ccb2c380
MP
444#define vFAIL3(m,a1,a2) STMT_START { \
445 if (!SIZE_ONLY) \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
447 Simple_vFAIL3(m, a1, a2); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Like Simple_vFAIL(), but accepts four arguments.
452 */
ccb2c380 453#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 454 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
455 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
456 (int)offset, RExC_precomp, RExC_precomp + offset); \
457} STMT_END
b45f050a 458
ccb2c380 459#define vWARN(loc,m) STMT_START { \
a28509cc 460 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
461 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465#define vWARNdep(loc,m) STMT_START { \
a28509cc 466 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
467 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
468 "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472
473#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 474 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
476 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 480 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
481 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
482 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
483} STMT_END
484
485#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 486 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
487 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
488 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
490
491#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
493 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
9d1d55b5 496
8615cb43 497
cd439c50 498/* Allow for side effects in s */
ccb2c380
MP
499#define REGC(c,s) STMT_START { \
500 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
501} STMT_END
cd439c50 502
fac92740
MJD
503/* Macros for recording node offsets. 20001227 mjd@plover.com
504 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
505 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
506 * Element 0 holds the number n.
07be1b83 507 * Position is 1 indexed.
fac92740
MJD
508 */
509
ccb2c380
MP
510#define Set_Node_Offset_To_R(node,byte) STMT_START { \
511 if (! SIZE_ONLY) { \
512 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 513 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 514 if((node) < 0) { \
551405c4 515 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
516 } else { \
517 RExC_offsets[2*(node)-1] = (byte); \
518 } \
519 } \
520} STMT_END
521
522#define Set_Node_Offset(node,byte) \
523 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
524#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
525
526#define Set_Node_Length_To_R(node,len) STMT_START { \
527 if (! SIZE_ONLY) { \
528 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 529 __LINE__, (int)(node), (int)(len))); \
ccb2c380 530 if((node) < 0) { \
551405c4 531 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
532 } else { \
533 RExC_offsets[2*(node)] = (len); \
534 } \
535 } \
536} STMT_END
537
538#define Set_Node_Length(node,len) \
539 Set_Node_Length_To_R((node)-RExC_emit_start, len)
540#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
541#define Set_Node_Cur_Length(node) \
542 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
543
544/* Get offsets and lengths */
545#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
546#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
547
07be1b83
YO
548#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
549 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
550 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
551} STMT_END
552
553
554#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
555#define EXPERIMENTAL_INPLACESCAN
556#endif
557
1de06328 558#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 559DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 560 PerlIO_printf(Perl_debug_log, \
b515a41d 561 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
562 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
563 (int)(depth)*2, "", \
564 (IV)((data)->pos_min), \
565 (IV)((data)->pos_delta), \
566 (IV)((data)->flags), \
567 (IV)((data)->whilem_c), \
568 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
569 ); \
570 if ((data)->last_found) \
571 PerlIO_printf(Perl_debug_log, \
572 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
573 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
574 SvPVX_const((data)->last_found), \
575 (IV)((data)->last_end), \
576 (IV)((data)->last_start_min), \
577 (IV)((data)->last_start_max), \
578 ((data)->longest && \
579 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
580 SvPVX_const((data)->longest_fixed), \
581 (IV)((data)->offset_fixed), \
582 ((data)->longest && \
583 (data)->longest==&((data)->longest_float)) ? "*" : "", \
584 SvPVX_const((data)->longest_float), \
585 (IV)((data)->offset_float_min), \
586 (IV)((data)->offset_float_max) \
587 ); \
588 PerlIO_printf(Perl_debug_log,"\n"); \
589});
590
acfe0abc 591static void clear_re(pTHX_ void *r);
4327152a 592
653099ff 593/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 594 Update the longest found anchored substring and the longest found
653099ff
GS
595 floating substrings if needed. */
596
4327152a 597STATIC void
1de06328 598S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 599{
e1ec3a88
AL
600 const STRLEN l = CHR_SVLEN(data->last_found);
601 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 602 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 603
c277df42 604 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 605 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
606 if (*data->longest == data->longest_fixed) {
607 data->offset_fixed = l ? data->last_start_min : data->pos_min;
608 if (data->flags & SF_BEFORE_EOL)
b81d288d 609 data->flags
c277df42
IZ
610 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
611 else
612 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
613 data->minlen_fixed=minlenp;
614 data->lookbehind_fixed=0;
a0ed51b3
LW
615 }
616 else {
c277df42 617 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
618 data->offset_float_max = (l
619 ? data->last_start_max
c277df42 620 : data->pos_min + data->pos_delta);
9051bda5
HS
621 if ((U32)data->offset_float_max > (U32)I32_MAX)
622 data->offset_float_max = I32_MAX;
c277df42 623 if (data->flags & SF_BEFORE_EOL)
b81d288d 624 data->flags
c277df42
IZ
625 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
626 else
627 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
628 data->minlen_float=minlenp;
629 data->lookbehind_float=0;
c277df42
IZ
630 }
631 }
632 SvCUR_set(data->last_found, 0);
0eda9292 633 {
a28509cc 634 SV * const sv = data->last_found;
097eb12c
AL
635 if (SvUTF8(sv) && SvMAGICAL(sv)) {
636 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
637 if (mg)
638 mg->mg_len = 0;
639 }
0eda9292 640 }
c277df42
IZ
641 data->last_end = -1;
642 data->flags &= ~SF_BEFORE_EOL;
1de06328 643 DEBUG_STUDYDATA(data,0);
c277df42
IZ
644}
645
653099ff
GS
646/* Can match anything (initialization) */
647STATIC void
097eb12c 648S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 649{
653099ff 650 ANYOF_CLASS_ZERO(cl);
f8bef550 651 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 652 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
653 if (LOC)
654 cl->flags |= ANYOF_LOCALE;
655}
656
657/* Can match anything (initialization) */
658STATIC int
5f66b61c 659S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
660{
661 int value;
662
aaa51d5e 663 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
664 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
665 return 1;
1aa99e6b
IH
666 if (!(cl->flags & ANYOF_UNICODE_ALL))
667 return 0;
10edeb5d 668 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 669 return 0;
653099ff
GS
670 return 1;
671}
672
673/* Can match anything (initialization) */
674STATIC void
097eb12c 675S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 676{
8ecf7187 677 Zero(cl, 1, struct regnode_charclass_class);
653099ff 678 cl->type = ANYOF;
830247a4 679 cl_anything(pRExC_state, cl);
653099ff
GS
680}
681
682STATIC void
097eb12c 683S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 684{
8ecf7187 685 Zero(cl, 1, struct regnode_charclass_class);
653099ff 686 cl->type = ANYOF;
830247a4 687 cl_anything(pRExC_state, cl);
653099ff
GS
688 if (LOC)
689 cl->flags |= ANYOF_LOCALE;
690}
691
692/* 'And' a given class with another one. Can create false positives */
693/* We assume that cl is not inverted */
694STATIC void
5f66b61c 695S_cl_and(struct regnode_charclass_class *cl,
a28509cc 696 const struct regnode_charclass_class *and_with)
653099ff 697{
40d049e4
YO
698
699 assert(and_with->type == ANYOF);
653099ff
GS
700 if (!(and_with->flags & ANYOF_CLASS)
701 && !(cl->flags & ANYOF_CLASS)
702 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
703 && !(and_with->flags & ANYOF_FOLD)
704 && !(cl->flags & ANYOF_FOLD)) {
705 int i;
706
707 if (and_with->flags & ANYOF_INVERT)
708 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
709 cl->bitmap[i] &= ~and_with->bitmap[i];
710 else
711 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
712 cl->bitmap[i] &= and_with->bitmap[i];
713 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
714 if (!(and_with->flags & ANYOF_EOS))
715 cl->flags &= ~ANYOF_EOS;
1aa99e6b 716
14ebb1a2
JH
717 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
718 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
719 cl->flags &= ~ANYOF_UNICODE_ALL;
720 cl->flags |= ANYOF_UNICODE;
721 ARG_SET(cl, ARG(and_with));
722 }
14ebb1a2
JH
723 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
724 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 725 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
726 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
727 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 728 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
729}
730
731/* 'OR' a given class with another one. Can create false positives */
732/* We assume that cl is not inverted */
733STATIC void
097eb12c 734S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 735{
653099ff
GS
736 if (or_with->flags & ANYOF_INVERT) {
737 /* We do not use
738 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
739 * <= (B1 | !B2) | (CL1 | !CL2)
740 * which is wasteful if CL2 is small, but we ignore CL2:
741 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
742 * XXXX Can we handle case-fold? Unclear:
743 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
744 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
745 */
746 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
747 && !(or_with->flags & ANYOF_FOLD)
748 && !(cl->flags & ANYOF_FOLD) ) {
749 int i;
750
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] |= ~or_with->bitmap[i];
753 } /* XXXX: logic is complicated otherwise */
754 else {
830247a4 755 cl_anything(pRExC_state, cl);
653099ff
GS
756 }
757 } else {
758 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
759 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 760 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
761 || (cl->flags & ANYOF_FOLD)) ) {
762 int i;
763
764 /* OR char bitmap and class bitmap separately */
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] |= or_with->bitmap[i];
767 if (or_with->flags & ANYOF_CLASS) {
768 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
769 cl->classflags[i] |= or_with->classflags[i];
770 cl->flags |= ANYOF_CLASS;
771 }
772 }
773 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 774 cl_anything(pRExC_state, cl);
653099ff
GS
775 }
776 }
777 if (or_with->flags & ANYOF_EOS)
778 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
779
780 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
781 ARG(cl) != ARG(or_with)) {
782 cl->flags |= ANYOF_UNICODE_ALL;
783 cl->flags &= ~ANYOF_UNICODE;
784 }
785 if (or_with->flags & ANYOF_UNICODE_ALL) {
786 cl->flags |= ANYOF_UNICODE_ALL;
787 cl->flags &= ~ANYOF_UNICODE;
788 }
653099ff
GS
789}
790
a3621e74
YO
791#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
792#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
793#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
794#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
795
3dab1dad
YO
796
797#ifdef DEBUGGING
07be1b83 798/*
3dab1dad
YO
799 dump_trie(trie)
800 dump_trie_interim_list(trie,next_alloc)
801 dump_trie_interim_table(trie,next_alloc)
802
803 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
804 The _interim_ variants are used for debugging the interim
805 tables that are used to generate the final compressed
806 representation which is what dump_trie expects.
807
3dab1dad
YO
808 Part of the reason for their existance is to provide a form
809 of documentation as to how the different representations function.
07be1b83
YO
810
811*/
3dab1dad
YO
812
813/*
814 dump_trie(trie)
815 Dumps the final compressed table form of the trie to Perl_debug_log.
816 Used for debugging make_trie().
817*/
818
819STATIC void
820S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
821{
822 U32 state;
ab3bbdeb
YO
823 SV *sv=sv_newmortal();
824 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
825 GET_RE_DEBUG_FLAGS_DECL;
826
ab3bbdeb 827
3dab1dad
YO
828 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
829 (int)depth * 2 + 2,"",
830 "Match","Base","Ofs" );
831
832 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 833 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 834 if ( tmp ) {
ab3bbdeb
YO
835 PerlIO_printf( Perl_debug_log, "%*s",
836 colwidth,
ddc5bc0f 837 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
838 PL_colors[0], PL_colors[1],
839 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
840 PERL_PV_ESCAPE_FIRSTCHAR
841 )
842 );
3dab1dad
YO
843 }
844 }
845 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
846 (int)depth * 2 + 2,"");
847
848 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "\n");
851
1e2e3d02 852 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 853 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
854
855 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
856
857 if ( trie->states[ state ].wordnum ) {
858 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
859 } else {
860 PerlIO_printf( Perl_debug_log, "%6s", "" );
861 }
862
863 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
864
865 if ( base ) {
866 U32 ofs = 0;
867
868 while( ( base + ofs < trie->uniquecharcount ) ||
869 ( base + ofs - trie->uniquecharcount < trie->lasttrans
870 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
871 ofs++;
872
873 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
874
875 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
876 if ( ( base + ofs >= trie->uniquecharcount ) &&
877 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
878 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
879 {
ab3bbdeb
YO
880 PerlIO_printf( Perl_debug_log, "%*"UVXf,
881 colwidth,
3dab1dad
YO
882 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
883 } else {
ab3bbdeb 884 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
885 }
886 }
887
888 PerlIO_printf( Perl_debug_log, "]");
889
890 }
891 PerlIO_printf( Perl_debug_log, "\n" );
892 }
893}
894/*
895 dump_trie_interim_list(trie,next_alloc)
896 Dumps a fully constructed but uncompressed trie in list form.
897 List tries normally only are used for construction when the number of
898 possible chars (trie->uniquecharcount) is very high.
899 Used for debugging make_trie().
900*/
901STATIC void
902S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
903{
904 U32 state;
ab3bbdeb
YO
905 SV *sv=sv_newmortal();
906 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
907 GET_RE_DEBUG_FLAGS_DECL;
908 /* print out the table precompression. */
ab3bbdeb
YO
909 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
910 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
911 "------:-----+-----------------\n" );
3dab1dad
YO
912
913 for( state=1 ; state < next_alloc ; state ++ ) {
914 U16 charid;
915
ab3bbdeb 916 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
917 (int)depth * 2 + 2,"", (UV)state );
918 if ( ! trie->states[ state ].wordnum ) {
919 PerlIO_printf( Perl_debug_log, "%5s| ","");
920 } else {
921 PerlIO_printf( Perl_debug_log, "W%4x| ",
922 trie->states[ state ].wordnum
923 );
924 }
925 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 926 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
927 if ( tmp ) {
928 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
929 colwidth,
ddc5bc0f 930 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
931 PL_colors[0], PL_colors[1],
932 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
933 PERL_PV_ESCAPE_FIRSTCHAR
934 ) ,
1e2e3d02
YO
935 TRIE_LIST_ITEM(state,charid).forid,
936 (UV)TRIE_LIST_ITEM(state,charid).newstate
937 );
938 if (!(charid % 10))
664e119d
RGS
939 PerlIO_printf(Perl_debug_log, "\n%*s| ",
940 (int)((depth * 2) + 14), "");
1e2e3d02 941 }
ab3bbdeb
YO
942 }
943 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
944 }
945}
946
947/*
948 dump_trie_interim_table(trie,next_alloc)
949 Dumps a fully constructed but uncompressed trie in table form.
950 This is the normal DFA style state transition table, with a few
951 twists to facilitate compression later.
952 Used for debugging make_trie().
953*/
954STATIC void
955S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
956{
957 U32 state;
958 U16 charid;
ab3bbdeb
YO
959 SV *sv=sv_newmortal();
960 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
961 GET_RE_DEBUG_FLAGS_DECL;
962
963 /*
964 print out the table precompression so that we can do a visual check
965 that they are identical.
966 */
967
968 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
969
970 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 971 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 972 if ( tmp ) {
ab3bbdeb
YO
973 PerlIO_printf( Perl_debug_log, "%*s",
974 colwidth,
ddc5bc0f 975 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
976 PL_colors[0], PL_colors[1],
977 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
978 PERL_PV_ESCAPE_FIRSTCHAR
979 )
980 );
3dab1dad
YO
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 987 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
999 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1000 if (v)
1001 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1002 else
1003 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1004 }
1005 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1006 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1007 } else {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1009 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1010 }
1011 }
07be1b83 1012}
3dab1dad
YO
1013
1014#endif
1015
786e8c11
YO
1016/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1017 startbranch: the first branch in the whole branch sequence
1018 first : start branch of sequence of branch-exact nodes.
1019 May be the same as startbranch
1020 last : Thing following the last branch.
1021 May be the same as tail.
1022 tail : item following the branch sequence
1023 count : words in the sequence
1024 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1025 depth : indent depth
3dab1dad 1026
786e8c11 1027Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1028
786e8c11
YO
1029A trie is an N'ary tree where the branches are determined by digital
1030decomposition of the key. IE, at the root node you look up the 1st character and
1031follow that branch repeat until you find the end of the branches. Nodes can be
1032marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1033
786e8c11 1034 /he|she|his|hers/
72f13be8 1035
786e8c11
YO
1036would convert into the following structure. Numbers represent states, letters
1037following numbers represent valid transitions on the letter from that state, if
1038the number is in square brackets it represents an accepting state, otherwise it
1039will be in parenthesis.
07be1b83 1040
786e8c11
YO
1041 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1042 | |
1043 | (2)
1044 | |
1045 (1) +-i->(6)-+-s->[7]
1046 |
1047 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1048
786e8c11
YO
1049 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1050
1051This shows that when matching against the string 'hers' we will begin at state 1
1052read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1053then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1054is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1055single traverse. We store a mapping from accepting to state to which word was
1056matched, and then when we have multiple possibilities we try to complete the
1057rest of the regex in the order in which they occured in the alternation.
1058
1059The only prior NFA like behaviour that would be changed by the TRIE support is
1060the silent ignoring of duplicate alternations which are of the form:
1061
1062 / (DUPE|DUPE) X? (?{ ... }) Y /x
1063
1064Thus EVAL blocks follwing a trie may be called a different number of times with
1065and without the optimisation. With the optimisations dupes will be silently
1066ignored. This inconsistant behaviour of EVAL type nodes is well established as
1067the following demonstrates:
1068
1069 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1070
1071which prints out 'word' three times, but
1072
1073 'words'=~/(word|word|word)(?{ print $1 })S/
1074
1075which doesnt print it out at all. This is due to other optimisations kicking in.
1076
1077Example of what happens on a structural level:
1078
1079The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1080
1081 1: CURLYM[1] {1,32767}(18)
1082 5: BRANCH(8)
1083 6: EXACT <ac>(16)
1084 8: BRANCH(11)
1085 9: EXACT <ad>(16)
1086 11: BRANCH(14)
1087 12: EXACT <ab>(16)
1088 16: SUCCEED(0)
1089 17: NOTHING(18)
1090 18: END(0)
1091
1092This would be optimizable with startbranch=5, first=5, last=16, tail=16
1093and should turn into:
1094
1095 1: CURLYM[1] {1,32767}(18)
1096 5: TRIE(16)
1097 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 <ac>
1099 <ad>
1100 <ab>
1101 16: SUCCEED(0)
1102 17: NOTHING(18)
1103 18: END(0)
1104
1105Cases where tail != last would be like /(?foo|bar)baz/:
1106
1107 1: BRANCH(4)
1108 2: EXACT <foo>(8)
1109 4: BRANCH(7)
1110 5: EXACT <bar>(8)
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115which would be optimizable with startbranch=1, first=1, last=7, tail=8
1116and would end up looking like:
1117
1118 1: TRIE(8)
1119 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1120 <foo>
1121 <bar>
1122 7: TAIL(8)
1123 8: EXACT <baz>(10)
1124 10: END(0)
1125
1126 d = uvuni_to_utf8_flags(d, uv, 0);
1127
1128is the recommended Unicode-aware way of saying
1129
1130 *(d++) = uv;
1131*/
1132
1e2e3d02 1133#define TRIE_STORE_REVCHAR \
786e8c11 1134 STMT_START { \
1e2e3d02 1135 SV *tmp = newSVpvs(""); \
786e8c11 1136 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1137 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1138 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1139 } STMT_END
1140
1141#define TRIE_READ_CHAR STMT_START { \
1142 wordlen++; \
1143 if ( UTF ) { \
1144 if ( folder ) { \
1145 if ( foldlen > 0 ) { \
1146 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1147 foldlen -= len; \
1148 scan += len; \
1149 len = 0; \
1150 } else { \
1151 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1152 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1153 foldlen -= UNISKIP( uvc ); \
1154 scan = foldbuf + UNISKIP( uvc ); \
1155 } \
1156 } else { \
1157 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158 } \
1159 } else { \
1160 uvc = (U32)*uc; \
1161 len = 1; \
1162 } \
1163} STMT_END
1164
1165
1166
1167#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1168 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1169 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1170 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1171 } \
1172 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1173 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1174 TRIE_LIST_CUR( state )++; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_LIST_NEW(state) STMT_START { \
1178 Newxz( trie->states[ state ].trans.list, \
1179 4, reg_trie_trans_le ); \
1180 TRIE_LIST_CUR( state ) = 1; \
1181 TRIE_LIST_LEN( state ) = 4; \
1182} STMT_END
07be1b83 1183
786e8c11
YO
1184#define TRIE_HANDLE_WORD(state) STMT_START { \
1185 U16 dupe= trie->states[ state ].wordnum; \
1186 regnode * const noper_next = regnext( noper ); \
1187 \
1188 if (trie->wordlen) \
1189 trie->wordlen[ curword ] = wordlen; \
1190 DEBUG_r({ \
1191 /* store the word for dumping */ \
1192 SV* tmp; \
1193 if (OP(noper) != NOTHING) \
1194 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1195 else \
1196 tmp = newSVpvn( "", 0 ); \
1197 if ( UTF ) SvUTF8_on( tmp ); \
1198 av_push( trie->words, tmp ); \
1199 }); \
1200 \
1201 curword++; \
1202 \
1203 if ( noper_next < tail ) { \
1204 if (!trie->jump) \
1205 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1206 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1207 if (!jumper) \
1208 jumper = noper_next; \
1209 if (!nextbranch) \
1210 nextbranch= regnext(cur); \
1211 } \
1212 \
1213 if ( dupe ) { \
1214 /* So it's a dupe. This means we need to maintain a */\
1215 /* linked-list from the first to the next. */\
1216 /* we only allocate the nextword buffer when there */\
1217 /* a dupe, so first time we have to do the allocation */\
1218 if (!trie->nextword) \
1219 Newxz( trie->nextword, word_count + 1, U16); \
1220 while ( trie->nextword[dupe] ) \
1221 dupe= trie->nextword[dupe]; \
1222 trie->nextword[dupe]= curword; \
1223 } else { \
1224 /* we haven't inserted this word yet. */ \
1225 trie->states[ state ].wordnum = curword; \
1226 } \
1227} STMT_END
07be1b83 1228
3dab1dad 1229
786e8c11
YO
1230#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1231 ( ( base + charid >= ucharcount \
1232 && base + charid < ubound \
1233 && state == trie->trans[ base - ucharcount + charid ].check \
1234 && trie->trans[ base - ucharcount + charid ].next ) \
1235 ? trie->trans[ base - ucharcount + charid ].next \
1236 : ( state==1 ? special : 0 ) \
1237 )
3dab1dad 1238
786e8c11
YO
1239#define MADE_TRIE 1
1240#define MADE_JUMP_TRIE 2
1241#define MADE_EXACT_TRIE 4
3dab1dad 1242
a3621e74 1243STATIC I32
786e8c11 1244S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1245{
27da23d5 1246 dVAR;
a3621e74
YO
1247 /* first pass, loop through and scan words */
1248 reg_trie_data *trie;
1249 regnode *cur;
9f7f3913 1250 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1251 STRLEN len = 0;
1252 UV uvc = 0;
1253 U16 curword = 0;
1254 U32 next_alloc = 0;
786e8c11
YO
1255 regnode *jumper = NULL;
1256 regnode *nextbranch = NULL;
7f69552c 1257 regnode *convert = NULL;
a3621e74 1258 /* we just use folder as a flag in utf8 */
e1ec3a88 1259 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1260 ? PL_fold
1261 : ( flags == EXACTFL
1262 ? PL_fold_locale
1263 : NULL
1264 )
1265 );
1266
e1ec3a88 1267 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1268 SV *re_trie_maxbuff;
3dab1dad
YO
1269#ifndef DEBUGGING
1270 /* these are only used during construction but are useful during
8e11feef 1271 * debugging so we store them in the struct when debugging.
8e11feef 1272 */
3dab1dad 1273 STRLEN trie_charcount=0;
3dab1dad
YO
1274 AV *trie_revcharmap;
1275#endif
a3621e74 1276 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1277#ifndef DEBUGGING
1278 PERL_UNUSED_ARG(depth);
1279#endif
a3621e74 1280
a02a5408 1281 Newxz( trie, 1, reg_trie_data );
a3621e74 1282 trie->refcount = 1;
3dab1dad 1283 trie->startstate = 1;
786e8c11 1284 trie->wordcount = word_count;
a3621e74 1285 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1286 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1287 if (!(UTF && folder))
1288 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1289 DEBUG_r({
1290 trie->words = newAV();
a3621e74 1291 });
3dab1dad 1292 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1293
0111c4fd 1294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1295 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1297 }
3dab1dad
YO
1298 DEBUG_OPTIMISE_r({
1299 PerlIO_printf( Perl_debug_log,
786e8c11 1300 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1301 (int)depth * 2 + 2, "",
1302 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1303 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1304 (int)depth);
3dab1dad 1305 });
7f69552c
YO
1306
1307 /* Find the node we are going to overwrite */
1308 if ( first == startbranch && OP( last ) != BRANCH ) {
1309 /* whole branch chain */
1310 convert = first;
1311 } else {
1312 /* branch sub-chain */
1313 convert = NEXTOPER( first );
1314 }
1315
a3621e74
YO
1316 /* -- First loop and Setup --
1317
1318 We first traverse the branches and scan each word to determine if it
1319 contains widechars, and how many unique chars there are, this is
1320 important as we have to build a table with at least as many columns as we
1321 have unique chars.
1322
1323 We use an array of integers to represent the character codes 0..255
1324 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1325 native representation of the character value as the key and IV's for the
1326 coded index.
1327
1328 *TODO* If we keep track of how many times each character is used we can
1329 remap the columns so that the table compression later on is more
1330 efficient in terms of memory by ensuring most common value is in the
1331 middle and the least common are on the outside. IMO this would be better
1332 than a most to least common mapping as theres a decent chance the most
1333 common letter will share a node with the least common, meaning the node
1334 will not be compressable. With a middle is most common approach the worst
1335 case is when we have the least common nodes twice.
1336
1337 */
1338
a3621e74 1339 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1340 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1341 const U8 *uc = (U8*)STRING( noper );
a28509cc 1342 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1343 STRLEN foldlen = 0;
1344 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1345 const U8 *scan = (U8*)NULL;
07be1b83 1346 U32 wordlen = 0; /* required init */
3dab1dad 1347 STRLEN chars=0;
a3621e74 1348
3dab1dad
YO
1349 if (OP(noper) == NOTHING) {
1350 trie->minlen= 0;
1351 continue;
1352 }
1353 if (trie->bitmap) {
1354 TRIE_BITMAP_SET(trie,*uc);
1355 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1356 }
a3621e74 1357 for ( ; uc < e ; uc += len ) {
3dab1dad 1358 TRIE_CHARCOUNT(trie)++;
a3621e74 1359 TRIE_READ_CHAR;
3dab1dad 1360 chars++;
a3621e74
YO
1361 if ( uvc < 256 ) {
1362 if ( !trie->charmap[ uvc ] ) {
1363 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1364 if ( folder )
1365 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1366 TRIE_STORE_REVCHAR;
a3621e74
YO
1367 }
1368 } else {
1369 SV** svpp;
1370 if ( !trie->widecharmap )
1371 trie->widecharmap = newHV();
1372
1373 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1374
1375 if ( !svpp )
e4584336 1376 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1377
1378 if ( !SvTRUE( *svpp ) ) {
1379 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1380 TRIE_STORE_REVCHAR;
a3621e74
YO
1381 }
1382 }
1383 }
3dab1dad
YO
1384 if( cur == first ) {
1385 trie->minlen=chars;
1386 trie->maxlen=chars;
1387 } else if (chars < trie->minlen) {
1388 trie->minlen=chars;
1389 } else if (chars > trie->maxlen) {
1390 trie->maxlen=chars;
1391 }
1392
a3621e74
YO
1393 } /* end first pass */
1394 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1395 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1396 (int)depth * 2 + 2,"",
85c3142d 1397 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1398 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1399 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1400 );
786e8c11 1401 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1402
1403 /*
1404 We now know what we are dealing with in terms of unique chars and
1405 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1406 representation using a flat table will take. If it's over a reasonable
1407 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1408 conservative but potentially much slower representation using an array
1409 of lists.
1410
1411 At the end we convert both representations into the same compressed
1412 form that will be used in regexec.c for matching with. The latter
1413 is a form that cannot be used to construct with but has memory
1414 properties similar to the list form and access properties similar
1415 to the table form making it both suitable for fast searches and
1416 small enough that its feasable to store for the duration of a program.
1417
1418 See the comment in the code where the compressed table is produced
1419 inplace from the flat tabe representation for an explanation of how
1420 the compression works.
1421
1422 */
1423
1424
3dab1dad 1425 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1426 /*
1427 Second Pass -- Array Of Lists Representation
1428
1429 Each state will be represented by a list of charid:state records
1430 (reg_trie_trans_le) the first such element holds the CUR and LEN
1431 points of the allocated array. (See defines above).
1432
1433 We build the initial structure using the lists, and then convert
1434 it into the compressed table form which allows faster lookups
1435 (but cant be modified once converted).
a3621e74
YO
1436 */
1437
a3621e74
YO
1438 STRLEN transcount = 1;
1439
1e2e3d02
YO
1440 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1441 "%*sCompiling trie using list compiler\n",
1442 (int)depth * 2 + 2, ""));
1443
3dab1dad 1444 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1445 TRIE_LIST_NEW(1);
1446 next_alloc = 2;
1447
1448 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449
c445ea15
AL
1450 regnode * const noper = NEXTOPER( cur );
1451 U8 *uc = (U8*)STRING( noper );
1452 const U8 * const e = uc + STR_LEN( noper );
1453 U32 state = 1; /* required init */
1454 U16 charid = 0; /* sanity init */
1455 U8 *scan = (U8*)NULL; /* sanity init */
1456 STRLEN foldlen = 0; /* required init */
07be1b83 1457 U32 wordlen = 0; /* required init */
c445ea15
AL
1458 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459
3dab1dad 1460 if (OP(noper) != NOTHING) {
786e8c11 1461 for ( ; uc < e ; uc += len ) {
c445ea15 1462
786e8c11 1463 TRIE_READ_CHAR;
c445ea15 1464
786e8c11
YO
1465 if ( uvc < 256 ) {
1466 charid = trie->charmap[ uvc ];
c445ea15 1467 } else {
786e8c11
YO
1468 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1469 if ( !svpp ) {
1470 charid = 0;
1471 } else {
1472 charid=(U16)SvIV( *svpp );
1473 }
c445ea15 1474 }
786e8c11
YO
1475 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1476 if ( charid ) {
a3621e74 1477
786e8c11
YO
1478 U16 check;
1479 U32 newstate = 0;
a3621e74 1480
786e8c11
YO
1481 charid--;
1482 if ( !trie->states[ state ].trans.list ) {
1483 TRIE_LIST_NEW( state );
c445ea15 1484 }
786e8c11
YO
1485 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1486 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1487 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1488 break;
1489 }
1490 }
1491 if ( ! newstate ) {
1492 newstate = next_alloc++;
1493 TRIE_LIST_PUSH( state, charid, newstate );
1494 transcount++;
1495 }
1496 state = newstate;
1497 } else {
1498 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1499 }
a28509cc 1500 }
c445ea15 1501 }
3dab1dad 1502 TRIE_HANDLE_WORD(state);
a3621e74
YO
1503
1504 } /* end second pass */
1505
1e2e3d02
YO
1506 /* next alloc is the NEXT state to be allocated */
1507 trie->statecount = next_alloc;
a3621e74
YO
1508 Renew( trie->states, next_alloc, reg_trie_state );
1509
3dab1dad
YO
1510 /* and now dump it out before we compress it */
1511 DEBUG_TRIE_COMPILE_MORE_r(
1512 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1513 );
a3621e74 1514
a02a5408 1515 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1516 {
1517 U32 state;
a3621e74
YO
1518 U32 tp = 0;
1519 U32 zp = 0;
1520
1521
1522 for( state=1 ; state < next_alloc ; state ++ ) {
1523 U32 base=0;
1524
1525 /*
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1528 );
1529 */
1530
1531 if (trie->states[state].trans.list) {
1532 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1533 U16 maxid=minid;
a28509cc 1534 U16 idx;
a3621e74
YO
1535
1536 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1537 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1538 if ( forid < minid ) {
1539 minid=forid;
1540 } else if ( forid > maxid ) {
1541 maxid=forid;
1542 }
a3621e74
YO
1543 }
1544 if ( transcount < tp + maxid - minid + 1) {
1545 transcount *= 2;
1546 Renew( trie->trans, transcount, reg_trie_trans );
1547 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1548 }
1549 base = trie->uniquecharcount + tp - minid;
1550 if ( maxid == minid ) {
1551 U32 set = 0;
1552 for ( ; zp < tp ; zp++ ) {
1553 if ( ! trie->trans[ zp ].next ) {
1554 base = trie->uniquecharcount + zp - minid;
1555 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ zp ].check = state;
1557 set = 1;
1558 break;
1559 }
1560 }
1561 if ( !set ) {
1562 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1563 trie->trans[ tp ].check = state;
1564 tp++;
1565 zp = tp;
1566 }
1567 } else {
1568 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1569 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1570 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1571 trie->trans[ tid ].check = state;
1572 }
1573 tp += ( maxid - minid + 1 );
1574 }
1575 Safefree(trie->states[ state ].trans.list);
1576 }
1577 /*
1578 DEBUG_TRIE_COMPILE_MORE_r(
1579 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1580 );
1581 */
1582 trie->states[ state ].trans.base=base;
1583 }
cc601c31 1584 trie->lasttrans = tp + 1;
a3621e74
YO
1585 }
1586 } else {
1587 /*
1588 Second Pass -- Flat Table Representation.
1589
1590 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1591 We know that we will need Charcount+1 trans at most to store the data
1592 (one row per char at worst case) So we preallocate both structures
1593 assuming worst case.
1594
1595 We then construct the trie using only the .next slots of the entry
1596 structs.
1597
1598 We use the .check field of the first entry of the node temporarily to
1599 make compression both faster and easier by keeping track of how many non
1600 zero fields are in the node.
1601
1602 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1603 transition.
1604
1605 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1606 number representing the first entry of the node, and state as a
1607 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1608 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1609 are 2 entrys per node. eg:
1610
1611 A B A B
1612 1. 2 4 1. 3 7
1613 2. 0 3 3. 0 5
1614 3. 0 0 5. 0 0
1615 4. 0 0 7. 0 0
1616
1617 The table is internally in the right hand, idx form. However as we also
1618 have to deal with the states array which is indexed by nodenum we have to
1619 use TRIE_NODENUM() to convert.
1620
1621 */
1e2e3d02
YO
1622 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1623 "%*sCompiling trie using table compiler\n",
1624 (int)depth * 2 + 2, ""));
3dab1dad
YO
1625
1626 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1627 reg_trie_trans );
3dab1dad 1628 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1629 next_alloc = trie->uniquecharcount + 1;
1630
3dab1dad 1631
a3621e74
YO
1632 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1633
c445ea15 1634 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1635 const U8 *uc = (U8*)STRING( noper );
1636 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1637
1638 U32 state = 1; /* required init */
1639
1640 U16 charid = 0; /* sanity init */
1641 U32 accept_state = 0; /* sanity init */
1642 U8 *scan = (U8*)NULL; /* sanity init */
1643
1644 STRLEN foldlen = 0; /* required init */
07be1b83 1645 U32 wordlen = 0; /* required init */
a3621e74
YO
1646 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1647
3dab1dad 1648 if ( OP(noper) != NOTHING ) {
786e8c11 1649 for ( ; uc < e ; uc += len ) {
a3621e74 1650
786e8c11 1651 TRIE_READ_CHAR;
a3621e74 1652
786e8c11
YO
1653 if ( uvc < 256 ) {
1654 charid = trie->charmap[ uvc ];
1655 } else {
1656 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1657 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1658 }
786e8c11
YO
1659 if ( charid ) {
1660 charid--;
1661 if ( !trie->trans[ state + charid ].next ) {
1662 trie->trans[ state + charid ].next = next_alloc;
1663 trie->trans[ state ].check++;
1664 next_alloc += trie->uniquecharcount;
1665 }
1666 state = trie->trans[ state + charid ].next;
1667 } else {
1668 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1669 }
1670 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1671 }
a3621e74 1672 }
3dab1dad
YO
1673 accept_state = TRIE_NODENUM( state );
1674 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1675
1676 } /* end second pass */
1677
3dab1dad
YO
1678 /* and now dump it out before we compress it */
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 dump_trie_interim_table(trie,next_alloc,depth+1)
1681 );
a3621e74 1682
a3621e74
YO
1683 {
1684 /*
1685 * Inplace compress the table.*
1686
1687 For sparse data sets the table constructed by the trie algorithm will
1688 be mostly 0/FAIL transitions or to put it another way mostly empty.
1689 (Note that leaf nodes will not contain any transitions.)
1690
1691 This algorithm compresses the tables by eliminating most such
1692 transitions, at the cost of a modest bit of extra work during lookup:
1693
1694 - Each states[] entry contains a .base field which indicates the
1695 index in the state[] array wheres its transition data is stored.
1696
1697 - If .base is 0 there are no valid transitions from that node.
1698
1699 - If .base is nonzero then charid is added to it to find an entry in
1700 the trans array.
1701
1702 -If trans[states[state].base+charid].check!=state then the
1703 transition is taken to be a 0/Fail transition. Thus if there are fail
1704 transitions at the front of the node then the .base offset will point
1705 somewhere inside the previous nodes data (or maybe even into a node
1706 even earlier), but the .check field determines if the transition is
1707 valid.
1708
786e8c11 1709 XXX - wrong maybe?
a3621e74
YO
1710 The following process inplace converts the table to the compressed
1711 table: We first do not compress the root node 1,and mark its all its
1712 .check pointers as 1 and set its .base pointer as 1 as well. This
1713 allows to do a DFA construction from the compressed table later, and
1714 ensures that any .base pointers we calculate later are greater than
1715 0.
1716
1717 - We set 'pos' to indicate the first entry of the second node.
1718
1719 - We then iterate over the columns of the node, finding the first and
1720 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1721 and set the .check pointers accordingly, and advance pos
1722 appropriately and repreat for the next node. Note that when we copy
1723 the next pointers we have to convert them from the original
1724 NODEIDX form to NODENUM form as the former is not valid post
1725 compression.
1726
1727 - If a node has no transitions used we mark its base as 0 and do not
1728 advance the pos pointer.
1729
1730 - If a node only has one transition we use a second pointer into the
1731 structure to fill in allocated fail transitions from other states.
1732 This pointer is independent of the main pointer and scans forward
1733 looking for null transitions that are allocated to a state. When it
1734 finds one it writes the single transition into the "hole". If the
786e8c11 1735 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1736
1737 - Once compressed we can Renew/realloc the structures to release the
1738 excess space.
1739
1740 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1741 specifically Fig 3.47 and the associated pseudocode.
1742
1743 demq
1744 */
a3b680e6 1745 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1746 U32 state, charid;
a3621e74 1747 U32 pos = 0, zp=0;
1e2e3d02 1748 trie->statecount = laststate;
a3621e74
YO
1749
1750 for ( state = 1 ; state < laststate ; state++ ) {
1751 U8 flag = 0;
a28509cc
AL
1752 const U32 stateidx = TRIE_NODEIDX( state );
1753 const U32 o_used = trie->trans[ stateidx ].check;
1754 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1755 trie->trans[ stateidx ].check = 0;
1756
1757 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1758 if ( flag || trie->trans[ stateidx + charid ].next ) {
1759 if ( trie->trans[ stateidx + charid ].next ) {
1760 if (o_used == 1) {
1761 for ( ; zp < pos ; zp++ ) {
1762 if ( ! trie->trans[ zp ].next ) {
1763 break;
1764 }
1765 }
1766 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1767 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ zp ].check = state;
1769 if ( ++zp > pos ) pos = zp;
1770 break;
1771 }
1772 used--;
1773 }
1774 if ( !flag ) {
1775 flag = 1;
1776 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1777 }
1778 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ pos ].check = state;
1780 pos++;
1781 }
1782 }
1783 }
cc601c31 1784 trie->lasttrans = pos + 1;
1e2e3d02 1785 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1786 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1787 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1788 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1789 (int)depth * 2 + 2,"",
1790 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1791 (IV)next_alloc,
1792 (IV)pos,
a3621e74
YO
1793 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1794 );
1795
1796 } /* end table compress */
1797 }
1e2e3d02
YO
1798 DEBUG_TRIE_COMPILE_MORE_r(
1799 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1800 (int)depth * 2 + 2, "",
1801 (UV)trie->statecount,
1802 (UV)trie->lasttrans)
1803 );
cc601c31
YO
1804 /* resize the trans array to remove unused space */
1805 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1806
3dab1dad
YO
1807 /* and now dump out the compressed format */
1808 DEBUG_TRIE_COMPILE_r(
1809 dump_trie(trie,depth+1)
1810 );
07be1b83 1811
3dab1dad 1812 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1813 U8 nodetype =(U8)(flags & 0xFF);
1814 char *str=NULL;
786e8c11 1815
07be1b83 1816#ifdef DEBUGGING
e62cc96a 1817 regnode *optimize = NULL;
b57a0404
JH
1818 U32 mjd_offset = 0;
1819 U32 mjd_nodelen = 0;
07be1b83 1820#endif
a3621e74 1821 /*
3dab1dad
YO
1822 This means we convert either the first branch or the first Exact,
1823 depending on whether the thing following (in 'last') is a branch
1824 or not and whther first is the startbranch (ie is it a sub part of
1825 the alternation or is it the whole thing.)
1826 Assuming its a sub part we conver the EXACT otherwise we convert
1827 the whole branch sequence, including the first.
a3621e74 1828 */
3dab1dad 1829 /* Find the node we are going to overwrite */
7f69552c 1830 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1831 /* branch sub-chain */
3dab1dad 1832 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1833 DEBUG_r({
1834 mjd_offset= Node_Offset((convert));
1835 mjd_nodelen= Node_Length((convert));
1836 });
7f69552c
YO
1837 /* whole branch chain */
1838 } else {
1839 DEBUG_r({
1840 const regnode *nop = NEXTOPER( convert );
1841 mjd_offset= Node_Offset((nop));
1842 mjd_nodelen= Node_Length((nop));
1843 });
07be1b83 1844 }
7f69552c 1845
07be1b83
YO
1846 DEBUG_OPTIMISE_r(
1847 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1848 (int)depth * 2 + 2, "",
786e8c11 1849 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1850 );
a3621e74 1851
3dab1dad
YO
1852 /* But first we check to see if there is a common prefix we can
1853 split out as an EXACT and put in front of the TRIE node. */
1854 trie->startstate= 1;
786e8c11 1855 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1856 U32 state;
1e2e3d02 1857 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1858 U32 ofs = 0;
8e11feef
RGS
1859 I32 idx = -1;
1860 U32 count = 0;
1861 const U32 base = trie->states[ state ].trans.base;
a3621e74 1862
3dab1dad 1863 if ( trie->states[state].wordnum )
8e11feef 1864 count = 1;
a3621e74 1865
8e11feef 1866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1867 if ( ( base + ofs >= trie->uniquecharcount ) &&
1868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1870 {
3dab1dad 1871 if ( ++count > 1 ) {
8e11feef 1872 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1873 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874 if ( state == 1 ) break;
3dab1dad
YO
1875 if ( count == 2 ) {
1876 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1877 DEBUG_OPTIMISE_r(
8e11feef
RGS
1878 PerlIO_printf(Perl_debug_log,
1879 "%*sNew Start State=%"UVuf" Class: [",
1880 (int)depth * 2 + 2, "",
786e8c11 1881 (UV)state));
be8e71aa
YO
1882 if (idx >= 0) {
1883 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1884 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1885
3dab1dad 1886 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1887 if ( folder )
1888 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1889 DEBUG_OPTIMISE_r(
07be1b83 1890 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1891 );
8e11feef
RGS
1892 }
1893 }
1894 TRIE_BITMAP_SET(trie,*ch);
1895 if ( folder )
1896 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1897 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1898 }
1899 idx = ofs;
1900 }
3dab1dad
YO
1901 }
1902 if ( count == 1 ) {
1903 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1904 char *ch = SvPV_nolen( *tmp );
1905 DEBUG_OPTIMISE_r({
1906 SV *sv=sv_newmortal();
8e11feef
RGS
1907 PerlIO_printf( Perl_debug_log,
1908 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1909 (int)depth * 2 + 2, "",
de734bd5
A
1910 (UV)state, (UV)idx,
1911 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1912 PL_colors[0], PL_colors[1],
1913 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1914 PERL_PV_ESCAPE_FIRSTCHAR
1915 )
1916 );
1917 });
3dab1dad
YO
1918 if ( state==1 ) {
1919 OP( convert ) = nodetype;
1920 str=STRING(convert);
1921 STR_LEN(convert)=0;
1922 }
de734bd5
A
1923 while (*ch) {
1924 *str++ = *ch++;
1925 STR_LEN(convert)++;
1926 }
1927
8e11feef 1928 } else {
f9049ba1 1929#ifdef DEBUGGING
8e11feef
RGS
1930 if (state>1)
1931 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1932#endif
8e11feef
RGS
1933 break;
1934 }
1935 }
3dab1dad 1936 if (str) {
8e11feef 1937 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1938 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1939 trie->startstate = state;
07be1b83
YO
1940 trie->minlen -= (state - 1);
1941 trie->maxlen -= (state - 1);
1942 DEBUG_r({
1943 regnode *fix = convert;
de734bd5 1944 U32 word = trie->wordcount;
07be1b83
YO
1945 mjd_nodelen++;
1946 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1947 while( ++fix < n ) {
1948 Set_Node_Offset_Length(fix, 0, 0);
1949 }
de734bd5
A
1950 while (word--) {
1951 SV ** const tmp = av_fetch( trie->words, word, 0 );
1952 if (tmp) {
1953 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1954 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1955 else
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1957 }
1958 }
07be1b83 1959 });
8e11feef
RGS
1960 if (trie->maxlen) {
1961 convert = n;
1962 } else {
3dab1dad 1963 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1964 DEBUG_r(optimize= n);
3dab1dad
YO
1965 }
1966 }
1967 }
a5ca303d
YO
1968 if (!jumper)
1969 jumper = last;
3dab1dad 1970 if ( trie->maxlen ) {
8e11feef
RGS
1971 NEXT_OFF( convert ) = (U16)(tail - convert);
1972 ARG_SET( convert, data_slot );
786e8c11
YO
1973 /* Store the offset to the first unabsorbed branch in
1974 jump[0], which is otherwise unused by the jump logic.
1975 We use this when dumping a trie and during optimisation. */
1976 if (trie->jump)
7f69552c 1977 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1978
786e8c11
YO
1979 /* XXXX */
1980 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1981 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1982 {
1983 OP( convert ) = TRIEC;
1984 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1985 Safefree(trie->bitmap);
1986 trie->bitmap= NULL;
1987 } else
1988 OP( convert ) = TRIE;
a3621e74 1989
3dab1dad
YO
1990 /* store the type in the flags */
1991 convert->flags = nodetype;
a5ca303d
YO
1992 DEBUG_r({
1993 optimize = convert
1994 + NODE_STEP_REGNODE
1995 + regarglen[ OP( convert ) ];
1996 });
1997 /* XXX We really should free up the resource in trie now,
1998 as we won't use them - (which resources?) dmq */
3dab1dad 1999 }
a3621e74 2000 /* needed for dumping*/
e62cc96a 2001 DEBUG_r(if (optimize) {
07be1b83 2002 regnode *opt = convert;
e62cc96a 2003 while ( ++opt < optimize) {
07be1b83
YO
2004 Set_Node_Offset_Length(opt,0,0);
2005 }
786e8c11
YO
2006 /*
2007 Try to clean up some of the debris left after the
2008 optimisation.
a3621e74 2009 */
786e8c11 2010 while( optimize < jumper ) {
07be1b83 2011 mjd_nodelen += Node_Length((optimize));
a3621e74 2012 OP( optimize ) = OPTIMIZED;
07be1b83 2013 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2014 optimize++;
2015 }
07be1b83 2016 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2017 });
2018 } /* end node insert */
07be1b83 2019#ifndef DEBUGGING
6e8b4190 2020 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2021#endif
786e8c11
YO
2022 return trie->jump
2023 ? MADE_JUMP_TRIE
2024 : trie->startstate>1
2025 ? MADE_EXACT_TRIE
2026 : MADE_TRIE;
2027}
2028
2029STATIC void
2030S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2031{
2032/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2033
2034 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2035 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2036 ISBN 0-201-10088-6
2037
2038 We find the fail state for each state in the trie, this state is the longest proper
2039 suffix of the current states 'word' that is also a proper prefix of another word in our
2040 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2041 the DFA not to have to restart after its tried and failed a word at a given point, it
2042 simply continues as though it had been matching the other word in the first place.
2043 Consider
2044 'abcdgu'=~/abcdefg|cdgu/
2045 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2046 fail, which would bring use to the state representing 'd' in the second word where we would
2047 try 'g' and succeed, prodceding to match 'cdgu'.
2048 */
2049 /* add a fail transition */
2050 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2051 U32 *q;
2052 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2053 const U32 numstates = trie->statecount;
786e8c11
YO
2054 const U32 ubound = trie->lasttrans + ucharcount;
2055 U32 q_read = 0;
2056 U32 q_write = 0;
2057 U32 charid;
2058 U32 base = trie->states[ 1 ].trans.base;
2059 U32 *fail;
2060 reg_ac_data *aho;
2061 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2062 GET_RE_DEBUG_FLAGS_DECL;
2063#ifndef DEBUGGING
2064 PERL_UNUSED_ARG(depth);
2065#endif
2066
2067
2068 ARG_SET( stclass, data_slot );
2069 Newxz( aho, 1, reg_ac_data );
2070 RExC_rx->data->data[ data_slot ] = (void*)aho;
2071 aho->trie=trie;
2072 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2073 numstates * sizeof(reg_trie_state));
786e8c11
YO
2074 Newxz( q, numstates, U32);
2075 Newxz( aho->fail, numstates, U32 );
2076 aho->refcount = 1;
2077 fail = aho->fail;
2078 /* initialize fail[0..1] to be 1 so that we always have
2079 a valid final fail state */
2080 fail[ 0 ] = fail[ 1 ] = 1;
2081
2082 for ( charid = 0; charid < ucharcount ; charid++ ) {
2083 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2084 if ( newstate ) {
2085 q[ q_write ] = newstate;
2086 /* set to point at the root */
2087 fail[ q[ q_write++ ] ]=1;
2088 }
2089 }
2090 while ( q_read < q_write) {
2091 const U32 cur = q[ q_read++ % numstates ];
2092 base = trie->states[ cur ].trans.base;
2093
2094 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2095 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2096 if (ch_state) {
2097 U32 fail_state = cur;
2098 U32 fail_base;
2099 do {
2100 fail_state = fail[ fail_state ];
2101 fail_base = aho->states[ fail_state ].trans.base;
2102 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2103
2104 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2105 fail[ ch_state ] = fail_state;
2106 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2107 {
2108 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2109 }
2110 q[ q_write++ % numstates] = ch_state;
2111 }
2112 }
2113 }
2114 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2115 when we fail in state 1, this allows us to use the
2116 charclass scan to find a valid start char. This is based on the principle
2117 that theres a good chance the string being searched contains lots of stuff
2118 that cant be a start char.
2119 */
2120 fail[ 0 ] = fail[ 1 ] = 0;
2121 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2122 PerlIO_printf(Perl_debug_log,
2123 "%*sStclass Failtable (%"UVuf" states): 0",
2124 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2125 );
786e8c11
YO
2126 for( q_read=1; q_read<numstates; q_read++ ) {
2127 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2128 }
2129 PerlIO_printf(Perl_debug_log, "\n");
2130 });
2131 Safefree(q);
2132 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2133}
2134
786e8c11 2135
a3621e74 2136/*
5d1c421c
JH
2137 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2138 * These need to be revisited when a newer toolchain becomes available.
2139 */
2140#if defined(__sparc64__) && defined(__GNUC__)
2141# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2142# undef SPARC64_GCC_WORKAROUND
2143# define SPARC64_GCC_WORKAROUND 1
2144# endif
2145#endif
2146
07be1b83 2147#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2148 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2149 SV * const mysv=sv_newmortal(); \
2150 regnode *Next = regnext(scan); \
2151 regprop(RExC_rx, mysv, scan); \
7f69552c 2152 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2153 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2154 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2155 }});
07be1b83 2156
1de06328
YO
2157
2158
2159
2160
07be1b83
YO
2161#define JOIN_EXACT(scan,min,flags) \
2162 if (PL_regkind[OP(scan)] == EXACT) \
2163 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2164
be8e71aa 2165STATIC U32
07be1b83
YO
2166S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2167 /* Merge several consecutive EXACTish nodes into one. */
2168 regnode *n = regnext(scan);
2169 U32 stringok = 1;
2170 regnode *next = scan + NODE_SZ_STR(scan);
2171 U32 merged = 0;
2172 U32 stopnow = 0;
2173#ifdef DEBUGGING
2174 regnode *stop = scan;
72f13be8 2175 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2176#else
d47053eb
RGS
2177 PERL_UNUSED_ARG(depth);
2178#endif
2179#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2180 PERL_UNUSED_ARG(flags);
2181 PERL_UNUSED_ARG(val);
07be1b83 2182#endif
07be1b83
YO
2183 DEBUG_PEEP("join",scan,depth);
2184
2185 /* Skip NOTHING, merge EXACT*. */
2186 while (n &&
2187 ( PL_regkind[OP(n)] == NOTHING ||
2188 (stringok && (OP(n) == OP(scan))))
2189 && NEXT_OFF(n)
2190 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2191
2192 if (OP(n) == TAIL || n > next)
2193 stringok = 0;
2194 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2195 DEBUG_PEEP("skip:",n,depth);
2196 NEXT_OFF(scan) += NEXT_OFF(n);
2197 next = n + NODE_STEP_REGNODE;
2198#ifdef DEBUGGING
2199 if (stringok)
2200 stop = n;
2201#endif
2202 n = regnext(n);
2203 }
2204 else if (stringok) {
786e8c11 2205 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2206 regnode * const nnext = regnext(n);
2207
2208 DEBUG_PEEP("merg",n,depth);
2209
2210 merged++;
2211 if (oldl + STR_LEN(n) > U8_MAX)
2212 break;
2213 NEXT_OFF(scan) += NEXT_OFF(n);
2214 STR_LEN(scan) += STR_LEN(n);
2215 next = n + NODE_SZ_STR(n);
2216 /* Now we can overwrite *n : */
2217 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2218#ifdef DEBUGGING
2219 stop = next - 1;
2220#endif
2221 n = nnext;
2222 if (stopnow) break;
2223 }
2224
d47053eb
RGS
2225#ifdef EXPERIMENTAL_INPLACESCAN
2226 if (flags && !NEXT_OFF(n)) {
2227 DEBUG_PEEP("atch", val, depth);
2228 if (reg_off_by_arg[OP(n)]) {
2229 ARG_SET(n, val - n);
2230 }
2231 else {
2232 NEXT_OFF(n) = val - n;
2233 }
2234 stopnow = 1;
2235 }
07be1b83
YO
2236#endif
2237 }
2238
2239 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2240 /*
2241 Two problematic code points in Unicode casefolding of EXACT nodes:
2242
2243 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2244 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2245
2246 which casefold to
2247
2248 Unicode UTF-8
2249
2250 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2251 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2252
2253 This means that in case-insensitive matching (or "loose matching",
2254 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2255 length of the above casefolded versions) can match a target string
2256 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2257 This would rather mess up the minimum length computation.
2258
2259 What we'll do is to look for the tail four bytes, and then peek
2260 at the preceding two bytes to see whether we need to decrease
2261 the minimum length by four (six minus two).
2262
2263 Thanks to the design of UTF-8, there cannot be false matches:
2264 A sequence of valid UTF-8 bytes cannot be a subsequence of
2265 another valid sequence of UTF-8 bytes.
2266
2267 */
2268 char * const s0 = STRING(scan), *s, *t;
2269 char * const s1 = s0 + STR_LEN(scan) - 1;
2270 char * const s2 = s1 - 4;
e294cc5d
JH
2271#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2272 const char t0[] = "\xaf\x49\xaf\x42";
2273#else
07be1b83 2274 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2275#endif
07be1b83
YO
2276 const char * const t1 = t0 + 3;
2277
2278 for (s = s0 + 2;
2279 s < s2 && (t = ninstr(s, s1, t0, t1));
2280 s = t + 4) {
e294cc5d
JH
2281#ifdef EBCDIC
2282 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2283 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2284#else
07be1b83
YO
2285 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2286 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2287#endif
07be1b83
YO
2288 *min -= 4;
2289 }
2290 }
2291
2292#ifdef DEBUGGING
2293 /* Allow dumping */
2294 n = scan + NODE_SZ_STR(scan);
2295 while (n <= stop) {
2296 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2297 OP(n) = OPTIMIZED;
2298 NEXT_OFF(n) = 0;
2299 }
2300 n++;
2301 }
2302#endif
2303 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2304 return stopnow;
2305}
2306
653099ff
GS
2307/* REx optimizer. Converts nodes into quickier variants "in place".
2308 Finds fixed substrings. */
2309
a0288114 2310/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2311 to the position after last scanned or to NULL. */
2312
40d049e4
YO
2313#define INIT_AND_WITHP \
2314 assert(!and_withp); \
2315 Newx(and_withp,1,struct regnode_charclass_class); \
2316 SAVEFREEPV(and_withp)
07be1b83 2317
b515a41d
YO
2318/* this is a chain of data about sub patterns we are processing that
2319 need to be handled seperately/specially in study_chunk. Its so
2320 we can simulate recursion without losing state. */
2321struct scan_frame;
2322typedef struct scan_frame {
2323 regnode *last; /* last node to process in this frame */
2324 regnode *next; /* next node to process when last is reached */
2325 struct scan_frame *prev; /*previous frame*/
2326 I32 stop; /* what stopparen do we use */
2327} scan_frame;
2328
76e3520e 2329STATIC I32
40d049e4 2330S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2331 I32 *minlenp, I32 *deltap,
40d049e4
YO
2332 regnode *last,
2333 scan_data_t *data,
2334 I32 stopparen,
2335 U8* recursed,
2336 struct regnode_charclass_class *and_withp,
2337 U32 flags, U32 depth)
c277df42
IZ
2338 /* scanp: Start here (read-write). */
2339 /* deltap: Write maxlen-minlen here. */
2340 /* last: Stop before this one. */
40d049e4
YO
2341 /* data: string data about the pattern */
2342 /* stopparen: treat close N as END */
2343 /* recursed: which subroutines have we recursed into */
2344 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2345{
97aff369 2346 dVAR;
c277df42
IZ
2347 I32 min = 0, pars = 0, code;
2348 regnode *scan = *scanp, *next;
2349 I32 delta = 0;
2350 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2351 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2352 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2353 scan_data_t data_fake;
a3621e74 2354 SV *re_trie_maxbuff = NULL;
786e8c11 2355 regnode *first_non_open = scan;
e2e6a0f1 2356 I32 stopmin = I32_MAX;
8aa23a47
YO
2357 scan_frame *frame = NULL;
2358
a3621e74 2359 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2360
13a24bad 2361#ifdef DEBUGGING
40d049e4 2362 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2363#endif
40d049e4 2364
786e8c11 2365 if ( depth == 0 ) {
40d049e4 2366 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2367 first_non_open=regnext(first_non_open);
2368 }
2369
b81d288d 2370
8aa23a47
YO
2371 fake_study_recurse:
2372 while ( scan && OP(scan) != END && scan < last ){
2373 /* Peephole optimizer: */
2374 DEBUG_STUDYDATA(data,depth);
2375 DEBUG_PEEP("Peep",scan,depth);
2376 JOIN_EXACT(scan,&min,0);
2377
2378 /* Follow the next-chain of the current node and optimize
2379 away all the NOTHINGs from it. */
2380 if (OP(scan) != CURLYX) {
2381 const int max = (reg_off_by_arg[OP(scan)]
2382 ? I32_MAX
2383 /* I32 may be smaller than U16 on CRAYs! */
2384 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2385 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2386 int noff;
2387 regnode *n = scan;
2388
2389 /* Skip NOTHING and LONGJMP. */
2390 while ((n = regnext(n))
2391 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2392 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2393 && off + noff < max)
2394 off += noff;
2395 if (reg_off_by_arg[OP(scan)])
2396 ARG(scan) = off;
2397 else
2398 NEXT_OFF(scan) = off;
2399 }
a3621e74 2400
c277df42 2401
8aa23a47
YO
2402
2403 /* The principal pseudo-switch. Cannot be a switch, since we
2404 look into several different things. */
2405 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2406 || OP(scan) == IFTHEN) {
2407 next = regnext(scan);
2408 code = OP(scan);
2409 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2410
2411 if (OP(next) == code || code == IFTHEN) {
2412 /* NOTE - There is similar code to this block below for handling
2413 TRIE nodes on a re-study. If you change stuff here check there
2414 too. */
2415 I32 max1 = 0, min1 = I32_MAX, num = 0;
2416 struct regnode_charclass_class accum;
2417 regnode * const startbranch=scan;
2418
2419 if (flags & SCF_DO_SUBSTR)
2420 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2421 if (flags & SCF_DO_STCLASS)
2422 cl_init_zero(pRExC_state, &accum);
2423
2424 while (OP(scan) == code) {
2425 I32 deltanext, minnext, f = 0, fake;
2426 struct regnode_charclass_class this_class;
2427
2428 num++;
2429 data_fake.flags = 0;
2430 if (data) {
2431 data_fake.whilem_c = data->whilem_c;
2432 data_fake.last_closep = data->last_closep;
2433 }
2434 else
2435 data_fake.last_closep = &fake;
58e23c8d
YO
2436
2437 data_fake.pos_delta = delta;
8aa23a47
YO
2438 next = regnext(scan);
2439 scan = NEXTOPER(scan);
2440 if (code != BRANCH)
c277df42 2441 scan = NEXTOPER(scan);
8aa23a47
YO
2442 if (flags & SCF_DO_STCLASS) {
2443 cl_init(pRExC_state, &this_class);
2444 data_fake.start_class = &this_class;
2445 f = SCF_DO_STCLASS_AND;
58e23c8d 2446 }
8aa23a47
YO
2447 if (flags & SCF_WHILEM_VISITED_POS)
2448 f |= SCF_WHILEM_VISITED_POS;
2449
2450 /* we suppose the run is continuous, last=next...*/
2451 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2452 next, &data_fake,
2453 stopparen, recursed, NULL, f,depth+1);
2454 if (min1 > minnext)
2455 min1 = minnext;
2456 if (max1 < minnext + deltanext)
2457 max1 = minnext + deltanext;
2458 if (deltanext == I32_MAX)
2459 is_inf = is_inf_internal = 1;
2460 scan = next;
2461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2462 pars++;
2463 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2464 if ( stopmin > minnext)
2465 stopmin = min + min1;
2466 flags &= ~SCF_DO_SUBSTR;
2467 if (data)
2468 data->flags |= SCF_SEEN_ACCEPT;
2469 }
2470 if (data) {
2471 if (data_fake.flags & SF_HAS_EVAL)
2472 data->flags |= SF_HAS_EVAL;
2473 data->whilem_c = data_fake.whilem_c;
3dab1dad 2474 }
8aa23a47
YO
2475 if (flags & SCF_DO_STCLASS)
2476 cl_or(pRExC_state, &accum, &this_class);
2477 }
2478 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2479 min1 = 0;
2480 if (flags & SCF_DO_SUBSTR) {
2481 data->pos_min += min1;
2482 data->pos_delta += max1 - min1;
2483 if (max1 != min1 || is_inf)
2484 data->longest = &(data->longest_float);
2485 }
2486 min += min1;
2487 delta += max1 - min1;
2488 if (flags & SCF_DO_STCLASS_OR) {
2489 cl_or(pRExC_state, data->start_class, &accum);
2490 if (min1) {
2491 cl_and(data->start_class, and_withp);
2492 flags &= ~SCF_DO_STCLASS;
653099ff 2493 }
8aa23a47
YO
2494 }
2495 else if (flags & SCF_DO_STCLASS_AND) {
2496 if (min1) {
2497 cl_and(data->start_class, &accum);
2498 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2499 }
8aa23a47
YO
2500 else {
2501 /* Switch to OR mode: cache the old value of
2502 * data->start_class */
2503 INIT_AND_WITHP;
2504 StructCopy(data->start_class, and_withp,
2505 struct regnode_charclass_class);
2506 flags &= ~SCF_DO_STCLASS_AND;
2507 StructCopy(&accum, data->start_class,
2508 struct regnode_charclass_class);
2509 flags |= SCF_DO_STCLASS_OR;
2510 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2511 }
8aa23a47 2512 }
a3621e74 2513
8aa23a47
YO
2514 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2515 /* demq.
a3621e74 2516
8aa23a47
YO
2517 Assuming this was/is a branch we are dealing with: 'scan' now
2518 points at the item that follows the branch sequence, whatever
2519 it is. We now start at the beginning of the sequence and look
2520 for subsequences of
a3621e74 2521
8aa23a47
YO
2522 BRANCH->EXACT=>x1
2523 BRANCH->EXACT=>x2
2524 tail
a3621e74 2525
8aa23a47 2526 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2527
8aa23a47
YO
2528 If we can find such a subseqence we need to turn the first
2529 element into a trie and then add the subsequent branch exact
2530 strings to the trie.
a3621e74 2531
8aa23a47 2532 We have two cases
a3621e74 2533
8aa23a47 2534 1. patterns where the whole set of branch can be converted.
a3621e74 2535
8aa23a47 2536 2. patterns where only a subset can be converted.
a3621e74 2537
8aa23a47
YO
2538 In case 1 we can replace the whole set with a single regop
2539 for the trie. In case 2 we need to keep the start and end
2540 branchs so
a3621e74 2541
8aa23a47
YO
2542 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2543 becomes BRANCH TRIE; BRANCH X;
786e8c11 2544
8aa23a47
YO
2545 There is an additional case, that being where there is a
2546 common prefix, which gets split out into an EXACT like node
2547 preceding the TRIE node.
a3621e74 2548
8aa23a47
YO
2549 If x(1..n)==tail then we can do a simple trie, if not we make
2550 a "jump" trie, such that when we match the appropriate word
2551 we "jump" to the appopriate tail node. Essentailly we turn
2552 a nested if into a case structure of sorts.
b515a41d 2553
8aa23a47
YO
2554 */
2555
2556 int made=0;
2557 if (!re_trie_maxbuff) {
2558 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2559 if (!SvIOK(re_trie_maxbuff))
2560 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2561 }
2562 if ( SvIV(re_trie_maxbuff)>=0 ) {
2563 regnode *cur;
2564 regnode *first = (regnode *)NULL;
2565 regnode *last = (regnode *)NULL;
2566 regnode *tail = scan;
2567 U8 optype = 0;
2568 U32 count=0;
a3621e74
YO
2569
2570#ifdef DEBUGGING
8aa23a47 2571 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2572#endif
8aa23a47
YO
2573 /* var tail is used because there may be a TAIL
2574 regop in the way. Ie, the exacts will point to the
2575 thing following the TAIL, but the last branch will
2576 point at the TAIL. So we advance tail. If we
2577 have nested (?:) we may have to move through several
2578 tails.
2579 */
2580
2581 while ( OP( tail ) == TAIL ) {
2582 /* this is the TAIL generated by (?:) */
2583 tail = regnext( tail );
2584 }
a3621e74 2585
8aa23a47
YO
2586
2587 DEBUG_OPTIMISE_r({
2588 regprop(RExC_rx, mysv, tail );
2589 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2590 (int)depth * 2 + 2, "",
2591 "Looking for TRIE'able sequences. Tail node is: ",
2592 SvPV_nolen_const( mysv )
2593 );
2594 });
2595
2596 /*
2597
2598 step through the branches, cur represents each
2599 branch, noper is the first thing to be matched
2600 as part of that branch and noper_next is the
2601 regnext() of that node. if noper is an EXACT
2602 and noper_next is the same as scan (our current
2603 position in the regex) then the EXACT branch is
2604 a possible optimization target. Once we have
2605 two or more consequetive such branches we can
2606 create a trie of the EXACT's contents and stich
2607 it in place. If the sequence represents all of
2608 the branches we eliminate the whole thing and
2609 replace it with a single TRIE. If it is a
2610 subsequence then we need to stitch it in. This
2611 means the first branch has to remain, and needs
2612 to be repointed at the item on the branch chain
2613 following the last branch optimized. This could
2614 be either a BRANCH, in which case the
2615 subsequence is internal, or it could be the
2616 item following the branch sequence in which
2617 case the subsequence is at the end.
2618
2619 */
2620
2621 /* dont use tail as the end marker for this traverse */
2622 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2623 regnode * const noper = NEXTOPER( cur );
b515a41d 2624#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2625 regnode * const noper_next = regnext( noper );
b515a41d
YO
2626#endif
2627
8aa23a47
YO
2628 DEBUG_OPTIMISE_r({
2629 regprop(RExC_rx, mysv, cur);
2630 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2631 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2632
2633 regprop(RExC_rx, mysv, noper);
2634 PerlIO_printf( Perl_debug_log, " -> %s",
2635 SvPV_nolen_const(mysv));
2636
2637 if ( noper_next ) {
2638 regprop(RExC_rx, mysv, noper_next );
2639 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2640 SvPV_nolen_const(mysv));
2641 }
2642 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2643 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2644 });
2645 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2646 : PL_regkind[ OP( noper ) ] == EXACT )
2647 || OP(noper) == NOTHING )
786e8c11 2648#ifdef NOJUMPTRIE
8aa23a47 2649 && noper_next == tail
786e8c11 2650#endif
8aa23a47
YO
2651 && count < U16_MAX)
2652 {
2653 count++;
2654 if ( !first || optype == NOTHING ) {
2655 if (!first) first = cur;
2656 optype = OP( noper );
2657 } else {
2658 last = cur;
2659 }
2660 } else {
2661 if ( last ) {
2662 make_trie( pRExC_state,
2663 startbranch, first, cur, tail, count,
2664 optype, depth+1 );
2665 }
2666 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2667#ifdef NOJUMPTRIE
8aa23a47 2668 && noper_next == tail
786e8c11 2669#endif
8aa23a47
YO
2670 ){
2671 count = 1;
2672 first = cur;
2673 optype = OP( noper );
2674 } else {
2675 count = 0;
2676 first = NULL;
2677 optype = 0;
2678 }
2679 last = NULL;
2680 }
2681 }
2682 DEBUG_OPTIMISE_r({
2683 regprop(RExC_rx, mysv, cur);
2684 PerlIO_printf( Perl_debug_log,
2685 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2686 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2687
2688 });
2689 if ( last ) {
2690 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2691#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2692 if ( ((made == MADE_EXACT_TRIE &&
2693 startbranch == first)
2694 || ( first_non_open == first )) &&
2695 depth==0 ) {
2696 flags |= SCF_TRIE_RESTUDY;
2697 if ( startbranch == first
2698 && scan == tail )
2699 {
2700 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2701 }
2702 }
3dab1dad 2703#endif
8aa23a47
YO
2704 }
2705 }
2706
2707 } /* do trie */
2708
653099ff 2709 }
8aa23a47
YO
2710 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2711 scan = NEXTOPER(NEXTOPER(scan));
2712 } else /* single branch is optimized. */
2713 scan = NEXTOPER(scan);
2714 continue;
2715 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2716 scan_frame *newframe = NULL;
2717 I32 paren;
2718 regnode *start;
2719 regnode *end;
2720
2721 if (OP(scan) != SUSPEND) {
2722 /* set the pointer */
2723 if (OP(scan) == GOSUB) {
2724 paren = ARG(scan);
2725 RExC_recurse[ARG2L(scan)] = scan;
2726 start = RExC_open_parens[paren-1];
2727 end = RExC_close_parens[paren-1];
2728 } else {
2729 paren = 0;
2730 start = RExC_rx->program + 1;
2731 end = RExC_opend;
2732 }
2733 if (!recursed) {
2734 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2735 SAVEFREEPV(recursed);
2736 }
2737 if (!PAREN_TEST(recursed,paren+1)) {
2738 PAREN_SET(recursed,paren+1);
2739 Newx(newframe,1,scan_frame);
2740 } else {
2741 if (flags & SCF_DO_SUBSTR) {
2742 scan_commit(pRExC_state,data,minlenp);
2743 data->longest = &(data->longest_float);
2744 }
2745 is_inf = is_inf_internal = 1;
2746 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2747 cl_anything(pRExC_state, data->start_class);
2748 flags &= ~SCF_DO_STCLASS;
2749 }
2750 } else {
2751 Newx(newframe,1,scan_frame);
2752 paren = stopparen;
2753 start = scan+2;
2754 end = regnext(scan);
2755 }
2756 if (newframe) {
2757 assert(start);
2758 assert(end);
2759 SAVEFREEPV(newframe);
2760 newframe->next = regnext(scan);
2761 newframe->last = last;
2762 newframe->stop = stopparen;
2763 newframe->prev = frame;
2764
2765 frame = newframe;
2766 scan = start;
2767 stopparen = paren;
2768 last = end;
2769
2770 continue;
2771 }
2772 }
2773 else if (OP(scan) == EXACT) {
2774 I32 l = STR_LEN(scan);
2775 UV uc;
2776 if (UTF) {
2777 const U8 * const s = (U8*)STRING(scan);
2778 l = utf8_length(s, s + l);
2779 uc = utf8_to_uvchr(s, NULL);
2780 } else {
2781 uc = *((U8*)STRING(scan));
2782 }
2783 min += l;
2784 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2785 /* The code below prefers earlier match for fixed
2786 offset, later match for variable offset. */
2787 if (data->last_end == -1) { /* Update the start info. */
2788 data->last_start_min = data->pos_min;
2789 data->last_start_max = is_inf
2790 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2791 }
8aa23a47
YO
2792 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2793 if (UTF)
2794 SvUTF8_on(data->last_found);
2795 {
2796 SV * const sv = data->last_found;
2797 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2798 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2799 if (mg && mg->mg_len >= 0)
2800 mg->mg_len += utf8_length((U8*)STRING(scan),
2801 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2802 }
8aa23a47
YO
2803 data->last_end = data->pos_min + l;
2804 data->pos_min += l; /* As in the first entry. */
2805 data->flags &= ~SF_BEFORE_EOL;
2806 }
2807 if (flags & SCF_DO_STCLASS_AND) {
2808 /* Check whether it is compatible with what we know already! */
2809 int compat = 1;
2810
2811 if (uc >= 0x100 ||
2812 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2813 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2814 && (!(data->start_class->flags & ANYOF_FOLD)
2815 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2816 )
2817 compat = 0;
2818 ANYOF_CLASS_ZERO(data->start_class);
2819 ANYOF_BITMAP_ZERO(data->start_class);
2820 if (compat)
2821 ANYOF_BITMAP_SET(data->start_class, uc);
2822 data->start_class->flags &= ~ANYOF_EOS;
2823 if (uc < 0x100)
2824 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2825 }
2826 else if (flags & SCF_DO_STCLASS_OR) {
2827 /* false positive possible if the class is case-folded */
2828 if (uc < 0x100)
2829 ANYOF_BITMAP_SET(data->start_class, uc);
2830 else
2831 data->start_class->flags |= ANYOF_UNICODE_ALL;
2832 data->start_class->flags &= ~ANYOF_EOS;
2833 cl_and(data->start_class, and_withp);
2834 }
2835 flags &= ~SCF_DO_STCLASS;
2836 }
2837 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2838 I32 l = STR_LEN(scan);
2839 UV uc = *((U8*)STRING(scan));
2840
2841 /* Search for fixed substrings supports EXACT only. */
2842 if (flags & SCF_DO_SUBSTR) {
2843 assert(data);
2844 scan_commit(pRExC_state, data, minlenp);
2845 }
2846 if (UTF) {
2847 const U8 * const s = (U8 *)STRING(scan);
2848 l = utf8_length(s, s + l);
2849 uc = utf8_to_uvchr(s, NULL);
2850 }
2851 min += l;
2852 if (flags & SCF_DO_SUBSTR)
2853 data->pos_min += l;
2854 if (flags & SCF_DO_STCLASS_AND) {
2855 /* Check whether it is compatible with what we know already! */
2856 int compat = 1;
2857
2858 if (uc >= 0x100 ||
2859 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2860 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2861 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2862 compat = 0;
2863 ANYOF_CLASS_ZERO(data->start_class);
2864 ANYOF_BITMAP_ZERO(data->start_class);
2865 if (compat) {
2866 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2867 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2868 data->start_class->flags |= ANYOF_FOLD;
2869 if (OP(scan) == EXACTFL)
2870 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2871 }
8aa23a47
YO
2872 }
2873 else if (flags & SCF_DO_STCLASS_OR) {
2874 if (data->start_class->flags & ANYOF_FOLD) {
2875 /* false positive possible if the class is case-folded.
2876 Assume that the locale settings are the same... */
1aa99e6b
IH
2877 if (uc < 0x100)
2878 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2879 data->start_class->flags &= ~ANYOF_EOS;
2880 }
8aa23a47 2881 cl_and(data->start_class, and_withp);
653099ff 2882 }
8aa23a47
YO
2883 flags &= ~SCF_DO_STCLASS;
2884 }
2885 else if (strchr((const char*)PL_varies,OP(scan))) {
2886 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2887 I32 f = flags, pos_before = 0;
2888 regnode * const oscan = scan;
2889 struct regnode_charclass_class this_class;
2890 struct regnode_charclass_class *oclass = NULL;
2891 I32 next_is_eval = 0;
2892
2893 switch (PL_regkind[OP(scan)]) {
2894 case WHILEM: /* End of (?:...)* . */
2895 scan = NEXTOPER(scan);
2896 goto finish;
2897 case PLUS:
2898 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2899 next = NEXTOPER(scan);
2900 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2901 mincount = 1;
2902 maxcount = REG_INFTY;
2903 next = regnext(scan);
2904 scan = NEXTOPER(scan);
2905 goto do_curly;
2906 }
2907 }
2908 if (flags & SCF_DO_SUBSTR)
2909 data->pos_min++;
2910 min++;
2911 /* Fall through. */
2912 case STAR:
2913 if (flags & SCF_DO_STCLASS) {
2914 mincount = 0;
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2918 goto do_curly;
2919 }
2920 is_inf = is_inf_internal = 1;
2921 scan = regnext(scan);
c277df42 2922 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2923 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2924 data->longest = &(data->longest_float);
c277df42 2925 }
8aa23a47
YO
2926 goto optimize_curly_tail;
2927 case CURLY:
2928 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2929 && (scan->flags == stopparen))
2930 {
2931 mincount = 1;
2932 maxcount = 1;
2933 } else {
2934 mincount = ARG1(scan);
2935 maxcount = ARG2(scan);
653099ff 2936 }
8aa23a47
YO
2937 next = regnext(scan);
2938 if (OP(scan) == CURLYX) {
2939 I32 lp = (data ? *(data->last_closep) : 0);
2940 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2941 }
8aa23a47
YO
2942 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943 next_is_eval = (OP(scan) == EVAL);
2944 do_curly:
2945 if (flags & SCF_DO_SUBSTR) {
2946 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2947 pos_before = data->pos_min;
b45f050a 2948 }
8aa23a47
YO
2949 if (data) {
2950 fl = data->flags;
2951 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2952 if (is_inf)
2953 data->flags |= SF_IS_INF;
2954 }
2955 if (flags & SCF_DO_STCLASS) {
2956 cl_init(pRExC_state, &this_class);
2957 oclass = data->start_class;
2958 data->start_class = &this_class;
2959 f |= SCF_DO_STCLASS_AND;
2960 f &= ~SCF_DO_STCLASS_OR;
2961 }
2962 /* These are the cases when once a subexpression
2963 fails at a particular position, it cannot succeed
2964 even after backtracking at the enclosing scope.
2965
2966 XXXX what if minimal match and we are at the
2967 initial run of {n,m}? */
2968 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2969 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2970
8aa23a47
YO
2971 /* This will finish on WHILEM, setting scan, or on NULL: */
2972 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2973 last, data, stopparen, recursed, NULL,
2974 (mincount == 0
2975 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2976
8aa23a47
YO
2977 if (flags & SCF_DO_STCLASS)
2978 data->start_class = oclass;
2979 if (mincount == 0 || minnext == 0) {
2980 if (flags & SCF_DO_STCLASS_OR) {
2981 cl_or(pRExC_state, data->start_class, &this_class);
2982 }
2983 else if (flags & SCF_DO_STCLASS_AND) {
2984 /* Switch to OR mode: cache the old value of
2985 * data->start_class */
2986 INIT_AND_WITHP;
2987 StructCopy(data->start_class, and_withp,
2988 struct regnode_charclass_class);
2989 flags &= ~SCF_DO_STCLASS_AND;
2990 StructCopy(&this_class, data->start_class,
2991 struct regnode_charclass_class);
2992 flags |= SCF_DO_STCLASS_OR;
2993 data->start_class->flags |= ANYOF_EOS;
2994 }
2995 } else { /* Non-zero len */
2996 if (flags & SCF_DO_STCLASS_OR) {
2997 cl_or(pRExC_state, data->start_class, &this_class);
2998 cl_and(data->start_class, and_withp);
2999 }
3000 else if (flags & SCF_DO_STCLASS_AND)
3001 cl_and(data->start_class, &this_class);
3002 flags &= ~SCF_DO_STCLASS;
3003 }
3004 if (!scan) /* It was not CURLYX, but CURLY. */
3005 scan = next;
3006 if ( /* ? quantifier ok, except for (?{ ... }) */
3007 (next_is_eval || !(mincount == 0 && maxcount == 1))
3008 && (minnext == 0) && (deltanext == 0)
3009 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3010 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3011 && ckWARN(WARN_REGEXP))
3012 {
3013 vWARN(RExC_parse,
3014 "Quantifier unexpected on zero-length expression");
3015 }
3016
3017 min += minnext * mincount;
3018 is_inf_internal |= ((maxcount == REG_INFTY
3019 && (minnext + deltanext) > 0)
3020 || deltanext == I32_MAX);
3021 is_inf |= is_inf_internal;
3022 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3023
3024 /* Try powerful optimization CURLYX => CURLYN. */
3025 if ( OP(oscan) == CURLYX && data
3026 && data->flags & SF_IN_PAR
3027 && !(data->flags & SF_HAS_EVAL)
3028 && !deltanext && minnext == 1 ) {
3029 /* Try to optimize to CURLYN. */
3030 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3031 regnode * const nxt1 = nxt;
497b47a8 3032#ifdef DEBUGGING
8aa23a47 3033 regnode *nxt2;
497b47a8 3034#endif
c277df42 3035
8aa23a47
YO
3036 /* Skip open. */
3037 nxt = regnext(nxt);
3038 if (!strchr((const char*)PL_simple,OP(nxt))
3039 && !(PL_regkind[OP(nxt)] == EXACT
3040 && STR_LEN(nxt) == 1))
3041 goto nogo;
497b47a8 3042#ifdef DEBUGGING
8aa23a47 3043 nxt2 = nxt;
497b47a8 3044#endif
8aa23a47
YO
3045 nxt = regnext(nxt);
3046 if (OP(nxt) != CLOSE)
3047 goto nogo;
3048 if (RExC_open_parens) {
3049 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3050 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3051 }
3052 /* Now we know that nxt2 is the only contents: */
3053 oscan->flags = (U8)ARG(nxt);
3054 OP(oscan) = CURLYN;
3055 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3056
c277df42 3057#ifdef DEBUGGING
8aa23a47
YO
3058 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3059 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3060 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3061 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3062 OP(nxt + 1) = OPTIMIZED; /* was count. */
3063 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3064#endif
8aa23a47
YO
3065 }
3066 nogo:
3067
3068 /* Try optimization CURLYX => CURLYM. */
3069 if ( OP(oscan) == CURLYX && data
3070 && !(data->flags & SF_HAS_PAR)
3071 && !(data->flags & SF_HAS_EVAL)
3072 && !deltanext /* atom is fixed width */
3073 && minnext != 0 /* CURLYM can't handle zero width */
3074 ) {
3075 /* XXXX How to optimize if data == 0? */
3076 /* Optimize to a simpler form. */
3077 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3078 regnode *nxt2;
3079
3080 OP(oscan) = CURLYM;
3081 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3082 && (OP(nxt2) != WHILEM))
3083 nxt = nxt2;
3084 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3085 /* Need to optimize away parenths. */
3086 if (data->flags & SF_IN_PAR) {
3087 /* Set the parenth number. */
3088 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3089
3090 if (OP(nxt) != CLOSE)
3091 FAIL("Panic opt close");
3092 oscan->flags = (U8)ARG(nxt);
3093 if (RExC_open_parens) {
3094 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3095 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3096 }
8aa23a47
YO
3097 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3098 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3099
c277df42 3100#ifdef DEBUGGING
8aa23a47
YO
3101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3102 OP(nxt + 1) = OPTIMIZED; /* was count. */
3103 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3104 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3105#endif
c277df42 3106#if 0
8aa23a47
YO
3107 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3108 regnode *nnxt = regnext(nxt1);
3109
3110 if (nnxt == nxt) {
3111 if (reg_off_by_arg[OP(nxt1)])
3112 ARG_SET(nxt1, nxt2 - nxt1);
3113 else if (nxt2 - nxt1 < U16_MAX)
3114 NEXT_OFF(nxt1) = nxt2 - nxt1;
3115 else
3116 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3117 }
8aa23a47 3118 nxt1 = nnxt;
c277df42 3119 }
5d1c421c 3120#endif
8aa23a47
YO
3121 /* Optimize again: */
3122 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3123 NULL, stopparen, recursed, NULL, 0,depth+1);
3124 }
3125 else
3126 oscan->flags = 0;
3127 }
3128 else if ((OP(oscan) == CURLYX)
3129 && (flags & SCF_WHILEM_VISITED_POS)
3130 /* See the comment on a similar expression above.
3131 However, this time it not a subexpression
3132 we care about, but the expression itself. */
3133 && (maxcount == REG_INFTY)
3134 && data && ++data->whilem_c < 16) {
3135 /* This stays as CURLYX, we can put the count/of pair. */
3136 /* Find WHILEM (as in regexec.c) */
3137 regnode *nxt = oscan + NEXT_OFF(oscan);
3138
3139 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3140 nxt += ARG(nxt);
3141 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3142 | (RExC_whilem_seen << 4)); /* On WHILEM */
3143 }
3144 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3145 pars++;
3146 if (flags & SCF_DO_SUBSTR) {
3147 SV *last_str = NULL;
3148 int counted = mincount != 0;
a0ed51b3 3149
8aa23a47
YO
3150 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3151#if defined(SPARC64_GCC_WORKAROUND)
3152 I32 b = 0;
3153 STRLEN l = 0;
3154 const char *s = NULL;
3155 I32 old = 0;
b515a41d 3156
8aa23a47
YO
3157 if (pos_before >= data->last_start_min)
3158 b = pos_before;
3159 else
3160 b = data->last_start_min;
b515a41d 3161
8aa23a47
YO
3162 l = 0;
3163 s = SvPV_const(data->last_found, l);
3164 old = b - data->last_start_min;
3165
3166#else
3167 I32 b = pos_before >= data->last_start_min
3168 ? pos_before : data->last_start_min;
3169 STRLEN l;
3170 const char * const s = SvPV_const(data->last_found, l);
3171 I32 old = b - data->last_start_min;
3172#endif
3173
3174 if (UTF)
3175 old = utf8_hop((U8*)s, old) - (U8*)s;
3176
3177 l -= old;
3178 /* Get the added string: */
3179 last_str = newSVpvn(s + old, l);
3180 if (UTF)
3181 SvUTF8_on(last_str);
3182 if (deltanext == 0 && pos_before == b) {
3183 /* What was added is a constant string */
3184 if (mincount > 1) {
3185 SvGROW(last_str, (mincount * l) + 1);
3186 repeatcpy(SvPVX(last_str) + l,
3187 SvPVX_const(last_str), l, mincount - 1);
3188 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3189 /* Add additional parts. */
3190 SvCUR_set(data->last_found,
3191 SvCUR(data->last_found) - l);
3192 sv_catsv(data->last_found, last_str);
3193 {
3194 SV * sv = data->last_found;
3195 MAGIC *mg =
3196 SvUTF8(sv) && SvMAGICAL(sv) ?
3197 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3198 if (mg && mg->mg_len >= 0)
3199 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3200 }
8aa23a47 3201 data->last_end += l * (mincount - 1);
b515a41d 3202 }
8aa23a47
YO
3203 } else {
3204 /* start offset must point into the last copy */
3205 data->last_start_min += minnext * (mincount - 1);
3206 data->last_start_max += is_inf ? I32_MAX
3207 : (maxcount - 1) * (minnext + data->pos_delta);
3208 }
c277df42 3209 }
8aa23a47
YO
3210 /* It is counted once already... */
3211 data->pos_min += minnext * (mincount - counted);
3212 data->pos_delta += - counted * deltanext +
3213 (minnext + deltanext) * maxcount - minnext * mincount;
3214 if (mincount != maxcount) {
3215 /* Cannot extend fixed substrings found inside
3216 the group. */
3217 scan_commit(pRExC_state,data,minlenp);
3218 if (mincount && last_str) {
3219 SV * const sv = data->last_found;
3220 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3221 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3222
3223 if (mg)
3224 mg->mg_len = -1;
3225 sv_setsv(sv, last_str);
3226 data->last_end = data->pos_min;
3227 data->last_start_min =
3228 data->pos_min - CHR_SVLEN(last_str);
3229 data->last_start_max = is_inf
3230 ? I32_MAX
3231 : data->pos_min + data->pos_delta
3232 - CHR_SVLEN(last_str);
3233 }
3234 data->longest = &(data->longest_float);
3235 }
3236 SvREFCNT_dec(last_str);
c277df42 3237 }
8aa23a47
YO
3238 if (data && (fl & SF_HAS_EVAL))
3239 data->flags |= SF_HAS_EVAL;
3240 optimize_curly_tail:
3241 if (OP(oscan) != CURLYX) {
3242 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3243 && NEXT_OFF(next))
3244 NEXT_OFF(oscan) += NEXT_OFF(next);
3245 }
3246 continue;
3247 default: /* REF and CLUMP only? */
3248 if (flags & SCF_DO_SUBSTR) {
3249 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3250 data->longest = &(data->longest_float);
3251 }
3252 is_inf = is_inf_internal = 1;
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_anything(pRExC_state, data->start_class);
3255 flags &= ~SCF_DO_STCLASS;
3256 break;
c277df42 3257 }
8aa23a47
YO
3258 }
3259 else if (strchr((const char*)PL_simple,OP(scan))) {
3260 int value = 0;
653099ff 3261
8aa23a47
YO
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp);
3264 data->pos_min++;
3265 }
3266 min++;
3267 if (flags & SCF_DO_STCLASS) {
3268 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3269
8aa23a47
YO
3270 /* Some of the logic below assumes that switching
3271 locale on will only add false positives. */
3272 switch (PL_regkind[OP(scan)]) {
3273 case SANY:
3274 default:
3275 do_default:
3276 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3277 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3278 cl_anything(pRExC_state, data->start_class);
3279 break;
3280 case REG_ANY:
3281 if (OP(scan) == SANY)
3282 goto do_default;
3283 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3284 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3285 || (data->start_class->flags & ANYOF_CLASS));
3286 cl_anything(pRExC_state, data->start_class);
653099ff 3287 }
8aa23a47
YO
3288 if (flags & SCF_DO_STCLASS_AND || !value)
3289 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3290 break;
3291 case ANYOF:
3292 if (flags & SCF_DO_STCLASS_AND)
3293 cl_and(data->start_class,
3294 (struct regnode_charclass_class*)scan);
653099ff 3295 else
8aa23a47
YO
3296 cl_or(pRExC_state, data->start_class,
3297 (struct regnode_charclass_class*)scan);
3298 break;
3299 case ALNUM:
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3303 for (value = 0; value < 256; value++)
3304 if (!isALNUM(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3306 }
653099ff 3307 }
8aa23a47
YO
3308 else {
3309 if (data->start_class->flags & ANYOF_LOCALE)
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3311 else {
3312 for (value = 0; value < 256; value++)
3313 if (isALNUM(value))
3314 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3315 }
8aa23a47
YO
3316 }
3317 break;
3318 case ALNUML:
3319 if (flags & SCF_DO_STCLASS_AND) {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3322 }
3323 else {
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 }
3327 break;
3328 case NALNUM:
3329 if (flags & SCF_DO_STCLASS_AND) {
3330 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3332 for (value = 0; value < 256; value++)
3333 if (isALNUM(value))
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3335 }
3336 }
8aa23a47
YO
3337 else {
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3340 else {
3341 for (value = 0; value < 256; value++)
3342 if (!isALNUM(value))
3343 ANYOF_BITMAP_SET(data->start_class, value);
3344 }
653099ff 3345 }
8aa23a47
YO
3346 break;
3347 case NALNUML:
3348 if (flags & SCF_DO_STCLASS_AND) {
3349 if (data->start_class->flags & ANYOF_LOCALE)
3350 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3351 }
8aa23a47
YO
3352 else {
3353 data->start_class->flags |= ANYOF_LOCALE;
3354 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3355 }
3356 break;
3357 case SPACE:
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3361 for (value = 0; value < 256; value++)
3362 if (!isSPACE(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3364 }
3365 }
8aa23a47
YO
3366 else {
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isSPACE(value))
3372 ANYOF_BITMAP_SET(data->start_class, value);
3373 }
653099ff 3374 }
8aa23a47
YO
3375 break;
3376 case SPACEL:
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3380 }
3381 else {
3382 data->start_class->flags |= ANYOF_LOCALE;
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3384 }
3385 break;
3386 case NSPACE:
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3390 for (value = 0; value < 256; value++)
3391 if (isSPACE(value))
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3393 }
8aa23a47
YO
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (!isSPACE(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3402 }
3403 }
8aa23a47
YO
3404 break;
3405 case NSPACEL:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE) {
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3409 for (value = 0; value < 256; value++)
3410 if (!isSPACE(value))
3411 ANYOF_BITMAP_CLEAR(data->start_class, value);
3412 }
653099ff 3413 }
8aa23a47
YO
3414 else {
3415 data->start_class->flags |= ANYOF_LOCALE;
3416 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3417 }
3418 break;
3419 case DIGIT:
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3422 for (value = 0; value < 256; value++)
3423 if (!isDIGIT(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425 }
3426 else {
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3429 else {
3430 for (value = 0; value < 256; value++)
3431 if (isDIGIT(value))
3432 ANYOF_BITMAP_SET(data->start_class, value);
3433 }
3434 }
3435 break;
3436 case NDIGIT:
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3439 for (value = 0; value < 256; value++)
3440 if (isDIGIT(value))
3441 ANYOF_BITMAP_CLEAR(data->start_class, value);
3442 }
3443 else {
3444 if (data->start_class->flags & ANYOF_LOCALE)
3445 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3446 else {
3447 for (value = 0; value < 256; value++)
3448 if (!isDIGIT(value))
3449 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3450 }
3451 }
8aa23a47
YO
3452 break;
3453 }
3454 if (flags & SCF_DO_STCLASS_OR)
3455 cl_and(data->start_class, and_withp);
3456 flags &= ~SCF_DO_STCLASS;
3457 }
3458 }
3459 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3460 data->flags |= (OP(scan) == MEOL
3461 ? SF_BEFORE_MEOL
3462 : SF_BEFORE_SEOL);
3463 }
3464 else if ( PL_regkind[OP(scan)] == BRANCHJ
3465 /* Lookbehind, or need to calculate parens/evals/stclass: */
3466 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3467 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3468 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3469 || OP(scan) == UNLESSM )
3470 {
3471 /* Negative Lookahead/lookbehind
3472 In this case we can't do fixed string optimisation.
3473 */
1de06328 3474
8aa23a47
YO
3475 I32 deltanext, minnext, fake = 0;
3476 regnode *nscan;
3477 struct regnode_charclass_class intrnl;
3478 int f = 0;
1de06328 3479
8aa23a47
YO
3480 data_fake.flags = 0;
3481 if (data) {
3482 data_fake.whilem_c = data->whilem_c;
3483 data_fake.last_closep = data->last_closep;
c277df42 3484 }
8aa23a47
YO
3485 else
3486 data_fake.last_closep = &fake;
58e23c8d 3487 data_fake.pos_delta = delta;
8aa23a47
YO
3488 if ( flags & SCF_DO_STCLASS && !scan->flags
3489 && OP(scan) == IFMATCH ) { /* Lookahead */
3490 cl_init(pRExC_state, &intrnl);
3491 data_fake.start_class = &intrnl;
3492 f |= SCF_DO_STCLASS_AND;
3493 }
3494 if (flags & SCF_WHILEM_VISITED_POS)
3495 f |= SCF_WHILEM_VISITED_POS;
3496 next = regnext(scan);
3497 nscan = NEXTOPER(NEXTOPER(scan));
3498 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3499 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3500 if (scan->flags) {
3501 if (deltanext) {
58e23c8d 3502 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3503 }
3504 else if (minnext > (I32)U8_MAX) {
58e23c8d 3505 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3506 }
3507 scan->flags = (U8)minnext;
3508 }
3509 if (data) {
3510 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3511 pars++;
3512 if (data_fake.flags & SF_HAS_EVAL)
3513 data->flags |= SF_HAS_EVAL;
3514 data->whilem_c = data_fake.whilem_c;
3515 }
3516 if (f & SCF_DO_STCLASS_AND) {
3517 const int was = (data->start_class->flags & ANYOF_EOS);
3518
3519 cl_and(data->start_class, &intrnl);
3520 if (was)
3521 data->start_class->flags |= ANYOF_EOS;
3522 }
cb434fcc 3523 }
8aa23a47
YO
3524#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3525 else {
3526 /* Positive Lookahead/lookbehind
3527 In this case we can do fixed string optimisation,
3528 but we must be careful about it. Note in the case of
3529 lookbehind the positions will be offset by the minimum
3530 length of the pattern, something we won't know about
3531 until after the recurse.
3532 */
3533 I32 deltanext, fake = 0;
3534 regnode *nscan;
3535 struct regnode_charclass_class intrnl;
3536 int f = 0;
3537 /* We use SAVEFREEPV so that when the full compile
3538 is finished perl will clean up the allocated
3539 minlens when its all done. This was we don't
3540 have to worry about freeing them when we know
3541 they wont be used, which would be a pain.
3542 */
3543 I32 *minnextp;
3544 Newx( minnextp, 1, I32 );
3545 SAVEFREEPV(minnextp);
3546
3547 if (data) {
3548 StructCopy(data, &data_fake, scan_data_t);
3549 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3550 f |= SCF_DO_SUBSTR;
3551 if (scan->flags)
3552 scan_commit(pRExC_state, &data_fake,minlenp);
3553 data_fake.last_found=newSVsv(data->last_found);
3554 }
3555 }
3556 else
3557 data_fake.last_closep = &fake;
3558 data_fake.flags = 0;
58e23c8d 3559 data_fake.pos_delta = delta;
8aa23a47
YO
3560 if (is_inf)
3561 data_fake.flags |= SF_IS_INF;
3562 if ( flags & SCF_DO_STCLASS && !scan->flags
3563 && OP(scan) == IFMATCH ) { /* Lookahead */
3564 cl_init(pRExC_state, &intrnl);
3565 data_fake.start_class = &intrnl;
3566 f |= SCF_DO_STCLASS_AND;
3567 }
3568 if (flags & SCF_WHILEM_VISITED_POS)
3569 f |= SCF_WHILEM_VISITED_POS;
3570 next = regnext(scan);
3571 nscan = NEXTOPER(NEXTOPER(scan));
3572
3573 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3574 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3575 if (scan->flags) {
3576 if (deltanext) {
58e23c8d 3577 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3578 }
3579 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3580 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3581 }
3582 scan->flags = (U8)*minnextp;
3583 }
3584
3585 *minnextp += min;
3586
3587 if (f & SCF_DO_STCLASS_AND) {
3588 const int was = (data->start_class->flags & ANYOF_EOS);
3589
3590 cl_and(data->start_class, &intrnl);
3591 if (was)
3592 data->start_class->flags |= ANYOF_EOS;
3593 }
3594 if (data) {
3595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3596 pars++;
3597 if (data_fake.flags & SF_HAS_EVAL)
3598 data->flags |= SF_HAS_EVAL;
3599 data->whilem_c = data_fake.whilem_c;
3600 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3601 if (RExC_rx->minlen<*minnextp)
3602 RExC_rx->minlen=*minnextp;
3603 scan_commit(pRExC_state, &data_fake, minnextp);
3604 SvREFCNT_dec(data_fake.last_found);
3605
3606 if ( data_fake.minlen_fixed != minlenp )
3607 {
3608 data->offset_fixed= data_fake.offset_fixed;
3609 data->minlen_fixed= data_fake.minlen_fixed;
3610 data->lookbehind_fixed+= scan->flags;
3611 }
3612 if ( data_fake.minlen_float != minlenp )
3613 {
3614 data->minlen_float= data_fake.minlen_float;
3615 data->offset_float_min=data_fake.offset_float_min;
3616 data->offset_float_max=data_fake.offset_float_max;
3617 data->lookbehind_float+= scan->flags;
3618 }
3619 }
3620 }
3621
3622
40d049e4 3623 }
8aa23a47
YO
3624#endif
3625 }
3626 else if (OP(scan) == OPEN) {
3627 if (stopparen != (I32)ARG(scan))
3628 pars++;
3629 }
3630 else if (OP(scan) == CLOSE) {
3631 if (stopparen == (I32)ARG(scan)) {
3632 break;
3633 }
3634 if ((I32)ARG(scan) == is_par) {
3635 next = regnext(scan);
b515a41d 3636
8aa23a47
YO
3637 if ( next && (OP(next) != WHILEM) && next < last)
3638 is_par = 0; /* Disable optimization */
40d049e4 3639 }
8aa23a47
YO
3640 if (data)
3641 *(data->last_closep) = ARG(scan);
3642 }
3643 else if (OP(scan) == EVAL) {
c277df42
IZ
3644 if (data)
3645 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3646 }
3647 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3648 if (flags & SCF_DO_SUBSTR) {
3649 scan_commit(pRExC_state,data,minlenp);
3650 flags &= ~SCF_DO_SUBSTR;
40d049e4 3651 }
8aa23a47
YO
3652 if (data && OP(scan)==ACCEPT) {
3653 data->flags |= SCF_SEEN_ACCEPT;
3654 if (stopmin > min)
3655 stopmin = min;
e2e6a0f1 3656 }
8aa23a47
YO
3657 }
3658 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3659 {
0f5d15d6 3660 if (flags & SCF_DO_SUBSTR) {
1de06328 3661 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3662 data->longest = &(data->longest_float);
3663 }
3664 is_inf = is_inf_internal = 1;
653099ff 3665 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3666 cl_anything(pRExC_state, data->start_class);
96776eda 3667 flags &= ~SCF_DO_STCLASS;
8aa23a47 3668 }
58e23c8d
YO
3669 else if (OP(scan) == GPOS) {
3670 if (!(RExC_rx->reganch & ROPT_GPOS_FLOAT) &&
3671 !(delta || is_inf || (data && data->pos_delta)))
3672 {
3673 if (!(RExC_rx->reganch & ROPT_ANCH) && (flags & SCF_DO_SUBSTR))
3674 RExC_rx->reganch |= ROPT_ANCH_GPOS;
3675 if (RExC_rx->gofs < (U32)min)
3676 RExC_rx->gofs = min;
3677 } else {
3678 RExC_rx->reganch |= ROPT_GPOS_FLOAT;
3679 RExC_rx->gofs = 0;
3680 }
3681 }
786e8c11 3682#ifdef TRIE_STUDY_OPT
40d049e4 3683#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3684 else if (PL_regkind[OP(scan)] == TRIE) {
3685 /* NOTE - There is similar code to this block above for handling
3686 BRANCH nodes on the initial study. If you change stuff here
3687 check there too. */
3688 regnode *trie_node= scan;
3689 regnode *tail= regnext(scan);
3690 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3691 I32 max1 = 0, min1 = I32_MAX;
3692 struct regnode_charclass_class accum;
3693
3694 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3695 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3696 if (flags & SCF_DO_STCLASS)
3697 cl_init_zero(pRExC_state, &accum);
3698
3699 if (!trie->jump) {
3700 min1= trie->minlen;
3701 max1= trie->maxlen;
3702 } else {
3703 const regnode *nextbranch= NULL;
3704 U32 word;
3705
3706 for ( word=1 ; word <= trie->wordcount ; word++)
3707 {
3708 I32 deltanext=0, minnext=0, f = 0, fake;
3709 struct regnode_charclass_class this_class;
3710
3711 data_fake.flags = 0;
3712 if (data) {
3713 data_fake.whilem_c = data->whilem_c;
3714 data_fake.last_closep = data->last_closep;
3715 }
3716 else
3717 data_fake.last_closep = &fake;
58e23c8d 3718 data_fake.pos_delta = delta;
8aa23a47
YO
3719 if (flags & SCF_DO_STCLASS) {
3720 cl_init(pRExC_state, &this_class);
3721 data_fake.start_class = &this_class;
3722 f = SCF_DO_STCLASS_AND;
3723 }
3724 if (flags & SCF_WHILEM_VISITED_POS)
3725 f |= SCF_WHILEM_VISITED_POS;
3726
3727 if (trie->jump[word]) {
3728 if (!nextbranch)
3729 nextbranch = trie_node + trie->jump[0];
3730 scan= trie_node + trie->jump[word];
3731 /* We go from the jump point to the branch that follows
3732 it. Note this means we need the vestigal unused branches
3733 even though they arent otherwise used.
3734 */
3735 minnext = study_chunk(pRExC_state, &scan, minlenp,
3736 &deltanext, (regnode *)nextbranch, &data_fake,
3737 stopparen, recursed, NULL, f,depth+1);
3738 }
3739 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3740 nextbranch= regnext((regnode*)nextbranch);
3741
3742 if (min1 > (I32)(minnext + trie->minlen))
3743 min1 = minnext + trie->minlen;
3744 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3745 max1 = minnext + deltanext + trie->maxlen;
3746 if (deltanext == I32_MAX)
3747 is_inf = is_inf_internal = 1;
3748
3749 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3750 pars++;
3751 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3752 if ( stopmin > min + min1)
3753 stopmin = min + min1;
3754 flags &= ~SCF_DO_SUBSTR;
3755 if (data)
3756 data->flags |= SCF_SEEN_ACCEPT;
3757 }
3758 if (data) {
3759 if (data_fake.flags & SF_HAS_EVAL)
3760 data->flags |= SF_HAS_EVAL;
3761 data->whilem_c = data_fake.whilem_c;
3762 }
3763 if (flags & SCF_DO_STCLASS)
3764 cl_or(pRExC_state, &accum, &this_class);
3765 }
3766 }
3767 if (flags & SCF_DO_SUBSTR) {
3768 data->pos_min += min1;
3769 data->pos_delta += max1 - min1;
3770 if (max1 != min1 || is_inf)
3771 data->longest = &(data->longest_float);
3772 }
3773 min += min1;
3774 delta += max1 - min1;
3775 if (flags & SCF_DO_STCLASS_OR) {
3776 cl_or(pRExC_state, data->start_class, &accum);
3777 if (min1) {
3778 cl_and(data->start_class, and_withp);
3779 flags &= ~SCF_DO_STCLASS;
3780 }
3781 }
3782 else if (flags & SCF_DO_STCLASS_AND) {
3783 if (min1) {
3784 cl_and(data->start_class, &accum);
3785 flags &= ~SCF_DO_STCLASS;
3786 }
3787 else {
3788 /* Switch to OR mode: cache the old value of
3789 * data->start_class */
3790 INIT_AND_WITHP;
3791 StructCopy(data->start_class, and_withp,
3792 struct regnode_charclass_class);
3793 flags &= ~SCF_DO_STCLASS_AND;
3794 StructCopy(&accum, data->start_class,
3795 struct regnode_charclass_class);
3796 flags |= SCF_DO_STCLASS_OR;
3797 data->start_class->flags |= ANYOF_EOS;
3798 }
3799 }
3800 scan= tail;
3801 continue;
3802 }
786e8c11 3803#else
8aa23a47
YO
3804 else if (PL_regkind[OP(scan)] == TRIE) {
3805 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3806 U8*bang=NULL;
3807
3808 min += trie->minlen;
3809 delta += (trie->maxlen - trie->minlen);
3810 flags &= ~SCF_DO_STCLASS; /* xxx */
3811 if (flags & SCF_DO_SUBSTR) {
3812 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3813 data->pos_min += trie->minlen;
3814 data->pos_delta += (trie->maxlen - trie->minlen);
3815 if (trie->maxlen != trie->minlen)
3816 data->longest = &(data->longest_float);
3817 }
3818 if (trie->jump) /* no more substrings -- for now /grr*/
3819 flags &= ~SCF_DO_SUBSTR;
b515a41d 3820 }
8aa23a47
YO
3821#endif /* old or new */
3822#endif /* TRIE_STUDY_OPT */
3823 /* Else: zero-length, ignore. */
3824 scan = regnext(scan);
3825 }
3826 if (frame) {
3827 last = frame->last;
3828 scan = frame->next;
3829 stopparen = frame->stop;
3830 frame = frame->prev;
3831 goto fake_study_recurse;
c277df42
IZ
3832 }
3833
3834 finish:
8aa23a47
YO
3835 assert(!frame);
3836
c277df42 3837 *scanp = scan;
aca2d497 3838 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3839 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3840 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3841 if (is_par > (I32)U8_MAX)
c277df42
IZ
3842 is_par = 0;
3843 if (is_par && pars==1 && data) {
3844 data->flags |= SF_IN_PAR;
3845 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3846 }
3847 else if (pars && data) {
c277df42
IZ
3848 data->flags |= SF_HAS_PAR;
3849 data->flags &= ~SF_IN_PAR;
3850 }
653099ff 3851 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3852 cl_and(data->start_class, and_withp);
786e8c11
YO
3853 if (flags & SCF_TRIE_RESTUDY)
3854 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3855
3856 DEBUG_STUDYDATA(data,depth);
3857
e2e6a0f1 3858 return min < stopmin ? min : stopmin;
c277df42
IZ
3859}
3860
76e3520e 3861STATIC I32
5f66b61c 3862S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3863{
830247a4 3864 if (RExC_rx->data) {
2eb97020 3865 const U32 count = RExC_rx->data->count;
b81d288d 3866 Renewc(RExC_rx->data,
2eb97020 3867 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3868 char, struct reg_data);
2eb97020 3869 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3870 RExC_rx->data->count += n;
a0ed51b3
LW
3871 }
3872 else {
a02a5408 3873 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3874 char, struct reg_data);
a02a5408 3875 Newx(RExC_rx->data->what, n, U8);
830247a4 3876 RExC_rx->data->count = n;
c277df42 3877 }
830247a4
IZ
3878 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3879 return RExC_rx->data->count - n;
c277df42
IZ
3880}
3881
76234dfb 3882#ifndef PERL_IN_XSUB_RE
d88dccdf 3883void
864dbfa3 3884Perl_reginitcolors(pTHX)
d88dccdf 3885{
97aff369 3886 dVAR;
1df70142 3887 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3888 if (s) {
1df70142
AL
3889 char *t = savepv(s);
3890 int i = 0;
3891 PL_colors[0] = t;
d88dccdf 3892 while (++i < 6) {
1df70142
AL
3893 t = strchr(t, '\t');
3894 if (t) {
3895 *t = '\0';
3896 PL_colors[i] = ++t;
d88dccdf
IZ
3897 }
3898 else
1df70142 3899 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3900 }
3901 } else {
1df70142 3902 int i = 0;
b81d288d 3903 while (i < 6)
06b5626a 3904 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3905 }
3906 PL_colorset = 1;
3907}
76234dfb 3908#endif
8615cb43 3909
07be1b83 3910
786e8c11
YO
3911#ifdef TRIE_STUDY_OPT
3912#define CHECK_RESTUDY_GOTO \
3913 if ( \
3914 (data.flags & SCF_TRIE_RESTUDY) \
3915 && ! restudied++ \
3916 ) goto reStudy
3917#else
3918#define CHECK_RESTUDY_GOTO
3919#endif
f9f4320a 3920
a687059c 3921/*
e50aee73 3922 - pregcomp - compile a regular expression into internal code
a687059c
LW
3923 *
3924 * We can't allocate space until we know how big the compiled form will be,
3925 * but we can't compile it (and thus know how big it is) until we've got a
3926 * place to put the code. So we cheat: we compile it twice, once with code
3927 * generation turned off and size counting turned on, and once "for real".
3928 * This also means that we don't allocate space until we are sure that the
3929 * thing really will compile successfully, and we never have to move the
3930 * code and thus invalidate pointers into it. (Note that it has to be in
3931 * one piece because free() must be able to free it all.) [NB: not true in perl]
3932 *
3933 * Beware that the optimization-preparation code in here knows about some
3934 * of the structure of the compiled regexp. [I'll say.]
3935 */
b9b4dddf
YO
3936
3937
3938
f9f4320a 3939#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3940#define RE_ENGINE_PTR &PL_core_reg_engine
3941#else
f9f4320a
YO
3942extern const struct regexp_engine my_reg_engine;
3943#define RE_ENGINE_PTR &my_reg_engine
3944#endif
b9b4dddf
YO
3945/* these make a few things look better, to avoid indentation */
3946#define BEGIN_BLOCK {
f9f4320a
YO
3947#define END_BLOCK }
3948
a687059c 3949regexp *
864dbfa3 3950Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3951{
97aff369 3952 dVAR;
f9f4320a
YO
3953 GET_RE_DEBUG_FLAGS_DECL;
3954 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3955#ifndef PERL_IN_XSUB_RE
3956 BEGIN_BLOCK
f9f4320a
YO
3957 /* Dispatch a request to compile a regexp to correct
3958 regexp engine. */
3959 HV * const table = GvHV(PL_hintgv);
3960 if (table) {
3961 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3962 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3963 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3964 DEBUG_COMPILE_r({
8d8756e7 3965 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3966 SvIV(*ptr));
3967 });
f2f78491 3968 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3969 }
b9b4dddf
YO
3970 }
3971 END_BLOCK
3972#endif
3973 BEGIN_BLOCK
a0d0e21e 3974 register regexp *r;
c277df42 3975 regnode *scan;
c277df42 3976 regnode *first;
a0d0e21e 3977 I32 flags;
a0d0e21e
LW
3978 I32 minlen = 0;
3979 I32 sawplus = 0;
3980 I32 sawopen = 0;
2c2d71f5 3981 scan_data_t data;
830247a4 3982 RExC_state_t RExC_state;
be8e71aa 3983 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3984#ifdef TRIE_STUDY_OPT
3985 int restudied= 0;
3986 RExC_state_t copyRExC_state;
3987#endif
a0d0e21e 3988 if (exp == NULL)
c277df42 3989 FAIL("NULL regexp argument");
a0d0e21e 3990
a5961de5 3991 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3992
5cfc7842 3993 RExC_precomp = exp;
a3621e74 3994 DEBUG_COMPILE_r({
ab3bbdeb
YO
3995 SV *dsv= sv_newmortal();
3996 RE_PV_QUOTED_DECL(s, RExC_utf8,
3997 dsv, RExC_precomp, (xend - exp), 60);
3998 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3999 PL_colors[4],PL_colors[5],s);
a5961de5 4000 });
e2509266 4001 RExC_flags = pm->op_pmflags;
830247a4 4002 RExC_sawback = 0;
bbce6d69 4003
830247a4
IZ
4004 RExC_seen = 0;
4005 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4006 RExC_seen_evals = 0;
4007 RExC_extralen = 0;
c277df42 4008
bbce6d69 4009 /* First pass: determine size, legality. */
830247a4 4010 RExC_parse = exp;
fac92740 4011 RExC_start = exp;
830247a4
IZ
4012 RExC_end = xend;
4013 RExC_naughty = 0;
4014 RExC_npar = 1;
c74340f9 4015 RExC_cpar = 1;
e2e6a0f1 4016 RExC_nestroot = 0;
830247a4
IZ
4017 RExC_size = 0L;
4018 RExC_emit = &PL_regdummy;
4019 RExC_whilem_seen = 0;
fc8cd66c 4020 RExC_charnames = NULL;
40d049e4
YO
4021 RExC_open_parens = NULL;
4022 RExC_close_parens = NULL;
4023 RExC_opend = NULL;
81714fb9 4024 RExC_paren_names = NULL;
40d049e4
YO
4025 RExC_recurse = NULL;
4026 RExC_recurse_count = 0;
81714fb9 4027
85ddcde9
JH
4028#if 0 /* REGC() is (currently) a NOP at the first pass.
4029 * Clever compilers notice this and complain. --jhi */
830247a4 4030 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4031#endif
3dab1dad
YO
4032 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4033 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4034 RExC_precomp = NULL;
a0d0e21e
LW
4035 return(NULL);
4036 }
07be1b83 4037 DEBUG_PARSE_r({
81714fb9
YO
4038 PerlIO_printf(Perl_debug_log,
4039 "Required size %"IVdf" nodes\n"
4040 "Starting second pass (creation)\n",
4041 (IV)RExC_size);
07be1b83
YO
4042 RExC_lastnum=0;
4043 RExC_lastparse=NULL;
4044 });
c277df42
IZ
4045 /* Small enough for pointer-storage convention?
4046 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4047 if (RExC_size >= 0x10000L && RExC_extralen)
4048 RExC_size += RExC_extralen;
c277df42 4049 else
830247a4
IZ
4050 RExC_extralen = 0;
4051 if (RExC_whilem_seen > 15)
4052 RExC_whilem_seen = 15;
a0d0e21e 4053
e2e6a0f1
YO
4054#ifdef DEBUGGING
4055 /* Make room for a sentinel value at the end of the program */
4056 RExC_size++;
4057#endif
4058
f9f4320a
YO
4059 /* Allocate space and zero-initialize. Note, the two step process
4060 of zeroing when in debug mode, thus anything assigned has to
4061 happen after that */
a02a5408 4062 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 4063 char, regexp);
a0d0e21e 4064 if (r == NULL)
b45f050a 4065 FAIL("Regexp out of space");
0f79a09d
GS
4066#ifdef DEBUGGING
4067 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4068 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d
YO
4069#else
4070 /* bulk initialize fields with 0. */
4071 Zero(r, sizeof(regexp), char);
0f79a09d 4072#endif
58e23c8d
YO
4073
4074 /* non-zero initialization begins here */
f9f4320a 4075 r->engine= RE_ENGINE_PTR;
c277df42 4076 r->refcnt = 1;
bbce6d69 4077 r->prelen = xend - exp;
5cfc7842 4078 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4079 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4080 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4081
6bda09f9 4082 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4083 Newxz(RExC_open_parens, RExC_npar,regnode *);
4084 SAVEFREEPV(RExC_open_parens);
4085 Newxz(RExC_close_parens,RExC_npar,regnode *);
4086 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4087 }
4088
4089 /* Useful during FAIL. */
a02a5408 4090 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4091 if (r->offsets) {
2af232bd 4092 r->offsets[0] = RExC_size;
fac92740 4093 }
a3621e74 4094 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4095 "%s %"UVuf" bytes for offset annotations.\n",
4096 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4097 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4098
830247a4 4099 RExC_rx = r;
bbce6d69 4100
4101 /* Second pass: emit code. */
e2509266 4102 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4103 RExC_parse = exp;
4104 RExC_end = xend;
4105 RExC_naughty = 0;
4106 RExC_npar = 1;
c74340f9 4107 RExC_cpar = 1;
fac92740 4108 RExC_emit_start = r->program;
830247a4 4109 RExC_emit = r->program;
e2e6a0f1
YO
4110#ifdef DEBUGGING
4111 /* put a sentinal on the end of the program so we can check for
4112 overwrites */
4113 r->program[RExC_size].type = 255;
4114#endif
2cd61cdb 4115 /* Store the count of eval-groups for security checks: */
786e8c11 4116 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4117 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4118 r->data = 0;
3dab1dad 4119 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4120 return(NULL);
6bda09f9 4121
07be1b83
YO
4122 /* XXXX To minimize changes to RE engine we always allocate
4123 3-units-long substrs field. */
4124 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4125 if (RExC_recurse_count) {
4126 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4127 SAVEFREEPV(RExC_recurse);
4128 }
a0d0e21e 4129
07be1b83 4130reStudy:
1de06328 4131 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4132 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4133
07be1b83
YO
4134#ifdef TRIE_STUDY_OPT
4135 if ( restudied ) {
5d458dd8 4136 U32 seen=RExC_seen;
07be1b83 4137 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4138
4139 RExC_state = copyRExC_state;
4140 if (seen & REG_TOP_LEVEL_BRANCHES)
4141 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4142 else
4143 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4144 if (data.last_found) {
07be1b83 4145 SvREFCNT_dec(data.longest_fixed);
07be1b83 4146 SvREFCNT_dec(data.longest_float);
07be1b83 4147 SvREFCNT_dec(data.last_found);
1de06328 4148 }
40d049e4 4149 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4150 } else {
40d049e4 4151 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4152 copyRExC_state = RExC_state;
07be1b83 4153 }
40d049e4
YO
4154#else
4155 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4156#endif
fc8cd66c 4157
a0d0e21e 4158 /* Dig out information for optimizations. */
cf93c79d 4159 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4160 pm->op_pmflags = RExC_flags;
a0ed51b3 4161 if (UTF)
5ff6fc6d 4162 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4163 r->regstclass = NULL;
830247a4 4164 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4165 r->reganch |= ROPT_NAUGHTY;
c277df42 4166 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4167
1de06328
YO
4168 /* testing for BRANCH here tells us whether there is "must appear"
4169 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4170 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4171 I32 fake;
c5254dd6 4172 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4173 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4174 int stclass_flag;
07be1b83 4175 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4176
4177 first = scan;
c277df42 4178 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4179 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4180 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4181 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4182 /* for now we can't handle lookbehind IFMATCH*/
4183 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4184 (OP(first) == PLUS) ||
4185 (OP(first) == MINMOD) ||
653099ff 4186 /* An {n,m} with n>0 */
07be1b83
YO
4187 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4188 {
786e8c11 4189
a0d0e21e
LW
4190 if (OP(first) == PLUS)
4191 sawplus = 1;
4192 else
3dab1dad 4193 first += regarglen[OP(first)];
07be1b83
YO
4194 if (OP(first) == IFMATCH) {
4195 first = NEXTOPER(first);
4196 first += EXTRA_STEP_2ARGS;
7c167cea 4197 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4198 first = NEXTOPER(first);
a687059c
LW
4199 }
4200
a0d0e21e
LW
4201 /* Starting-point info. */
4202 again:
786e8c11 4203 DEBUG_PEEP("first:",first,0);
07be1b83 4204 /* Ignore EXACT as we deal with it later. */
3dab1dad 4205 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4206 if (OP(first) == EXACT)
6f207bd3 4207 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4208 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4209 r->regstclass = first;
4210 }
07be1b83 4211#ifdef TRIE_STCLASS
786e8c11 4212 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4213 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4214 {
786e8c11 4215 regnode *trie_op;
07be1b83 4216 /* this can happen only on restudy */
786e8c11
YO
4217 if ( OP(first) == TRIE ) {
4218 struct regnode_1 *trieop;
4219 Newxz(trieop,1,struct regnode_1);
4220 StructCopy(first,trieop,struct regnode_1);
4221 trie_op=(regnode *)trieop;
4222 } else {
4223 struct regnode_charclass *trieop;
4224 Newxz(trieop,1,struct regnode_charclass);
4225 StructCopy(first,trieop,struct regnode_charclass);
4226 trie_op=(regnode *)trieop;
4227 }
1de06328 4228 OP(trie_op)+=2;
786e8c11
YO
4229 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4230 r->regstclass = trie_op;
07be1b83
YO
4231 }
4232#endif
bfed75c6 4233 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4234 r->regstclass = first;
3dab1dad
YO
4235 else if (PL_regkind[OP(first)] == BOUND ||
4236 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4237 r->regstclass = first;
3dab1dad 4238 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4239 r->reganch |= (OP(first) == MBOL
4240 ? ROPT_ANCH_MBOL
4241 : (OP(first) == SBOL
4242 ? ROPT_ANCH_SBOL
4243 : ROPT_ANCH_BOL));
a0d0e21e 4244 first = NEXTOPER(first);
774d564b 4245 goto again;
4246 }
4247 else if (OP(first) == GPOS) {
4248 r->reganch |= ROPT_ANCH_GPOS;
4249 first = NEXTOPER(first);
4250 goto again;
a0d0e21e 4251 }
cf2a2b69
YO
4252 else if ((!sawopen || !RExC_sawback) &&
4253 (OP(first) == STAR &&
3dab1dad 4254 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
58e23c8d 4255 !(r->reganch & ROPT_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4256 {
4257 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4258 const int type =
4259 (OP(NEXTOPER(first)) == REG_ANY)
4260 ? ROPT_ANCH_MBOL
4261 : ROPT_ANCH_SBOL;
cad2e5aa 4262 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4263 first = NEXTOPER(first);
774d564b 4264 goto again;
a0d0e21e 4265 }
b81d288d 4266 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4267 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4268 /* x+ must match at the 1st pos of run of x's */
4269 r->reganch |= ROPT_SKIP;
a0d0e21e 4270
c277df42 4271 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4272#ifdef TRIE_STUDY_OPT
81714fb9 4273 DEBUG_PARSE_r(
be8e71aa
YO
4274 if (!restudied)
4275 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4276 (IV)(first - scan + 1))
4277 );
4278#else
81714fb9 4279 DEBUG_PARSE_r(
be8e71aa
YO
4280 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4281 (IV)(first - scan + 1))
4282 );
4283#endif
4284
4285
a0d0e21e
LW
4286 /*
4287 * If there's something expensive in the r.e., find the
4288 * longest literal string that must appear and make it the
4289 * regmust. Resolve ties in favor of later strings, since
4290 * the regstart check works with the beginning of the r.e.
4291 * and avoiding duplication strengthens checking. Not a
4292 * strong reason, but sufficient in the absence of others.
4293 * [Now we resolve ties in favor of the earlier string if
c277df42 4294 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4295 * earlier string may buy us something the later one won't.]
4296 */
de8c5301 4297
396482e1
GA
4298 data.longest_fixed = newSVpvs("");
4299 data.longest_float = newSVpvs("");
4300 data.last_found = newSVpvs("");
c277df42
IZ
4301 data.longest = &(data.longest_fixed);
4302 first = scan;
653099ff 4303 if (!r->regstclass) {
830247a4 4304 cl_init(pRExC_state, &ch_class);
653099ff
GS
4305 data.start_class = &ch_class;
4306 stclass_flag = SCF_DO_STCLASS_AND;
4307 } else /* XXXX Check for BOUND? */
4308 stclass_flag = 0;
cb434fcc 4309 data.last_closep = &last_close;
de8c5301 4310
1de06328 4311 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4312 &data, -1, NULL, NULL,
4313 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4314
07be1b83 4315
786e8c11
YO
4316 CHECK_RESTUDY_GOTO;
4317
4318
830247a4 4319 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4320 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4321 && !RExC_seen_zerolen
4322 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4323 r->reganch |= ROPT_CHECK_ALL;
1de06328 4324 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4325 SvREFCNT_dec(data.last_found);
4326
1de06328
YO
4327 /* Note that code very similar to this but for anchored string
4328 follows immediately below, changes may need to be made to both.
4329 Be careful.
4330 */
a0ed51b3 4331 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4332 if (longest_float_length
c277df42
IZ
4333 || (data.flags & SF_FL_BEFORE_EOL
4334 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4335 || (RExC_flags & PMf_MULTILINE))))
4336 {
1182767e 4337 I32 t,ml;
cf93c79d 4338
1de06328 4339 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4340 && data.offset_fixed == data.offset_float_min
4341 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4342 goto remove_float; /* As in (a)+. */
4343
1de06328
YO
4344 /* copy the information about the longest float from the reg_scan_data
4345 over to the program. */
33b8afdf
JH
4346 if (SvUTF8(data.longest_float)) {
4347 r->float_utf8 = data.longest_float;
c445ea15 4348 r->float_substr = NULL;
33b8afdf
JH
4349 } else {
4350 r->float_substr = data.longest_float;
c445ea15 4351 r->float_utf8 = NULL;
33b8afdf 4352 }
1de06328
YO
4353 /* float_end_shift is how many chars that must be matched that
4354 follow this item. We calculate it ahead of time as once the
4355 lookbehind offset is added in we lose the ability to correctly
4356 calculate it.*/
4357 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4358 : (I32)longest_float_length;
1de06328
YO
4359 r->float_end_shift = ml - data.offset_float_min
4360 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4361 + data.lookbehind_float;
4362 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4363 r->float_max_offset = data.offset_float_max;
1182767e 4364 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4365 r->float_max_offset -= data.lookbehind_float;
4366
cf93c79d
IZ
4367 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4368 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4369 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4370 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4371 }
4372 else {
aca2d497 4373 remove_float:
c445ea15 4374 r->float_substr = r->float_utf8 = NULL;
c277df42 4375 SvREFCNT_dec(data.longest_float);
c5254dd6 4376 longest_float_length = 0;
a0d0e21e 4377 }
c277df42 4378
1de06328
YO
4379 /* Note that code very similar to this but for floating string
4380 is immediately above, changes may need to be made to both.
4381 Be careful.
4382 */
a0ed51b3 4383 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4384 if (longest_fixed_length
c277df42
IZ
4385 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4386 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4387 || (RExC_flags & PMf_MULTILINE))))
4388 {
1182767e 4389 I32 t,ml;
cf93c79d 4390
1de06328
YO
4391 /* copy the information about the longest fixed
4392 from the reg_scan_data over to the program. */
33b8afdf
JH
4393 if (SvUTF8(data.longest_fixed)) {
4394 r->anchored_utf8 = data.longest_fixed;
c445ea15 4395 r->anchored_substr = NULL;
33b8afdf
JH
4396 } else {
4397 r->anchored_substr = data.longest_fixed;
c445ea15 4398 r->anchored_utf8 = NULL;
33b8afdf 4399 }
1de06328
YO
4400 /* fixed_end_shift is how many chars that must be matched that
4401 follow this item. We calculate it ahead of time as once the
4402 lookbehind offset is added in we lose the ability to correctly
4403 calculate it.*/
4404 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4405 : (I32)longest_fixed_length;
1de06328
YO
4406 r->anchored_end_shift = ml - data.offset_fixed
4407 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4408 + data.lookbehind_fixed;
4409 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4410
cf93c79d
IZ
4411 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4412 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4413 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4414 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4415 }
4416 else {
c445ea15 4417 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4418 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4419 longest_fixed_length = 0;
a0d0e21e 4420 }
b81d288d 4421 if (r->regstclass
ffc61ed2 4422 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4423 r->regstclass = NULL;
33b8afdf
JH
4424 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4425 && stclass_flag
653099ff 4426 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4427 && !cl_is_anything(data.start_class))
4428 {
1df70142 4429 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4430
a02a5408 4431 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4432 struct regnode_charclass_class);
4433 StructCopy(data.start_class,
830247a4 4434 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4435 struct regnode_charclass_class);
830247a4 4436 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4437 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4438 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4439 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4440 PerlIO_printf(Perl_debug_log,
a0288114 4441 "synthetic stclass \"%s\".\n",
3f7c398e 4442 SvPVX_const(sv));});
653099ff 4443 }
c277df42
IZ
4444
4445 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4446 if (longest_fixed_length > longest_float_length) {
1de06328 4447 r->check_end_shift = r->anchored_end_shift;
c277df42 4448 r->check_substr = r->anchored_substr;
33b8afdf 4449 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4450 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4451 if (r->reganch & ROPT_ANCH_SINGLE)
4452 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4453 }
4454 else {
1de06328 4455 r->check_end_shift = r->float_end_shift;
c277df42 4456 r->check_substr = r->float_substr;
33b8afdf 4457 r->check_utf8 = r->float_utf8;
1de06328
YO
4458 r->check_offset_min = r->float_min_offset;
4459 r->check_offset_max = r->float_max_offset;
a0d0e21e 4460 }
30382c73
IZ
4461 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4462 This should be changed ASAP! */
33b8afdf 4463 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4464 r->reganch |= RE_USE_INTUIT;
33b8afdf 4465 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4466 r->reganch |= RE_INTUIT_TAIL;
4467 }
1de06328
YO
4468 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4469 if ( (STRLEN)minlen < longest_float_length )
4470 minlen= longest_float_length;
4471 if ( (STRLEN)minlen < longest_fixed_length )
4472 minlen= longest_fixed_length;
4473 */
a0ed51b3
LW
4474 }
4475 else {
c277df42
IZ
4476 /* Several toplevels. Best we can is to set minlen. */
4477 I32 fake;
653099ff 4478 struct regnode_charclass_class ch_class;
cb434fcc 4479 I32 last_close = 0;
c277df42 4480
5d458dd8 4481 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4482
c277df42 4483 scan = r->program + 1;
830247a4 4484 cl_init(pRExC_state, &ch_class);
653099ff 4485 data.start_class = &ch_class;
cb434fcc 4486 data.last_closep = &last_close;
07be1b83 4487
de8c5301 4488
1de06328 4489 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4490 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4491
786e8c11 4492 CHECK_RESTUDY_GOTO;
07be1b83 4493
33b8afdf 4494 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4495 = r->float_substr = r->float_utf8 = NULL;
653099ff 4496 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4497 && !cl_is_anything(data.start_class))
4498 {
1df70142 4499 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4500
a02a5408 4501 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4502 struct regnode_charclass_class);
4503 StructCopy(data.start_class,
830247a4 4504 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4505 struct regnode_charclass_class);
830247a4 4506 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4507 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4508 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4509 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4510 PerlIO_printf(Perl_debug_log,
a0288114 4511 "synthetic stclass \"%s\".\n",
3f7c398e 4512 SvPVX_const(sv));});
653099ff 4513 }
a0d0e21e
LW
4514 }
4515
1de06328
YO
4516 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4517 the "real" pattern. */
cf9788e3
RGS
4518 DEBUG_OPTIMISE_r({
4519 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4520 minlen, r->minlen);
4521 });
de8c5301 4522 r->minlenret = minlen;
1de06328
YO
4523 if (r->minlen < minlen)
4524 r->minlen = minlen;
4525
b81d288d 4526 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4527 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4528 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4529 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4530 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4531 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4532 if (RExC_seen & REG_SEEN_CANY)
4533 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4534 if (RExC_seen & REG_SEEN_VERBARG)
4535 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4536 if (RExC_seen & REG_SEEN_CUTGROUP)
4537 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4538 if (RExC_paren_names)
4539 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4540 else
4541 r->paren_names = NULL;
4542
40d049e4
YO
4543 if (RExC_recurse_count) {
4544 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4545 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4546 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4547 }
4548 }
a02a5408
JC
4549 Newxz(r->startp, RExC_npar, I32);
4550 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4551 /* assume we don't need to swap parens around before we match */
4552
be8e71aa
YO
4553 DEBUG_DUMP_r({
4554 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4555 regdump(r);
4556 });
8e9a8a48
YO
4557 DEBUG_OFFSETS_r(if (r->offsets) {
4558 const U32 len = r->offsets[0];
4559 U32 i;
4560 GET_RE_DEBUG_FLAGS_DECL;
4561 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4562 for (i = 1; i <= len; i++) {
4563 if (r->offsets[i*2-1] || r->offsets[i*2])
4564 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4565 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4566 }
4567 PerlIO_printf(Perl_debug_log, "\n");
4568 });
a0d0e21e 4569 return(r);
f9f4320a 4570 END_BLOCK
a687059c
LW
4571}
4572
f9f4320a
YO
4573#undef CORE_ONLY_BLOCK
4574#undef END_BLOCK
4575#undef RE_ENGINE_PTR
3dab1dad 4576
9af228c6 4577#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4578SV*
4579Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4580{
4581 I32 parno = 0; /* no match */
4582 if (PL_curpm) {
4583 const REGEXP * const rx = PM_GETRE(PL_curpm);
4584 if (rx && rx->paren_names) {
4585 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4586 if (he_str) {
4587 IV i;
4588 SV* sv_dat=HeVAL(he_str);
4589 I32 *nums=(I32*)SvPVX(sv_dat);
4590 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4591 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4592 rx->endp[nums[i]] != -1)
4593 {
4594 parno = nums[i];
4595 break;
4596 }
4597 }
4598 }
4599 }
4600 }
4601 if ( !parno ) {
4602 return 0;
4603 } else {
4604 GV *gv_paren;
4605 SV *sv= sv_newmortal();
4606 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4607 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4608 return GvSVn(gv_paren);
4609 }
4610}
9af228c6 4611#endif
0a4db386 4612
894be9b7 4613/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4614 * If flags is REG_RSN_RETURN_NULL returns null.
4615 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4616 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4617 * to the parsed name as looked up in the RExC_paren_names hash.
4618 * If there is an error throws a vFAIL().. type exception.
894be9b7 4619 */
0a4db386
YO
4620
4621#define REG_RSN_RETURN_NULL 0
4622#define REG_RSN_RETURN_NAME 1
4623#define REG_RSN_RETURN_DATA 2
4624
894be9b7
YO
4625STATIC SV*
4626S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4627 char *name_start = RExC_parse;
0a4db386 4628 if ( UTF ) {
894be9b7 4629 STRLEN numlen;
0a4db386
YO
4630 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4631 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4632 {
4633 RExC_parse += numlen;
4634 }
4635 } else {
4636 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4637 RExC_parse++;
4638 }
0a4db386
YO
4639 if ( flags ) {
4640 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4641 (int)(RExC_parse - name_start)));
894be9b7 4642 if (UTF)
0a4db386
YO
4643 SvUTF8_on(sv_name);
4644 if ( flags == REG_RSN_RETURN_NAME)
4645 return sv_name;
4646 else if (flags==REG_RSN_RETURN_DATA) {
4647 HE *he_str = NULL;
4648 SV *sv_dat = NULL;
4649 if ( ! sv_name ) /* should not happen*/
4650 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4651 if (RExC_paren_names)
4652 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4653 if ( he_str )
4654 sv_dat = HeVAL(he_str);
4655 if ( ! sv_dat )
4656 vFAIL("Reference to nonexistent named group");
4657 return sv_dat;
4658 }
4659 else {
4660 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4661 }
4662 /* NOT REACHED */
894be9b7 4663 }
0a4db386 4664 return NULL;
894be9b7
YO
4665}
4666
3dab1dad
YO
4667#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4668 int rem=(int)(RExC_end - RExC_parse); \
4669 int cut; \
4670 int num; \
4671 int iscut=0; \
4672 if (rem>10) { \
4673 rem=10; \
4674 iscut=1; \
4675 } \
4676 cut=10-rem; \
4677 if (RExC_lastparse!=RExC_parse) \
4678 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4679 rem, RExC_parse, \
4680 cut + 4, \
4681 iscut ? "..." : "<" \
4682 ); \
4683 else \
4684 PerlIO_printf(Perl_debug_log,"%16s",""); \
4685 \
4686 if (SIZE_ONLY) \
4687 num=RExC_size; \
4688 else \
4689 num=REG_NODE_NUM(RExC_emit); \
4690 if (RExC_lastnum!=num) \
0a4db386 4691 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4692 else \
0a4db386 4693 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4694 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4695 (int)((depth*2)), "", \
3dab1dad
YO
4696 (funcname) \
4697 ); \
4698 RExC_lastnum=num; \
4699 RExC_lastparse=RExC_parse; \
4700})
4701
07be1b83
YO
4702
4703
3dab1dad
YO
4704#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4705 DEBUG_PARSE_MSG((funcname)); \
4706 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4707})
6bda09f9
YO
4708#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4709 DEBUG_PARSE_MSG((funcname)); \
4710 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4711})
a687059c
LW
4712/*
4713 - reg - regular expression, i.e. main body or parenthesized thing
4714 *
4715 * Caller must absorb opening parenthesis.
4716 *
4717 * Combining parenthesis handling with the base level of regular expression
4718 * is a trifle forced, but the need to tie the tails of the branches to what
4719 * follows makes it hard to avoid.
4720 */
07be1b83
YO
4721#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4722#ifdef DEBUGGING
4723#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4724#else
4725#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4726#endif
3dab1dad 4727
e2e6a0f1
YO
4728/* this idea is borrowed from STR_WITH_LEN in handy.h */
4729#define CHECK_WORD(s,v,l) \
4730 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4731
76e3520e 4732STATIC regnode *
3dab1dad 4733S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4734 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4735{
27da23d5 4736 dVAR;
c277df42
IZ
4737 register regnode *ret; /* Will be the head of the group. */
4738 register regnode *br;
4739 register regnode *lastbr;
cbbf8932 4740 register regnode *ender = NULL;
a0d0e21e 4741 register I32 parno = 0;
cbbf8932
AL
4742 I32 flags;
4743 const I32 oregflags = RExC_flags;
6136c704
AL
4744 bool have_branch = 0;
4745 bool is_open = 0;
9d1d55b5
JP
4746
4747 /* for (?g), (?gc), and (?o) warnings; warning
4748 about (?c) will warn about (?g) -- japhy */
4749
6136c704
AL
4750#define WASTED_O 0x01
4751#define WASTED_G 0x02
4752#define WASTED_C 0x04
4753#define WASTED_GC (0x02|0x04)
cbbf8932 4754 I32 wastedflags = 0x00;
9d1d55b5 4755
fac92740 4756 char * parse_start = RExC_parse; /* MJD */
a28509cc 4757 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4758
3dab1dad
YO
4759 GET_RE_DEBUG_FLAGS_DECL;
4760 DEBUG_PARSE("reg ");
4761
4762
821b33a5 4763 *flagp = 0; /* Tentatively. */
a0d0e21e 4764
9d1d55b5 4765
a0d0e21e
LW
4766 /* Make an OPEN node, if parenthesized. */
4767 if (paren) {
e2e6a0f1
YO
4768 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4769 char *start_verb = RExC_parse;
4770 STRLEN verb_len = 0;
4771 char *start_arg = NULL;
4772 unsigned char op = 0;
4773 int argok = 1;
4774 int internal_argval = 0; /* internal_argval is only useful if !argok */
4775 while ( *RExC_parse && *RExC_parse != ')' ) {
4776 if ( *RExC_parse == ':' ) {
4777 start_arg = RExC_parse + 1;
4778 break;
4779 }
4780 RExC_parse++;
4781 }
4782 ++start_verb;
4783 verb_len = RExC_parse - start_verb;
4784 if ( start_arg ) {
4785 RExC_parse++;
4786 while ( *RExC_parse && *RExC_parse != ')' )
4787 RExC_parse++;
4788 if ( *RExC_parse != ')' )
4789 vFAIL("Unterminated verb pattern argument");
4790 if ( RExC_parse == start_arg )
4791 start_arg = NULL;
4792 } else {
4793 if ( *RExC_parse != ')' )
4794 vFAIL("Unterminated verb pattern");
4795 }
5d458dd8 4796
e2e6a0f1
YO
4797 switch ( *start_verb ) {
4798 case 'A': /* (*ACCEPT) */
4799 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4800 op = ACCEPT;
4801 internal_argval = RExC_nestroot;
4802 }
4803 break;
4804 case 'C': /* (*COMMIT) */
4805 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4806 op = COMMIT;
e2e6a0f1
YO
4807 break;
4808 case 'F': /* (*FAIL) */
4809 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4810 op = OPFAIL;
4811 argok = 0;
4812 }
4813 break;
5d458dd8
YO
4814 case ':': /* (*:NAME) */
4815 case 'M': /* (*MARK:NAME) */
4816 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4817 op = MARKPOINT;
5d458dd8
YO
4818 argok = -1;
4819 }
4820 break;
4821 case 'P': /* (*PRUNE) */
4822 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4823 op = PRUNE;
e2e6a0f1 4824 break;
5d458dd8
YO
4825 case 'S': /* (*SKIP) */
4826 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4827 op = SKIP;
4828 break;
4829 case 'T': /* (*THEN) */
4830 /* [19:06] <TimToady> :: is then */
4831 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4832 op = CUTGROUP;
4833 RExC_seen |= REG_SEEN_CUTGROUP;
4834 }
e2e6a0f1
YO
4835 break;
4836 }
4837 if ( ! op ) {
4838 RExC_parse++;
4839 vFAIL3("Unknown verb pattern '%.*s'",
4840 verb_len, start_verb);
4841 }
4842 if ( argok ) {
4843 if ( start_arg && internal_argval ) {
4844 vFAIL3("Verb pattern '%.*s' may not have an argument",
4845 verb_len, start_verb);
4846 } else if ( argok < 0 && !start_arg ) {
4847 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4848 verb_len, start_verb);
4849 } else {
4850 ret = reganode(pRExC_state, op, internal_argval);
4851 if ( ! internal_argval && ! SIZE_ONLY ) {
4852 if (start_arg) {
4853 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4854 ARG(ret) = add_data( pRExC_state, 1, "S" );
4855 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4856 ret->flags = 0;
4857 } else {
4858 ret->flags = 1;
4859 }
4860 }
4861 }
4862 if (!internal_argval)
4863 RExC_seen |= REG_SEEN_VERBARG;
4864 } else if ( start_arg ) {
4865 vFAIL3("Verb pattern '%.*s' may not have an argument",
4866 verb_len, start_verb);
4867 } else {
4868 ret = reg_node(pRExC_state, op);
4869 }
4870 nextchar(pRExC_state);
4871 return ret;
4872 } else
fac92740 4873 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4874 U32 posflags = 0, negflags = 0;
4875 U32 *flagsp = &posflags;
6136c704 4876 bool is_logical = 0;
a28509cc 4877 const char * const seqstart = RExC_parse;
ca9dfc88 4878
830247a4
IZ
4879 RExC_parse++;
4880 paren = *RExC_parse++;
c277df42 4881 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4882 switch (paren) {
894be9b7 4883
fac92740 4884 case '<': /* (?<...) */
b81d288d 4885 if (*RExC_parse == '!')
c277df42 4886 paren = ',';
0a4db386
YO
4887 else if (*RExC_parse != '=')
4888 { /* (?<...>) */
81714fb9 4889 char *name_start;
894be9b7 4890 SV *svname;
81714fb9
YO
4891 paren= '>';
4892 case '\'': /* (?'...') */
4893 name_start= RExC_parse;
0a4db386
YO
4894 svname = reg_scan_name(pRExC_state,
4895 SIZE_ONLY ? /* reverse test from the others */
4896 REG_RSN_RETURN_NAME :
4897 REG_RSN_RETURN_NULL);
81714fb9
YO
4898 if (RExC_parse == name_start)
4899 goto unknown;
4900 if (*RExC_parse != paren)
4901 vFAIL2("Sequence (?%c... not terminated",
4902 paren=='>' ? '<' : paren);
4903 if (SIZE_ONLY) {
e62cc96a
YO
4904 HE *he_str;
4905 SV *sv_dat = NULL;
894be9b7
YO
4906 if (!svname) /* shouldnt happen */
4907 Perl_croak(aTHX_
4908 "panic: reg_scan_name returned NULL");
81714fb9
YO
4909 if (!RExC_paren_names) {
4910 RExC_paren_names= newHV();
4911 sv_2mortal((SV*)RExC_paren_names);
4912 }
4913 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4914 if ( he_str )
81714fb9 4915 sv_dat = HeVAL(he_str);
e62cc96a 4916 if ( ! sv_dat ) {
81714fb9 4917 /* croak baby croak */
e62cc96a
YO
4918 Perl_croak(aTHX_
4919 "panic: paren_name hash element allocation failed");
4920 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4921 IV count=SvIV(sv_dat);
4922 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4923 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4924 pv[count]=RExC_npar;
4925 SvIVX(sv_dat)++;
4926 } else {
4927 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4928 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4929 SvIOK_on(sv_dat);
4930 SvIVX(sv_dat)= 1;
e62cc96a
YO
4931 }
4932
81714fb9
YO
4933 /*sv_dump(sv_dat);*/
4934 }
4935 nextchar(pRExC_state);
4936 paren = 1;
4937 goto capturing_parens;
4938 }
4939 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4940 RExC_parse++;
fac92740
MJD
4941 case '=': /* (?=...) */
4942 case '!': /* (?!...) */
830247a4 4943 RExC_seen_zerolen++;
e2e6a0f1
YO
4944 if (*RExC_parse == ')') {
4945 ret=reg_node(pRExC_state, OPFAIL);
4946 nextchar(pRExC_state);
4947 return ret;
4948 }
fac92740
MJD
4949 case ':': /* (?:...) */
4950 case '>': /* (?>...) */
a0d0e21e 4951 break;
fac92740
MJD
4952 case '$': /* (?$...) */
4953 case '@': /* (?@...) */
8615cb43 4954 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4955 break;
fac92740 4956 case '#': /* (?#...) */
830247a4
IZ
4957 while (*RExC_parse && *RExC_parse != ')')
4958 RExC_parse++;
4959 if (*RExC_parse != ')')
c277df42 4960 FAIL("Sequence (?#... not terminated");
830247a4 4961 nextchar(pRExC_state);
a0d0e21e
LW
4962 *flagp = TRYAGAIN;
4963 return NULL;
894be9b7
YO
4964 case '0' : /* (?0) */
4965 case 'R' : /* (?R) */
4966 if (*RExC_parse != ')')
6bda09f9 4967 FAIL("Sequence (?R) not terminated");
1a147d38 4968 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4969 nextchar(pRExC_state);
4970 return ret;
4971 /*notreached*/
894be9b7
YO
4972 { /* named and numeric backreferences */
4973 I32 num;
4974 char * parse_start;
4975 case '&': /* (?&NAME) */
4976 parse_start = RExC_parse - 1;
4977 {
0a4db386
YO
4978 SV *sv_dat = reg_scan_name(pRExC_state,
4979 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4980 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4981 }
4982 goto gen_recurse_regop;
4983 /* NOT REACHED */
542fa716
YO
4984 case '+':
4985 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4986 RExC_parse++;
4987 vFAIL("Illegal pattern");
4988 }
4989 goto parse_recursion;
4990 /* NOT REACHED*/
4991 case '-': /* (?-1) */
4992 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4993 RExC_parse--; /* rewind to let it be handled later */
4994 goto parse_flags;
4995 }
4996 /*FALLTHROUGH */
6bda09f9
YO
4997 case '1': case '2': case '3': case '4': /* (?1) */
4998 case '5': case '6': case '7': case '8': case '9':
4999 RExC_parse--;
542fa716 5000 parse_recursion:
894be9b7
YO
5001 num = atoi(RExC_parse);
5002 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5003 if (*RExC_parse == '-')
5004 RExC_parse++;
6bda09f9
YO
5005 while (isDIGIT(*RExC_parse))
5006 RExC_parse++;
5007 if (*RExC_parse!=')')
5008 vFAIL("Expecting close bracket");
894be9b7
YO
5009
5010 gen_recurse_regop:
542fa716
YO
5011 if ( paren == '-' ) {
5012 /*
5013 Diagram of capture buffer numbering.
5014 Top line is the normal capture buffer numbers
5015 Botton line is the negative indexing as from
5016 the X (the (?-2))
5017
5018 + 1 2 3 4 5 X 6 7
5019 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5020 - 5 4 3 2 1 X x x
5021
5022 */
5023 num = RExC_npar + num;
5024 if (num < 1) {
5025 RExC_parse++;
5026 vFAIL("Reference to nonexistent group");
5027 }
5028 } else if ( paren == '+' ) {
5029 num = RExC_npar + num - 1;
5030 }
5031
1a147d38 5032 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5033 if (!SIZE_ONLY) {
5034 if (num > (I32)RExC_rx->nparens) {
5035 RExC_parse++;
5036 vFAIL("Reference to nonexistent group");
5037 }
40d049e4 5038 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5039 RExC_emit++;
226de585 5040 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5041 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5042 } else {
6bda09f9 5043 RExC_size++;
6bda09f9 5044 }
0a4db386 5045 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5046 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5047 Set_Node_Offset(ret, parse_start); /* MJD */
5048
6bda09f9
YO
5049 nextchar(pRExC_state);
5050 return ret;
894be9b7
YO
5051 } /* named and numeric backreferences */
5052 /* NOT REACHED */
5053
fac92740 5054 case 'p': /* (?p...) */
9014280d 5055 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5056 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5057 /* FALL THROUGH*/
fac92740 5058 case '?': /* (??...) */
6136c704 5059 is_logical = 1;
438a3801
YST
5060 if (*RExC_parse != '{')
5061 goto unknown;
830247a4 5062 paren = *RExC_parse++;
0f5d15d6 5063 /* FALL THROUGH */
fac92740 5064 case '{': /* (?{...}) */
c277df42 5065 {
c277df42
IZ
5066 I32 count = 1, n = 0;
5067 char c;
830247a4 5068 char *s = RExC_parse;
c277df42 5069
830247a4
IZ
5070 RExC_seen_zerolen++;
5071 RExC_seen |= REG_SEEN_EVAL;
5072 while (count && (c = *RExC_parse)) {
6136c704
AL
5073 if (c == '\\') {
5074 if (RExC_parse[1])
5075 RExC_parse++;
5076 }
b81d288d 5077 else if (c == '{')
c277df42 5078 count++;
b81d288d 5079 else if (c == '}')
c277df42 5080 count--;
830247a4 5081 RExC_parse++;
c277df42 5082 }
6136c704 5083 if (*RExC_parse != ')') {
b81d288d 5084 RExC_parse = s;
b45f050a
JF
5085 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5086 }
c277df42 5087 if (!SIZE_ONLY) {
f3548bdc 5088 PAD *pad;
6136c704
AL
5089 OP_4tree *sop, *rop;
5090 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5091
569233ed
SB
5092 ENTER;
5093 Perl_save_re_context(aTHX);
f3548bdc 5094 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5095 sop->op_private |= OPpREFCOUNTED;
5096 /* re_dup will OpREFCNT_inc */
5097 OpREFCNT_set(sop, 1);
569233ed 5098 LEAVE;
c277df42 5099
830247a4
IZ
5100 n = add_data(pRExC_state, 3, "nop");
5101 RExC_rx->data->data[n] = (void*)rop;
5102 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5103 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5104 SvREFCNT_dec(sv);
a0ed51b3 5105 }
e24b16f9 5106 else { /* First pass */
830247a4 5107 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5108 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5109 /* No compiled RE interpolated, has runtime
5110 components ===> unsafe. */
5111 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5112 if (PL_tainting && PL_tainted)
cc6b7395 5113 FAIL("Eval-group in insecure regular expression");
54df2634 5114#if PERL_VERSION > 8
923e4eb5 5115 if (IN_PERL_COMPILETIME)
b5c19bd7 5116 PL_cv_has_eval = 1;
54df2634 5117#endif
c277df42 5118 }
b5c19bd7 5119
830247a4 5120 nextchar(pRExC_state);
6136c704 5121 if (is_logical) {
830247a4 5122 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5123 if (!SIZE_ONLY)
5124 ret->flags = 2;
3dab1dad 5125 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5126 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5127 return ret;
5128 }
ccb2c380
MP
5129 ret = reganode(pRExC_state, EVAL, n);
5130 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5131 Set_Node_Offset(ret, parse_start);
5132 return ret;
c277df42 5133 }
fac92740 5134 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5135 {
0a4db386 5136 int is_define= 0;
fac92740 5137 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5138 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5139 || RExC_parse[1] == '<'
830247a4 5140 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5141 I32 flag;
5142
830247a4 5143 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5144 if (!SIZE_ONLY)
5145 ret->flags = 1;
3dab1dad 5146 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5147 goto insert_if;
b81d288d 5148 }
a0ed51b3 5149 }
0a4db386
YO
5150 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5151 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5152 {
5153 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5154 char *name_start= RExC_parse++;
5155 I32 num = 0;
5156 SV *sv_dat=reg_scan_name(pRExC_state,
5157 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158 if (RExC_parse == name_start || *RExC_parse != ch)
5159 vFAIL2("Sequence (?(%c... not terminated",
5160 (ch == '>' ? '<' : ch));
5161 RExC_parse++;
5162 if (!SIZE_ONLY) {
5163 num = add_data( pRExC_state, 1, "S" );
5164 RExC_rx->data->data[num]=(void*)sv_dat;
5165 SvREFCNT_inc(sv_dat);
5166 }
5167 ret = reganode(pRExC_state,NGROUPP,num);
5168 goto insert_if_check_paren;
5169 }
5170 else if (RExC_parse[0] == 'D' &&
5171 RExC_parse[1] == 'E' &&
5172 RExC_parse[2] == 'F' &&
5173 RExC_parse[3] == 'I' &&
5174 RExC_parse[4] == 'N' &&
5175 RExC_parse[5] == 'E')
5176 {
5177 ret = reganode(pRExC_state,DEFINEP,0);
5178 RExC_parse +=6 ;
5179 is_define = 1;
5180 goto insert_if_check_paren;
5181 }
5182 else if (RExC_parse[0] == 'R') {
5183 RExC_parse++;
5184 parno = 0;
5185 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5186 parno = atoi(RExC_parse++);
5187 while (isDIGIT(*RExC_parse))
5188 RExC_parse++;
5189 } else if (RExC_parse[0] == '&') {
5190 SV *sv_dat;
5191 RExC_parse++;
5192 sv_dat = reg_scan_name(pRExC_state,
5193 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5194 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5195 }
1a147d38 5196 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5197 goto insert_if_check_paren;
5198 }
830247a4 5199 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5200 /* (?(1)...) */
6136c704 5201 char c;
830247a4 5202 parno = atoi(RExC_parse++);
c277df42 5203
830247a4
IZ
5204 while (isDIGIT(*RExC_parse))
5205 RExC_parse++;
fac92740 5206 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5207
0a4db386 5208 insert_if_check_paren:
830247a4 5209 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5210 vFAIL("Switch condition not recognized");
c277df42 5211 insert_if:
3dab1dad
YO
5212 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5213 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5214 if (br == NULL)
830247a4 5215 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5216 else
3dab1dad 5217 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5218 c = *nextchar(pRExC_state);
d1b80229
IZ
5219 if (flags&HASWIDTH)
5220 *flagp |= HASWIDTH;
c277df42 5221 if (c == '|') {
0a4db386
YO
5222 if (is_define)
5223 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5224 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5225 regbranch(pRExC_state, &flags, 1,depth+1);
5226 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5227 if (flags&HASWIDTH)
5228 *flagp |= HASWIDTH;
830247a4 5229 c = *nextchar(pRExC_state);
a0ed51b3
LW
5230 }
5231 else
c277df42
IZ
5232 lastbr = NULL;
5233 if (c != ')')
8615cb43 5234 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5235 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5236 REGTAIL(pRExC_state, br, ender);
c277df42 5237 if (lastbr) {
3dab1dad
YO
5238 REGTAIL(pRExC_state, lastbr, ender);
5239 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5240 }
5241 else
3dab1dad 5242 REGTAIL(pRExC_state, ret, ender);
c277df42 5243 return ret;
a0ed51b3
LW
5244 }
5245 else {
830247a4 5246 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5247 }
5248 }
1b1626e4 5249 case 0:
830247a4 5250 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5251 vFAIL("Sequence (? incomplete");
1b1626e4 5252 break;
a0d0e21e 5253 default:
830247a4 5254 --RExC_parse;
fac92740 5255 parse_flags: /* (?i) */
830247a4 5256 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5257 /* (?g), (?gc) and (?o) are useless here
5258 and must be globally applied -- japhy */
5259
5260 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5261 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5262 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5263 if (! (wastedflags & wflagbit) ) {
5264 wastedflags |= wflagbit;
5265 vWARN5(
5266 RExC_parse + 1,
5267 "Useless (%s%c) - %suse /%c modifier",
5268 flagsp == &negflags ? "?-" : "?",
5269 *RExC_parse,
5270 flagsp == &negflags ? "don't " : "",
5271 *RExC_parse
5272 );
5273 }
5274 }
5275 }
5276 else if (*RExC_parse == 'c') {
5277 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5278 if (! (wastedflags & WASTED_C) ) {
5279 wastedflags |= WASTED_GC;
9d1d55b5
JP
5280 vWARN3(
5281 RExC_parse + 1,
5282 "Useless (%sc) - %suse /gc modifier",
5283 flagsp == &negflags ? "?-" : "?",
5284 flagsp == &negflags ? "don't " : ""
5285 );
5286 }
5287 }
5288 }
5289 else { pmflag(flagsp, *RExC_parse); }
5290
830247a4 5291 ++RExC_parse;
ca9dfc88 5292 }
830247a4 5293 if (*RExC_parse == '-') {
ca9dfc88 5294 flagsp = &negflags;
9d1d55b5 5295 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5296 ++RExC_parse;
ca9dfc88 5297 goto parse_flags;
48c036b1 5298 }
e2509266
JH
5299 RExC_flags |= posflags;
5300 RExC_flags &= ~negflags;
830247a4
IZ
5301 if (*RExC_parse == ':') {
5302 RExC_parse++;
ca9dfc88
IZ
5303 paren = ':';
5304 break;
5305 }
c277df42 5306 unknown:
830247a4
IZ
5307 if (*RExC_parse != ')') {
5308 RExC_parse++;
5309 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5310 }
830247a4 5311 nextchar(pRExC_state);
a0d0e21e
LW
5312 *flagp = TRYAGAIN;
5313 return NULL;
5314 }
5315 }
fac92740 5316 else { /* (...) */
81714fb9 5317 capturing_parens:
830247a4
IZ
5318 parno = RExC_npar;
5319 RExC_npar++;
e2e6a0f1 5320
830247a4 5321 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5322 if (!SIZE_ONLY ){
5323 if (!RExC_nestroot)
5324 RExC_nestroot = parno;
5325 if (RExC_seen & REG_SEEN_RECURSE) {
5326 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5327 "Setting open paren #%"IVdf" to %d\n",
5328 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5329 RExC_open_parens[parno-1]= ret;
5330 }
6bda09f9 5331 }
fac92740
MJD
5332 Set_Node_Length(ret, 1); /* MJD */
5333 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5334 is_open = 1;
a0d0e21e 5335 }
a0ed51b3 5336 }
fac92740 5337 else /* ! paren */
a0d0e21e
LW
5338 ret = NULL;
5339
5340 /* Pick up the branches, linking them together. */
fac92740 5341 parse_start = RExC_parse; /* MJD */
3dab1dad 5342 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5343 /* branch_len = (paren != 0); */
2af232bd 5344
a0d0e21e
LW
5345 if (br == NULL)
5346 return(NULL);
830247a4
IZ
5347 if (*RExC_parse == '|') {
5348 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5349 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5350 }
fac92740 5351 else { /* MJD */
6bda09f9 5352 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5353 Set_Node_Length(br, paren != 0);
5354 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5355 }
c277df42
IZ
5356 have_branch = 1;
5357 if (SIZE_ONLY)
830247a4 5358 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5359 }
5360 else if (paren == ':') {
c277df42
IZ
5361 *flagp |= flags&SIMPLE;
5362 }
6136c704 5363 if (is_open) { /* Starts with OPEN. */
3dab1dad 5364 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5365 }
5366 else if (paren != '?') /* Not Conditional */
a0d0e21e 5367 ret = br;
32a0ca98 5368 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5369 lastbr = br;
830247a4
IZ
5370 while (*RExC_parse == '|') {
5371 if (!SIZE_ONLY && RExC_extralen) {
5372 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5373 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5374 }
5375 if (SIZE_ONLY)
830247a4
IZ
5376 RExC_extralen += 2; /* Account for LONGJMP. */
5377 nextchar(pRExC_state);
3dab1dad 5378 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5379
a687059c 5380 if (br == NULL)
a0d0e21e 5381 return(NULL);
3dab1dad 5382 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5383 lastbr = br;
821b33a5
IZ
5384 if (flags&HASWIDTH)
5385 *flagp |= HASWIDTH;
a687059c 5386 *flagp |= flags&SPSTART;
a0d0e21e
LW
5387 }
5388
c277df42
IZ
5389 if (have_branch || paren != ':') {
5390 /* Make a closing node, and hook it on the end. */
5391 switch (paren) {
5392 case ':':
830247a4 5393 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5394 break;
5395 case 1:
c74340f9 5396 RExC_cpar++;
830247a4 5397 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5398 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5399 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5400 "Setting close paren #%"IVdf" to %d\n",
5401 (IV)parno, REG_NODE_NUM(ender)));
5402 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5403 if (RExC_nestroot == parno)
5404 RExC_nestroot = 0;
40d049e4 5405 }
fac92740
MJD
5406 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5407 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5408 break;
5409 case '<':
c277df42
IZ
5410 case ',':
5411 case '=':
5412 case '!':
c277df42 5413 *flagp &= ~HASWIDTH;
821b33a5
IZ
5414 /* FALL THROUGH */
5415 case '>':
830247a4 5416 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5417 break;
5418 case 0:
830247a4 5419 ender = reg_node(pRExC_state, END);
40d049e4
YO
5420 if (!SIZE_ONLY) {
5421 assert(!RExC_opend); /* there can only be one! */
5422 RExC_opend = ender;
5423 }
c277df42
IZ
5424 break;
5425 }
eaf3ca90 5426 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5427
9674d46a 5428 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5429 if (depth==1)
5430 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5431
c277df42 5432 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5433 for (br = ret; br; br = regnext(br)) {
5434 const U8 op = PL_regkind[OP(br)];
5435 if (op == BRANCH) {
07be1b83 5436 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5437 }
5438 else if (op == BRANCHJ) {
07be1b83 5439 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5440 }
c277df42
IZ
5441 }
5442 }
a0d0e21e 5443 }
c277df42
IZ
5444
5445 {
e1ec3a88
AL
5446 const char *p;
5447 static const char parens[] = "=!<,>";
c277df42
IZ
5448
5449 if (paren && (p = strchr(parens, paren))) {
eb160463 5450 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5451 int flag = (p - parens) > 1;
5452
5453 if (paren == '>')
5454 node = SUSPEND, flag = 0;
6bda09f9 5455 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5456 Set_Node_Cur_Length(ret);
5457 Set_Node_Offset(ret, parse_start + 1);
c277df42 5458 ret->flags = flag;
07be1b83 5459 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5460 }
a0d0e21e
LW
5461 }
5462
5463 /* Check for proper termination. */
ce3e6498 5464 if (paren) {
e2509266 5465 RExC_flags = oregflags;
830247a4
IZ
5466 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5467 RExC_parse = oregcomp_parse;
380a0633 5468 vFAIL("Unmatched (");
ce3e6498 5469 }
a0ed51b3 5470 }
830247a4
IZ
5471 else if (!paren && RExC_parse < RExC_end) {
5472 if (*RExC_parse == ')') {
5473 RExC_parse++;
380a0633 5474 vFAIL("Unmatched )");
a0ed51b3
LW
5475 }
5476 else
b45f050a 5477 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5478 /* NOTREACHED */
5479 }
a687059c 5480
a0d0e21e 5481 return(ret);
a687059c
LW
5482}
5483
5484/*
5485 - regbranch - one alternative of an | operator
5486 *
5487 * Implements the concatenation operator.
5488 */
76e3520e 5489STATIC regnode *
3dab1dad 5490S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5491{
97aff369 5492 dVAR;
c277df42
IZ
5493 register regnode *ret;
5494 register regnode *chain = NULL;
5495 register regnode *latest;
5496 I32 flags = 0, c = 0;
3dab1dad
YO
5497 GET_RE_DEBUG_FLAGS_DECL;
5498 DEBUG_PARSE("brnc");
b81d288d 5499 if (first)
c277df42
IZ
5500 ret = NULL;
5501 else {
b81d288d 5502 if (!SIZE_ONLY && RExC_extralen)
830247a4 5503 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5504 else {
830247a4 5505 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5506 Set_Node_Length(ret, 1);
5507 }
c277df42
IZ
5508 }
5509
b81d288d 5510 if (!first && SIZE_ONLY)
830247a4 5511 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5512
c277df42 5513 *flagp = WORST; /* Tentatively. */
a0d0e21e 5514
830247a4
IZ
5515 RExC_parse--;
5516 nextchar(pRExC_state);
5517 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5518 flags &= ~TRYAGAIN;
3dab1dad 5519 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5520 if (latest == NULL) {
5521 if (flags & TRYAGAIN)
5522 continue;
5523 return(NULL);
a0ed51b3
LW
5524 }
5525 else if (ret == NULL)
c277df42 5526 ret = latest;
a0d0e21e 5527 *flagp |= flags&HASWIDTH;
c277df42 5528 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5529 *flagp |= flags&SPSTART;
5530 else {
830247a4 5531 RExC_naughty++;
3dab1dad 5532 REGTAIL(pRExC_state, chain, latest);
a687059c 5533 }
a0d0e21e 5534 chain = latest;
c277df42
IZ
5535 c++;
5536 }
5537 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5538 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5539 if (ret == NULL)
5540 ret = chain;
5541 }
5542 if (c == 1) {
5543 *flagp |= flags&SIMPLE;
a0d0e21e 5544 }
a687059c 5545
d4c19fe8 5546 return ret;
a687059c
LW
5547}
5548
5549/*
5550 - regpiece - something followed by possible [*+?]
5551 *
5552 * Note that the branching code sequences used for ? and the general cases
5553 * of * and + are somewhat optimized: they use the same NOTHING node as
5554 * both the endmarker for their branch list and the body of the last branch.
5555 * It might seem that this node could be dispensed with entirely, but the
5556 * endmarker role is not redundant.
5557 */
76e3520e 5558STATIC regnode *
3dab1dad 5559S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5560{
97aff369 5561 dVAR;
c277df42 5562 register regnode *ret;
a0d0e21e
LW
5563 register char op;
5564 register char *next;
5565 I32 flags;
1df70142 5566 const char * const origparse = RExC_parse;
a0d0e21e 5567 I32 min;
c277df42 5568 I32 max = REG_INFTY;
fac92740 5569 char *parse_start;
10edeb5d 5570 const char *maxpos = NULL;
3dab1dad
YO
5571 GET_RE_DEBUG_FLAGS_DECL;
5572 DEBUG_PARSE("piec");
a0d0e21e 5573
3dab1dad 5574 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5575 if (ret == NULL) {
5576 if (flags & TRYAGAIN)
5577 *flagp |= TRYAGAIN;
5578 return(NULL);
5579 }
5580
830247a4 5581 op = *RExC_parse;
a0d0e21e 5582
830247a4 5583 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5584 maxpos = NULL;
fac92740 5585 parse_start = RExC_parse; /* MJD */
830247a4 5586 next = RExC_parse + 1;
a0d0e21e
LW
5587 while (isDIGIT(*next) || *next == ',') {
5588 if (*next == ',') {
5589 if (maxpos)
5590 break;
5591 else
5592 maxpos = next;
a687059c 5593 }
a0d0e21e
LW
5594 next++;
5595 }
5596 if (*next == '}') { /* got one */
5597 if (!maxpos)
5598 maxpos = next;
830247a4
IZ
5599 RExC_parse++;
5600 min = atoi(RExC_parse);
a0d0e21e
LW
5601 if (*maxpos == ',')
5602 maxpos++;
5603 else
830247a4 5604 maxpos = RExC_parse;
a0d0e21e
LW
5605 max = atoi(maxpos);
5606 if (!max && *maxpos != '0')
c277df42
IZ
5607 max = REG_INFTY; /* meaning "infinity" */
5608 else if (max >= REG_INFTY)
8615cb43 5609 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5610 RExC_parse = next;
5611 nextchar(pRExC_state);
a0d0e21e
LW
5612
5613 do_curly:
5614 if ((flags&SIMPLE)) {
830247a4 5615 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5616 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5617 Set_Node_Offset(ret, parse_start+1); /* MJD */
5618 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5619 }
5620 else {
3dab1dad 5621 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5622
5623 w->flags = 0;
3dab1dad 5624 REGTAIL(pRExC_state, ret, w);
830247a4 5625 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5626 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5627 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5628 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5629 }
6bda09f9 5630 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5631 /* MJD hk */
5632 Set_Node_Offset(ret, parse_start+1);
2af232bd 5633 Set_Node_Length(ret,
fac92740 5634 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5635
830247a4 5636 if (!SIZE_ONLY && RExC_extralen)
c277df42 5637 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5638 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5639 if (SIZE_ONLY)
830247a4
IZ
5640 RExC_whilem_seen++, RExC_extralen += 3;
5641 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5642 }
c277df42 5643 ret->flags = 0;
a0d0e21e
LW
5644
5645 if (min > 0)
821b33a5
IZ
5646 *flagp = WORST;
5647 if (max > 0)
5648 *flagp |= HASWIDTH;
a0d0e21e 5649 if (max && max < min)
8615cb43 5650 vFAIL("Can't do {n,m} with n > m");
c277df42 5651 if (!SIZE_ONLY) {
eb160463
GS
5652 ARG1_SET(ret, (U16)min);
5653 ARG2_SET(ret, (U16)max);
a687059c 5654 }
a687059c 5655
a0d0e21e 5656 goto nest_check;
a687059c 5657 }
a0d0e21e 5658 }
a687059c 5659
a0d0e21e
LW
5660 if (!ISMULT1(op)) {
5661 *flagp = flags;
a687059c 5662 return(ret);
a0d0e21e 5663 }
bb20fd44 5664
c277df42 5665#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5666
5667 /* if this is reinstated, don't forget to put this back into perldiag:
5668
5669 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5670
5671 (F) The part of the regexp subject to either the * or + quantifier
5672 could match an empty string. The {#} shows in the regular
5673 expression about where the problem was discovered.
5674
5675 */
5676
bb20fd44 5677 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5678 vFAIL("Regexp *+ operand could be empty");
b81d288d 5679#endif
bb20fd44 5680
fac92740 5681 parse_start = RExC_parse;
830247a4 5682 nextchar(pRExC_state);
a0d0e21e 5683
821b33a5 5684 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5685
5686 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5687 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5688 ret->flags = 0;
830247a4 5689 RExC_naughty += 4;
a0d0e21e
LW
5690 }
5691 else if (op == '*') {
5692 min = 0;
5693 goto do_curly;
a0ed51b3
LW
5694 }
5695 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5696 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5697 ret->flags = 0;
830247a4 5698 RExC_naughty += 3;
a0d0e21e
LW
5699 }
5700 else if (op == '+') {
5701 min = 1;
5702 goto do_curly;
a0ed51b3
LW
5703 }
5704 else if (op == '?') {
a0d0e21e
LW
5705 min = 0; max = 1;
5706 goto do_curly;
5707 }
5708 nest_check:
041457d9 5709 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5710 vWARN3(RExC_parse,
b45f050a 5711 "%.*s matches null string many times",
afd78fd5 5712 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5713 origparse);
a0d0e21e
LW
5714 }
5715
b9b4dddf 5716 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5717 nextchar(pRExC_state);
6bda09f9 5718 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5719 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5720 }
b9b4dddf
YO
5721#ifndef REG_ALLOW_MINMOD_SUSPEND
5722 else
5723#endif
5724 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5725 regnode *ender;
5726 nextchar(pRExC_state);
5727 ender = reg_node(pRExC_state, SUCCEED);
5728 REGTAIL(pRExC_state, ret, ender);
5729 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5730 ret->flags = 0;
5731 ender = reg_node(pRExC_state, TAIL);
5732 REGTAIL(pRExC_state, ret, ender);
5733 /*ret= ender;*/
5734 }
5735
5736 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5737 RExC_parse++;
b45f050a
JF
5738 vFAIL("Nested quantifiers");
5739 }
a0d0e21e
LW
5740
5741 return(ret);
a687059c
LW
5742}
5743
fc8cd66c
YO
5744
5745/* reg_namedseq(pRExC_state,UVp)
5746
5747 This is expected to be called by a parser routine that has
5748 recognized'\N' and needs to handle the rest. RExC_parse is
5749 expected to point at the first char following the N at the time
5750 of the call.
5751
5752 If valuep is non-null then it is assumed that we are parsing inside
5753 of a charclass definition and the first codepoint in the resolved
5754 string is returned via *valuep and the routine will return NULL.
5755 In this mode if a multichar string is returned from the charnames
5756 handler a warning will be issued, and only the first char in the
5757 sequence will be examined. If the string returned is zero length
5758 then the value of *valuep is undefined and NON-NULL will
5759 be returned to indicate failure. (This will NOT be a valid pointer
5760 to a regnode.)
5761
5762 If value is null then it is assumed that we are parsing normal text
5763 and inserts a new EXACT node into the program containing the resolved
5764 string and returns a pointer to the new node. If the string is
5765 zerolength a NOTHING node is emitted.
5766
5767 On success RExC_parse is set to the char following the endbrace.
5768 Parsing failures will generate a fatal errorvia vFAIL(...)
5769
5770 NOTE: We cache all results from the charnames handler locally in
5771 the RExC_charnames hash (created on first use) to prevent a charnames
5772 handler from playing silly-buggers and returning a short string and
5773 then a long string for a given pattern. Since the regexp program
5774 size is calculated during an initial parse this would result
5775 in a buffer overrun so we cache to prevent the charname result from
5776 changing during the course of the parse.
5777
5778 */
5779STATIC regnode *
5780S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5781{
5782 char * name; /* start of the content of the name */
5783 char * endbrace; /* endbrace following the name */
5784 SV *sv_str = NULL;
5785 SV *sv_name = NULL;
5786 STRLEN len; /* this has various purposes throughout the code */
5787 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5788 regnode *ret = NULL;
5789
5790 if (*RExC_parse != '{') {
5791 vFAIL("Missing braces on \\N{}");
5792 }
5793 name = RExC_parse+1;
5794 endbrace = strchr(RExC_parse, '}');
5795 if ( ! endbrace ) {
5796 RExC_parse++;
5797 vFAIL("Missing right brace on \\N{}");
5798 }
5799 RExC_parse = endbrace + 1;
5800
5801
5802 /* RExC_parse points at the beginning brace,
5803 endbrace points at the last */
5804 if ( name[0]=='U' && name[1]=='+' ) {
5805 /* its a "unicode hex" notation {U+89AB} */
5806 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5807 | PERL_SCAN_DISALLOW_PREFIX
5808 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5809 UV cp;
196f1508 5810 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5811 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5812 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5813 cp = 0xFFFD;
5814 }
5815 if (cp > 0xff)
5816 RExC_utf8 = 1;
5817 if ( valuep ) {
5818 *valuep = cp;
5819 return NULL;
5820 }
5821 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5822 } else {
5823 /* fetch the charnames handler for this scope */
5824 HV * const table = GvHV(PL_hintgv);
5825 SV **cvp= table ?
5826 hv_fetchs(table, "charnames", FALSE) :
5827 NULL;
5828 SV *cv= cvp ? *cvp : NULL;
5829 HE *he_str;
5830 int count;
5831 /* create an SV with the name as argument */
5832 sv_name = newSVpvn(name, endbrace - name);
5833
5834 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5835 vFAIL2("Constant(\\N{%s}) unknown: "
5836 "(possibly a missing \"use charnames ...\")",
5837 SvPVX(sv_name));
5838 }
5839 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5840 vFAIL2("Constant(\\N{%s}): "
5841 "$^H{charnames} is not defined",SvPVX(sv_name));
5842 }
5843
5844
5845
5846 if (!RExC_charnames) {
5847 /* make sure our cache is allocated */
5848 RExC_charnames = newHV();
6bda09f9 5849 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5850 }
5851 /* see if we have looked this one up before */
5852 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5853 if ( he_str ) {
5854 sv_str = HeVAL(he_str);
5855 cached = 1;
5856 } else {
5857 dSP ;
5858
5859 ENTER ;
5860 SAVETMPS ;
5861 PUSHMARK(SP) ;
5862
5863 XPUSHs(sv_name);
5864
5865 PUTBACK ;
5866
5867 count= call_sv(cv, G_SCALAR);
5868
5869 if (count == 1) { /* XXXX is this right? dmq */
5870 sv_str = POPs;
5871 SvREFCNT_inc_simple_void(sv_str);
5872 }
5873
5874 SPAGAIN ;
5875 PUTBACK ;
5876 FREETMPS ;
5877 LEAVE ;
5878
5879 if ( !sv_str || !SvOK(sv_str) ) {
5880 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5881 "did not return a defined value",SvPVX(sv_name));
5882 }
5883 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5884 cached = 1;
5885 }
5886 }
5887 if (valuep) {
5888 char *p = SvPV(sv_str, len);
5889 if (len) {
5890 STRLEN numlen = 1;
5891 if ( SvUTF8(sv_str) ) {
196f1508 5892 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5893 if (*valuep > 0x7F)
5894 RExC_utf8 = 1;
5895 /* XXXX
5896 We have to turn on utf8 for high bit chars otherwise
5897 we get failures with
5898
5899 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5900 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5901
5902 This is different from what \x{} would do with the same
5903 codepoint, where the condition is > 0xFF.
5904 - dmq
5905 */
5906
5907
5908 } else {
5909 *valuep = (UV)*p;
5910 /* warn if we havent used the whole string? */
5911 }
5912 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5913 vWARN2(RExC_parse,
5914 "Ignoring excess chars from \\N{%s} in character class",
5915 SvPVX(sv_name)
5916 );
5917 }
5918 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5919 vWARN2(RExC_parse,
5920 "Ignoring zero length \\N{%s} in character class",
5921 SvPVX(sv_name)
5922 );
5923 }
5924 if (sv_name)
5925 SvREFCNT_dec(sv_name);
5926 if (!cached)
5927 SvREFCNT_dec(sv_str);
5928 return len ? NULL : (regnode *)&len;
5929 } else if(SvCUR(sv_str)) {
5930
5931 char *s;
5932 char *p, *pend;
5933 STRLEN charlen = 1;
5934 char * parse_start = name-3; /* needed for the offsets */
5935 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5936
5937 ret = reg_node(pRExC_state,
5938 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5939 s= STRING(ret);
5940
5941 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5942 sv_utf8_upgrade(sv_str);
5943 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5944 RExC_utf8= 1;
5945 }
5946
5947 p = SvPV(sv_str, len);
5948 pend = p + len;
5949 /* len is the length written, charlen is the size the char read */
5950 for ( len = 0; p < pend; p += charlen ) {
5951 if (UTF) {
196f1508 5952 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5953 if (FOLD) {
5954 STRLEN foldlen,numlen;
5955 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5956 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5957 /* Emit all the Unicode characters. */
5958
5959 for (foldbuf = tmpbuf;
5960 foldlen;
5961 foldlen -= numlen)
5962 {
5963 uvc = utf8_to_uvchr(foldbuf, &numlen);
5964 if (numlen > 0) {
5965 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5966 s += unilen;
5967 len += unilen;
5968 /* In EBCDIC the numlen
5969 * and unilen can differ. */
5970 foldbuf += numlen;
5971 if (numlen >= foldlen)
5972 break;
5973 }
5974 else
5975 break; /* "Can't happen." */
5976 }
5977 } else {
5978 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5979 if (unilen > 0) {
5980 s += unilen;
5981 len += unilen;
5982 }
5983 }
5984 } else {
5985 len++;
5986 REGC(*p, s++);
5987 }
5988 }
5989 if (SIZE_ONLY) {
5990 RExC_size += STR_SZ(len);
5991 } else {
5992 STR_LEN(ret) = len;
5993 RExC_emit += STR_SZ(len);
5994 }
5995 Set_Node_Cur_Length(ret); /* MJD */
5996 RExC_parse--;
5997 nextchar(pRExC_state);
5998 } else {
5999 ret = reg_node(pRExC_state,NOTHING);
6000 }
6001 if (!cached) {
6002 SvREFCNT_dec(sv_str);
6003 }
6004 if (sv_name) {
6005 SvREFCNT_dec(sv_name);
6006 }
6007 return ret;
6008
6009}
6010
6011
9e08bc66
TS
6012/*
6013 * reg_recode
6014 *
6015 * It returns the code point in utf8 for the value in *encp.
6016 * value: a code value in the source encoding
6017 * encp: a pointer to an Encode object
6018 *
6019 * If the result from Encode is not a single character,
6020 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6021 */
6022STATIC UV
6023S_reg_recode(pTHX_ const char value, SV **encp)
6024{
6025 STRLEN numlen = 1;
6026 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6027 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6028 : SvPVX(sv);
6029 const STRLEN newlen = SvCUR(sv);
6030 UV uv = UNICODE_REPLACEMENT;
6031
6032 if (newlen)
6033 uv = SvUTF8(sv)
6034 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6035 : *(U8*)s;
6036
6037 if (!newlen || numlen != newlen) {
6038 uv = UNICODE_REPLACEMENT;
6039 if (encp)
6040 *encp = NULL;
6041 }
6042 return uv;
6043}
6044
fc8cd66c 6045
a687059c
LW
6046/*
6047 - regatom - the lowest level
6048 *
6049 * Optimization: gobbles an entire sequence of ordinary characters so that
6050 * it can turn them into a single node, which is smaller to store and
6051 * faster to run. Backslashed characters are exceptions, each becoming a
6052 * separate node; the code is simpler that way and it's not worth fixing.
6053 *
7f6f358c
YO
6054 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6055 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6056 */
76e3520e 6057STATIC regnode *
3dab1dad 6058S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6059{
97aff369 6060 dVAR;
cbbf8932 6061 register regnode *ret = NULL;
a0d0e21e 6062 I32 flags;
45948336 6063 char *parse_start = RExC_parse;
3dab1dad
YO
6064 GET_RE_DEBUG_FLAGS_DECL;
6065 DEBUG_PARSE("atom");
a0d0e21e
LW
6066 *flagp = WORST; /* Tentatively. */
6067
6068tryagain:
830247a4 6069 switch (*RExC_parse) {
a0d0e21e 6070 case '^':
830247a4
IZ
6071 RExC_seen_zerolen++;
6072 nextchar(pRExC_state);
e2509266 6073 if (RExC_flags & PMf_MULTILINE)
830247a4 6074 ret = reg_node(pRExC_state, MBOL);
e2509266 6075 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6076 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6077 else
830247a4 6078 ret = reg_node(pRExC_state, BOL);
fac92740 6079 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6080 break;
6081 case '$':
830247a4 6082 nextchar(pRExC_state);
b81d288d 6083 if (*RExC_parse)
830247a4 6084 RExC_seen_zerolen++;
e2509266 6085 if (RExC_flags & PMf_MULTILINE)
830247a4 6086 ret = reg_node(pRExC_state, MEOL);
e2509266 6087 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6088 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6089 else
830247a4 6090 ret = reg_node(pRExC_state, EOL);
fac92740 6091 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6092 break;
6093 case '.':
830247a4 6094 nextchar(pRExC_state);
e2509266 6095 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6096 ret = reg_node(pRExC_state, SANY);
6097 else
6098 ret = reg_node(pRExC_state, REG_ANY);
6099 *flagp |= HASWIDTH|SIMPLE;
830247a4 6100 RExC_naughty++;
fac92740 6101 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6102 break;
6103 case '[':
b45f050a 6104 {
3dab1dad
YO
6105 char * const oregcomp_parse = ++RExC_parse;
6106 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6107 if (*RExC_parse != ']') {
6108 RExC_parse = oregcomp_parse;
b45f050a
JF
6109 vFAIL("Unmatched [");
6110 }
830247a4 6111 nextchar(pRExC_state);
a0d0e21e 6112 *flagp |= HASWIDTH|SIMPLE;
fac92740 6113 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6114 break;
b45f050a 6115 }
a0d0e21e 6116 case '(':
830247a4 6117 nextchar(pRExC_state);
3dab1dad 6118 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6119 if (ret == NULL) {
bf93d4cc 6120 if (flags & TRYAGAIN) {
830247a4 6121 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6122 /* Make parent create an empty node if needed. */
6123 *flagp |= TRYAGAIN;
6124 return(NULL);
6125 }
a0d0e21e 6126 goto tryagain;
bf93d4cc 6127 }
a0d0e21e
LW
6128 return(NULL);
6129 }
c277df42 6130 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6131 break;
6132 case '|':
6133 case ')':
6134 if (flags & TRYAGAIN) {
6135 *flagp |= TRYAGAIN;
6136 return NULL;
6137 }
b45f050a 6138 vFAIL("Internal urp");
a0d0e21e
LW
6139 /* Supposed to be caught earlier. */
6140 break;
85afd4ae 6141 case '{':
830247a4
IZ
6142 if (!regcurly(RExC_parse)) {
6143 RExC_parse++;
85afd4ae
CS
6144 goto defchar;
6145 }
6146 /* FALL THROUGH */
a0d0e21e
LW
6147 case '?':
6148 case '+':
6149 case '*':
830247a4 6150 RExC_parse++;
b45f050a 6151 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6152 break;
6153 case '\\':
830247a4 6154 switch (*++RExC_parse) {
a0d0e21e 6155 case 'A':
830247a4
IZ
6156 RExC_seen_zerolen++;
6157 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6158 *flagp |= SIMPLE;
830247a4 6159 nextchar(pRExC_state);
fac92740 6160 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6161 break;
6162 case 'G':
830247a4
IZ
6163 ret = reg_node(pRExC_state, GPOS);
6164 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6165 *flagp |= SIMPLE;
830247a4 6166 nextchar(pRExC_state);
fac92740 6167 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6168 break;
6169 case 'Z':
830247a4 6170 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6171 *flagp |= SIMPLE;
a1917ab9 6172 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6173 nextchar(pRExC_state);
a0d0e21e 6174 break;
b85d18e9 6175 case 'z':
830247a4 6176 ret = reg_node(pRExC_state, EOS);
b85d18e9 6177 *flagp |= SIMPLE;
830247a4
IZ
6178 RExC_seen_zerolen++; /* Do not optimize RE away */
6179 nextchar(pRExC_state);
fac92740 6180 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6181 break;
4a2d328f 6182 case 'C':
f33976b4
DB
6183 ret = reg_node(pRExC_state, CANY);
6184 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6185 *flagp |= HASWIDTH|SIMPLE;
830247a4 6186 nextchar(pRExC_state);
fac92740 6187 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6188 break;
6189 case 'X':
830247a4 6190 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6191 *flagp |= HASWIDTH;
830247a4 6192 nextchar(pRExC_state);
fac92740 6193 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6194 break;
a0d0e21e 6195 case 'w':
eb160463 6196 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6197 *flagp |= HASWIDTH|SIMPLE;
830247a4 6198 nextchar(pRExC_state);
fac92740 6199 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6200 break;
6201 case 'W':
eb160463 6202 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6203 *flagp |= HASWIDTH|SIMPLE;
830247a4 6204 nextchar(pRExC_state);
fac92740 6205 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6206 break;
6207 case 'b':
830247a4
IZ
6208 RExC_seen_zerolen++;
6209 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6210 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6211 *flagp |= SIMPLE;
830247a4 6212 nextchar(pRExC_state);
fac92740 6213 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6214 break;
6215 case 'B':
830247a4
IZ
6216 RExC_seen_zerolen++;
6217 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6218 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6219 *flagp |= SIMPLE;
830247a4 6220 nextchar(pRExC_state);
fac92740 6221 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6222 break;
6223 case 's':
eb160463 6224 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6225 *flagp |= HASWIDTH|SIMPLE;
830247a4 6226 nextchar(pRExC_state);
fac92740 6227 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6228 break;
6229 case 'S':
eb160463 6230 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6231 *flagp |= HASWIDTH|SIMPLE;
830247a4 6232 nextchar(pRExC_state);
fac92740 6233 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6234 break;
6235 case 'd':
ffc61ed2 6236 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6237 *flagp |= HASWIDTH|SIMPLE;
830247a4 6238 nextchar(pRExC_state);
fac92740 6239 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6240 break;
6241 case 'D':
ffc61ed2 6242 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6243 *flagp |= HASWIDTH|SIMPLE;
830247a4 6244 nextchar(pRExC_state);
fac92740 6245 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6246 break;
a14b48bc
LW
6247 case 'p':
6248 case 'P':
3568d838 6249 {
3dab1dad 6250 char* const oldregxend = RExC_end;
ccb2c380 6251 char* parse_start = RExC_parse - 2;
a14b48bc 6252
830247a4 6253 if (RExC_parse[1] == '{') {
3568d838 6254 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6255 RExC_end = strchr(RExC_parse, '}');
6256 if (!RExC_end) {
3dab1dad 6257 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6258 RExC_parse += 2;
6259 RExC_end = oldregxend;
0da60cf5 6260 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6261 }
830247a4 6262 RExC_end++;
a14b48bc 6263 }
af6f566e 6264 else {
830247a4 6265 RExC_end = RExC_parse + 2;
af6f566e
HS
6266 if (RExC_end > oldregxend)
6267 RExC_end = oldregxend;
6268 }
830247a4 6269 RExC_parse--;
a14b48bc 6270
3dab1dad 6271 ret = regclass(pRExC_state,depth+1);
a14b48bc 6272
830247a4
IZ
6273 RExC_end = oldregxend;
6274 RExC_parse--;
ccb2c380
MP
6275
6276 Set_Node_Offset(ret, parse_start + 2);
6277 Set_Node_Cur_Length(ret);
830247a4 6278 nextchar(pRExC_state);
a14b48bc
LW
6279 *flagp |= HASWIDTH|SIMPLE;
6280 }
6281 break;
fc8cd66c
YO
6282 case 'N':
6283 /* Handle \N{NAME} here and not below because it can be
6284 multicharacter. join_exact() will join them up later on.
6285 Also this makes sure that things like /\N{BLAH}+/ and
6286 \N{BLAH} being multi char Just Happen. dmq*/
6287 ++RExC_parse;
6288 ret= reg_namedseq(pRExC_state, NULL);
6289 break;
0a4db386 6290 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6291 {
6292 char ch= RExC_parse[1];
6293 if (ch != '<' && ch != '\'') {
6294 if (SIZE_ONLY)
6295 vWARN( RExC_parse + 1,
6296 "Possible broken named back reference treated as literal k");
6297 parse_start--;
6298 goto defchar;
6299 } else {
6300 char* name_start = (RExC_parse += 2);
6301 I32 num = 0;
0a4db386
YO
6302 SV *sv_dat = reg_scan_name(pRExC_state,
6303 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6304 ch= (ch == '<') ? '>' : '\'';
6305
81714fb9
YO
6306 if (RExC_parse == name_start || *RExC_parse != ch)
6307 vFAIL2("Sequence \\k%c... not terminated",
6308 (ch == '>' ? '<' : ch));
6309
6310 RExC_sawback = 1;
6311 ret = reganode(pRExC_state,
6312 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6313 num);
6314 *flagp |= HASWIDTH;
6315
6316
6317 if (!SIZE_ONLY) {
81714fb9
YO
6318 num = add_data( pRExC_state, 1, "S" );
6319 ARG_SET(ret,num);
6320 RExC_rx->data->data[num]=(void*)sv_dat;
6321 SvREFCNT_inc(sv_dat);
6322 }
6323 /* override incorrect value set in reganode MJD */
6324 Set_Node_Offset(ret, parse_start+1);
6325 Set_Node_Cur_Length(ret); /* MJD */
6326 nextchar(pRExC_state);
6327
6328 }
6329 break;
6330 }
a0d0e21e
LW
6331 case 'n':
6332 case 'r':
6333 case 't':
6334 case 'f':
6335 case 'e':
6336 case 'a':
6337 case 'x':
6338 case 'c':
6339 case '0':
6340 goto defchar;
c74340f9 6341 case 'R':
a0d0e21e
LW
6342 case '1': case '2': case '3': case '4':
6343 case '5': case '6': case '7': case '8': case '9':
6344 {
c74340f9
YO
6345 I32 num;
6346 bool isrel=(*RExC_parse=='R');
6347 if (isrel)
6348 RExC_parse++;
6349 num = atoi(RExC_parse);
6350 if (isrel) {
6351 num = RExC_cpar - num;
6352 if (num < 1)
6353 vFAIL("Reference to nonexistent or unclosed group");
6354 }
830247a4 6355 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6356 goto defchar;
6357 else {
3dab1dad 6358 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6359 while (isDIGIT(*RExC_parse))
6360 RExC_parse++;
b45f050a 6361
c74340f9
YO
6362 if (!SIZE_ONLY) {
6363 if (num > (I32)RExC_rx->nparens)
6364 vFAIL("Reference to nonexistent group");
6365 /* People make this error all the time apparently.
6366 So we cant fail on it, even though we should
6367
6368 else if (num >= RExC_cpar)
6369 vFAIL("Reference to unclosed group will always match");
6370 */
6371 }
830247a4 6372 RExC_sawback = 1;
eb160463
GS
6373 ret = reganode(pRExC_state,
6374 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6375 num);
a0d0e21e 6376 *flagp |= HASWIDTH;
2af232bd 6377
fac92740 6378 /* override incorrect value set in reganode MJD */
2af232bd 6379 Set_Node_Offset(ret, parse_start+1);
fac92740 6380 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6381 RExC_parse--;
6382 nextchar(pRExC_state);
a0d0e21e
LW
6383 }
6384 }
6385 break;
6386 case '\0':
830247a4 6387 if (RExC_parse >= RExC_end)
b45f050a 6388 FAIL("Trailing \\");
a0d0e21e
LW
6389 /* FALL THROUGH */
6390 default:
a0288114 6391 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6392 back into the quick-grab loop below */
45948336 6393 parse_start--;
a0d0e21e
LW
6394 goto defchar;
6395 }
6396 break;
4633a7c4
LW
6397
6398 case '#':
e2509266 6399 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6400 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6401 RExC_parse++;
830247a4 6402 if (RExC_parse < RExC_end)
4633a7c4
LW
6403 goto tryagain;
6404 }
6405 /* FALL THROUGH */
6406
a0d0e21e 6407 default: {
ba210ebe 6408 register STRLEN len;
58ae7d3f 6409 register UV ender;
a0d0e21e 6410 register char *p;
3dab1dad 6411 char *s;
80aecb99 6412 STRLEN foldlen;
89ebb4a3 6413 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6414
6415 parse_start = RExC_parse - 1;
a0d0e21e 6416
830247a4 6417 RExC_parse++;
a0d0e21e
LW
6418
6419 defchar:
58ae7d3f 6420 ender = 0;
eb160463
GS
6421 ret = reg_node(pRExC_state,
6422 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6423 s = STRING(ret);
830247a4
IZ
6424 for (len = 0, p = RExC_parse - 1;
6425 len < 127 && p < RExC_end;
a0d0e21e
LW
6426 len++)
6427 {
3dab1dad 6428 char * const oldp = p;
5b5a24f7 6429
e2509266 6430 if (RExC_flags & PMf_EXTENDED)
830247a4 6431 p = regwhite(p, RExC_end);
a0d0e21e
LW
6432 switch (*p) {
6433 case '^':
6434 case '$':
6435 case '.':
6436 case '[':
6437 case '(':
6438 case ')':
6439 case '|':
6440 goto loopdone;
6441 case '\\':
6442 switch (*++p) {
6443 case 'A':
1ed8eac0
JF
6444 case 'C':
6445 case 'X':
a0d0e21e
LW
6446 case 'G':
6447 case 'Z':
b85d18e9 6448 case 'z':
a0d0e21e
LW
6449 case 'w':
6450 case 'W':
6451 case 'b':
6452 case 'B':
6453 case 's':
6454 case 'S':
6455 case 'd':
6456 case 'D':
a14b48bc
LW
6457 case 'p':
6458 case 'P':
fc8cd66c 6459 case 'N':
c74340f9 6460 case 'R':
a0d0e21e
LW
6461 --p;
6462 goto loopdone;
6463 case 'n':
6464 ender = '\n';
6465 p++;
a687059c 6466 break;
a0d0e21e
LW
6467 case 'r':
6468 ender = '\r';
6469 p++;
a687059c 6470 break;
a0d0e21e
LW
6471 case 't':
6472 ender = '\t';
6473 p++;
a687059c 6474 break;
a0d0e21e
LW
6475 case 'f':
6476 ender = '\f';
6477 p++;
a687059c 6478 break;
a0d0e21e 6479 case 'e':
c7f1f016 6480 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6481 p++;
a687059c 6482 break;
a0d0e21e 6483 case 'a':
c7f1f016 6484 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6485 p++;
a687059c 6486 break;
a0d0e21e 6487 case 'x':
a0ed51b3 6488 if (*++p == '{') {
1df70142 6489 char* const e = strchr(p, '}');
b81d288d 6490
b45f050a 6491 if (!e) {
830247a4 6492 RExC_parse = p + 1;
b45f050a
JF
6493 vFAIL("Missing right brace on \\x{}");
6494 }
de5f0749 6495 else {
a4c04bdc
NC
6496 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6497 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6498 STRLEN numlen = e - p - 1;
53305cf1 6499 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6500 if (ender > 0xff)
6501 RExC_utf8 = 1;
a0ed51b3
LW
6502 p = e + 1;
6503 }
a0ed51b3
LW
6504 }
6505 else {
a4c04bdc 6506 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6507 STRLEN numlen = 2;
53305cf1 6508 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6509 p += numlen;
6510 }
9e08bc66
TS
6511 if (PL_encoding && ender < 0x100)
6512 goto recode_encoding;
a687059c 6513 break;
a0d0e21e
LW
6514 case 'c':
6515 p++;
bbce6d69 6516 ender = UCHARAT(p++);
6517 ender = toCTRL(ender);
a687059c 6518 break;
a0d0e21e
LW
6519 case '0': case '1': case '2': case '3':case '4':
6520 case '5': case '6': case '7': case '8':case '9':
6521 if (*p == '0' ||
830247a4 6522 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6523 I32 flags = 0;
1df70142 6524 STRLEN numlen = 3;
53305cf1 6525 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6526 p += numlen;
6527 }
6528 else {
6529 --p;
6530 goto loopdone;
a687059c 6531 }
9e08bc66
TS
6532 if (PL_encoding && ender < 0x100)
6533 goto recode_encoding;
6534 break;
6535 recode_encoding:
6536 {
6537 SV* enc = PL_encoding;
6538 ender = reg_recode((const char)(U8)ender, &enc);
6539 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6540 vWARN(p, "Invalid escape in the specified encoding");
6541 RExC_utf8 = 1;
6542 }
a687059c 6543 break;
a0d0e21e 6544 case '\0':
830247a4 6545 if (p >= RExC_end)
b45f050a 6546 FAIL("Trailing \\");
a687059c 6547 /* FALL THROUGH */
a0d0e21e 6548 default:
041457d9 6549 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6550 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6551 goto normal_default;
a0d0e21e
LW
6552 }
6553 break;
a687059c 6554 default:
a0ed51b3 6555 normal_default:
fd400ab9 6556 if (UTF8_IS_START(*p) && UTF) {
1df70142 6557 STRLEN numlen;
5e12f4fb 6558 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6559 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6560 p += numlen;
6561 }
6562 else
6563 ender = *p++;
a0d0e21e 6564 break;
a687059c 6565 }
e2509266 6566 if (RExC_flags & PMf_EXTENDED)
830247a4 6567 p = regwhite(p, RExC_end);
60a8b682
JH
6568 if (UTF && FOLD) {
6569 /* Prime the casefolded buffer. */
ac7e0132 6570 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6571 }
a0d0e21e
LW
6572 if (ISMULT2(p)) { /* Back off on ?+*. */
6573 if (len)
6574 p = oldp;
16ea2a2e 6575 else if (UTF) {
80aecb99 6576 if (FOLD) {
60a8b682 6577 /* Emit all the Unicode characters. */
1df70142 6578 STRLEN numlen;
80aecb99
JH
6579 for (foldbuf = tmpbuf;
6580 foldlen;
6581 foldlen -= numlen) {
6582 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6583 if (numlen > 0) {
71207a34 6584 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6585 s += unilen;
6586 len += unilen;
6587 /* In EBCDIC the numlen
6588 * and unilen can differ. */
9dc45d57 6589 foldbuf += numlen;
47654450
JH
6590 if (numlen >= foldlen)
6591 break;
9dc45d57
JH
6592 }
6593 else
6594 break; /* "Can't happen." */
80aecb99
JH
6595 }
6596 }
6597 else {
71207a34 6598 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6599 if (unilen > 0) {
0ebc6274
JH
6600 s += unilen;
6601 len += unilen;
9dc45d57 6602 }
80aecb99 6603 }
a0ed51b3 6604 }
a0d0e21e
LW
6605 else {
6606 len++;
eb160463 6607 REGC((char)ender, s++);
a0d0e21e
LW
6608 }
6609 break;
a687059c 6610 }
16ea2a2e 6611 if (UTF) {
80aecb99 6612 if (FOLD) {
60a8b682 6613 /* Emit all the Unicode characters. */
1df70142 6614 STRLEN numlen;
80aecb99
JH
6615 for (foldbuf = tmpbuf;
6616 foldlen;
6617 foldlen -= numlen) {
6618 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6619 if (numlen > 0) {
71207a34 6620 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6621 len += unilen;
6622 s += unilen;
6623 /* In EBCDIC the numlen
6624 * and unilen can differ. */
9dc45d57 6625 foldbuf += numlen;
47654450
JH
6626 if (numlen >= foldlen)
6627 break;
9dc45d57
JH
6628 }
6629 else
6630 break;
80aecb99
JH
6631 }
6632 }
6633 else {
71207a34 6634 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6635 if (unilen > 0) {
0ebc6274
JH
6636 s += unilen;
6637 len += unilen;
9dc45d57 6638 }
80aecb99
JH
6639 }
6640 len--;
a0ed51b3
LW
6641 }
6642 else
eb160463 6643 REGC((char)ender, s++);
a0d0e21e
LW
6644 }
6645 loopdone:
830247a4 6646 RExC_parse = p - 1;
fac92740 6647 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6648 nextchar(pRExC_state);
793db0cb
JH
6649 {
6650 /* len is STRLEN which is unsigned, need to copy to signed */
6651 IV iv = len;
6652 if (iv < 0)
6653 vFAIL("Internal disaster");
6654 }
a0d0e21e
LW
6655 if (len > 0)
6656 *flagp |= HASWIDTH;
090f7165 6657 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6658 *flagp |= SIMPLE;
3dab1dad 6659
cd439c50 6660 if (SIZE_ONLY)
830247a4 6661 RExC_size += STR_SZ(len);
3dab1dad
YO
6662 else {
6663 STR_LEN(ret) = len;
830247a4 6664 RExC_emit += STR_SZ(len);
07be1b83 6665 }
3dab1dad 6666 }
a0d0e21e
LW
6667 break;
6668 }
a687059c 6669
a0d0e21e 6670 return(ret);
a687059c
LW
6671}
6672
873ef191 6673STATIC char *
5f66b61c 6674S_regwhite(char *p, const char *e)
5b5a24f7
CS
6675{
6676 while (p < e) {
6677 if (isSPACE(*p))
6678 ++p;
6679 else if (*p == '#') {
6680 do {
6681 p++;
6682 } while (p < e && *p != '\n');
6683 }
6684 else
6685 break;
6686 }
6687 return p;
6688}
6689
b8c5462f
JH
6690/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6691 Character classes ([:foo:]) can also be negated ([:^foo:]).
6692 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6693 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6694 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6695
6696#define POSIXCC_DONE(c) ((c) == ':')
6697#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6698#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6699
b8c5462f 6700STATIC I32
830247a4 6701S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6702{
97aff369 6703 dVAR;
936ed897 6704 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6705
830247a4 6706 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6707 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6708 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6709 const char c = UCHARAT(RExC_parse);
097eb12c 6710 char* const s = RExC_parse++;
b81d288d 6711
9a86a77b 6712 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6713 RExC_parse++;
6714 if (RExC_parse == RExC_end)
620e46c5 6715 /* Grandfather lone [:, [=, [. */
830247a4 6716 RExC_parse = s;
620e46c5 6717 else {
3dab1dad 6718 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6719 assert(*t == c);
6720
9a86a77b 6721 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6722 const char *posixcc = s + 1;
830247a4 6723 RExC_parse++; /* skip over the ending ] */
3dab1dad 6724
b8c5462f 6725 if (*s == ':') {
1df70142
AL
6726 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6727 const I32 skip = t - posixcc;
80916619
NC
6728
6729 /* Initially switch on the length of the name. */
6730 switch (skip) {
6731 case 4:
3dab1dad
YO
6732 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6733 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6734 break;
80916619
NC
6735 case 5:
6736 /* Names all of length 5. */
6737 /* alnum alpha ascii blank cntrl digit graph lower
6738 print punct space upper */
6739 /* Offset 4 gives the best switch position. */
6740 switch (posixcc[4]) {
6741 case 'a':
3dab1dad
YO
6742 if (memEQ(posixcc, "alph", 4)) /* alpha */
6743 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6744 break;
6745 case 'e':
3dab1dad
YO
6746 if (memEQ(posixcc, "spac", 4)) /* space */
6747 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6748 break;
6749 case 'h':
3dab1dad
YO
6750 if (memEQ(posixcc, "grap", 4)) /* graph */
6751 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6752 break;
6753 case 'i':
3dab1dad
YO
6754 if (memEQ(posixcc, "asci", 4)) /* ascii */
6755 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6756 break;
6757 case 'k':
3dab1dad
YO
6758 if (memEQ(posixcc, "blan", 4)) /* blank */
6759 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6760 break;
6761 case 'l':
3dab1dad
YO
6762 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6763 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6764 break;
6765 case 'm':
3dab1dad
YO
6766 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6767 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6768 break;
6769 case 'r':
3dab1dad
YO
6770 if (memEQ(posixcc, "lowe", 4)) /* lower */
6771 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6772 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6773 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6774 break;
6775 case 't':
3dab1dad
YO
6776 if (memEQ(posixcc, "digi", 4)) /* digit */
6777 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6778 else if (memEQ(posixcc, "prin", 4)) /* print */
6779 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6780 else if (memEQ(posixcc, "punc", 4)) /* punct */
6781 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6782 break;
b8c5462f
JH
6783 }
6784 break;
80916619 6785 case 6:
3dab1dad
YO
6786 if (memEQ(posixcc, "xdigit", 6))
6787 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6788 break;
6789 }
80916619
NC
6790
6791 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6792 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6793 t - s - 1, s + 1);
80916619
NC
6794 assert (posixcc[skip] == ':');
6795 assert (posixcc[skip+1] == ']');
b45f050a 6796 } else if (!SIZE_ONLY) {
b8c5462f 6797 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6798
830247a4 6799 /* adjust RExC_parse so the warning shows after
b45f050a 6800 the class closes */
9a86a77b 6801 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6802 RExC_parse++;
b45f050a
JF
6803 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6804 }
b8c5462f
JH
6805 } else {
6806 /* Maternal grandfather:
6807 * "[:" ending in ":" but not in ":]" */
830247a4 6808 RExC_parse = s;
767d463e 6809 }
620e46c5
JH
6810 }
6811 }
6812
b8c5462f
JH
6813 return namedclass;
6814}
6815
6816STATIC void
830247a4 6817S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6818{
97aff369 6819 dVAR;
3dab1dad 6820 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6821 const char *s = RExC_parse;
6822 const char c = *s++;
b8c5462f 6823
3dab1dad 6824 while (isALNUM(*s))
b8c5462f
JH
6825 s++;
6826 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6827 if (ckWARN(WARN_REGEXP))
6828 vWARN3(s+2,
6829 "POSIX syntax [%c %c] belongs inside character classes",
6830 c, c);
b45f050a
JF
6831
6832 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6833 if (POSIXCC_NOTYET(c)) {
830247a4 6834 /* adjust RExC_parse so the error shows after
b45f050a 6835 the class closes */
9a86a77b 6836 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6837 NOOP;
b45f050a
JF
6838 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6839 }
b8c5462f
JH
6840 }
6841 }
620e46c5
JH
6842}
6843
7f6f358c
YO
6844
6845/*
6846 parse a class specification and produce either an ANYOF node that
6847 matches the pattern. If the pattern matches a single char only and
6848 that char is < 256 then we produce an EXACT node instead.
6849*/
76e3520e 6850STATIC regnode *
3dab1dad 6851S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6852{
97aff369 6853 dVAR;
9ef43ace 6854 register UV value = 0;
9a86a77b 6855 register UV nextvalue;
3568d838 6856 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6857 register IV range = 0;
c277df42 6858 register regnode *ret;
ba210ebe 6859 STRLEN numlen;
ffc61ed2 6860 IV namedclass;
cbbf8932 6861 char *rangebegin = NULL;
936ed897 6862 bool need_class = 0;
c445ea15 6863 SV *listsv = NULL;
ffc61ed2 6864 UV n;
9e55ce06 6865 bool optimize_invert = TRUE;
cbbf8932 6866 AV* unicode_alternate = NULL;
1b2d223b
JH
6867#ifdef EBCDIC
6868 UV literal_endpoint = 0;
6869#endif
7f6f358c 6870 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6871
3dab1dad 6872 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6873 case we need to change the emitted regop to an EXACT. */
07be1b83 6874 const char * orig_parse = RExC_parse;
72f13be8 6875 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6876#ifndef DEBUGGING
6877 PERL_UNUSED_ARG(depth);
6878#endif
72f13be8 6879
3dab1dad 6880 DEBUG_PARSE("clas");
7f6f358c
YO
6881
6882 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6883 ret = reganode(pRExC_state, ANYOF, 0);
6884
6885 if (!SIZE_ONLY)
6886 ANYOF_FLAGS(ret) = 0;
6887
9a86a77b 6888 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6889 RExC_naughty++;
6890 RExC_parse++;
6891 if (!SIZE_ONLY)
6892 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6893 }
a0d0e21e 6894
73060fc4 6895 if (SIZE_ONLY) {
830247a4 6896 RExC_size += ANYOF_SKIP;
73060fc4
JH
6897 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6898 }
936ed897 6899 else {
830247a4 6900 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6901 if (FOLD)
6902 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6903 if (LOC)
6904 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6905 ANYOF_BITMAP_ZERO(ret);
396482e1 6906 listsv = newSVpvs("# comment\n");
a0d0e21e 6907 }
b8c5462f 6908
9a86a77b
JH
6909 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6910
b938889d 6911 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6912 checkposixcc(pRExC_state);
b8c5462f 6913
f064b6ad
HS
6914 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6915 if (UCHARAT(RExC_parse) == ']')
6916 goto charclassloop;
ffc61ed2 6917
fc8cd66c 6918parseit:
9a86a77b 6919 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6920
6921 charclassloop:
6922
6923 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6924
73b437c8 6925 if (!range)
830247a4 6926 rangebegin = RExC_parse;
ffc61ed2 6927 if (UTF) {
5e12f4fb 6928 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6929 RExC_end - RExC_parse,
9f7f3913 6930 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6931 RExC_parse += numlen;
6932 }
6933 else
6934 value = UCHARAT(RExC_parse++);
7f6f358c 6935
9a86a77b
JH
6936 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6937 if (value == '[' && POSIXCC(nextvalue))
830247a4 6938 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6939 else if (value == '\\') {
ffc61ed2 6940 if (UTF) {
5e12f4fb 6941 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6942 RExC_end - RExC_parse,
9f7f3913 6943 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6944 RExC_parse += numlen;
6945 }
6946 else
6947 value = UCHARAT(RExC_parse++);
470c3474 6948 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6949 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6950 * be a problem later if we want switch on Unicode.
6951 * A similar issue a little bit later when switching on
6952 * namedclass. --jhi */
ffc61ed2 6953 switch ((I32)value) {
b8c5462f
JH
6954 case 'w': namedclass = ANYOF_ALNUM; break;
6955 case 'W': namedclass = ANYOF_NALNUM; break;
6956 case 's': namedclass = ANYOF_SPACE; break;
6957 case 'S': namedclass = ANYOF_NSPACE; break;
6958 case 'd': namedclass = ANYOF_DIGIT; break;
6959 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6960 case 'N': /* Handle \N{NAME} in class */
6961 {
6962 /* We only pay attention to the first char of
6963 multichar strings being returned. I kinda wonder
6964 if this makes sense as it does change the behaviour
6965 from earlier versions, OTOH that behaviour was broken
6966 as well. */
6967 UV v; /* value is register so we cant & it /grrr */
6968 if (reg_namedseq(pRExC_state, &v)) {
6969 goto parseit;
6970 }
6971 value= v;
6972 }
6973 break;
ffc61ed2
JH
6974 case 'p':
6975 case 'P':
3dab1dad
YO
6976 {
6977 char *e;
af6f566e 6978 if (RExC_parse >= RExC_end)
2a4859cd 6979 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6980 if (*RExC_parse == '{') {
1df70142 6981 const U8 c = (U8)value;
ffc61ed2
JH
6982 e = strchr(RExC_parse++, '}');
6983 if (!e)
0da60cf5 6984 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6985 while (isSPACE(UCHARAT(RExC_parse)))
6986 RExC_parse++;
6987 if (e == RExC_parse)
0da60cf5 6988 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6989 n = e - RExC_parse;
ab13f0c7
JH
6990 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6991 n--;
ffc61ed2
JH
6992 }
6993 else {
6994 e = RExC_parse;
6995 n = 1;
6996 }
6997 if (!SIZE_ONLY) {
ab13f0c7
JH
6998 if (UCHARAT(RExC_parse) == '^') {
6999 RExC_parse++;
7000 n--;
7001 value = value == 'p' ? 'P' : 'p'; /* toggle */
7002 while (isSPACE(UCHARAT(RExC_parse))) {
7003 RExC_parse++;
7004 n--;
7005 }
7006 }
097eb12c
AL
7007 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7008 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7009 }
7010 RExC_parse = e + 1;
7011 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7012 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7013 }
f81125e2 7014 break;
b8c5462f
JH
7015 case 'n': value = '\n'; break;
7016 case 'r': value = '\r'; break;
7017 case 't': value = '\t'; break;
7018 case 'f': value = '\f'; break;
7019 case 'b': value = '\b'; break;
c7f1f016
NIS
7020 case 'e': value = ASCII_TO_NATIVE('\033');break;
7021 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7022 case 'x':
ffc61ed2 7023 if (*RExC_parse == '{') {
a4c04bdc
NC
7024 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7025 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7026 char * const e = strchr(RExC_parse++, '}');
b81d288d 7027 if (!e)
ffc61ed2 7028 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7029
7030 numlen = e - RExC_parse;
7031 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7032 RExC_parse = e + 1;
7033 }
7034 else {
a4c04bdc 7035 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7036 numlen = 2;
7037 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7038 RExC_parse += numlen;
7039 }
9e08bc66
TS
7040 if (PL_encoding && value < 0x100)
7041 goto recode_encoding;
b8c5462f
JH
7042 break;
7043 case 'c':
830247a4 7044 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7045 value = toCTRL(value);
7046 break;
7047 case '0': case '1': case '2': case '3': case '4':
7048 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7049 {
7050 I32 flags = 0;
7051 numlen = 3;
7052 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7053 RExC_parse += numlen;
7054 if (PL_encoding && value < 0x100)
7055 goto recode_encoding;
7056 break;
7057 }
7058 recode_encoding:
7059 {
7060 SV* enc = PL_encoding;
7061 value = reg_recode((const char)(U8)value, &enc);
7062 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7063 vWARN(RExC_parse,
7064 "Invalid escape in the specified encoding");
7065 break;
7066 }
1028017a 7067 default:
041457d9 7068 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7069 vWARN2(RExC_parse,
7070 "Unrecognized escape \\%c in character class passed through",
7071 (int)value);
1028017a 7072 break;
b8c5462f 7073 }
ffc61ed2 7074 } /* end of \blah */
1b2d223b
JH
7075#ifdef EBCDIC
7076 else
7077 literal_endpoint++;
7078#endif
ffc61ed2
JH
7079
7080 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7081
7082 if (!SIZE_ONLY && !need_class)
936ed897 7083 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7084
936ed897 7085 need_class = 1;
ffc61ed2
JH
7086
7087 /* a bad range like a-\d, a-[:digit:] ? */
7088 if (range) {
73b437c8 7089 if (!SIZE_ONLY) {
afd78fd5 7090 if (ckWARN(WARN_REGEXP)) {
097eb12c 7091 const int w =
afd78fd5
JH
7092 RExC_parse >= rangebegin ?
7093 RExC_parse - rangebegin : 0;
830247a4 7094 vWARN4(RExC_parse,
b45f050a 7095 "False [] range \"%*.*s\"",
097eb12c 7096 w, w, rangebegin);
afd78fd5 7097 }
3568d838
JH
7098 if (prevvalue < 256) {
7099 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7100 ANYOF_BITMAP_SET(ret, '-');
7101 }
7102 else {
7103 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7104 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7105 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7106 }
b8c5462f 7107 }
ffc61ed2
JH
7108
7109 range = 0; /* this was not a true range */
73b437c8 7110 }
ffc61ed2 7111
73b437c8 7112 if (!SIZE_ONLY) {
c49a72a9
NC
7113 const char *what = NULL;
7114 char yesno = 0;
7115
3568d838
JH
7116 if (namedclass > OOB_NAMEDCLASS)
7117 optimize_invert = FALSE;
e2962f66
JH
7118 /* Possible truncation here but in some 64-bit environments
7119 * the compiler gets heartburn about switch on 64-bit values.
7120 * A similar issue a little earlier when switching on value.
98f323fa 7121 * --jhi */
e2962f66 7122 switch ((I32)namedclass) {
73b437c8
JH
7123 case ANYOF_ALNUM:
7124 if (LOC)
936ed897 7125 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7126 else {
7127 for (value = 0; value < 256; value++)
7128 if (isALNUM(value))
936ed897 7129 ANYOF_BITMAP_SET(ret, value);
73b437c8 7130 }
c49a72a9
NC
7131 yesno = '+';
7132 what = "Word";
73b437c8
JH
7133 break;
7134 case ANYOF_NALNUM:
7135 if (LOC)
936ed897 7136 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7137 else {
7138 for (value = 0; value < 256; value++)
7139 if (!isALNUM(value))
936ed897 7140 ANYOF_BITMAP_SET(ret, value);
73b437c8 7141 }
c49a72a9
NC
7142 yesno = '!';
7143 what = "Word";
73b437c8 7144 break;
ffc61ed2 7145 case ANYOF_ALNUMC:
73b437c8 7146 if (LOC)
ffc61ed2 7147 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7148 else {
7149 for (value = 0; value < 256; value++)
ffc61ed2 7150 if (isALNUMC(value))
936ed897 7151 ANYOF_BITMAP_SET(ret, value);
73b437c8 7152 }
c49a72a9
NC
7153 yesno = '+';
7154 what = "Alnum";
73b437c8
JH
7155 break;
7156 case ANYOF_NALNUMC:
7157 if (LOC)
936ed897 7158 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7159 else {
7160 for (value = 0; value < 256; value++)
7161 if (!isALNUMC(value))
936ed897 7162 ANYOF_BITMAP_SET(ret, value);
73b437c8 7163 }
c49a72a9
NC
7164 yesno = '!';
7165 what = "Alnum";
73b437c8
JH
7166 break;
7167 case ANYOF_ALPHA:
7168 if (LOC)
936ed897 7169 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7170 else {
7171 for (value = 0; value < 256; value++)
7172 if (isALPHA(value))
936ed897 7173 ANYOF_BITMAP_SET(ret, value);
73b437c8 7174 }
c49a72a9
NC
7175 yesno = '+';
7176 what = "Alpha";
73b437c8
JH
7177 break;
7178 case ANYOF_NALPHA:
7179 if (LOC)
936ed897 7180 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7181 else {
7182 for (value = 0; value < 256; value++)
7183 if (!isALPHA(value))
936ed897 7184 ANYOF_BITMAP_SET(ret, value);
73b437c8 7185 }
c49a72a9
NC
7186 yesno = '!';
7187 what = "Alpha";
73b437c8
JH
7188 break;
7189 case ANYOF_ASCII:
7190 if (LOC)
936ed897 7191 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7192 else {
c7f1f016 7193#ifndef EBCDIC
1ba5c669
JH
7194 for (value = 0; value < 128; value++)
7195 ANYOF_BITMAP_SET(ret, value);
7196#else /* EBCDIC */
ffbc6a93 7197 for (value = 0; value < 256; value++) {
3a3c4447
JH
7198 if (isASCII(value))
7199 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7200 }
1ba5c669 7201#endif /* EBCDIC */
73b437c8 7202 }
c49a72a9
NC
7203 yesno = '+';
7204 what = "ASCII";
73b437c8
JH
7205 break;
7206 case ANYOF_NASCII:
7207 if (LOC)
936ed897 7208 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7209 else {
c7f1f016 7210#ifndef EBCDIC
1ba5c669
JH
7211 for (value = 128; value < 256; value++)
7212 ANYOF_BITMAP_SET(ret, value);
7213#else /* EBCDIC */
ffbc6a93 7214 for (value = 0; value < 256; value++) {
3a3c4447
JH
7215 if (!isASCII(value))
7216 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7217 }
1ba5c669 7218#endif /* EBCDIC */
73b437c8 7219 }
c49a72a9
NC
7220 yesno = '!';
7221 what = "ASCII";
73b437c8 7222 break;
aaa51d5e
JF
7223 case ANYOF_BLANK:
7224 if (LOC)
7225 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7226 else {
7227 for (value = 0; value < 256; value++)
7228 if (isBLANK(value))
7229 ANYOF_BITMAP_SET(ret, value);
7230 }
c49a72a9
NC
7231 yesno = '+';
7232 what = "Blank";
aaa51d5e
JF
7233 break;
7234 case ANYOF_NBLANK:
7235 if (LOC)
7236 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7237 else {
7238 for (value = 0; value < 256; value++)
7239 if (!isBLANK(value))
7240 ANYOF_BITMAP_SET(ret, value);
7241 }
c49a72a9
NC
7242 yesno = '!';
7243 what = "Blank";
aaa51d5e 7244 break;
73b437c8
JH
7245 case ANYOF_CNTRL:
7246 if (LOC)
936ed897 7247 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7248 else {
7249 for (value = 0; value < 256; value++)
7250 if (isCNTRL(value))
936ed897 7251 ANYOF_BITMAP_SET(ret, value);
73b437c8 7252 }
c49a72a9
NC
7253 yesno = '+';
7254 what = "Cntrl";
73b437c8
JH
7255 break;
7256 case ANYOF_NCNTRL:
7257 if (LOC)
936ed897 7258 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7259 else {
7260 for (value = 0; value < 256; value++)
7261 if (!isCNTRL(value))
936ed897 7262 ANYOF_BITMAP_SET(ret, value);
73b437c8 7263 }
c49a72a9
NC
7264 yesno = '!';
7265 what = "Cntrl";
ffc61ed2
JH
7266 break;
7267 case ANYOF_DIGIT:
7268 if (LOC)
7269 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7270 else {
7271 /* consecutive digits assumed */
7272 for (value = '0'; value <= '9'; value++)
7273 ANYOF_BITMAP_SET(ret, value);
7274 }
c49a72a9
NC
7275 yesno = '+';
7276 what = "Digit";
ffc61ed2
JH
7277 break;
7278 case ANYOF_NDIGIT:
7279 if (LOC)
7280 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7281 else {
7282 /* consecutive digits assumed */
7283 for (value = 0; value < '0'; value++)
7284 ANYOF_BITMAP_SET(ret, value);
7285 for (value = '9' + 1; value < 256; value++)
7286 ANYOF_BITMAP_SET(ret, value);
7287 }
c49a72a9
NC
7288 yesno = '!';
7289 what = "Digit";
73b437c8
JH
7290 break;
7291 case ANYOF_GRAPH:
7292 if (LOC)
936ed897 7293 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7294 else {
7295 for (value = 0; value < 256; value++)
7296 if (isGRAPH(value))
936ed897 7297 ANYOF_BITMAP_SET(ret, value);
73b437c8 7298 }
c49a72a9
NC
7299 yesno = '+';
7300 what = "Graph";
73b437c8
JH
7301 break;
7302 case ANYOF_NGRAPH:
7303 if (LOC)
936ed897 7304 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7305 else {
7306 for (value = 0; value < 256; value++)
7307 if (!isGRAPH(value))
936ed897 7308 ANYOF_BITMAP_SET(ret, value);
73b437c8 7309 }
c49a72a9
NC
7310 yesno = '!';
7311 what = "Graph";
73b437c8
JH
7312 break;
7313 case ANYOF_LOWER:
7314 if (LOC)
936ed897 7315 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7316 else {
7317 for (value = 0; value < 256; value++)
7318 if (isLOWER(value))
936ed897 7319 ANYOF_BITMAP_SET(ret, value);
73b437c8 7320 }
c49a72a9
NC
7321 yesno = '+';
7322 what = "Lower";
73b437c8
JH
7323 break;
7324 case ANYOF_NLOWER:
7325 if (LOC)
936ed897 7326 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7327 else {
7328 for (value = 0; value < 256; value++)
7329 if (!isLOWER(value))
936ed897 7330 ANYOF_BITMAP_SET(ret, value);
73b437c8 7331 }
c49a72a9
NC
7332 yesno = '!';
7333 what = "Lower";
73b437c8
JH
7334 break;
7335 case ANYOF_PRINT:
7336 if (LOC)
936ed897 7337 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7338 else {
7339 for (value = 0; value < 256; value++)
7340 if (isPRINT(value))
936ed897 7341 ANYOF_BITMAP_SET(ret, value);
73b437c8 7342 }
c49a72a9
NC
7343 yesno = '+';
7344 what = "Print";
73b437c8
JH
7345 break;
7346 case ANYOF_NPRINT:
7347 if (LOC)
936ed897 7348 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7349 else {
7350 for (value = 0; value < 256; value++)
7351 if (!isPRINT(value))
936ed897 7352 ANYOF_BITMAP_SET(ret, value);
73b437c8 7353 }
c49a72a9
NC
7354 yesno = '!';
7355 what = "Print";
73b437c8 7356 break;
aaa51d5e
JF
7357 case ANYOF_PSXSPC:
7358 if (LOC)
7359 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7360 else {
7361 for (value = 0; value < 256; value++)
7362 if (isPSXSPC(value))
7363 ANYOF_BITMAP_SET(ret, value);
7364 }
c49a72a9
NC
7365 yesno = '+';
7366 what = "Space";
aaa51d5e
JF
7367 break;
7368 case ANYOF_NPSXSPC:
7369 if (LOC)
7370 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7371 else {
7372 for (value = 0; value < 256; value++)
7373 if (!isPSXSPC(value))
7374 ANYOF_BITMAP_SET(ret, value);
7375 }
c49a72a9
NC
7376 yesno = '!';
7377 what = "Space";
aaa51d5e 7378 break;
73b437c8
JH
7379 case ANYOF_PUNCT:
7380 if (LOC)
936ed897 7381 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7382 else {
7383 for (value = 0; value < 256; value++)
7384 if (isPUNCT(value))
936ed897 7385 ANYOF_BITMAP_SET(ret, value);
73b437c8 7386 }
c49a72a9
NC
7387 yesno = '+';
7388 what = "Punct";
73b437c8
JH
7389 break;
7390 case ANYOF_NPUNCT:
7391 if (LOC)
936ed897 7392 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7393 else {
7394 for (value = 0; value < 256; value++)
7395 if (!isPUNCT(value))
936ed897 7396 ANYOF_BITMAP_SET(ret, value);
73b437c8 7397 }
c49a72a9
NC
7398 yesno = '!';
7399 what = "Punct";
ffc61ed2
JH
7400 break;
7401 case ANYOF_SPACE:
7402 if (LOC)
7403 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7404 else {
7405 for (value = 0; value < 256; value++)
7406 if (isSPACE(value))
7407 ANYOF_BITMAP_SET(ret, value);
7408 }
c49a72a9
NC
7409 yesno = '+';
7410 what = "SpacePerl";
ffc61ed2
JH
7411 break;
7412 case ANYOF_NSPACE:
7413 if (LOC)
7414 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7415 else {
7416 for (value = 0; value < 256; value++)
7417 if (!isSPACE(value))
7418 ANYOF_BITMAP_SET(ret, value);
7419 }
c49a72a9
NC
7420 yesno = '!';
7421 what = "SpacePerl";
73b437c8
JH
7422 break;
7423 case ANYOF_UPPER:
7424 if (LOC)
936ed897 7425 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7426 else {
7427 for (value = 0; value < 256; value++)
7428 if (isUPPER(value))
936ed897 7429 ANYOF_BITMAP_SET(ret, value);
73b437c8 7430 }
c49a72a9
NC
7431 yesno = '+';
7432 what = "Upper";
73b437c8
JH
7433 break;
7434 case ANYOF_NUPPER:
7435 if (LOC)
936ed897 7436 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7437 else {
7438 for (value = 0; value < 256; value++)
7439 if (!isUPPER(value))
936ed897 7440 ANYOF_BITMAP_SET(ret, value);
73b437c8 7441 }
c49a72a9
NC
7442 yesno = '!';
7443 what = "Upper";
73b437c8
JH
7444 break;
7445 case ANYOF_XDIGIT:
7446 if (LOC)
936ed897 7447 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7448 else {
7449 for (value = 0; value < 256; value++)
7450 if (isXDIGIT(value))
936ed897 7451 ANYOF_BITMAP_SET(ret, value);
73b437c8 7452 }
c49a72a9
NC
7453 yesno = '+';
7454 what = "XDigit";
73b437c8
JH
7455 break;
7456 case ANYOF_NXDIGIT:
7457 if (LOC)
936ed897 7458 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7459 else {
7460 for (value = 0; value < 256; value++)
7461 if (!isXDIGIT(value))
936ed897 7462 ANYOF_BITMAP_SET(ret, value);
73b437c8 7463 }
c49a72a9
NC
7464 yesno = '!';
7465 what = "XDigit";
73b437c8 7466 break;
f81125e2
JP
7467 case ANYOF_MAX:
7468 /* this is to handle \p and \P */
7469 break;
73b437c8 7470 default:
b45f050a 7471 vFAIL("Invalid [::] class");
73b437c8 7472 break;
b8c5462f 7473 }
c49a72a9
NC
7474 if (what) {
7475 /* Strings such as "+utf8::isWord\n" */
7476 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7477 }
b8c5462f 7478 if (LOC)
936ed897 7479 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7480 continue;
a0d0e21e 7481 }
ffc61ed2
JH
7482 } /* end of namedclass \blah */
7483
a0d0e21e 7484 if (range) {
eb160463 7485 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7486 const int w = RExC_parse - rangebegin;
7487 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7488 range = 0; /* not a valid range */
73b437c8 7489 }
a0d0e21e
LW
7490 }
7491 else {
3568d838 7492 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7493 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7494 RExC_parse[1] != ']') {
7495 RExC_parse++;
ffc61ed2
JH
7496
7497 /* a bad range like \w-, [:word:]- ? */
7498 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7499 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7500 const int w =
afd78fd5
JH
7501 RExC_parse >= rangebegin ?
7502 RExC_parse - rangebegin : 0;
830247a4 7503 vWARN4(RExC_parse,
b45f050a 7504 "False [] range \"%*.*s\"",
097eb12c 7505 w, w, rangebegin);
afd78fd5 7506 }
73b437c8 7507 if (!SIZE_ONLY)
936ed897 7508 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7509 } else
ffc61ed2
JH
7510 range = 1; /* yeah, it's a range! */
7511 continue; /* but do it the next time */
a0d0e21e 7512 }
a687059c 7513 }
ffc61ed2 7514
93733859 7515 /* now is the next time */
07be1b83 7516 /*stored += (value - prevvalue + 1);*/
ae5c130c 7517 if (!SIZE_ONLY) {
3568d838 7518 if (prevvalue < 256) {
1df70142 7519 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7520 IV i;
3568d838 7521#ifdef EBCDIC
1b2d223b
JH
7522 /* In EBCDIC [\x89-\x91] should include
7523 * the \x8e but [i-j] should not. */
7524 if (literal_endpoint == 2 &&
7525 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7526 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7527 {
3568d838
JH
7528 if (isLOWER(prevvalue)) {
7529 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7530 if (isLOWER(i))
7531 ANYOF_BITMAP_SET(ret, i);
7532 } else {
3568d838 7533 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7534 if (isUPPER(i))
7535 ANYOF_BITMAP_SET(ret, i);
7536 }
8ada0baa 7537 }
ffc61ed2 7538 else
8ada0baa 7539#endif
07be1b83
YO
7540 for (i = prevvalue; i <= ceilvalue; i++) {
7541 if (!ANYOF_BITMAP_TEST(ret,i)) {
7542 stored++;
7543 ANYOF_BITMAP_SET(ret, i);
7544 }
7545 }
3568d838 7546 }
a5961de5 7547 if (value > 255 || UTF) {
1df70142
AL
7548 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7549 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7550 stored+=2; /* can't optimize this class */
ffc61ed2 7551 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7552 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7553 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7554 prevnatvalue, natvalue);
7555 }
7556 else if (prevnatvalue == natvalue) {
7557 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7558 if (FOLD) {
89ebb4a3 7559 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7560 STRLEN foldlen;
1df70142 7561 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7562
e294cc5d
JH
7563#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7564 if (RExC_precomp[0] == ':' &&
7565 RExC_precomp[1] == '[' &&
7566 (f == 0xDF || f == 0x92)) {
7567 f = NATIVE_TO_UNI(f);
7568 }
7569#endif
c840d2a2
JH
7570 /* If folding and foldable and a single
7571 * character, insert also the folded version
7572 * to the charclass. */
9e55ce06 7573 if (f != value) {
e294cc5d
JH
7574#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7575 if ((RExC_precomp[0] == ':' &&
7576 RExC_precomp[1] == '[' &&
7577 (f == 0xA2 &&
7578 (value == 0xFB05 || value == 0xFB06))) ?
7579 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7580 foldlen == (STRLEN)UNISKIP(f) )
7581#else
eb160463 7582 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7583#endif
9e55ce06
JH
7584 Perl_sv_catpvf(aTHX_ listsv,
7585 "%04"UVxf"\n", f);
7586 else {
7587 /* Any multicharacter foldings
7588 * require the following transform:
7589 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7590 * where E folds into "pq" and F folds
7591 * into "rst", all other characters
7592 * fold to single characters. We save
7593 * away these multicharacter foldings,
7594 * to be later saved as part of the
7595 * additional "s" data. */
7596 SV *sv;
7597
7598 if (!unicode_alternate)
7599 unicode_alternate = newAV();
7600 sv = newSVpvn((char*)foldbuf, foldlen);
7601 SvUTF8_on(sv);
7602 av_push(unicode_alternate, sv);
7603 }
7604 }
254ba52a 7605
60a8b682
JH
7606 /* If folding and the value is one of the Greek
7607 * sigmas insert a few more sigmas to make the
7608 * folding rules of the sigmas to work right.
7609 * Note that not all the possible combinations
7610 * are handled here: some of them are handled
9e55ce06
JH
7611 * by the standard folding rules, and some of
7612 * them (literal or EXACTF cases) are handled
7613 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7614 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7615 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7616 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7617 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7618 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7619 }
7620 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7621 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7622 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7623 }
7624 }
ffc61ed2 7625 }
1b2d223b
JH
7626#ifdef EBCDIC
7627 literal_endpoint = 0;
7628#endif
8ada0baa 7629 }
ffc61ed2
JH
7630
7631 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7632 }
ffc61ed2 7633
936ed897 7634 if (need_class) {
4f66b38d 7635 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7636 if (SIZE_ONLY)
830247a4 7637 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7638 else
830247a4 7639 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7640 }
ffc61ed2 7641
7f6f358c
YO
7642
7643 if (SIZE_ONLY)
7644 return ret;
7645 /****** !SIZE_ONLY AFTER HERE *********/
7646
7647 if( stored == 1 && value < 256
7648 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7649 ) {
7650 /* optimize single char class to an EXACT node
7651 but *only* when its not a UTF/high char */
07be1b83
YO
7652 const char * cur_parse= RExC_parse;
7653 RExC_emit = (regnode *)orig_emit;
7654 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7655 ret = reg_node(pRExC_state,
7656 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7657 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7658 *STRING(ret)= (char)value;
7659 STR_LEN(ret)= 1;
7660 RExC_emit += STR_SZ(1);
7661 return ret;
7662 }
ae5c130c 7663 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7664 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7665 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7666 ) {
a0ed51b3 7667 for (value = 0; value < 256; ++value) {
936ed897 7668 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7669 UV fold = PL_fold[value];
ffc61ed2
JH
7670
7671 if (fold != value)
7672 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7673 }
7674 }
936ed897 7675 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7676 }
ffc61ed2 7677
ae5c130c 7678 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7679 if (optimize_invert &&
ffc61ed2
JH
7680 /* If the only flag is inversion. */
7681 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7682 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7683 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7684 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7685 }
7f6f358c 7686 {
097eb12c 7687 AV * const av = newAV();
ffc61ed2 7688 SV *rv;
9e55ce06 7689 /* The 0th element stores the character class description
6a0407ee 7690 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7691 * to initialize the appropriate swash (which gets stored in
7692 * the 1st element), and also useful for dumping the regnode.
7693 * The 2nd element stores the multicharacter foldings,
6a0407ee 7694 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7695 av_store(av, 0, listsv);
7696 av_store(av, 1, NULL);
9e55ce06 7697 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7698 rv = newRV_noinc((SV*)av);
19860706 7699 n = add_data(pRExC_state, 1, "s");
830247a4 7700 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7701 ARG_SET(ret, n);
a0ed51b3 7702 }
a0ed51b3
LW
7703 return ret;
7704}
7705
76e3520e 7706STATIC char*
830247a4 7707S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7708{
097eb12c 7709 char* const retval = RExC_parse++;
a0d0e21e 7710
4633a7c4 7711 for (;;) {
830247a4
IZ
7712 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7713 RExC_parse[2] == '#') {
e994fd66
AE
7714 while (*RExC_parse != ')') {
7715 if (RExC_parse == RExC_end)
7716 FAIL("Sequence (?#... not terminated");
830247a4 7717 RExC_parse++;
e994fd66 7718 }
830247a4 7719 RExC_parse++;
4633a7c4
LW
7720 continue;
7721 }
e2509266 7722 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7723 if (isSPACE(*RExC_parse)) {
7724 RExC_parse++;
748a9306
LW
7725 continue;
7726 }
830247a4 7727 else if (*RExC_parse == '#') {
e994fd66
AE
7728 while (RExC_parse < RExC_end)
7729 if (*RExC_parse++ == '\n') break;
748a9306
LW
7730 continue;
7731 }
748a9306 7732 }
4633a7c4 7733 return retval;
a0d0e21e 7734 }
a687059c
LW
7735}
7736
7737/*
c277df42 7738- reg_node - emit a node
a0d0e21e 7739*/
76e3520e 7740STATIC regnode * /* Location. */
830247a4 7741S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7742{
97aff369 7743 dVAR;
c277df42 7744 register regnode *ptr;
504618e9 7745 regnode * const ret = RExC_emit;
07be1b83 7746 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7747
c277df42 7748 if (SIZE_ONLY) {
830247a4
IZ
7749 SIZE_ALIGN(RExC_size);
7750 RExC_size += 1;
a0d0e21e
LW
7751 return(ret);
7752 }
e2e6a0f1
YO
7753#ifdef DEBUGGING
7754 if (OP(RExC_emit) == 255)
7755 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7756 reg_name[op], OP(RExC_emit));
7757#endif
c277df42 7758 NODE_ALIGN_FILL(ret);
a0d0e21e 7759 ptr = ret;
c277df42 7760 FILL_ADVANCE_NODE(ptr, op);
fac92740 7761 if (RExC_offsets) { /* MJD */
07be1b83 7762 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7763 "reg_node", __LINE__,
7764 reg_name[op],
07be1b83
YO
7765 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7766 ? "Overwriting end of array!\n" : "OK",
7767 (UV)(RExC_emit - RExC_emit_start),
7768 (UV)(RExC_parse - RExC_start),
7769 (UV)RExC_offsets[0]));
ccb2c380 7770 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7771 }
07be1b83 7772
830247a4 7773 RExC_emit = ptr;
a0d0e21e 7774 return(ret);
a687059c
LW
7775}
7776
7777/*
a0d0e21e
LW
7778- reganode - emit a node with an argument
7779*/
76e3520e 7780STATIC regnode * /* Location. */
830247a4 7781S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7782{
97aff369 7783 dVAR;
c277df42 7784 register regnode *ptr;
504618e9 7785 regnode * const ret = RExC_emit;
07be1b83 7786 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7787
c277df42 7788 if (SIZE_ONLY) {
830247a4
IZ
7789 SIZE_ALIGN(RExC_size);
7790 RExC_size += 2;
6bda09f9
YO
7791 /*
7792 We can't do this:
7793
7794 assert(2==regarglen[op]+1);
7795
7796 Anything larger than this has to allocate the extra amount.
7797 If we changed this to be:
7798
7799 RExC_size += (1 + regarglen[op]);
7800
7801 then it wouldn't matter. Its not clear what side effect
7802 might come from that so its not done so far.
7803 -- dmq
7804 */
a0d0e21e
LW
7805 return(ret);
7806 }
e2e6a0f1
YO
7807#ifdef DEBUGGING
7808 if (OP(RExC_emit) == 255)
7809 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7810#endif
c277df42 7811 NODE_ALIGN_FILL(ret);
a0d0e21e 7812 ptr = ret;
c277df42 7813 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7814 if (RExC_offsets) { /* MJD */
07be1b83 7815 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7816 "reganode",
ccb2c380
MP
7817 __LINE__,
7818 reg_name[op],
07be1b83 7819 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7820 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7821 (UV)(RExC_emit - RExC_emit_start),
7822 (UV)(RExC_parse - RExC_start),
7823 (UV)RExC_offsets[0]));
ccb2c380 7824 Set_Cur_Node_Offset;
fac92740
MJD
7825 }
7826
830247a4 7827 RExC_emit = ptr;
a0d0e21e 7828 return(ret);
fe14fcc3
LW
7829}
7830
7831/*
cd439c50 7832- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7833*/
71207a34
AL
7834STATIC STRLEN
7835S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7836{
97aff369 7837 dVAR;
71207a34 7838 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7839}
7840
7841/*
a0d0e21e
LW
7842- reginsert - insert an operator in front of already-emitted operand
7843*
7844* Means relocating the operand.
7845*/
76e3520e 7846STATIC void
6bda09f9 7847S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7848{
97aff369 7849 dVAR;
c277df42
IZ
7850 register regnode *src;
7851 register regnode *dst;
7852 register regnode *place;
504618e9 7853 const int offset = regarglen[(U8)op];
6bda09f9 7854 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7855 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7856/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7857 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7858 if (SIZE_ONLY) {
6bda09f9 7859 RExC_size += size;
a0d0e21e
LW
7860 return;
7861 }
a687059c 7862
830247a4 7863 src = RExC_emit;
6bda09f9 7864 RExC_emit += size;
830247a4 7865 dst = RExC_emit;
40d049e4 7866 if (RExC_open_parens) {
6bda09f9 7867 int paren;
6d99fb9b 7868 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7869 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7870 if ( RExC_open_parens[paren] >= opnd ) {
7871 DEBUG_PARSE_FMT("open"," - %d",size);
7872 RExC_open_parens[paren] += size;
7873 } else {
7874 DEBUG_PARSE_FMT("open"," - %s","ok");
7875 }
7876 if ( RExC_close_parens[paren] >= opnd ) {
7877 DEBUG_PARSE_FMT("close"," - %d",size);
7878 RExC_close_parens[paren] += size;
7879 } else {
7880 DEBUG_PARSE_FMT("close"," - %s","ok");
7881 }
7882 }
6bda09f9 7883 }
40d049e4 7884
fac92740 7885 while (src > opnd) {
c277df42 7886 StructCopy(--src, --dst, regnode);
fac92740 7887 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7888 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7889 "reg_insert",
ccb2c380
MP
7890 __LINE__,
7891 reg_name[op],
07be1b83
YO
7892 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7893 ? "Overwriting end of array!\n" : "OK",
7894 (UV)(src - RExC_emit_start),
7895 (UV)(dst - RExC_emit_start),
7896 (UV)RExC_offsets[0]));
ccb2c380
MP
7897 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7898 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7899 }
7900 }
7901
a0d0e21e
LW
7902
7903 place = opnd; /* Op node, where operand used to be. */
fac92740 7904 if (RExC_offsets) { /* MJD */
07be1b83 7905 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7906 "reginsert",
ccb2c380
MP
7907 __LINE__,
7908 reg_name[op],
07be1b83 7909 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7910 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7911 (UV)(place - RExC_emit_start),
7912 (UV)(RExC_parse - RExC_start),
786e8c11 7913 (UV)RExC_offsets[0]));
ccb2c380 7914 Set_Node_Offset(place, RExC_parse);
45948336 7915 Set_Node_Length(place, 1);
fac92740 7916 }
c277df42
IZ
7917 src = NEXTOPER(place);
7918 FILL_ADVANCE_NODE(place, op);
7919 Zero(src, offset, regnode);
a687059c
LW
7920}
7921
7922/*
c277df42 7923- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7924- SEE ALSO: regtail_study
a0d0e21e 7925*/
097eb12c 7926/* TODO: All three parms should be const */
76e3520e 7927STATIC void
3dab1dad 7928S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7929{
97aff369 7930 dVAR;
c277df42 7931 register regnode *scan;
72f13be8 7932 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7933#ifndef DEBUGGING
7934 PERL_UNUSED_ARG(depth);
7935#endif
a0d0e21e 7936
c277df42 7937 if (SIZE_ONLY)
a0d0e21e
LW
7938 return;
7939
7940 /* Find last node. */
7941 scan = p;
7942 for (;;) {
504618e9 7943 regnode * const temp = regnext(scan);
3dab1dad
YO
7944 DEBUG_PARSE_r({
7945 SV * const mysv=sv_newmortal();
7946 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7947 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7948 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7949 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7950 (temp == NULL ? "->" : ""),
7951 (temp == NULL ? reg_name[OP(val)] : "")
7952 );
3dab1dad
YO
7953 });
7954 if (temp == NULL)
7955 break;
7956 scan = temp;
7957 }
7958
7959 if (reg_off_by_arg[OP(scan)]) {
7960 ARG_SET(scan, val - scan);
7961 }
7962 else {
7963 NEXT_OFF(scan) = val - scan;
7964 }
7965}
7966
07be1b83 7967#ifdef DEBUGGING
3dab1dad
YO
7968/*
7969- regtail_study - set the next-pointer at the end of a node chain of p to val.
7970- Look for optimizable sequences at the same time.
7971- currently only looks for EXACT chains.
07be1b83
YO
7972
7973This is expermental code. The idea is to use this routine to perform
7974in place optimizations on branches and groups as they are constructed,
7975with the long term intention of removing optimization from study_chunk so
7976that it is purely analytical.
7977
7978Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7979to control which is which.
7980
3dab1dad
YO
7981*/
7982/* TODO: All four parms should be const */
07be1b83 7983
3dab1dad
YO
7984STATIC U8
7985S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7986{
7987 dVAR;
7988 register regnode *scan;
07be1b83
YO
7989 U8 exact = PSEUDO;
7990#ifdef EXPERIMENTAL_INPLACESCAN
7991 I32 min = 0;
7992#endif
7993
3dab1dad
YO
7994 GET_RE_DEBUG_FLAGS_DECL;
7995
07be1b83 7996
3dab1dad
YO
7997 if (SIZE_ONLY)
7998 return exact;
7999
8000 /* Find last node. */
8001
8002 scan = p;
8003 for (;;) {
8004 regnode * const temp = regnext(scan);
07be1b83
YO
8005#ifdef EXPERIMENTAL_INPLACESCAN
8006 if (PL_regkind[OP(scan)] == EXACT)
8007 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8008 return EXACT;
8009#endif
3dab1dad
YO
8010 if ( exact ) {
8011 switch (OP(scan)) {
8012 case EXACT:
8013 case EXACTF:
8014 case EXACTFL:
8015 if( exact == PSEUDO )
8016 exact= OP(scan);
07be1b83
YO
8017 else if ( exact != OP(scan) )
8018 exact= 0;
3dab1dad
YO
8019 case NOTHING:
8020 break;
8021 default:
8022 exact= 0;
8023 }
8024 }
8025 DEBUG_PARSE_r({
8026 SV * const mysv=sv_newmortal();
8027 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8028 regprop(RExC_rx, mysv, scan);
eaf3ca90 8029 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8030 SvPV_nolen_const(mysv),
eaf3ca90
YO
8031 REG_NODE_NUM(scan),
8032 reg_name[exact]);
3dab1dad 8033 });
a0d0e21e
LW
8034 if (temp == NULL)
8035 break;
8036 scan = temp;
8037 }
07be1b83
YO
8038 DEBUG_PARSE_r({
8039 SV * const mysv_val=sv_newmortal();
8040 DEBUG_PARSE_MSG("");
8041 regprop(RExC_rx, mysv_val, val);
8042 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8043 SvPV_nolen_const(mysv_val),
8044 REG_NODE_NUM(val),
8045 val - scan
8046 );
8047 });
c277df42
IZ
8048 if (reg_off_by_arg[OP(scan)]) {
8049 ARG_SET(scan, val - scan);
a0ed51b3
LW
8050 }
8051 else {
c277df42
IZ
8052 NEXT_OFF(scan) = val - scan;
8053 }
3dab1dad
YO
8054
8055 return exact;
a687059c 8056}
07be1b83 8057#endif
a687059c
LW
8058
8059/*
a687059c
LW
8060 - regcurly - a little FSA that accepts {\d+,?\d*}
8061 */
79072805 8062STATIC I32
5f66b61c 8063S_regcurly(register const char *s)
a687059c
LW
8064{
8065 if (*s++ != '{')
8066 return FALSE;
f0fcb552 8067 if (!isDIGIT(*s))
a687059c 8068 return FALSE;
f0fcb552 8069 while (isDIGIT(*s))
a687059c
LW
8070 s++;
8071 if (*s == ',')
8072 s++;
f0fcb552 8073 while (isDIGIT(*s))
a687059c
LW
8074 s++;
8075 if (*s != '}')
8076 return FALSE;
8077 return TRUE;
8078}
8079
a687059c
LW
8080
8081/*
fd181c75 8082 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8083 */
8084void
097eb12c 8085Perl_regdump(pTHX_ const regexp *r)
a687059c 8086{
35ff7856 8087#ifdef DEBUGGING
97aff369 8088 dVAR;
c445ea15 8089 SV * const sv = sv_newmortal();
ab3bbdeb 8090 SV *dsv= sv_newmortal();
a687059c 8091
786e8c11 8092 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8093
8094 /* Header fields of interest. */
ab3bbdeb
YO
8095 if (r->anchored_substr) {
8096 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8097 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8098 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8099 "anchored %s%s at %"IVdf" ",
8100 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8101 (IV)r->anchored_offset);
ab3bbdeb
YO
8102 } else if (r->anchored_utf8) {
8103 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8104 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8105 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8106 "anchored utf8 %s%s at %"IVdf" ",
8107 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8108 (IV)r->anchored_offset);
ab3bbdeb
YO
8109 }
8110 if (r->float_substr) {
8111 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8112 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8113 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8114 "floating %s%s at %"IVdf"..%"UVuf" ",
8115 s, RE_SV_TAIL(r->float_substr),
7b0972df 8116 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8117 } else if (r->float_utf8) {
8118 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8119 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8120 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8121 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8122 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8123 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8124 }
33b8afdf 8125 if (r->check_substr || r->check_utf8)
b81d288d 8126 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8127 (const char *)
8128 (r->check_substr == r->float_substr
8129 && r->check_utf8 == r->float_utf8
8130 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8131 if (r->reganch & ROPT_NOSCAN)
8132 PerlIO_printf(Perl_debug_log, " noscan");
8133 if (r->reganch & ROPT_CHECK_ALL)
8134 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8135 if (r->check_substr || r->check_utf8)
c277df42
IZ
8136 PerlIO_printf(Perl_debug_log, ") ");
8137
46fc3d4c 8138 if (r->regstclass) {
32fc9b6a 8139 regprop(r, sv, r->regstclass);
1de06328 8140 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8141 }
774d564b 8142 if (r->reganch & ROPT_ANCH) {
8143 PerlIO_printf(Perl_debug_log, "anchored");
8144 if (r->reganch & ROPT_ANCH_BOL)
8145 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8146 if (r->reganch & ROPT_ANCH_MBOL)
8147 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8148 if (r->reganch & ROPT_ANCH_SBOL)
8149 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8150 if (r->reganch & ROPT_ANCH_GPOS)
8151 PerlIO_printf(Perl_debug_log, "(GPOS)");
8152 PerlIO_putc(Perl_debug_log, ' ');
8153 }
c277df42 8154 if (r->reganch & ROPT_GPOS_SEEN)
58e23c8d 8155 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs);
a0d0e21e 8156 if (r->reganch & ROPT_SKIP)
760ac839 8157 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8158 if (r->reganch & ROPT_IMPLICIT)
760ac839 8159 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8160 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8161 if (r->reganch & ROPT_EVAL_SEEN)
8162 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8163 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8164#else
96a5add6 8165 PERL_UNUSED_CONTEXT;
65e66c80 8166 PERL_UNUSED_ARG(r);
17c3b450 8167#endif /* DEBUGGING */
a687059c
LW
8168}
8169
8170/*
a0d0e21e
LW
8171- regprop - printable representation of opcode
8172*/
46fc3d4c 8173void
32fc9b6a 8174Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8175{
35ff7856 8176#ifdef DEBUGGING
97aff369 8177 dVAR;
9b155405 8178 register int k;
1de06328 8179 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8180
54dc92de 8181 sv_setpvn(sv, "", 0);
8aa23a47 8182
03363afd 8183 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8184 /* It would be nice to FAIL() here, but this may be called from
8185 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8186 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8187 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8188
3dab1dad 8189 k = PL_regkind[OP(o)];
9b155405 8190
2a782b5b 8191 if (k == EXACT) {
396482e1 8192 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8193 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8194 * is a crude hack but it may be the best for now since
8195 * we have no flag "this EXACTish node was UTF-8"
8196 * --jhi */
8197 const char * const s =
ddc5bc0f 8198 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8199 PL_colors[0], PL_colors[1],
8200 PERL_PV_ESCAPE_UNI_DETECT |
8201 PERL_PV_PRETTY_ELIPSES |
8202 PERL_PV_PRETTY_LTGT
8203 );
8204 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8205 } else if (k == TRIE) {
3dab1dad 8206 /* print the details of the trie in dumpuntil instead, as
4f639d21 8207 * prog->data isn't available here */
1de06328
YO
8208 const char op = OP(o);
8209 const I32 n = ARG(o);
8210 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8211 (reg_ac_data *)prog->data->data[n] :
8212 NULL;
8213 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8214 (reg_trie_data*)prog->data->data[n] :
8215 ac->trie;
8216
8217 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8218 DEBUG_TRIE_COMPILE_r(
8219 Perl_sv_catpvf(aTHX_ sv,
8220 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8221 (UV)trie->startstate,
1e2e3d02 8222 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8223 (UV)trie->wordcount,
8224 (UV)trie->minlen,
8225 (UV)trie->maxlen,
8226 (UV)TRIE_CHARCOUNT(trie),
8227 (UV)trie->uniquecharcount
8228 )
8229 );
8230 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8231 int i;
8232 int rangestart = -1;
f46cb337 8233 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8234 Perl_sv_catpvf(aTHX_ sv, "[");
8235 for (i = 0; i <= 256; i++) {
8236 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8237 if (rangestart == -1)
8238 rangestart = i;
8239 } else if (rangestart != -1) {
8240 if (i <= rangestart + 3)
8241 for (; rangestart < i; rangestart++)
8242 put_byte(sv, rangestart);
8243 else {
8244 put_byte(sv, rangestart);
8245 sv_catpvs(sv, "-");
8246 put_byte(sv, i - 1);
8247 }
8248 rangestart = -1;
8249 }
8250 }
8251 Perl_sv_catpvf(aTHX_ sv, "]");
8252 }
8253
a3621e74 8254 } else if (k == CURLY) {
cb434fcc 8255 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8256 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8257 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8258 }
2c2d71f5
JH
8259 else if (k == WHILEM && o->flags) /* Ordinal/of */
8260 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8261 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8262 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8263 else if (k == GOSUB)
6bda09f9 8264 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8265 else if (k == VERB) {
8266 if (!o->flags)
8267 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8268 (SV*)prog->data->data[ ARG( o ) ]);
8269 } else if (k == LOGICAL)
04ebc1ab 8270 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8271 else if (k == ANYOF) {
8272 int i, rangestart = -1;
2d03de9c 8273 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8274
8275 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8276 static const char * const anyofs[] = {
653099ff
GS
8277 "\\w",
8278 "\\W",
8279 "\\s",
8280 "\\S",
8281 "\\d",
8282 "\\D",
8283 "[:alnum:]",
8284 "[:^alnum:]",
8285 "[:alpha:]",
8286 "[:^alpha:]",
8287 "[:ascii:]",
8288 "[:^ascii:]",
8289 "[:ctrl:]",
8290 "[:^ctrl:]",
8291 "[:graph:]",
8292 "[:^graph:]",
8293 "[:lower:]",
8294 "[:^lower:]",
8295 "[:print:]",
8296 "[:^print:]",
8297 "[:punct:]",
8298 "[:^punct:]",
8299 "[:upper:]",
aaa51d5e 8300 "[:^upper:]",
653099ff 8301 "[:xdigit:]",
aaa51d5e
JF
8302 "[:^xdigit:]",
8303 "[:space:]",
8304 "[:^space:]",
8305 "[:blank:]",
8306 "[:^blank:]"
653099ff
GS
8307 };
8308
19860706 8309 if (flags & ANYOF_LOCALE)
396482e1 8310 sv_catpvs(sv, "{loc}");
19860706 8311 if (flags & ANYOF_FOLD)
396482e1 8312 sv_catpvs(sv, "{i}");
653099ff 8313 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8314 if (flags & ANYOF_INVERT)
396482e1 8315 sv_catpvs(sv, "^");
ffc61ed2
JH
8316 for (i = 0; i <= 256; i++) {
8317 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8318 if (rangestart == -1)
8319 rangestart = i;
8320 } else if (rangestart != -1) {
8321 if (i <= rangestart + 3)
8322 for (; rangestart < i; rangestart++)
653099ff 8323 put_byte(sv, rangestart);
ffc61ed2
JH
8324 else {
8325 put_byte(sv, rangestart);
396482e1 8326 sv_catpvs(sv, "-");
ffc61ed2 8327 put_byte(sv, i - 1);
653099ff 8328 }
ffc61ed2 8329 rangestart = -1;
653099ff 8330 }
847a199f 8331 }
ffc61ed2
JH
8332
8333 if (o->flags & ANYOF_CLASS)
bb7a0f54 8334 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8335 if (ANYOF_CLASS_TEST(o,i))
8336 sv_catpv(sv, anyofs[i]);
8337
8338 if (flags & ANYOF_UNICODE)
396482e1 8339 sv_catpvs(sv, "{unicode}");
1aa99e6b 8340 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8341 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8342
8343 {
8344 SV *lv;
32fc9b6a 8345 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8346
ffc61ed2
JH
8347 if (lv) {
8348 if (sw) {
89ebb4a3 8349 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8350
ffc61ed2 8351 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8352 uvchr_to_utf8(s, i);
ffc61ed2 8353
3568d838 8354 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8355 if (rangestart == -1)
8356 rangestart = i;
8357 } else if (rangestart != -1) {
ffc61ed2
JH
8358 if (i <= rangestart + 3)
8359 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8360 const U8 * const e = uvchr_to_utf8(s,rangestart);
8361 U8 *p;
8362 for(p = s; p < e; p++)
ffc61ed2
JH
8363 put_byte(sv, *p);
8364 }
8365 else {
2d03de9c
AL
8366 const U8 *e = uvchr_to_utf8(s,rangestart);
8367 U8 *p;
8368 for (p = s; p < e; p++)
ffc61ed2 8369 put_byte(sv, *p);
396482e1 8370 sv_catpvs(sv, "-");
2d03de9c
AL
8371 e = uvchr_to_utf8(s, i-1);
8372 for (p = s; p < e; p++)
1df70142 8373 put_byte(sv, *p);
ffc61ed2
JH
8374 }
8375 rangestart = -1;
8376 }
19860706 8377 }
ffc61ed2 8378
396482e1 8379 sv_catpvs(sv, "..."); /* et cetera */
19860706 8380 }
fde631ed 8381
ffc61ed2 8382 {
2e0de35c 8383 char *s = savesvpv(lv);
c445ea15 8384 char * const origs = s;
b81d288d 8385
3dab1dad
YO
8386 while (*s && *s != '\n')
8387 s++;
b81d288d 8388
ffc61ed2 8389 if (*s == '\n') {
2d03de9c 8390 const char * const t = ++s;
ffc61ed2
JH
8391
8392 while (*s) {
8393 if (*s == '\n')
8394 *s = ' ';
8395 s++;
8396 }
8397 if (s[-1] == ' ')
8398 s[-1] = 0;
8399
8400 sv_catpv(sv, t);
fde631ed 8401 }
b81d288d 8402
ffc61ed2 8403 Safefree(origs);
fde631ed
JH
8404 }
8405 }
653099ff 8406 }
ffc61ed2 8407
653099ff
GS
8408 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8409 }
9b155405 8410 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8411 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8412#else
96a5add6 8413 PERL_UNUSED_CONTEXT;
65e66c80
SP
8414 PERL_UNUSED_ARG(sv);
8415 PERL_UNUSED_ARG(o);
f9049ba1 8416 PERL_UNUSED_ARG(prog);
17c3b450 8417#endif /* DEBUGGING */
35ff7856 8418}
a687059c 8419
cad2e5aa
JH
8420SV *
8421Perl_re_intuit_string(pTHX_ regexp *prog)
8422{ /* Assume that RE_INTUIT is set */
97aff369 8423 dVAR;
a3621e74 8424 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8425 PERL_UNUSED_CONTEXT;
8426
a3621e74 8427 DEBUG_COMPILE_r(
cfd0369c 8428 {
2d03de9c 8429 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8430 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8431
8432 if (!PL_colorset) reginitcolors();
8433 PerlIO_printf(Perl_debug_log,
a0288114 8434 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8435 PL_colors[4],
8436 prog->check_substr ? "" : "utf8 ",
8437 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8438 s,
8439 PL_colors[1],
8440 (strlen(s) > 60 ? "..." : ""));
8441 } );
8442
33b8afdf 8443 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8444}
8445
84da74a7
YO
8446/*
8447 pregfree - free a regexp
8448
8449 See regdupe below if you change anything here.
8450*/
8451
2b69d0c2 8452void
864dbfa3 8453Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8454{
27da23d5 8455 dVAR;
0df25f3d 8456
fc32ee4a 8457 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8458
7821416a
IZ
8459 if (!r || (--r->refcnt > 0))
8460 return;
ab3bbdeb 8461 DEBUG_COMPILE_r({
0df25f3d
YO
8462 if (!PL_colorset)
8463 reginitcolors();
5c9f2f80
YO
8464 {
8465 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8466 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8467 dsv, r->precomp, r->prelen, 60);
8468 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8469 PL_colors[4],PL_colors[5],s);
8470 }
9e55ce06 8471 });
cad2e5aa 8472
43c5f42d
NC
8473 /* gcov results gave these as non-null 100% of the time, so there's no
8474 optimisation in checking them before calling Safefree */
8475 Safefree(r->precomp);
8476 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8477 RX_MATCH_COPY_FREE(r);
f8c7b90f 8478#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8479 if (r->saved_copy)
8480 SvREFCNT_dec(r->saved_copy);
8481#endif
a193d654
GS
8482 if (r->substrs) {
8483 if (r->anchored_substr)
8484 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8485 if (r->anchored_utf8)
8486 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8487 if (r->float_substr)
8488 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8489 if (r->float_utf8)
8490 SvREFCNT_dec(r->float_utf8);
2779dcf1 8491 Safefree(r->substrs);
a193d654 8492 }
81714fb9
YO
8493 if (r->paren_names)
8494 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8495 if (r->data) {
8496 int n = r->data->count;
f3548bdc
DM
8497 PAD* new_comppad = NULL;
8498 PAD* old_comppad;
4026c95a 8499 PADOFFSET refcnt;
dfad63ad 8500
c277df42 8501 while (--n >= 0) {
261faec3 8502 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8503 switch (r->data->what[n]) {
8504 case 's':
81714fb9 8505 case 'S':
c277df42
IZ
8506 SvREFCNT_dec((SV*)r->data->data[n]);
8507 break;
653099ff
GS
8508 case 'f':
8509 Safefree(r->data->data[n]);
8510 break;
dfad63ad
HS
8511 case 'p':
8512 new_comppad = (AV*)r->data->data[n];
8513 break;
c277df42 8514 case 'o':
dfad63ad 8515 if (new_comppad == NULL)
cea2e8a9 8516 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8517 PAD_SAVE_LOCAL(old_comppad,
8518 /* Watch out for global destruction's random ordering. */
c445ea15 8519 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8520 );
b34c0dd4 8521 OP_REFCNT_LOCK;
4026c95a
SH
8522 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8523 OP_REFCNT_UNLOCK;
8524 if (!refcnt)
9b978d73 8525 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8526
f3548bdc 8527 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8528 SvREFCNT_dec((SV*)new_comppad);
8529 new_comppad = NULL;
c277df42
IZ
8530 break;
8531 case 'n':
9e55ce06 8532 break;
07be1b83 8533 case 'T':
be8e71aa
YO
8534 { /* Aho Corasick add-on structure for a trie node.
8535 Used in stclass optimization only */
07be1b83
YO
8536 U32 refcount;
8537 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8538 OP_REFCNT_LOCK;
8539 refcount = --aho->refcount;
8540 OP_REFCNT_UNLOCK;
8541 if ( !refcount ) {
8542 Safefree(aho->states);
8543 Safefree(aho->fail);
8544 aho->trie=NULL; /* not necessary to free this as it is
8545 handled by the 't' case */
8546 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8547 Safefree(r->regstclass);
07be1b83
YO
8548 }
8549 }
8550 break;
a3621e74 8551 case 't':
07be1b83 8552 {
be8e71aa 8553 /* trie structure. */
07be1b83
YO
8554 U32 refcount;
8555 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8556 OP_REFCNT_LOCK;
8557 refcount = --trie->refcount;
8558 OP_REFCNT_UNLOCK;
8559 if ( !refcount ) {
8560 Safefree(trie->charmap);
8561 if (trie->widecharmap)
8562 SvREFCNT_dec((SV*)trie->widecharmap);
8563 Safefree(trie->states);
8564 Safefree(trie->trans);
8565 if (trie->bitmap)
8566 Safefree(trie->bitmap);
8567 if (trie->wordlen)
8568 Safefree(trie->wordlen);
786e8c11
YO
8569 if (trie->jump)
8570 Safefree(trie->jump);
8571 if (trie->nextword)
8572 Safefree(trie->nextword);
a3621e74 8573#ifdef DEBUGGING
5c9f2f80
YO
8574 if (trie->words)
8575 SvREFCNT_dec((SV*)trie->words);
8576 if (trie->revcharmap)
8577 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8578#endif
07be1b83 8579 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8580 }
07be1b83
YO
8581 }
8582 break;
c277df42 8583 default:
830247a4 8584 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8585 }
8586 }
8587 Safefree(r->data->what);
8588 Safefree(r->data);
a0d0e21e
LW
8589 }
8590 Safefree(r->startp);
8591 Safefree(r->endp);
c74340f9
YO
8592 if (r->swap) {
8593 Safefree(r->swap->startp);
8594 Safefree(r->swap->endp);
8595 Safefree(r->swap);
8596 }
a0d0e21e 8597 Safefree(r);
a687059c 8598}
c277df42 8599
84da74a7
YO
8600#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8601#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8602#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8603#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8604
8605/*
8606 regdupe - duplicate a regexp.
8607
8608 This routine is called by sv.c's re_dup and is expected to clone a
8609 given regexp structure. It is a no-op when not under USE_ITHREADS.
8610 (Originally this *was* re_dup() for change history see sv.c)
8611
8612 See pregfree() above if you change anything here.
8613*/
a3c0e9ca 8614#if defined(USE_ITHREADS)
84da74a7
YO
8615regexp *
8616Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8617{
84da74a7
YO
8618 dVAR;
8619 REGEXP *ret;
8620 int i, len, npar;
8621 struct reg_substr_datum *s;
8622
8623 if (!r)
8624 return (REGEXP *)NULL;
8625
8626 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8627 return ret;
8628
8629 len = r->offsets[0];
8630 npar = r->nparens+1;
8631
8632 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8633 Copy(r->program, ret->program, len+1, regnode);
8634
8635 Newx(ret->startp, npar, I32);
8636 Copy(r->startp, ret->startp, npar, I32);
8637 Newx(ret->endp, npar, I32);
8638 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8639 if(r->swap) {
8640 Newx(ret->swap, 1, regexp_paren_ofs);
8641 /* no need to copy these */
8642 Newx(ret->swap->startp, npar, I32);
8643 Newx(ret->swap->endp, npar, I32);
8644 } else {
8645 ret->swap = NULL;
8646 }
84da74a7
YO
8647
8648 Newx(ret->substrs, 1, struct reg_substr_data);
8649 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8650 s->min_offset = r->substrs->data[i].min_offset;
8651 s->max_offset = r->substrs->data[i].max_offset;
8652 s->end_shift = r->substrs->data[i].end_shift;
8653 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8654 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8655 }
8656
8657 ret->regstclass = NULL;
8658 if (r->data) {
8659 struct reg_data *d;
8660 const int count = r->data->count;
8661 int i;
8662
8663 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8664 char, struct reg_data);
8665 Newx(d->what, count, U8);
8666
8667 d->count = count;
8668 for (i = 0; i < count; i++) {
8669 d->what[i] = r->data->what[i];
8670 switch (d->what[i]) {
e2e6a0f1 8671 /* legal options are one of: sSfpont
84da74a7
YO
8672 see also regcomp.h and pregfree() */
8673 case 's':
81714fb9 8674 case 'S':
84da74a7
YO
8675 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8676 break;
8677 case 'p':
8678 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8679 break;
8680 case 'f':
8681 /* This is cheating. */
8682 Newx(d->data[i], 1, struct regnode_charclass_class);
8683 StructCopy(r->data->data[i], d->data[i],
8684 struct regnode_charclass_class);
8685 ret->regstclass = (regnode*)d->data[i];
8686 break;
8687 case 'o':
8688 /* Compiled op trees are readonly, and can thus be
8689 shared without duplication. */
8690 OP_REFCNT_LOCK;
8691 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8692 OP_REFCNT_UNLOCK;
8693 break;
8694 case 'n':
8695 d->data[i] = r->data->data[i];
8696 break;
8697 case 't':
8698 d->data[i] = r->data->data[i];
8699 OP_REFCNT_LOCK;
8700 ((reg_trie_data*)d->data[i])->refcount++;
8701 OP_REFCNT_UNLOCK;
8702 break;
8703 case 'T':
8704 d->data[i] = r->data->data[i];
8705 OP_REFCNT_LOCK;
8706 ((reg_ac_data*)d->data[i])->refcount++;
8707 OP_REFCNT_UNLOCK;
8708 /* Trie stclasses are readonly and can thus be shared
8709 * without duplication. We free the stclass in pregfree
8710 * when the corresponding reg_ac_data struct is freed.
8711 */
8712 ret->regstclass= r->regstclass;
8713 break;
8714 default:
8715 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8716 }
8717 }
8718
8719 ret->data = d;
8720 }
8721 else
8722 ret->data = NULL;
8723
8724 Newx(ret->offsets, 2*len+1, U32);
8725 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8726
8727 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8728 ret->refcnt = r->refcnt;
8729 ret->minlen = r->minlen;
de8c5301 8730 ret->minlenret = r->minlenret;
84da74a7
YO
8731 ret->prelen = r->prelen;
8732 ret->nparens = r->nparens;
8733 ret->lastparen = r->lastparen;
8734 ret->lastcloseparen = r->lastcloseparen;
8735 ret->reganch = r->reganch;
8736
8737 ret->sublen = r->sublen;
8738
f9f4320a 8739 ret->engine = r->engine;
81714fb9
YO
8740
8741 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8742
84da74a7
YO
8743 if (RX_MATCH_COPIED(ret))
8744 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8745 else
8746 ret->subbeg = NULL;
8747#ifdef PERL_OLD_COPY_ON_WRITE
8748 ret->saved_copy = NULL;
8749#endif
8750
8751 ptr_table_store(PL_ptr_table, r, ret);
8752 return ret;
84da74a7 8753}
a3c0e9ca 8754#endif
84da74a7 8755
de8c5301
YO
8756/*
8757 reg_stringify()
8758
8759 converts a regexp embedded in a MAGIC struct to its stringified form,
8760 caching the converted form in the struct and returns the cached
8761 string.
8762
8763 If lp is nonnull then it is used to return the length of the
8764 resulting string
8765
8766 If flags is nonnull and the returned string contains UTF8 then
8767 (flags & 1) will be true.
8768
8769 If haseval is nonnull then it is used to return whether the pattern
8770 contains evals.
8771
8772 Normally called via macro:
8773
8774 CALLREG_STRINGIFY(mg,0,0);
8775
8776 And internally with
8777
8778 CALLREG_AS_STR(mg,lp,flags,haseval)
8779
8780 See sv_2pv_flags() in sv.c for an example of internal usage.
8781
8782 */
8783
8784char *
8785Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8786 dVAR;
8787 const regexp * const re = (regexp *)mg->mg_obj;
8788
8789 if (!mg->mg_ptr) {
8790 const char *fptr = "msix";
8791 char reflags[6];
8792 char ch;
8793 int left = 0;
8794 int right = 4;
8795 bool need_newline = 0;
8796 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8797
8798 while((ch = *fptr++)) {
8799 if(reganch & 1) {
8800 reflags[left++] = ch;
8801 }
8802 else {
8803 reflags[right--] = ch;
8804 }
8805 reganch >>= 1;
8806 }
8807 if(left != 4) {
8808 reflags[left] = '-';
8809 left = 5;
8810 }
8811
8812 mg->mg_len = re->prelen + 4 + left;
8813 /*
8814 * If /x was used, we have to worry about a regex ending with a
8815 * comment later being embedded within another regex. If so, we don't
8816 * want this regex's "commentization" to leak out to the right part of
8817 * the enclosing regex, we must cap it with a newline.
8818 *
8819 * So, if /x was used, we scan backwards from the end of the regex. If
8820 * we find a '#' before we find a newline, we need to add a newline
8821 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8822 * we don't need to add anything. -jfriedl
8823 */
8824 if (PMf_EXTENDED & re->reganch) {
8825 const char *endptr = re->precomp + re->prelen;
8826 while (endptr >= re->precomp) {
8827 const char c = *(endptr--);
8828 if (c == '\n')
8829 break; /* don't need another */
8830 if (c == '#') {
8831 /* we end while in a comment, so we need a newline */
8832 mg->mg_len++; /* save space for it */
8833 need_newline = 1; /* note to add it */
8834 break;
8835 }
8836 }
8837 }
8838
8839 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8840 mg->mg_ptr[0] = '(';
8841 mg->mg_ptr[1] = '?';
8842 Copy(reflags, mg->mg_ptr+2, left, char);
8843 *(mg->mg_ptr+left+2) = ':';
8844 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8845 if (need_newline)
8846 mg->mg_ptr[mg->mg_len - 2] = '\n';
8847 mg->mg_ptr[mg->mg_len - 1] = ')';
8848 mg->mg_ptr[mg->mg_len] = 0;
8849 }
8850 if (haseval)
8851 *haseval = re->program[0].next_off;
8852 if (flags)
8853 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8854
8855 if (lp)
8856 *lp = mg->mg_len;
8857 return mg->mg_ptr;
8858}
8859
8860
76234dfb 8861#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8862/*
8863 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8864 */
8865regnode *
864dbfa3 8866Perl_regnext(pTHX_ register regnode *p)
c277df42 8867{
97aff369 8868 dVAR;
c277df42
IZ
8869 register I32 offset;
8870
3280af22 8871 if (p == &PL_regdummy)
c277df42
IZ
8872 return(NULL);
8873
8874 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8875 if (offset == 0)
8876 return(NULL);
8877
c277df42 8878 return(p+offset);
c277df42 8879}
76234dfb 8880#endif
c277df42 8881
01f988be 8882STATIC void
cea2e8a9 8883S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8884{
8885 va_list args;
8886 STRLEN l1 = strlen(pat1);
8887 STRLEN l2 = strlen(pat2);
8888 char buf[512];
06bf62c7 8889 SV *msv;
73d840c0 8890 const char *message;
c277df42
IZ
8891
8892 if (l1 > 510)
8893 l1 = 510;
8894 if (l1 + l2 > 510)
8895 l2 = 510 - l1;
8896 Copy(pat1, buf, l1 , char);
8897 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8898 buf[l1 + l2] = '\n';
8899 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8900#ifdef I_STDARG
8901 /* ANSI variant takes additional second argument */
c277df42 8902 va_start(args, pat2);
8736538c
AS
8903#else
8904 va_start(args);
8905#endif
5a844595 8906 msv = vmess(buf, &args);
c277df42 8907 va_end(args);
cfd0369c 8908 message = SvPV_const(msv,l1);
c277df42
IZ
8909 if (l1 > 512)
8910 l1 = 512;
8911 Copy(message, buf, l1 , char);
197cf9b9 8912 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8913 Perl_croak(aTHX_ "%s", buf);
c277df42 8914}
a0ed51b3
LW
8915
8916/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8917
76234dfb 8918#ifndef PERL_IN_XSUB_RE
a0ed51b3 8919void
864dbfa3 8920Perl_save_re_context(pTHX)
b81d288d 8921{
97aff369 8922 dVAR;
1ade1aa1
NC
8923
8924 struct re_save_state *state;
8925
8926 SAVEVPTR(PL_curcop);
8927 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8928
8929 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8930 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8931 SSPUSHINT(SAVEt_RE_STATE);
8932
46ab3289 8933 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8934
a0ed51b3 8935 PL_reg_start_tmp = 0;
a0ed51b3 8936 PL_reg_start_tmpl = 0;
c445ea15 8937 PL_reg_oldsaved = NULL;
a5db57d6 8938 PL_reg_oldsavedlen = 0;
a5db57d6 8939 PL_reg_maxiter = 0;
a5db57d6 8940 PL_reg_leftiter = 0;
c445ea15 8941 PL_reg_poscache = NULL;
a5db57d6 8942 PL_reg_poscache_size = 0;
1ade1aa1
NC
8943#ifdef PERL_OLD_COPY_ON_WRITE
8944 PL_nrs = NULL;
8945#endif
ada6e8a9 8946
c445ea15
AL
8947 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8948 if (PL_curpm) {
8949 const REGEXP * const rx = PM_GETRE(PL_curpm);
8950 if (rx) {
1df70142 8951 U32 i;
ada6e8a9 8952 for (i = 1; i <= rx->nparens; i++) {
1df70142 8953 char digits[TYPE_CHARS(long)];
d9fad198 8954 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8955 GV *const *const gvp
8956 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8957
b37c2d43
AL
8958 if (gvp) {
8959 GV * const gv = *gvp;
8960 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8961 save_scalar(gv);
49f27e4b 8962 }
ada6e8a9
AMS
8963 }
8964 }
8965 }
a0ed51b3 8966}
76234dfb 8967#endif
51371543 8968
51371543 8969static void
acfe0abc 8970clear_re(pTHX_ void *r)
51371543 8971{
97aff369 8972 dVAR;
51371543
GS
8973 ReREFCNT_dec((regexp *)r);
8974}
ffbc6a93 8975
a28509cc
AL
8976#ifdef DEBUGGING
8977
8978STATIC void
8979S_put_byte(pTHX_ SV *sv, int c)
8980{
8981 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8982 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8983 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8984 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8985 else
8986 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8987}
8988
786e8c11 8989
3dab1dad
YO
8990#define CLEAR_OPTSTART \
8991 if (optstart) STMT_START { \
07be1b83 8992 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8993 optstart=NULL; \
8994 } STMT_END
8995
786e8c11 8996#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8997
b5a2f8d8
NC
8998STATIC const regnode *
8999S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9000 const regnode *last, const regnode *plast,
9001 SV* sv, I32 indent, U32 depth)
a28509cc 9002{
97aff369 9003 dVAR;
786e8c11 9004 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9005 register const regnode *next;
3dab1dad
YO
9006 const regnode *optstart= NULL;
9007 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9008
786e8c11
YO
9009#ifdef DEBUG_DUMPUNTIL
9010 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9011 last ? last-start : 0,plast ? plast-start : 0);
9012#endif
9013
9014 if (plast && plast < last)
9015 last= plast;
9016
9017 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9018 /* While that wasn't END last time... */
9019
9020 NODE_ALIGN(node);
9021 op = OP(node);
de734bd5 9022 if (op == CLOSE || op == WHILEM)
786e8c11 9023 indent--;
b5a2f8d8 9024 next = regnext((regnode *)node);
07be1b83 9025
a28509cc 9026 /* Where, what. */
8e11feef 9027 if (OP(node) == OPTIMIZED) {
e68ec53f 9028 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9029 optstart = node;
3dab1dad 9030 else
8e11feef 9031 goto after_print;
3dab1dad
YO
9032 } else
9033 CLEAR_OPTSTART;
07be1b83 9034
32fc9b6a 9035 regprop(r, sv, node);
a28509cc 9036 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9037 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9038
9039 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9040 if (next == NULL) /* Next ptr. */
9041 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9042 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9043 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9044 else
9045 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9046
1de06328 9047 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9048 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9049 }
9050
a28509cc
AL
9051 after_print:
9052 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9053 assert(next);
9054 {
9055 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9056 ? regnext((regnode *)next)
9057 : next);
be8e71aa
YO
9058 if (last && nnode > last)
9059 nnode = last;
786e8c11 9060 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9061 }
a28509cc
AL
9062 }
9063 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9064 assert(next);
786e8c11 9065 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9066 }
9067 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9068 const regnode *this_trie = node;
1de06328 9069 const char op = OP(node);
a28509cc 9070 const I32 n = ARG(node);
1de06328
YO
9071 const reg_ac_data * const ac = op>=AHOCORASICK ?
9072 (reg_ac_data *)r->data->data[n] :
9073 NULL;
9074 const reg_trie_data * const trie = op<AHOCORASICK ?
9075 (reg_trie_data*)r->data->data[n] :
9076 ac->trie;
786e8c11 9077 const regnode *nextbranch= NULL;
a28509cc 9078 I32 word_idx;
1de06328 9079 sv_setpvn(sv, "", 0);
786e8c11 9080 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9081 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9082
9083 PerlIO_printf(Perl_debug_log, "%*s%s ",
9084 (int)(2*(indent+3)), "",
9085 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9086 PL_colors[0], PL_colors[1],
9087 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9088 PERL_PV_PRETTY_ELIPSES |
7f69552c 9089 PERL_PV_PRETTY_LTGT
786e8c11
YO
9090 )
9091 : "???"
9092 );
9093 if (trie->jump) {
40d049e4 9094 U16 dist= trie->jump[word_idx+1];
7f69552c 9095 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9096 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9097 if (dist) {
9098 if (!nextbranch)
24b23f37 9099 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9100 DUMPUNTIL(this_trie + dist, nextbranch);
9101 }
786e8c11
YO
9102 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9103 nextbranch= regnext((regnode *)nextbranch);
9104 } else {
9105 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9106 }
786e8c11
YO
9107 }
9108 if (last && next > last)
9109 node= last;
9110 else
9111 node= next;
a28509cc 9112 }
786e8c11
YO
9113 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9114 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9115 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9116 }
9117 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9118 assert(next);
786e8c11 9119 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9120 }
9121 else if ( op == PLUS || op == STAR) {
786e8c11 9122 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9123 }
9124 else if (op == ANYOF) {
9125 /* arglen 1 + class block */
9126 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9127 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9128 node = NEXTOPER(node);
9129 }
9130 else if (PL_regkind[(U8)op] == EXACT) {
9131 /* Literal string, where present. */
9132 node += NODE_SZ_STR(node) - 1;
9133 node = NEXTOPER(node);
9134 }
9135 else {
9136 node = NEXTOPER(node);
9137 node += regarglen[(U8)op];
9138 }
9139 if (op == CURLYX || op == OPEN)
786e8c11 9140 indent++;
a28509cc 9141 }
3dab1dad 9142 CLEAR_OPTSTART;
786e8c11
YO
9143#ifdef DEBUG_DUMPUNTIL
9144 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9145#endif
1de06328 9146 return node;
a28509cc
AL
9147}
9148
9149#endif /* DEBUGGING */
9150
241d1a3b
NC
9151/*
9152 * Local variables:
9153 * c-indentation-style: bsd
9154 * c-basic-offset: 4
9155 * indent-tabs-mode: t
9156 * End:
9157 *
37442d52
RGS
9158 * ex: set ts=8 sts=4 sw=4 noet:
9159 */