This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Module::Pluggable diff had some hidden binary files in it. I'm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
bbe252da
YO
345#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
58e23c8d 373#define _FAIL(code) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
58e23c8d 384 code; \
ccb2c380 385} STMT_END
8615cb43 386
58e23c8d
YO
387#define FAIL(msg) _FAIL( \
388 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
389 msg, (int)len, RExC_precomp, ellipses))
390
391#define FAIL2(msg,arg) _FAIL( \
392 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
393 arg, (int)len, RExC_precomp, ellipses))
394
b45f050a 395/*
b45f050a
JF
396 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
397 */
ccb2c380 398#define Simple_vFAIL(m) STMT_START { \
a28509cc 399 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
400 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
406 */
ccb2c380
MP
407#define vFAIL(m) STMT_START { \
408 if (!SIZE_ONLY) \
409 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
410 Simple_vFAIL(m); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Like Simple_vFAIL(), but accepts two arguments.
415 */
ccb2c380 416#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 417 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
418 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
419 (int)offset, RExC_precomp, RExC_precomp + offset); \
420} STMT_END
b45f050a
JF
421
422/*
423 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
424 */
ccb2c380
MP
425#define vFAIL2(m,a1) STMT_START { \
426 if (!SIZE_ONLY) \
427 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 Simple_vFAIL2(m, a1); \
429} STMT_END
b45f050a
JF
430
431
432/*
433 * Like Simple_vFAIL(), but accepts three arguments.
434 */
ccb2c380 435#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
443 */
ccb2c380
MP
444#define vFAIL3(m,a1,a2) STMT_START { \
445 if (!SIZE_ONLY) \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
447 Simple_vFAIL3(m, a1, a2); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Like Simple_vFAIL(), but accepts four arguments.
452 */
ccb2c380 453#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 454 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
455 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
456 (int)offset, RExC_precomp, RExC_precomp + offset); \
457} STMT_END
b45f050a 458
ccb2c380 459#define vWARN(loc,m) STMT_START { \
a28509cc 460 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
461 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465#define vWARNdep(loc,m) STMT_START { \
a28509cc 466 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
467 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
468 "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472
473#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 474 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
476 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 480 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
481 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
482 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
483} STMT_END
484
485#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 486 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
487 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
488 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
490
491#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
493 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
9d1d55b5 496
8615cb43 497
cd439c50 498/* Allow for side effects in s */
ccb2c380
MP
499#define REGC(c,s) STMT_START { \
500 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
501} STMT_END
cd439c50 502
fac92740
MJD
503/* Macros for recording node offsets. 20001227 mjd@plover.com
504 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
505 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
506 * Element 0 holds the number n.
07be1b83 507 * Position is 1 indexed.
fac92740
MJD
508 */
509
ccb2c380
MP
510#define Set_Node_Offset_To_R(node,byte) STMT_START { \
511 if (! SIZE_ONLY) { \
512 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 513 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 514 if((node) < 0) { \
551405c4 515 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
516 } else { \
517 RExC_offsets[2*(node)-1] = (byte); \
518 } \
519 } \
520} STMT_END
521
522#define Set_Node_Offset(node,byte) \
523 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
524#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
525
526#define Set_Node_Length_To_R(node,len) STMT_START { \
527 if (! SIZE_ONLY) { \
528 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 529 __LINE__, (int)(node), (int)(len))); \
ccb2c380 530 if((node) < 0) { \
551405c4 531 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
532 } else { \
533 RExC_offsets[2*(node)] = (len); \
534 } \
535 } \
536} STMT_END
537
538#define Set_Node_Length(node,len) \
539 Set_Node_Length_To_R((node)-RExC_emit_start, len)
540#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
541#define Set_Node_Cur_Length(node) \
542 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
543
544/* Get offsets and lengths */
545#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
546#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
547
07be1b83
YO
548#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
549 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
550 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
551} STMT_END
552
553
554#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
555#define EXPERIMENTAL_INPLACESCAN
556#endif
557
1de06328 558#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 559DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 560 PerlIO_printf(Perl_debug_log, \
b515a41d 561 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
562 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
563 (int)(depth)*2, "", \
564 (IV)((data)->pos_min), \
565 (IV)((data)->pos_delta), \
566 (IV)((data)->flags), \
567 (IV)((data)->whilem_c), \
568 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
569 ); \
570 if ((data)->last_found) \
571 PerlIO_printf(Perl_debug_log, \
572 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
573 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
574 SvPVX_const((data)->last_found), \
575 (IV)((data)->last_end), \
576 (IV)((data)->last_start_min), \
577 (IV)((data)->last_start_max), \
578 ((data)->longest && \
579 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
580 SvPVX_const((data)->longest_fixed), \
581 (IV)((data)->offset_fixed), \
582 ((data)->longest && \
583 (data)->longest==&((data)->longest_float)) ? "*" : "", \
584 SvPVX_const((data)->longest_float), \
585 (IV)((data)->offset_float_min), \
586 (IV)((data)->offset_float_max) \
587 ); \
588 PerlIO_printf(Perl_debug_log,"\n"); \
589});
590
acfe0abc 591static void clear_re(pTHX_ void *r);
4327152a 592
653099ff 593/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 594 Update the longest found anchored substring and the longest found
653099ff
GS
595 floating substrings if needed. */
596
4327152a 597STATIC void
1de06328 598S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 599{
e1ec3a88
AL
600 const STRLEN l = CHR_SVLEN(data->last_found);
601 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 602 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 603
c277df42 604 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 605 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
606 if (*data->longest == data->longest_fixed) {
607 data->offset_fixed = l ? data->last_start_min : data->pos_min;
608 if (data->flags & SF_BEFORE_EOL)
b81d288d 609 data->flags
c277df42
IZ
610 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
611 else
612 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
613 data->minlen_fixed=minlenp;
614 data->lookbehind_fixed=0;
a0ed51b3
LW
615 }
616 else {
c277df42 617 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
618 data->offset_float_max = (l
619 ? data->last_start_max
c277df42 620 : data->pos_min + data->pos_delta);
9051bda5
HS
621 if ((U32)data->offset_float_max > (U32)I32_MAX)
622 data->offset_float_max = I32_MAX;
c277df42 623 if (data->flags & SF_BEFORE_EOL)
b81d288d 624 data->flags
c277df42
IZ
625 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
626 else
627 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
628 data->minlen_float=minlenp;
629 data->lookbehind_float=0;
c277df42
IZ
630 }
631 }
632 SvCUR_set(data->last_found, 0);
0eda9292 633 {
a28509cc 634 SV * const sv = data->last_found;
097eb12c
AL
635 if (SvUTF8(sv) && SvMAGICAL(sv)) {
636 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
637 if (mg)
638 mg->mg_len = 0;
639 }
0eda9292 640 }
c277df42
IZ
641 data->last_end = -1;
642 data->flags &= ~SF_BEFORE_EOL;
1de06328 643 DEBUG_STUDYDATA(data,0);
c277df42
IZ
644}
645
653099ff
GS
646/* Can match anything (initialization) */
647STATIC void
097eb12c 648S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 649{
653099ff 650 ANYOF_CLASS_ZERO(cl);
f8bef550 651 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 652 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
653 if (LOC)
654 cl->flags |= ANYOF_LOCALE;
655}
656
657/* Can match anything (initialization) */
658STATIC int
5f66b61c 659S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
660{
661 int value;
662
aaa51d5e 663 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
664 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
665 return 1;
1aa99e6b
IH
666 if (!(cl->flags & ANYOF_UNICODE_ALL))
667 return 0;
10edeb5d 668 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 669 return 0;
653099ff
GS
670 return 1;
671}
672
673/* Can match anything (initialization) */
674STATIC void
097eb12c 675S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 676{
8ecf7187 677 Zero(cl, 1, struct regnode_charclass_class);
653099ff 678 cl->type = ANYOF;
830247a4 679 cl_anything(pRExC_state, cl);
653099ff
GS
680}
681
682STATIC void
097eb12c 683S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 684{
8ecf7187 685 Zero(cl, 1, struct regnode_charclass_class);
653099ff 686 cl->type = ANYOF;
830247a4 687 cl_anything(pRExC_state, cl);
653099ff
GS
688 if (LOC)
689 cl->flags |= ANYOF_LOCALE;
690}
691
692/* 'And' a given class with another one. Can create false positives */
693/* We assume that cl is not inverted */
694STATIC void
5f66b61c 695S_cl_and(struct regnode_charclass_class *cl,
a28509cc 696 const struct regnode_charclass_class *and_with)
653099ff 697{
40d049e4
YO
698
699 assert(and_with->type == ANYOF);
653099ff
GS
700 if (!(and_with->flags & ANYOF_CLASS)
701 && !(cl->flags & ANYOF_CLASS)
702 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
703 && !(and_with->flags & ANYOF_FOLD)
704 && !(cl->flags & ANYOF_FOLD)) {
705 int i;
706
707 if (and_with->flags & ANYOF_INVERT)
708 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
709 cl->bitmap[i] &= ~and_with->bitmap[i];
710 else
711 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
712 cl->bitmap[i] &= and_with->bitmap[i];
713 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
714 if (!(and_with->flags & ANYOF_EOS))
715 cl->flags &= ~ANYOF_EOS;
1aa99e6b 716
14ebb1a2
JH
717 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
718 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
719 cl->flags &= ~ANYOF_UNICODE_ALL;
720 cl->flags |= ANYOF_UNICODE;
721 ARG_SET(cl, ARG(and_with));
722 }
14ebb1a2
JH
723 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
724 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 725 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
726 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
727 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 728 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
729}
730
731/* 'OR' a given class with another one. Can create false positives */
732/* We assume that cl is not inverted */
733STATIC void
097eb12c 734S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 735{
653099ff
GS
736 if (or_with->flags & ANYOF_INVERT) {
737 /* We do not use
738 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
739 * <= (B1 | !B2) | (CL1 | !CL2)
740 * which is wasteful if CL2 is small, but we ignore CL2:
741 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
742 * XXXX Can we handle case-fold? Unclear:
743 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
744 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
745 */
746 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
747 && !(or_with->flags & ANYOF_FOLD)
748 && !(cl->flags & ANYOF_FOLD) ) {
749 int i;
750
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] |= ~or_with->bitmap[i];
753 } /* XXXX: logic is complicated otherwise */
754 else {
830247a4 755 cl_anything(pRExC_state, cl);
653099ff
GS
756 }
757 } else {
758 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
759 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 760 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
761 || (cl->flags & ANYOF_FOLD)) ) {
762 int i;
763
764 /* OR char bitmap and class bitmap separately */
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] |= or_with->bitmap[i];
767 if (or_with->flags & ANYOF_CLASS) {
768 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
769 cl->classflags[i] |= or_with->classflags[i];
770 cl->flags |= ANYOF_CLASS;
771 }
772 }
773 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 774 cl_anything(pRExC_state, cl);
653099ff
GS
775 }
776 }
777 if (or_with->flags & ANYOF_EOS)
778 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
779
780 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
781 ARG(cl) != ARG(or_with)) {
782 cl->flags |= ANYOF_UNICODE_ALL;
783 cl->flags &= ~ANYOF_UNICODE;
784 }
785 if (or_with->flags & ANYOF_UNICODE_ALL) {
786 cl->flags |= ANYOF_UNICODE_ALL;
787 cl->flags &= ~ANYOF_UNICODE;
788 }
653099ff
GS
789}
790
a3621e74
YO
791#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
792#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
793#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
794#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
795
3dab1dad
YO
796
797#ifdef DEBUGGING
07be1b83 798/*
3dab1dad
YO
799 dump_trie(trie)
800 dump_trie_interim_list(trie,next_alloc)
801 dump_trie_interim_table(trie,next_alloc)
802
803 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
804 The _interim_ variants are used for debugging the interim
805 tables that are used to generate the final compressed
806 representation which is what dump_trie expects.
807
3dab1dad
YO
808 Part of the reason for their existance is to provide a form
809 of documentation as to how the different representations function.
07be1b83
YO
810
811*/
3dab1dad
YO
812
813/*
814 dump_trie(trie)
815 Dumps the final compressed table form of the trie to Perl_debug_log.
816 Used for debugging make_trie().
817*/
818
819STATIC void
820S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
821{
822 U32 state;
ab3bbdeb
YO
823 SV *sv=sv_newmortal();
824 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
825 GET_RE_DEBUG_FLAGS_DECL;
826
ab3bbdeb 827
3dab1dad
YO
828 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
829 (int)depth * 2 + 2,"",
830 "Match","Base","Ofs" );
831
832 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 833 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 834 if ( tmp ) {
ab3bbdeb
YO
835 PerlIO_printf( Perl_debug_log, "%*s",
836 colwidth,
ddc5bc0f 837 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
838 PL_colors[0], PL_colors[1],
839 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
840 PERL_PV_ESCAPE_FIRSTCHAR
841 )
842 );
3dab1dad
YO
843 }
844 }
845 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
846 (int)depth * 2 + 2,"");
847
848 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "\n");
851
1e2e3d02 852 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 853 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
854
855 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
856
857 if ( trie->states[ state ].wordnum ) {
858 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
859 } else {
860 PerlIO_printf( Perl_debug_log, "%6s", "" );
861 }
862
863 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
864
865 if ( base ) {
866 U32 ofs = 0;
867
868 while( ( base + ofs < trie->uniquecharcount ) ||
869 ( base + ofs - trie->uniquecharcount < trie->lasttrans
870 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
871 ofs++;
872
873 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
874
875 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
876 if ( ( base + ofs >= trie->uniquecharcount ) &&
877 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
878 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
879 {
ab3bbdeb
YO
880 PerlIO_printf( Perl_debug_log, "%*"UVXf,
881 colwidth,
3dab1dad
YO
882 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
883 } else {
ab3bbdeb 884 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
885 }
886 }
887
888 PerlIO_printf( Perl_debug_log, "]");
889
890 }
891 PerlIO_printf( Perl_debug_log, "\n" );
892 }
893}
894/*
895 dump_trie_interim_list(trie,next_alloc)
896 Dumps a fully constructed but uncompressed trie in list form.
897 List tries normally only are used for construction when the number of
898 possible chars (trie->uniquecharcount) is very high.
899 Used for debugging make_trie().
900*/
901STATIC void
902S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
903{
904 U32 state;
ab3bbdeb
YO
905 SV *sv=sv_newmortal();
906 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
907 GET_RE_DEBUG_FLAGS_DECL;
908 /* print out the table precompression. */
ab3bbdeb
YO
909 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
910 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
911 "------:-----+-----------------\n" );
3dab1dad
YO
912
913 for( state=1 ; state < next_alloc ; state ++ ) {
914 U16 charid;
915
ab3bbdeb 916 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
917 (int)depth * 2 + 2,"", (UV)state );
918 if ( ! trie->states[ state ].wordnum ) {
919 PerlIO_printf( Perl_debug_log, "%5s| ","");
920 } else {
921 PerlIO_printf( Perl_debug_log, "W%4x| ",
922 trie->states[ state ].wordnum
923 );
924 }
925 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 926 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
927 if ( tmp ) {
928 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
929 colwidth,
ddc5bc0f 930 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
931 PL_colors[0], PL_colors[1],
932 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
933 PERL_PV_ESCAPE_FIRSTCHAR
934 ) ,
1e2e3d02
YO
935 TRIE_LIST_ITEM(state,charid).forid,
936 (UV)TRIE_LIST_ITEM(state,charid).newstate
937 );
938 if (!(charid % 10))
664e119d
RGS
939 PerlIO_printf(Perl_debug_log, "\n%*s| ",
940 (int)((depth * 2) + 14), "");
1e2e3d02 941 }
ab3bbdeb
YO
942 }
943 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
944 }
945}
946
947/*
948 dump_trie_interim_table(trie,next_alloc)
949 Dumps a fully constructed but uncompressed trie in table form.
950 This is the normal DFA style state transition table, with a few
951 twists to facilitate compression later.
952 Used for debugging make_trie().
953*/
954STATIC void
955S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
956{
957 U32 state;
958 U16 charid;
ab3bbdeb
YO
959 SV *sv=sv_newmortal();
960 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
961 GET_RE_DEBUG_FLAGS_DECL;
962
963 /*
964 print out the table precompression so that we can do a visual check
965 that they are identical.
966 */
967
968 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
969
970 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 971 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 972 if ( tmp ) {
ab3bbdeb
YO
973 PerlIO_printf( Perl_debug_log, "%*s",
974 colwidth,
ddc5bc0f 975 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
976 PL_colors[0], PL_colors[1],
977 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
978 PERL_PV_ESCAPE_FIRSTCHAR
979 )
980 );
3dab1dad
YO
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 987 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
999 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1000 if (v)
1001 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1002 else
1003 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1004 }
1005 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1006 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1007 } else {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1009 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1010 }
1011 }
07be1b83 1012}
3dab1dad
YO
1013
1014#endif
1015
786e8c11
YO
1016/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1017 startbranch: the first branch in the whole branch sequence
1018 first : start branch of sequence of branch-exact nodes.
1019 May be the same as startbranch
1020 last : Thing following the last branch.
1021 May be the same as tail.
1022 tail : item following the branch sequence
1023 count : words in the sequence
1024 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1025 depth : indent depth
3dab1dad 1026
786e8c11 1027Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1028
786e8c11
YO
1029A trie is an N'ary tree where the branches are determined by digital
1030decomposition of the key. IE, at the root node you look up the 1st character and
1031follow that branch repeat until you find the end of the branches. Nodes can be
1032marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1033
786e8c11 1034 /he|she|his|hers/
72f13be8 1035
786e8c11
YO
1036would convert into the following structure. Numbers represent states, letters
1037following numbers represent valid transitions on the letter from that state, if
1038the number is in square brackets it represents an accepting state, otherwise it
1039will be in parenthesis.
07be1b83 1040
786e8c11
YO
1041 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1042 | |
1043 | (2)
1044 | |
1045 (1) +-i->(6)-+-s->[7]
1046 |
1047 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1048
786e8c11
YO
1049 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1050
1051This shows that when matching against the string 'hers' we will begin at state 1
1052read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1053then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1054is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1055single traverse. We store a mapping from accepting to state to which word was
1056matched, and then when we have multiple possibilities we try to complete the
1057rest of the regex in the order in which they occured in the alternation.
1058
1059The only prior NFA like behaviour that would be changed by the TRIE support is
1060the silent ignoring of duplicate alternations which are of the form:
1061
1062 / (DUPE|DUPE) X? (?{ ... }) Y /x
1063
1064Thus EVAL blocks follwing a trie may be called a different number of times with
1065and without the optimisation. With the optimisations dupes will be silently
1066ignored. This inconsistant behaviour of EVAL type nodes is well established as
1067the following demonstrates:
1068
1069 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1070
1071which prints out 'word' three times, but
1072
1073 'words'=~/(word|word|word)(?{ print $1 })S/
1074
1075which doesnt print it out at all. This is due to other optimisations kicking in.
1076
1077Example of what happens on a structural level:
1078
1079The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1080
1081 1: CURLYM[1] {1,32767}(18)
1082 5: BRANCH(8)
1083 6: EXACT <ac>(16)
1084 8: BRANCH(11)
1085 9: EXACT <ad>(16)
1086 11: BRANCH(14)
1087 12: EXACT <ab>(16)
1088 16: SUCCEED(0)
1089 17: NOTHING(18)
1090 18: END(0)
1091
1092This would be optimizable with startbranch=5, first=5, last=16, tail=16
1093and should turn into:
1094
1095 1: CURLYM[1] {1,32767}(18)
1096 5: TRIE(16)
1097 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 <ac>
1099 <ad>
1100 <ab>
1101 16: SUCCEED(0)
1102 17: NOTHING(18)
1103 18: END(0)
1104
1105Cases where tail != last would be like /(?foo|bar)baz/:
1106
1107 1: BRANCH(4)
1108 2: EXACT <foo>(8)
1109 4: BRANCH(7)
1110 5: EXACT <bar>(8)
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115which would be optimizable with startbranch=1, first=1, last=7, tail=8
1116and would end up looking like:
1117
1118 1: TRIE(8)
1119 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1120 <foo>
1121 <bar>
1122 7: TAIL(8)
1123 8: EXACT <baz>(10)
1124 10: END(0)
1125
1126 d = uvuni_to_utf8_flags(d, uv, 0);
1127
1128is the recommended Unicode-aware way of saying
1129
1130 *(d++) = uv;
1131*/
1132
1e2e3d02 1133#define TRIE_STORE_REVCHAR \
786e8c11 1134 STMT_START { \
1e2e3d02 1135 SV *tmp = newSVpvs(""); \
786e8c11 1136 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1137 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1138 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1139 } STMT_END
1140
1141#define TRIE_READ_CHAR STMT_START { \
1142 wordlen++; \
1143 if ( UTF ) { \
1144 if ( folder ) { \
1145 if ( foldlen > 0 ) { \
1146 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1147 foldlen -= len; \
1148 scan += len; \
1149 len = 0; \
1150 } else { \
1151 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1152 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1153 foldlen -= UNISKIP( uvc ); \
1154 scan = foldbuf + UNISKIP( uvc ); \
1155 } \
1156 } else { \
1157 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158 } \
1159 } else { \
1160 uvc = (U32)*uc; \
1161 len = 1; \
1162 } \
1163} STMT_END
1164
1165
1166
1167#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1168 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1169 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1170 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1171 } \
1172 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1173 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1174 TRIE_LIST_CUR( state )++; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_LIST_NEW(state) STMT_START { \
1178 Newxz( trie->states[ state ].trans.list, \
1179 4, reg_trie_trans_le ); \
1180 TRIE_LIST_CUR( state ) = 1; \
1181 TRIE_LIST_LEN( state ) = 4; \
1182} STMT_END
07be1b83 1183
786e8c11
YO
1184#define TRIE_HANDLE_WORD(state) STMT_START { \
1185 U16 dupe= trie->states[ state ].wordnum; \
1186 regnode * const noper_next = regnext( noper ); \
1187 \
1188 if (trie->wordlen) \
1189 trie->wordlen[ curword ] = wordlen; \
1190 DEBUG_r({ \
1191 /* store the word for dumping */ \
1192 SV* tmp; \
1193 if (OP(noper) != NOTHING) \
1194 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1195 else \
1196 tmp = newSVpvn( "", 0 ); \
1197 if ( UTF ) SvUTF8_on( tmp ); \
1198 av_push( trie->words, tmp ); \
1199 }); \
1200 \
1201 curword++; \
1202 \
1203 if ( noper_next < tail ) { \
1204 if (!trie->jump) \
1205 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1206 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1207 if (!jumper) \
1208 jumper = noper_next; \
1209 if (!nextbranch) \
1210 nextbranch= regnext(cur); \
1211 } \
1212 \
1213 if ( dupe ) { \
1214 /* So it's a dupe. This means we need to maintain a */\
1215 /* linked-list from the first to the next. */\
1216 /* we only allocate the nextword buffer when there */\
1217 /* a dupe, so first time we have to do the allocation */\
1218 if (!trie->nextword) \
1219 Newxz( trie->nextword, word_count + 1, U16); \
1220 while ( trie->nextword[dupe] ) \
1221 dupe= trie->nextword[dupe]; \
1222 trie->nextword[dupe]= curword; \
1223 } else { \
1224 /* we haven't inserted this word yet. */ \
1225 trie->states[ state ].wordnum = curword; \
1226 } \
1227} STMT_END
07be1b83 1228
3dab1dad 1229
786e8c11
YO
1230#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1231 ( ( base + charid >= ucharcount \
1232 && base + charid < ubound \
1233 && state == trie->trans[ base - ucharcount + charid ].check \
1234 && trie->trans[ base - ucharcount + charid ].next ) \
1235 ? trie->trans[ base - ucharcount + charid ].next \
1236 : ( state==1 ? special : 0 ) \
1237 )
3dab1dad 1238
786e8c11
YO
1239#define MADE_TRIE 1
1240#define MADE_JUMP_TRIE 2
1241#define MADE_EXACT_TRIE 4
3dab1dad 1242
a3621e74 1243STATIC I32
786e8c11 1244S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1245{
27da23d5 1246 dVAR;
a3621e74
YO
1247 /* first pass, loop through and scan words */
1248 reg_trie_data *trie;
1249 regnode *cur;
9f7f3913 1250 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1251 STRLEN len = 0;
1252 UV uvc = 0;
1253 U16 curword = 0;
1254 U32 next_alloc = 0;
786e8c11
YO
1255 regnode *jumper = NULL;
1256 regnode *nextbranch = NULL;
7f69552c 1257 regnode *convert = NULL;
a3621e74 1258 /* we just use folder as a flag in utf8 */
e1ec3a88 1259 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1260 ? PL_fold
1261 : ( flags == EXACTFL
1262 ? PL_fold_locale
1263 : NULL
1264 )
1265 );
1266
e1ec3a88 1267 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1268 SV *re_trie_maxbuff;
3dab1dad
YO
1269#ifndef DEBUGGING
1270 /* these are only used during construction but are useful during
8e11feef 1271 * debugging so we store them in the struct when debugging.
8e11feef 1272 */
3dab1dad 1273 STRLEN trie_charcount=0;
3dab1dad
YO
1274 AV *trie_revcharmap;
1275#endif
a3621e74 1276 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1277#ifndef DEBUGGING
1278 PERL_UNUSED_ARG(depth);
1279#endif
a3621e74 1280
a02a5408 1281 Newxz( trie, 1, reg_trie_data );
a3621e74 1282 trie->refcount = 1;
3dab1dad 1283 trie->startstate = 1;
786e8c11 1284 trie->wordcount = word_count;
a3621e74 1285 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1286 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1287 if (!(UTF && folder))
1288 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1289 DEBUG_r({
1290 trie->words = newAV();
a3621e74 1291 });
3dab1dad 1292 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1293
0111c4fd 1294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1295 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1297 }
3dab1dad
YO
1298 DEBUG_OPTIMISE_r({
1299 PerlIO_printf( Perl_debug_log,
786e8c11 1300 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1301 (int)depth * 2 + 2, "",
1302 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1303 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1304 (int)depth);
3dab1dad 1305 });
7f69552c
YO
1306
1307 /* Find the node we are going to overwrite */
1308 if ( first == startbranch && OP( last ) != BRANCH ) {
1309 /* whole branch chain */
1310 convert = first;
1311 } else {
1312 /* branch sub-chain */
1313 convert = NEXTOPER( first );
1314 }
1315
a3621e74
YO
1316 /* -- First loop and Setup --
1317
1318 We first traverse the branches and scan each word to determine if it
1319 contains widechars, and how many unique chars there are, this is
1320 important as we have to build a table with at least as many columns as we
1321 have unique chars.
1322
1323 We use an array of integers to represent the character codes 0..255
1324 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1325 native representation of the character value as the key and IV's for the
1326 coded index.
1327
1328 *TODO* If we keep track of how many times each character is used we can
1329 remap the columns so that the table compression later on is more
1330 efficient in terms of memory by ensuring most common value is in the
1331 middle and the least common are on the outside. IMO this would be better
1332 than a most to least common mapping as theres a decent chance the most
1333 common letter will share a node with the least common, meaning the node
1334 will not be compressable. With a middle is most common approach the worst
1335 case is when we have the least common nodes twice.
1336
1337 */
1338
a3621e74 1339 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1340 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1341 const U8 *uc = (U8*)STRING( noper );
a28509cc 1342 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1343 STRLEN foldlen = 0;
1344 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1345 const U8 *scan = (U8*)NULL;
07be1b83 1346 U32 wordlen = 0; /* required init */
3dab1dad 1347 STRLEN chars=0;
a3621e74 1348
3dab1dad
YO
1349 if (OP(noper) == NOTHING) {
1350 trie->minlen= 0;
1351 continue;
1352 }
1353 if (trie->bitmap) {
1354 TRIE_BITMAP_SET(trie,*uc);
1355 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1356 }
a3621e74 1357 for ( ; uc < e ; uc += len ) {
3dab1dad 1358 TRIE_CHARCOUNT(trie)++;
a3621e74 1359 TRIE_READ_CHAR;
3dab1dad 1360 chars++;
a3621e74
YO
1361 if ( uvc < 256 ) {
1362 if ( !trie->charmap[ uvc ] ) {
1363 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1364 if ( folder )
1365 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1366 TRIE_STORE_REVCHAR;
a3621e74
YO
1367 }
1368 } else {
1369 SV** svpp;
1370 if ( !trie->widecharmap )
1371 trie->widecharmap = newHV();
1372
1373 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1374
1375 if ( !svpp )
e4584336 1376 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1377
1378 if ( !SvTRUE( *svpp ) ) {
1379 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1380 TRIE_STORE_REVCHAR;
a3621e74
YO
1381 }
1382 }
1383 }
3dab1dad
YO
1384 if( cur == first ) {
1385 trie->minlen=chars;
1386 trie->maxlen=chars;
1387 } else if (chars < trie->minlen) {
1388 trie->minlen=chars;
1389 } else if (chars > trie->maxlen) {
1390 trie->maxlen=chars;
1391 }
1392
a3621e74
YO
1393 } /* end first pass */
1394 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1395 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1396 (int)depth * 2 + 2,"",
85c3142d 1397 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1398 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1399 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1400 );
786e8c11 1401 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1402
1403 /*
1404 We now know what we are dealing with in terms of unique chars and
1405 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1406 representation using a flat table will take. If it's over a reasonable
1407 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1408 conservative but potentially much slower representation using an array
1409 of lists.
1410
1411 At the end we convert both representations into the same compressed
1412 form that will be used in regexec.c for matching with. The latter
1413 is a form that cannot be used to construct with but has memory
1414 properties similar to the list form and access properties similar
1415 to the table form making it both suitable for fast searches and
1416 small enough that its feasable to store for the duration of a program.
1417
1418 See the comment in the code where the compressed table is produced
1419 inplace from the flat tabe representation for an explanation of how
1420 the compression works.
1421
1422 */
1423
1424
3dab1dad 1425 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1426 /*
1427 Second Pass -- Array Of Lists Representation
1428
1429 Each state will be represented by a list of charid:state records
1430 (reg_trie_trans_le) the first such element holds the CUR and LEN
1431 points of the allocated array. (See defines above).
1432
1433 We build the initial structure using the lists, and then convert
1434 it into the compressed table form which allows faster lookups
1435 (but cant be modified once converted).
a3621e74
YO
1436 */
1437
a3621e74
YO
1438 STRLEN transcount = 1;
1439
1e2e3d02
YO
1440 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1441 "%*sCompiling trie using list compiler\n",
1442 (int)depth * 2 + 2, ""));
1443
3dab1dad 1444 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1445 TRIE_LIST_NEW(1);
1446 next_alloc = 2;
1447
1448 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449
c445ea15
AL
1450 regnode * const noper = NEXTOPER( cur );
1451 U8 *uc = (U8*)STRING( noper );
1452 const U8 * const e = uc + STR_LEN( noper );
1453 U32 state = 1; /* required init */
1454 U16 charid = 0; /* sanity init */
1455 U8 *scan = (U8*)NULL; /* sanity init */
1456 STRLEN foldlen = 0; /* required init */
07be1b83 1457 U32 wordlen = 0; /* required init */
c445ea15
AL
1458 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459
3dab1dad 1460 if (OP(noper) != NOTHING) {
786e8c11 1461 for ( ; uc < e ; uc += len ) {
c445ea15 1462
786e8c11 1463 TRIE_READ_CHAR;
c445ea15 1464
786e8c11
YO
1465 if ( uvc < 256 ) {
1466 charid = trie->charmap[ uvc ];
c445ea15 1467 } else {
786e8c11
YO
1468 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1469 if ( !svpp ) {
1470 charid = 0;
1471 } else {
1472 charid=(U16)SvIV( *svpp );
1473 }
c445ea15 1474 }
786e8c11
YO
1475 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1476 if ( charid ) {
a3621e74 1477
786e8c11
YO
1478 U16 check;
1479 U32 newstate = 0;
a3621e74 1480
786e8c11
YO
1481 charid--;
1482 if ( !trie->states[ state ].trans.list ) {
1483 TRIE_LIST_NEW( state );
c445ea15 1484 }
786e8c11
YO
1485 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1486 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1487 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1488 break;
1489 }
1490 }
1491 if ( ! newstate ) {
1492 newstate = next_alloc++;
1493 TRIE_LIST_PUSH( state, charid, newstate );
1494 transcount++;
1495 }
1496 state = newstate;
1497 } else {
1498 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1499 }
a28509cc 1500 }
c445ea15 1501 }
3dab1dad 1502 TRIE_HANDLE_WORD(state);
a3621e74
YO
1503
1504 } /* end second pass */
1505
1e2e3d02
YO
1506 /* next alloc is the NEXT state to be allocated */
1507 trie->statecount = next_alloc;
a3621e74
YO
1508 Renew( trie->states, next_alloc, reg_trie_state );
1509
3dab1dad
YO
1510 /* and now dump it out before we compress it */
1511 DEBUG_TRIE_COMPILE_MORE_r(
1512 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1513 );
a3621e74 1514
a02a5408 1515 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1516 {
1517 U32 state;
a3621e74
YO
1518 U32 tp = 0;
1519 U32 zp = 0;
1520
1521
1522 for( state=1 ; state < next_alloc ; state ++ ) {
1523 U32 base=0;
1524
1525 /*
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1528 );
1529 */
1530
1531 if (trie->states[state].trans.list) {
1532 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1533 U16 maxid=minid;
a28509cc 1534 U16 idx;
a3621e74
YO
1535
1536 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1537 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1538 if ( forid < minid ) {
1539 minid=forid;
1540 } else if ( forid > maxid ) {
1541 maxid=forid;
1542 }
a3621e74
YO
1543 }
1544 if ( transcount < tp + maxid - minid + 1) {
1545 transcount *= 2;
1546 Renew( trie->trans, transcount, reg_trie_trans );
1547 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1548 }
1549 base = trie->uniquecharcount + tp - minid;
1550 if ( maxid == minid ) {
1551 U32 set = 0;
1552 for ( ; zp < tp ; zp++ ) {
1553 if ( ! trie->trans[ zp ].next ) {
1554 base = trie->uniquecharcount + zp - minid;
1555 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ zp ].check = state;
1557 set = 1;
1558 break;
1559 }
1560 }
1561 if ( !set ) {
1562 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1563 trie->trans[ tp ].check = state;
1564 tp++;
1565 zp = tp;
1566 }
1567 } else {
1568 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1569 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1570 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1571 trie->trans[ tid ].check = state;
1572 }
1573 tp += ( maxid - minid + 1 );
1574 }
1575 Safefree(trie->states[ state ].trans.list);
1576 }
1577 /*
1578 DEBUG_TRIE_COMPILE_MORE_r(
1579 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1580 );
1581 */
1582 trie->states[ state ].trans.base=base;
1583 }
cc601c31 1584 trie->lasttrans = tp + 1;
a3621e74
YO
1585 }
1586 } else {
1587 /*
1588 Second Pass -- Flat Table Representation.
1589
1590 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1591 We know that we will need Charcount+1 trans at most to store the data
1592 (one row per char at worst case) So we preallocate both structures
1593 assuming worst case.
1594
1595 We then construct the trie using only the .next slots of the entry
1596 structs.
1597
1598 We use the .check field of the first entry of the node temporarily to
1599 make compression both faster and easier by keeping track of how many non
1600 zero fields are in the node.
1601
1602 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1603 transition.
1604
1605 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1606 number representing the first entry of the node, and state as a
1607 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1608 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1609 are 2 entrys per node. eg:
1610
1611 A B A B
1612 1. 2 4 1. 3 7
1613 2. 0 3 3. 0 5
1614 3. 0 0 5. 0 0
1615 4. 0 0 7. 0 0
1616
1617 The table is internally in the right hand, idx form. However as we also
1618 have to deal with the states array which is indexed by nodenum we have to
1619 use TRIE_NODENUM() to convert.
1620
1621 */
1e2e3d02
YO
1622 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1623 "%*sCompiling trie using table compiler\n",
1624 (int)depth * 2 + 2, ""));
3dab1dad
YO
1625
1626 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1627 reg_trie_trans );
3dab1dad 1628 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1629 next_alloc = trie->uniquecharcount + 1;
1630
3dab1dad 1631
a3621e74
YO
1632 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1633
c445ea15 1634 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1635 const U8 *uc = (U8*)STRING( noper );
1636 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1637
1638 U32 state = 1; /* required init */
1639
1640 U16 charid = 0; /* sanity init */
1641 U32 accept_state = 0; /* sanity init */
1642 U8 *scan = (U8*)NULL; /* sanity init */
1643
1644 STRLEN foldlen = 0; /* required init */
07be1b83 1645 U32 wordlen = 0; /* required init */
a3621e74
YO
1646 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1647
3dab1dad 1648 if ( OP(noper) != NOTHING ) {
786e8c11 1649 for ( ; uc < e ; uc += len ) {
a3621e74 1650
786e8c11 1651 TRIE_READ_CHAR;
a3621e74 1652
786e8c11
YO
1653 if ( uvc < 256 ) {
1654 charid = trie->charmap[ uvc ];
1655 } else {
1656 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1657 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1658 }
786e8c11
YO
1659 if ( charid ) {
1660 charid--;
1661 if ( !trie->trans[ state + charid ].next ) {
1662 trie->trans[ state + charid ].next = next_alloc;
1663 trie->trans[ state ].check++;
1664 next_alloc += trie->uniquecharcount;
1665 }
1666 state = trie->trans[ state + charid ].next;
1667 } else {
1668 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1669 }
1670 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1671 }
a3621e74 1672 }
3dab1dad
YO
1673 accept_state = TRIE_NODENUM( state );
1674 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1675
1676 } /* end second pass */
1677
3dab1dad
YO
1678 /* and now dump it out before we compress it */
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 dump_trie_interim_table(trie,next_alloc,depth+1)
1681 );
a3621e74 1682
a3621e74
YO
1683 {
1684 /*
1685 * Inplace compress the table.*
1686
1687 For sparse data sets the table constructed by the trie algorithm will
1688 be mostly 0/FAIL transitions or to put it another way mostly empty.
1689 (Note that leaf nodes will not contain any transitions.)
1690
1691 This algorithm compresses the tables by eliminating most such
1692 transitions, at the cost of a modest bit of extra work during lookup:
1693
1694 - Each states[] entry contains a .base field which indicates the
1695 index in the state[] array wheres its transition data is stored.
1696
1697 - If .base is 0 there are no valid transitions from that node.
1698
1699 - If .base is nonzero then charid is added to it to find an entry in
1700 the trans array.
1701
1702 -If trans[states[state].base+charid].check!=state then the
1703 transition is taken to be a 0/Fail transition. Thus if there are fail
1704 transitions at the front of the node then the .base offset will point
1705 somewhere inside the previous nodes data (or maybe even into a node
1706 even earlier), but the .check field determines if the transition is
1707 valid.
1708
786e8c11 1709 XXX - wrong maybe?
a3621e74
YO
1710 The following process inplace converts the table to the compressed
1711 table: We first do not compress the root node 1,and mark its all its
1712 .check pointers as 1 and set its .base pointer as 1 as well. This
1713 allows to do a DFA construction from the compressed table later, and
1714 ensures that any .base pointers we calculate later are greater than
1715 0.
1716
1717 - We set 'pos' to indicate the first entry of the second node.
1718
1719 - We then iterate over the columns of the node, finding the first and
1720 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1721 and set the .check pointers accordingly, and advance pos
1722 appropriately and repreat for the next node. Note that when we copy
1723 the next pointers we have to convert them from the original
1724 NODEIDX form to NODENUM form as the former is not valid post
1725 compression.
1726
1727 - If a node has no transitions used we mark its base as 0 and do not
1728 advance the pos pointer.
1729
1730 - If a node only has one transition we use a second pointer into the
1731 structure to fill in allocated fail transitions from other states.
1732 This pointer is independent of the main pointer and scans forward
1733 looking for null transitions that are allocated to a state. When it
1734 finds one it writes the single transition into the "hole". If the
786e8c11 1735 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1736
1737 - Once compressed we can Renew/realloc the structures to release the
1738 excess space.
1739
1740 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1741 specifically Fig 3.47 and the associated pseudocode.
1742
1743 demq
1744 */
a3b680e6 1745 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1746 U32 state, charid;
a3621e74 1747 U32 pos = 0, zp=0;
1e2e3d02 1748 trie->statecount = laststate;
a3621e74
YO
1749
1750 for ( state = 1 ; state < laststate ; state++ ) {
1751 U8 flag = 0;
a28509cc
AL
1752 const U32 stateidx = TRIE_NODEIDX( state );
1753 const U32 o_used = trie->trans[ stateidx ].check;
1754 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1755 trie->trans[ stateidx ].check = 0;
1756
1757 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1758 if ( flag || trie->trans[ stateidx + charid ].next ) {
1759 if ( trie->trans[ stateidx + charid ].next ) {
1760 if (o_used == 1) {
1761 for ( ; zp < pos ; zp++ ) {
1762 if ( ! trie->trans[ zp ].next ) {
1763 break;
1764 }
1765 }
1766 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1767 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ zp ].check = state;
1769 if ( ++zp > pos ) pos = zp;
1770 break;
1771 }
1772 used--;
1773 }
1774 if ( !flag ) {
1775 flag = 1;
1776 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1777 }
1778 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ pos ].check = state;
1780 pos++;
1781 }
1782 }
1783 }
cc601c31 1784 trie->lasttrans = pos + 1;
1e2e3d02 1785 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1786 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1787 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1788 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1789 (int)depth * 2 + 2,"",
1790 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1791 (IV)next_alloc,
1792 (IV)pos,
a3621e74
YO
1793 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1794 );
1795
1796 } /* end table compress */
1797 }
1e2e3d02
YO
1798 DEBUG_TRIE_COMPILE_MORE_r(
1799 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1800 (int)depth * 2 + 2, "",
1801 (UV)trie->statecount,
1802 (UV)trie->lasttrans)
1803 );
cc601c31
YO
1804 /* resize the trans array to remove unused space */
1805 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1806
3dab1dad
YO
1807 /* and now dump out the compressed format */
1808 DEBUG_TRIE_COMPILE_r(
1809 dump_trie(trie,depth+1)
1810 );
07be1b83 1811
3dab1dad 1812 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1813 U8 nodetype =(U8)(flags & 0xFF);
1814 char *str=NULL;
786e8c11 1815
07be1b83 1816#ifdef DEBUGGING
e62cc96a 1817 regnode *optimize = NULL;
b57a0404
JH
1818 U32 mjd_offset = 0;
1819 U32 mjd_nodelen = 0;
07be1b83 1820#endif
a3621e74 1821 /*
3dab1dad
YO
1822 This means we convert either the first branch or the first Exact,
1823 depending on whether the thing following (in 'last') is a branch
1824 or not and whther first is the startbranch (ie is it a sub part of
1825 the alternation or is it the whole thing.)
1826 Assuming its a sub part we conver the EXACT otherwise we convert
1827 the whole branch sequence, including the first.
a3621e74 1828 */
3dab1dad 1829 /* Find the node we are going to overwrite */
7f69552c 1830 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1831 /* branch sub-chain */
3dab1dad 1832 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1833 DEBUG_r({
1834 mjd_offset= Node_Offset((convert));
1835 mjd_nodelen= Node_Length((convert));
1836 });
7f69552c
YO
1837 /* whole branch chain */
1838 } else {
1839 DEBUG_r({
1840 const regnode *nop = NEXTOPER( convert );
1841 mjd_offset= Node_Offset((nop));
1842 mjd_nodelen= Node_Length((nop));
1843 });
07be1b83 1844 }
7f69552c 1845
07be1b83
YO
1846 DEBUG_OPTIMISE_r(
1847 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1848 (int)depth * 2 + 2, "",
786e8c11 1849 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1850 );
a3621e74 1851
3dab1dad
YO
1852 /* But first we check to see if there is a common prefix we can
1853 split out as an EXACT and put in front of the TRIE node. */
1854 trie->startstate= 1;
786e8c11 1855 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1856 U32 state;
1e2e3d02 1857 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1858 U32 ofs = 0;
8e11feef
RGS
1859 I32 idx = -1;
1860 U32 count = 0;
1861 const U32 base = trie->states[ state ].trans.base;
a3621e74 1862
3dab1dad 1863 if ( trie->states[state].wordnum )
8e11feef 1864 count = 1;
a3621e74 1865
8e11feef 1866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1867 if ( ( base + ofs >= trie->uniquecharcount ) &&
1868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1870 {
3dab1dad 1871 if ( ++count > 1 ) {
8e11feef 1872 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1873 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874 if ( state == 1 ) break;
3dab1dad
YO
1875 if ( count == 2 ) {
1876 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1877 DEBUG_OPTIMISE_r(
8e11feef
RGS
1878 PerlIO_printf(Perl_debug_log,
1879 "%*sNew Start State=%"UVuf" Class: [",
1880 (int)depth * 2 + 2, "",
786e8c11 1881 (UV)state));
be8e71aa
YO
1882 if (idx >= 0) {
1883 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1884 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1885
3dab1dad 1886 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1887 if ( folder )
1888 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1889 DEBUG_OPTIMISE_r(
07be1b83 1890 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1891 );
8e11feef
RGS
1892 }
1893 }
1894 TRIE_BITMAP_SET(trie,*ch);
1895 if ( folder )
1896 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1897 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1898 }
1899 idx = ofs;
1900 }
3dab1dad
YO
1901 }
1902 if ( count == 1 ) {
1903 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1904 char *ch = SvPV_nolen( *tmp );
1905 DEBUG_OPTIMISE_r({
1906 SV *sv=sv_newmortal();
8e11feef
RGS
1907 PerlIO_printf( Perl_debug_log,
1908 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1909 (int)depth * 2 + 2, "",
de734bd5
A
1910 (UV)state, (UV)idx,
1911 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1912 PL_colors[0], PL_colors[1],
1913 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1914 PERL_PV_ESCAPE_FIRSTCHAR
1915 )
1916 );
1917 });
3dab1dad
YO
1918 if ( state==1 ) {
1919 OP( convert ) = nodetype;
1920 str=STRING(convert);
1921 STR_LEN(convert)=0;
1922 }
de734bd5
A
1923 while (*ch) {
1924 *str++ = *ch++;
1925 STR_LEN(convert)++;
1926 }
1927
8e11feef 1928 } else {
f9049ba1 1929#ifdef DEBUGGING
8e11feef
RGS
1930 if (state>1)
1931 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1932#endif
8e11feef
RGS
1933 break;
1934 }
1935 }
3dab1dad 1936 if (str) {
8e11feef 1937 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1938 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1939 trie->startstate = state;
07be1b83
YO
1940 trie->minlen -= (state - 1);
1941 trie->maxlen -= (state - 1);
1942 DEBUG_r({
1943 regnode *fix = convert;
de734bd5 1944 U32 word = trie->wordcount;
07be1b83
YO
1945 mjd_nodelen++;
1946 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1947 while( ++fix < n ) {
1948 Set_Node_Offset_Length(fix, 0, 0);
1949 }
de734bd5
A
1950 while (word--) {
1951 SV ** const tmp = av_fetch( trie->words, word, 0 );
1952 if (tmp) {
1953 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1954 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1955 else
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1957 }
1958 }
07be1b83 1959 });
8e11feef
RGS
1960 if (trie->maxlen) {
1961 convert = n;
1962 } else {
3dab1dad 1963 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1964 DEBUG_r(optimize= n);
3dab1dad
YO
1965 }
1966 }
1967 }
a5ca303d
YO
1968 if (!jumper)
1969 jumper = last;
3dab1dad 1970 if ( trie->maxlen ) {
8e11feef
RGS
1971 NEXT_OFF( convert ) = (U16)(tail - convert);
1972 ARG_SET( convert, data_slot );
786e8c11
YO
1973 /* Store the offset to the first unabsorbed branch in
1974 jump[0], which is otherwise unused by the jump logic.
1975 We use this when dumping a trie and during optimisation. */
1976 if (trie->jump)
7f69552c 1977 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1978
786e8c11
YO
1979 /* XXXX */
1980 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1981 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1982 {
1983 OP( convert ) = TRIEC;
1984 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1985 Safefree(trie->bitmap);
1986 trie->bitmap= NULL;
1987 } else
1988 OP( convert ) = TRIE;
a3621e74 1989
3dab1dad
YO
1990 /* store the type in the flags */
1991 convert->flags = nodetype;
a5ca303d
YO
1992 DEBUG_r({
1993 optimize = convert
1994 + NODE_STEP_REGNODE
1995 + regarglen[ OP( convert ) ];
1996 });
1997 /* XXX We really should free up the resource in trie now,
1998 as we won't use them - (which resources?) dmq */
3dab1dad 1999 }
a3621e74 2000 /* needed for dumping*/
e62cc96a 2001 DEBUG_r(if (optimize) {
07be1b83 2002 regnode *opt = convert;
e62cc96a 2003 while ( ++opt < optimize) {
07be1b83
YO
2004 Set_Node_Offset_Length(opt,0,0);
2005 }
786e8c11
YO
2006 /*
2007 Try to clean up some of the debris left after the
2008 optimisation.
a3621e74 2009 */
786e8c11 2010 while( optimize < jumper ) {
07be1b83 2011 mjd_nodelen += Node_Length((optimize));
a3621e74 2012 OP( optimize ) = OPTIMIZED;
07be1b83 2013 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2014 optimize++;
2015 }
07be1b83 2016 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2017 });
2018 } /* end node insert */
07be1b83 2019#ifndef DEBUGGING
6e8b4190 2020 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2021#endif
786e8c11
YO
2022 return trie->jump
2023 ? MADE_JUMP_TRIE
2024 : trie->startstate>1
2025 ? MADE_EXACT_TRIE
2026 : MADE_TRIE;
2027}
2028
2029STATIC void
2030S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2031{
2032/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2033
2034 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2035 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2036 ISBN 0-201-10088-6
2037
2038 We find the fail state for each state in the trie, this state is the longest proper
2039 suffix of the current states 'word' that is also a proper prefix of another word in our
2040 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2041 the DFA not to have to restart after its tried and failed a word at a given point, it
2042 simply continues as though it had been matching the other word in the first place.
2043 Consider
2044 'abcdgu'=~/abcdefg|cdgu/
2045 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2046 fail, which would bring use to the state representing 'd' in the second word where we would
2047 try 'g' and succeed, prodceding to match 'cdgu'.
2048 */
2049 /* add a fail transition */
2050 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2051 U32 *q;
2052 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2053 const U32 numstates = trie->statecount;
786e8c11
YO
2054 const U32 ubound = trie->lasttrans + ucharcount;
2055 U32 q_read = 0;
2056 U32 q_write = 0;
2057 U32 charid;
2058 U32 base = trie->states[ 1 ].trans.base;
2059 U32 *fail;
2060 reg_ac_data *aho;
2061 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2062 GET_RE_DEBUG_FLAGS_DECL;
2063#ifndef DEBUGGING
2064 PERL_UNUSED_ARG(depth);
2065#endif
2066
2067
2068 ARG_SET( stclass, data_slot );
2069 Newxz( aho, 1, reg_ac_data );
2070 RExC_rx->data->data[ data_slot ] = (void*)aho;
2071 aho->trie=trie;
2072 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2073 numstates * sizeof(reg_trie_state));
786e8c11
YO
2074 Newxz( q, numstates, U32);
2075 Newxz( aho->fail, numstates, U32 );
2076 aho->refcount = 1;
2077 fail = aho->fail;
2078 /* initialize fail[0..1] to be 1 so that we always have
2079 a valid final fail state */
2080 fail[ 0 ] = fail[ 1 ] = 1;
2081
2082 for ( charid = 0; charid < ucharcount ; charid++ ) {
2083 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2084 if ( newstate ) {
2085 q[ q_write ] = newstate;
2086 /* set to point at the root */
2087 fail[ q[ q_write++ ] ]=1;
2088 }
2089 }
2090 while ( q_read < q_write) {
2091 const U32 cur = q[ q_read++ % numstates ];
2092 base = trie->states[ cur ].trans.base;
2093
2094 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2095 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2096 if (ch_state) {
2097 U32 fail_state = cur;
2098 U32 fail_base;
2099 do {
2100 fail_state = fail[ fail_state ];
2101 fail_base = aho->states[ fail_state ].trans.base;
2102 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2103
2104 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2105 fail[ ch_state ] = fail_state;
2106 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2107 {
2108 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2109 }
2110 q[ q_write++ % numstates] = ch_state;
2111 }
2112 }
2113 }
2114 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2115 when we fail in state 1, this allows us to use the
2116 charclass scan to find a valid start char. This is based on the principle
2117 that theres a good chance the string being searched contains lots of stuff
2118 that cant be a start char.
2119 */
2120 fail[ 0 ] = fail[ 1 ] = 0;
2121 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2122 PerlIO_printf(Perl_debug_log,
2123 "%*sStclass Failtable (%"UVuf" states): 0",
2124 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2125 );
786e8c11
YO
2126 for( q_read=1; q_read<numstates; q_read++ ) {
2127 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2128 }
2129 PerlIO_printf(Perl_debug_log, "\n");
2130 });
2131 Safefree(q);
2132 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2133}
2134
786e8c11 2135
a3621e74 2136/*
5d1c421c
JH
2137 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2138 * These need to be revisited when a newer toolchain becomes available.
2139 */
2140#if defined(__sparc64__) && defined(__GNUC__)
2141# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2142# undef SPARC64_GCC_WORKAROUND
2143# define SPARC64_GCC_WORKAROUND 1
2144# endif
2145#endif
2146
07be1b83 2147#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2148 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2149 SV * const mysv=sv_newmortal(); \
2150 regnode *Next = regnext(scan); \
2151 regprop(RExC_rx, mysv, scan); \
7f69552c 2152 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2153 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2154 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2155 }});
07be1b83 2156
1de06328
YO
2157
2158
2159
2160
07be1b83
YO
2161#define JOIN_EXACT(scan,min,flags) \
2162 if (PL_regkind[OP(scan)] == EXACT) \
2163 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2164
be8e71aa 2165STATIC U32
07be1b83
YO
2166S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2167 /* Merge several consecutive EXACTish nodes into one. */
2168 regnode *n = regnext(scan);
2169 U32 stringok = 1;
2170 regnode *next = scan + NODE_SZ_STR(scan);
2171 U32 merged = 0;
2172 U32 stopnow = 0;
2173#ifdef DEBUGGING
2174 regnode *stop = scan;
72f13be8 2175 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2176#else
d47053eb
RGS
2177 PERL_UNUSED_ARG(depth);
2178#endif
2179#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2180 PERL_UNUSED_ARG(flags);
2181 PERL_UNUSED_ARG(val);
07be1b83 2182#endif
07be1b83
YO
2183 DEBUG_PEEP("join",scan,depth);
2184
2185 /* Skip NOTHING, merge EXACT*. */
2186 while (n &&
2187 ( PL_regkind[OP(n)] == NOTHING ||
2188 (stringok && (OP(n) == OP(scan))))
2189 && NEXT_OFF(n)
2190 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2191
2192 if (OP(n) == TAIL || n > next)
2193 stringok = 0;
2194 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2195 DEBUG_PEEP("skip:",n,depth);
2196 NEXT_OFF(scan) += NEXT_OFF(n);
2197 next = n + NODE_STEP_REGNODE;
2198#ifdef DEBUGGING
2199 if (stringok)
2200 stop = n;
2201#endif
2202 n = regnext(n);
2203 }
2204 else if (stringok) {
786e8c11 2205 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2206 regnode * const nnext = regnext(n);
2207
2208 DEBUG_PEEP("merg",n,depth);
2209
2210 merged++;
2211 if (oldl + STR_LEN(n) > U8_MAX)
2212 break;
2213 NEXT_OFF(scan) += NEXT_OFF(n);
2214 STR_LEN(scan) += STR_LEN(n);
2215 next = n + NODE_SZ_STR(n);
2216 /* Now we can overwrite *n : */
2217 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2218#ifdef DEBUGGING
2219 stop = next - 1;
2220#endif
2221 n = nnext;
2222 if (stopnow) break;
2223 }
2224
d47053eb
RGS
2225#ifdef EXPERIMENTAL_INPLACESCAN
2226 if (flags && !NEXT_OFF(n)) {
2227 DEBUG_PEEP("atch", val, depth);
2228 if (reg_off_by_arg[OP(n)]) {
2229 ARG_SET(n, val - n);
2230 }
2231 else {
2232 NEXT_OFF(n) = val - n;
2233 }
2234 stopnow = 1;
2235 }
07be1b83
YO
2236#endif
2237 }
2238
2239 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2240 /*
2241 Two problematic code points in Unicode casefolding of EXACT nodes:
2242
2243 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2244 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2245
2246 which casefold to
2247
2248 Unicode UTF-8
2249
2250 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2251 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2252
2253 This means that in case-insensitive matching (or "loose matching",
2254 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2255 length of the above casefolded versions) can match a target string
2256 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2257 This would rather mess up the minimum length computation.
2258
2259 What we'll do is to look for the tail four bytes, and then peek
2260 at the preceding two bytes to see whether we need to decrease
2261 the minimum length by four (six minus two).
2262
2263 Thanks to the design of UTF-8, there cannot be false matches:
2264 A sequence of valid UTF-8 bytes cannot be a subsequence of
2265 another valid sequence of UTF-8 bytes.
2266
2267 */
2268 char * const s0 = STRING(scan), *s, *t;
2269 char * const s1 = s0 + STR_LEN(scan) - 1;
2270 char * const s2 = s1 - 4;
e294cc5d
JH
2271#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2272 const char t0[] = "\xaf\x49\xaf\x42";
2273#else
07be1b83 2274 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2275#endif
07be1b83
YO
2276 const char * const t1 = t0 + 3;
2277
2278 for (s = s0 + 2;
2279 s < s2 && (t = ninstr(s, s1, t0, t1));
2280 s = t + 4) {
e294cc5d
JH
2281#ifdef EBCDIC
2282 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2283 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2284#else
07be1b83
YO
2285 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2286 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2287#endif
07be1b83
YO
2288 *min -= 4;
2289 }
2290 }
2291
2292#ifdef DEBUGGING
2293 /* Allow dumping */
2294 n = scan + NODE_SZ_STR(scan);
2295 while (n <= stop) {
2296 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2297 OP(n) = OPTIMIZED;
2298 NEXT_OFF(n) = 0;
2299 }
2300 n++;
2301 }
2302#endif
2303 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2304 return stopnow;
2305}
2306
653099ff
GS
2307/* REx optimizer. Converts nodes into quickier variants "in place".
2308 Finds fixed substrings. */
2309
a0288114 2310/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2311 to the position after last scanned or to NULL. */
2312
40d049e4
YO
2313#define INIT_AND_WITHP \
2314 assert(!and_withp); \
2315 Newx(and_withp,1,struct regnode_charclass_class); \
2316 SAVEFREEPV(and_withp)
07be1b83 2317
b515a41d
YO
2318/* this is a chain of data about sub patterns we are processing that
2319 need to be handled seperately/specially in study_chunk. Its so
2320 we can simulate recursion without losing state. */
2321struct scan_frame;
2322typedef struct scan_frame {
2323 regnode *last; /* last node to process in this frame */
2324 regnode *next; /* next node to process when last is reached */
2325 struct scan_frame *prev; /*previous frame*/
2326 I32 stop; /* what stopparen do we use */
2327} scan_frame;
2328
76e3520e 2329STATIC I32
40d049e4 2330S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2331 I32 *minlenp, I32 *deltap,
40d049e4
YO
2332 regnode *last,
2333 scan_data_t *data,
2334 I32 stopparen,
2335 U8* recursed,
2336 struct regnode_charclass_class *and_withp,
2337 U32 flags, U32 depth)
c277df42
IZ
2338 /* scanp: Start here (read-write). */
2339 /* deltap: Write maxlen-minlen here. */
2340 /* last: Stop before this one. */
40d049e4
YO
2341 /* data: string data about the pattern */
2342 /* stopparen: treat close N as END */
2343 /* recursed: which subroutines have we recursed into */
2344 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2345{
97aff369 2346 dVAR;
c277df42
IZ
2347 I32 min = 0, pars = 0, code;
2348 regnode *scan = *scanp, *next;
2349 I32 delta = 0;
2350 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2351 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2352 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2353 scan_data_t data_fake;
a3621e74 2354 SV *re_trie_maxbuff = NULL;
786e8c11 2355 regnode *first_non_open = scan;
e2e6a0f1 2356 I32 stopmin = I32_MAX;
8aa23a47
YO
2357 scan_frame *frame = NULL;
2358
a3621e74 2359 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2360
13a24bad 2361#ifdef DEBUGGING
40d049e4 2362 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2363#endif
40d049e4 2364
786e8c11 2365 if ( depth == 0 ) {
40d049e4 2366 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2367 first_non_open=regnext(first_non_open);
2368 }
2369
b81d288d 2370
8aa23a47
YO
2371 fake_study_recurse:
2372 while ( scan && OP(scan) != END && scan < last ){
2373 /* Peephole optimizer: */
2374 DEBUG_STUDYDATA(data,depth);
2375 DEBUG_PEEP("Peep",scan,depth);
2376 JOIN_EXACT(scan,&min,0);
2377
2378 /* Follow the next-chain of the current node and optimize
2379 away all the NOTHINGs from it. */
2380 if (OP(scan) != CURLYX) {
2381 const int max = (reg_off_by_arg[OP(scan)]
2382 ? I32_MAX
2383 /* I32 may be smaller than U16 on CRAYs! */
2384 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2385 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2386 int noff;
2387 regnode *n = scan;
2388
2389 /* Skip NOTHING and LONGJMP. */
2390 while ((n = regnext(n))
2391 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2392 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2393 && off + noff < max)
2394 off += noff;
2395 if (reg_off_by_arg[OP(scan)])
2396 ARG(scan) = off;
2397 else
2398 NEXT_OFF(scan) = off;
2399 }
a3621e74 2400
c277df42 2401
8aa23a47
YO
2402
2403 /* The principal pseudo-switch. Cannot be a switch, since we
2404 look into several different things. */
2405 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2406 || OP(scan) == IFTHEN) {
2407 next = regnext(scan);
2408 code = OP(scan);
2409 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2410
2411 if (OP(next) == code || code == IFTHEN) {
2412 /* NOTE - There is similar code to this block below for handling
2413 TRIE nodes on a re-study. If you change stuff here check there
2414 too. */
2415 I32 max1 = 0, min1 = I32_MAX, num = 0;
2416 struct regnode_charclass_class accum;
2417 regnode * const startbranch=scan;
2418
2419 if (flags & SCF_DO_SUBSTR)
2420 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2421 if (flags & SCF_DO_STCLASS)
2422 cl_init_zero(pRExC_state, &accum);
2423
2424 while (OP(scan) == code) {
2425 I32 deltanext, minnext, f = 0, fake;
2426 struct regnode_charclass_class this_class;
2427
2428 num++;
2429 data_fake.flags = 0;
2430 if (data) {
2431 data_fake.whilem_c = data->whilem_c;
2432 data_fake.last_closep = data->last_closep;
2433 }
2434 else
2435 data_fake.last_closep = &fake;
58e23c8d
YO
2436
2437 data_fake.pos_delta = delta;
8aa23a47
YO
2438 next = regnext(scan);
2439 scan = NEXTOPER(scan);
2440 if (code != BRANCH)
c277df42 2441 scan = NEXTOPER(scan);
8aa23a47
YO
2442 if (flags & SCF_DO_STCLASS) {
2443 cl_init(pRExC_state, &this_class);
2444 data_fake.start_class = &this_class;
2445 f = SCF_DO_STCLASS_AND;
58e23c8d 2446 }
8aa23a47
YO
2447 if (flags & SCF_WHILEM_VISITED_POS)
2448 f |= SCF_WHILEM_VISITED_POS;
2449
2450 /* we suppose the run is continuous, last=next...*/
2451 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2452 next, &data_fake,
2453 stopparen, recursed, NULL, f,depth+1);
2454 if (min1 > minnext)
2455 min1 = minnext;
2456 if (max1 < minnext + deltanext)
2457 max1 = minnext + deltanext;
2458 if (deltanext == I32_MAX)
2459 is_inf = is_inf_internal = 1;
2460 scan = next;
2461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2462 pars++;
2463 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2464 if ( stopmin > minnext)
2465 stopmin = min + min1;
2466 flags &= ~SCF_DO_SUBSTR;
2467 if (data)
2468 data->flags |= SCF_SEEN_ACCEPT;
2469 }
2470 if (data) {
2471 if (data_fake.flags & SF_HAS_EVAL)
2472 data->flags |= SF_HAS_EVAL;
2473 data->whilem_c = data_fake.whilem_c;
3dab1dad 2474 }
8aa23a47
YO
2475 if (flags & SCF_DO_STCLASS)
2476 cl_or(pRExC_state, &accum, &this_class);
2477 }
2478 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2479 min1 = 0;
2480 if (flags & SCF_DO_SUBSTR) {
2481 data->pos_min += min1;
2482 data->pos_delta += max1 - min1;
2483 if (max1 != min1 || is_inf)
2484 data->longest = &(data->longest_float);
2485 }
2486 min += min1;
2487 delta += max1 - min1;
2488 if (flags & SCF_DO_STCLASS_OR) {
2489 cl_or(pRExC_state, data->start_class, &accum);
2490 if (min1) {
2491 cl_and(data->start_class, and_withp);
2492 flags &= ~SCF_DO_STCLASS;
653099ff 2493 }
8aa23a47
YO
2494 }
2495 else if (flags & SCF_DO_STCLASS_AND) {
2496 if (min1) {
2497 cl_and(data->start_class, &accum);
2498 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2499 }
8aa23a47
YO
2500 else {
2501 /* Switch to OR mode: cache the old value of
2502 * data->start_class */
2503 INIT_AND_WITHP;
2504 StructCopy(data->start_class, and_withp,
2505 struct regnode_charclass_class);
2506 flags &= ~SCF_DO_STCLASS_AND;
2507 StructCopy(&accum, data->start_class,
2508 struct regnode_charclass_class);
2509 flags |= SCF_DO_STCLASS_OR;
2510 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2511 }
8aa23a47 2512 }
a3621e74 2513
8aa23a47
YO
2514 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2515 /* demq.
a3621e74 2516
8aa23a47
YO
2517 Assuming this was/is a branch we are dealing with: 'scan' now
2518 points at the item that follows the branch sequence, whatever
2519 it is. We now start at the beginning of the sequence and look
2520 for subsequences of
a3621e74 2521
8aa23a47
YO
2522 BRANCH->EXACT=>x1
2523 BRANCH->EXACT=>x2
2524 tail
a3621e74 2525
8aa23a47 2526 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2527
8aa23a47
YO
2528 If we can find such a subseqence we need to turn the first
2529 element into a trie and then add the subsequent branch exact
2530 strings to the trie.
a3621e74 2531
8aa23a47 2532 We have two cases
a3621e74 2533
8aa23a47 2534 1. patterns where the whole set of branch can be converted.
a3621e74 2535
8aa23a47 2536 2. patterns where only a subset can be converted.
a3621e74 2537
8aa23a47
YO
2538 In case 1 we can replace the whole set with a single regop
2539 for the trie. In case 2 we need to keep the start and end
2540 branchs so
a3621e74 2541
8aa23a47
YO
2542 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2543 becomes BRANCH TRIE; BRANCH X;
786e8c11 2544
8aa23a47
YO
2545 There is an additional case, that being where there is a
2546 common prefix, which gets split out into an EXACT like node
2547 preceding the TRIE node.
a3621e74 2548
8aa23a47
YO
2549 If x(1..n)==tail then we can do a simple trie, if not we make
2550 a "jump" trie, such that when we match the appropriate word
2551 we "jump" to the appopriate tail node. Essentailly we turn
2552 a nested if into a case structure of sorts.
b515a41d 2553
8aa23a47
YO
2554 */
2555
2556 int made=0;
2557 if (!re_trie_maxbuff) {
2558 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2559 if (!SvIOK(re_trie_maxbuff))
2560 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2561 }
2562 if ( SvIV(re_trie_maxbuff)>=0 ) {
2563 regnode *cur;
2564 regnode *first = (regnode *)NULL;
2565 regnode *last = (regnode *)NULL;
2566 regnode *tail = scan;
2567 U8 optype = 0;
2568 U32 count=0;
a3621e74
YO
2569
2570#ifdef DEBUGGING
8aa23a47 2571 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2572#endif
8aa23a47
YO
2573 /* var tail is used because there may be a TAIL
2574 regop in the way. Ie, the exacts will point to the
2575 thing following the TAIL, but the last branch will
2576 point at the TAIL. So we advance tail. If we
2577 have nested (?:) we may have to move through several
2578 tails.
2579 */
2580
2581 while ( OP( tail ) == TAIL ) {
2582 /* this is the TAIL generated by (?:) */
2583 tail = regnext( tail );
2584 }
a3621e74 2585
8aa23a47
YO
2586
2587 DEBUG_OPTIMISE_r({
2588 regprop(RExC_rx, mysv, tail );
2589 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2590 (int)depth * 2 + 2, "",
2591 "Looking for TRIE'able sequences. Tail node is: ",
2592 SvPV_nolen_const( mysv )
2593 );
2594 });
2595
2596 /*
2597
2598 step through the branches, cur represents each
2599 branch, noper is the first thing to be matched
2600 as part of that branch and noper_next is the
2601 regnext() of that node. if noper is an EXACT
2602 and noper_next is the same as scan (our current
2603 position in the regex) then the EXACT branch is
2604 a possible optimization target. Once we have
2605 two or more consequetive such branches we can
2606 create a trie of the EXACT's contents and stich
2607 it in place. If the sequence represents all of
2608 the branches we eliminate the whole thing and
2609 replace it with a single TRIE. If it is a
2610 subsequence then we need to stitch it in. This
2611 means the first branch has to remain, and needs
2612 to be repointed at the item on the branch chain
2613 following the last branch optimized. This could
2614 be either a BRANCH, in which case the
2615 subsequence is internal, or it could be the
2616 item following the branch sequence in which
2617 case the subsequence is at the end.
2618
2619 */
2620
2621 /* dont use tail as the end marker for this traverse */
2622 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2623 regnode * const noper = NEXTOPER( cur );
b515a41d 2624#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2625 regnode * const noper_next = regnext( noper );
b515a41d
YO
2626#endif
2627
8aa23a47
YO
2628 DEBUG_OPTIMISE_r({
2629 regprop(RExC_rx, mysv, cur);
2630 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2631 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2632
2633 regprop(RExC_rx, mysv, noper);
2634 PerlIO_printf( Perl_debug_log, " -> %s",
2635 SvPV_nolen_const(mysv));
2636
2637 if ( noper_next ) {
2638 regprop(RExC_rx, mysv, noper_next );
2639 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2640 SvPV_nolen_const(mysv));
2641 }
2642 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2643 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2644 });
2645 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2646 : PL_regkind[ OP( noper ) ] == EXACT )
2647 || OP(noper) == NOTHING )
786e8c11 2648#ifdef NOJUMPTRIE
8aa23a47 2649 && noper_next == tail
786e8c11 2650#endif
8aa23a47
YO
2651 && count < U16_MAX)
2652 {
2653 count++;
2654 if ( !first || optype == NOTHING ) {
2655 if (!first) first = cur;
2656 optype = OP( noper );
2657 } else {
2658 last = cur;
2659 }
2660 } else {
2661 if ( last ) {
2662 make_trie( pRExC_state,
2663 startbranch, first, cur, tail, count,
2664 optype, depth+1 );
2665 }
2666 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2667#ifdef NOJUMPTRIE
8aa23a47 2668 && noper_next == tail
786e8c11 2669#endif
8aa23a47
YO
2670 ){
2671 count = 1;
2672 first = cur;
2673 optype = OP( noper );
2674 } else {
2675 count = 0;
2676 first = NULL;
2677 optype = 0;
2678 }
2679 last = NULL;
2680 }
2681 }
2682 DEBUG_OPTIMISE_r({
2683 regprop(RExC_rx, mysv, cur);
2684 PerlIO_printf( Perl_debug_log,
2685 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2686 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2687
2688 });
2689 if ( last ) {
2690 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2691#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2692 if ( ((made == MADE_EXACT_TRIE &&
2693 startbranch == first)
2694 || ( first_non_open == first )) &&
2695 depth==0 ) {
2696 flags |= SCF_TRIE_RESTUDY;
2697 if ( startbranch == first
2698 && scan == tail )
2699 {
2700 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2701 }
2702 }
3dab1dad 2703#endif
8aa23a47
YO
2704 }
2705 }
2706
2707 } /* do trie */
2708
653099ff 2709 }
8aa23a47
YO
2710 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2711 scan = NEXTOPER(NEXTOPER(scan));
2712 } else /* single branch is optimized. */
2713 scan = NEXTOPER(scan);
2714 continue;
2715 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2716 scan_frame *newframe = NULL;
2717 I32 paren;
2718 regnode *start;
2719 regnode *end;
2720
2721 if (OP(scan) != SUSPEND) {
2722 /* set the pointer */
2723 if (OP(scan) == GOSUB) {
2724 paren = ARG(scan);
2725 RExC_recurse[ARG2L(scan)] = scan;
2726 start = RExC_open_parens[paren-1];
2727 end = RExC_close_parens[paren-1];
2728 } else {
2729 paren = 0;
2730 start = RExC_rx->program + 1;
2731 end = RExC_opend;
2732 }
2733 if (!recursed) {
2734 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2735 SAVEFREEPV(recursed);
2736 }
2737 if (!PAREN_TEST(recursed,paren+1)) {
2738 PAREN_SET(recursed,paren+1);
2739 Newx(newframe,1,scan_frame);
2740 } else {
2741 if (flags & SCF_DO_SUBSTR) {
2742 scan_commit(pRExC_state,data,minlenp);
2743 data->longest = &(data->longest_float);
2744 }
2745 is_inf = is_inf_internal = 1;
2746 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2747 cl_anything(pRExC_state, data->start_class);
2748 flags &= ~SCF_DO_STCLASS;
2749 }
2750 } else {
2751 Newx(newframe,1,scan_frame);
2752 paren = stopparen;
2753 start = scan+2;
2754 end = regnext(scan);
2755 }
2756 if (newframe) {
2757 assert(start);
2758 assert(end);
2759 SAVEFREEPV(newframe);
2760 newframe->next = regnext(scan);
2761 newframe->last = last;
2762 newframe->stop = stopparen;
2763 newframe->prev = frame;
2764
2765 frame = newframe;
2766 scan = start;
2767 stopparen = paren;
2768 last = end;
2769
2770 continue;
2771 }
2772 }
2773 else if (OP(scan) == EXACT) {
2774 I32 l = STR_LEN(scan);
2775 UV uc;
2776 if (UTF) {
2777 const U8 * const s = (U8*)STRING(scan);
2778 l = utf8_length(s, s + l);
2779 uc = utf8_to_uvchr(s, NULL);
2780 } else {
2781 uc = *((U8*)STRING(scan));
2782 }
2783 min += l;
2784 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2785 /* The code below prefers earlier match for fixed
2786 offset, later match for variable offset. */
2787 if (data->last_end == -1) { /* Update the start info. */
2788 data->last_start_min = data->pos_min;
2789 data->last_start_max = is_inf
2790 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2791 }
8aa23a47
YO
2792 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2793 if (UTF)
2794 SvUTF8_on(data->last_found);
2795 {
2796 SV * const sv = data->last_found;
2797 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2798 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2799 if (mg && mg->mg_len >= 0)
2800 mg->mg_len += utf8_length((U8*)STRING(scan),
2801 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2802 }
8aa23a47
YO
2803 data->last_end = data->pos_min + l;
2804 data->pos_min += l; /* As in the first entry. */
2805 data->flags &= ~SF_BEFORE_EOL;
2806 }
2807 if (flags & SCF_DO_STCLASS_AND) {
2808 /* Check whether it is compatible with what we know already! */
2809 int compat = 1;
2810
2811 if (uc >= 0x100 ||
2812 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2813 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2814 && (!(data->start_class->flags & ANYOF_FOLD)
2815 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2816 )
2817 compat = 0;
2818 ANYOF_CLASS_ZERO(data->start_class);
2819 ANYOF_BITMAP_ZERO(data->start_class);
2820 if (compat)
2821 ANYOF_BITMAP_SET(data->start_class, uc);
2822 data->start_class->flags &= ~ANYOF_EOS;
2823 if (uc < 0x100)
2824 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2825 }
2826 else if (flags & SCF_DO_STCLASS_OR) {
2827 /* false positive possible if the class is case-folded */
2828 if (uc < 0x100)
2829 ANYOF_BITMAP_SET(data->start_class, uc);
2830 else
2831 data->start_class->flags |= ANYOF_UNICODE_ALL;
2832 data->start_class->flags &= ~ANYOF_EOS;
2833 cl_and(data->start_class, and_withp);
2834 }
2835 flags &= ~SCF_DO_STCLASS;
2836 }
2837 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2838 I32 l = STR_LEN(scan);
2839 UV uc = *((U8*)STRING(scan));
2840
2841 /* Search for fixed substrings supports EXACT only. */
2842 if (flags & SCF_DO_SUBSTR) {
2843 assert(data);
2844 scan_commit(pRExC_state, data, minlenp);
2845 }
2846 if (UTF) {
2847 const U8 * const s = (U8 *)STRING(scan);
2848 l = utf8_length(s, s + l);
2849 uc = utf8_to_uvchr(s, NULL);
2850 }
2851 min += l;
2852 if (flags & SCF_DO_SUBSTR)
2853 data->pos_min += l;
2854 if (flags & SCF_DO_STCLASS_AND) {
2855 /* Check whether it is compatible with what we know already! */
2856 int compat = 1;
2857
2858 if (uc >= 0x100 ||
2859 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2860 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2861 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2862 compat = 0;
2863 ANYOF_CLASS_ZERO(data->start_class);
2864 ANYOF_BITMAP_ZERO(data->start_class);
2865 if (compat) {
2866 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2867 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2868 data->start_class->flags |= ANYOF_FOLD;
2869 if (OP(scan) == EXACTFL)
2870 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2871 }
8aa23a47
YO
2872 }
2873 else if (flags & SCF_DO_STCLASS_OR) {
2874 if (data->start_class->flags & ANYOF_FOLD) {
2875 /* false positive possible if the class is case-folded.
2876 Assume that the locale settings are the same... */
1aa99e6b
IH
2877 if (uc < 0x100)
2878 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2879 data->start_class->flags &= ~ANYOF_EOS;
2880 }
8aa23a47 2881 cl_and(data->start_class, and_withp);
653099ff 2882 }
8aa23a47
YO
2883 flags &= ~SCF_DO_STCLASS;
2884 }
2885 else if (strchr((const char*)PL_varies,OP(scan))) {
2886 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2887 I32 f = flags, pos_before = 0;
2888 regnode * const oscan = scan;
2889 struct regnode_charclass_class this_class;
2890 struct regnode_charclass_class *oclass = NULL;
2891 I32 next_is_eval = 0;
2892
2893 switch (PL_regkind[OP(scan)]) {
2894 case WHILEM: /* End of (?:...)* . */
2895 scan = NEXTOPER(scan);
2896 goto finish;
2897 case PLUS:
2898 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2899 next = NEXTOPER(scan);
2900 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2901 mincount = 1;
2902 maxcount = REG_INFTY;
2903 next = regnext(scan);
2904 scan = NEXTOPER(scan);
2905 goto do_curly;
2906 }
2907 }
2908 if (flags & SCF_DO_SUBSTR)
2909 data->pos_min++;
2910 min++;
2911 /* Fall through. */
2912 case STAR:
2913 if (flags & SCF_DO_STCLASS) {
2914 mincount = 0;
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2918 goto do_curly;
2919 }
2920 is_inf = is_inf_internal = 1;
2921 scan = regnext(scan);
c277df42 2922 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2923 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2924 data->longest = &(data->longest_float);
c277df42 2925 }
8aa23a47
YO
2926 goto optimize_curly_tail;
2927 case CURLY:
2928 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2929 && (scan->flags == stopparen))
2930 {
2931 mincount = 1;
2932 maxcount = 1;
2933 } else {
2934 mincount = ARG1(scan);
2935 maxcount = ARG2(scan);
653099ff 2936 }
8aa23a47
YO
2937 next = regnext(scan);
2938 if (OP(scan) == CURLYX) {
2939 I32 lp = (data ? *(data->last_closep) : 0);
2940 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2941 }
8aa23a47
YO
2942 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943 next_is_eval = (OP(scan) == EVAL);
2944 do_curly:
2945 if (flags & SCF_DO_SUBSTR) {
2946 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2947 pos_before = data->pos_min;
b45f050a 2948 }
8aa23a47
YO
2949 if (data) {
2950 fl = data->flags;
2951 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2952 if (is_inf)
2953 data->flags |= SF_IS_INF;
2954 }
2955 if (flags & SCF_DO_STCLASS) {
2956 cl_init(pRExC_state, &this_class);
2957 oclass = data->start_class;
2958 data->start_class = &this_class;
2959 f |= SCF_DO_STCLASS_AND;
2960 f &= ~SCF_DO_STCLASS_OR;
2961 }
2962 /* These are the cases when once a subexpression
2963 fails at a particular position, it cannot succeed
2964 even after backtracking at the enclosing scope.
2965
2966 XXXX what if minimal match and we are at the
2967 initial run of {n,m}? */
2968 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2969 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2970
8aa23a47
YO
2971 /* This will finish on WHILEM, setting scan, or on NULL: */
2972 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2973 last, data, stopparen, recursed, NULL,
2974 (mincount == 0
2975 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2976
8aa23a47
YO
2977 if (flags & SCF_DO_STCLASS)
2978 data->start_class = oclass;
2979 if (mincount == 0 || minnext == 0) {
2980 if (flags & SCF_DO_STCLASS_OR) {
2981 cl_or(pRExC_state, data->start_class, &this_class);
2982 }
2983 else if (flags & SCF_DO_STCLASS_AND) {
2984 /* Switch to OR mode: cache the old value of
2985 * data->start_class */
2986 INIT_AND_WITHP;
2987 StructCopy(data->start_class, and_withp,
2988 struct regnode_charclass_class);
2989 flags &= ~SCF_DO_STCLASS_AND;
2990 StructCopy(&this_class, data->start_class,
2991 struct regnode_charclass_class);
2992 flags |= SCF_DO_STCLASS_OR;
2993 data->start_class->flags |= ANYOF_EOS;
2994 }
2995 } else { /* Non-zero len */
2996 if (flags & SCF_DO_STCLASS_OR) {
2997 cl_or(pRExC_state, data->start_class, &this_class);
2998 cl_and(data->start_class, and_withp);
2999 }
3000 else if (flags & SCF_DO_STCLASS_AND)
3001 cl_and(data->start_class, &this_class);
3002 flags &= ~SCF_DO_STCLASS;
3003 }
3004 if (!scan) /* It was not CURLYX, but CURLY. */
3005 scan = next;
3006 if ( /* ? quantifier ok, except for (?{ ... }) */
3007 (next_is_eval || !(mincount == 0 && maxcount == 1))
3008 && (minnext == 0) && (deltanext == 0)
3009 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3010 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3011 && ckWARN(WARN_REGEXP))
3012 {
3013 vWARN(RExC_parse,
3014 "Quantifier unexpected on zero-length expression");
3015 }
3016
3017 min += minnext * mincount;
3018 is_inf_internal |= ((maxcount == REG_INFTY
3019 && (minnext + deltanext) > 0)
3020 || deltanext == I32_MAX);
3021 is_inf |= is_inf_internal;
3022 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3023
3024 /* Try powerful optimization CURLYX => CURLYN. */
3025 if ( OP(oscan) == CURLYX && data
3026 && data->flags & SF_IN_PAR
3027 && !(data->flags & SF_HAS_EVAL)
3028 && !deltanext && minnext == 1 ) {
3029 /* Try to optimize to CURLYN. */
3030 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3031 regnode * const nxt1 = nxt;
497b47a8 3032#ifdef DEBUGGING
8aa23a47 3033 regnode *nxt2;
497b47a8 3034#endif
c277df42 3035
8aa23a47
YO
3036 /* Skip open. */
3037 nxt = regnext(nxt);
3038 if (!strchr((const char*)PL_simple,OP(nxt))
3039 && !(PL_regkind[OP(nxt)] == EXACT
3040 && STR_LEN(nxt) == 1))
3041 goto nogo;
497b47a8 3042#ifdef DEBUGGING
8aa23a47 3043 nxt2 = nxt;
497b47a8 3044#endif
8aa23a47
YO
3045 nxt = regnext(nxt);
3046 if (OP(nxt) != CLOSE)
3047 goto nogo;
3048 if (RExC_open_parens) {
3049 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3050 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3051 }
3052 /* Now we know that nxt2 is the only contents: */
3053 oscan->flags = (U8)ARG(nxt);
3054 OP(oscan) = CURLYN;
3055 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3056
c277df42 3057#ifdef DEBUGGING
8aa23a47
YO
3058 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3059 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3060 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3061 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3062 OP(nxt + 1) = OPTIMIZED; /* was count. */
3063 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3064#endif
8aa23a47
YO
3065 }
3066 nogo:
3067
3068 /* Try optimization CURLYX => CURLYM. */
3069 if ( OP(oscan) == CURLYX && data
3070 && !(data->flags & SF_HAS_PAR)
3071 && !(data->flags & SF_HAS_EVAL)
3072 && !deltanext /* atom is fixed width */
3073 && minnext != 0 /* CURLYM can't handle zero width */
3074 ) {
3075 /* XXXX How to optimize if data == 0? */
3076 /* Optimize to a simpler form. */
3077 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3078 regnode *nxt2;
3079
3080 OP(oscan) = CURLYM;
3081 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3082 && (OP(nxt2) != WHILEM))
3083 nxt = nxt2;
3084 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3085 /* Need to optimize away parenths. */
3086 if (data->flags & SF_IN_PAR) {
3087 /* Set the parenth number. */
3088 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3089
3090 if (OP(nxt) != CLOSE)
3091 FAIL("Panic opt close");
3092 oscan->flags = (U8)ARG(nxt);
3093 if (RExC_open_parens) {
3094 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3095 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3096 }
8aa23a47
YO
3097 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3098 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3099
c277df42 3100#ifdef DEBUGGING
8aa23a47
YO
3101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3102 OP(nxt + 1) = OPTIMIZED; /* was count. */
3103 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3104 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3105#endif
c277df42 3106#if 0
8aa23a47
YO
3107 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3108 regnode *nnxt = regnext(nxt1);
3109
3110 if (nnxt == nxt) {
3111 if (reg_off_by_arg[OP(nxt1)])
3112 ARG_SET(nxt1, nxt2 - nxt1);
3113 else if (nxt2 - nxt1 < U16_MAX)
3114 NEXT_OFF(nxt1) = nxt2 - nxt1;
3115 else
3116 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3117 }
8aa23a47 3118 nxt1 = nnxt;
c277df42 3119 }
5d1c421c 3120#endif
8aa23a47
YO
3121 /* Optimize again: */
3122 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3123 NULL, stopparen, recursed, NULL, 0,depth+1);
3124 }
3125 else
3126 oscan->flags = 0;
3127 }
3128 else if ((OP(oscan) == CURLYX)
3129 && (flags & SCF_WHILEM_VISITED_POS)
3130 /* See the comment on a similar expression above.
3131 However, this time it not a subexpression
3132 we care about, but the expression itself. */
3133 && (maxcount == REG_INFTY)
3134 && data && ++data->whilem_c < 16) {
3135 /* This stays as CURLYX, we can put the count/of pair. */
3136 /* Find WHILEM (as in regexec.c) */
3137 regnode *nxt = oscan + NEXT_OFF(oscan);
3138
3139 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3140 nxt += ARG(nxt);
3141 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3142 | (RExC_whilem_seen << 4)); /* On WHILEM */
3143 }
3144 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3145 pars++;
3146 if (flags & SCF_DO_SUBSTR) {
3147 SV *last_str = NULL;
3148 int counted = mincount != 0;
a0ed51b3 3149
8aa23a47
YO
3150 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3151#if defined(SPARC64_GCC_WORKAROUND)
3152 I32 b = 0;
3153 STRLEN l = 0;
3154 const char *s = NULL;
3155 I32 old = 0;
b515a41d 3156
8aa23a47
YO
3157 if (pos_before >= data->last_start_min)
3158 b = pos_before;
3159 else
3160 b = data->last_start_min;
b515a41d 3161
8aa23a47
YO
3162 l = 0;
3163 s = SvPV_const(data->last_found, l);
3164 old = b - data->last_start_min;
3165
3166#else
3167 I32 b = pos_before >= data->last_start_min
3168 ? pos_before : data->last_start_min;
3169 STRLEN l;
3170 const char * const s = SvPV_const(data->last_found, l);
3171 I32 old = b - data->last_start_min;
3172#endif
3173
3174 if (UTF)
3175 old = utf8_hop((U8*)s, old) - (U8*)s;
3176
3177 l -= old;
3178 /* Get the added string: */
3179 last_str = newSVpvn(s + old, l);
3180 if (UTF)
3181 SvUTF8_on(last_str);
3182 if (deltanext == 0 && pos_before == b) {
3183 /* What was added is a constant string */
3184 if (mincount > 1) {
3185 SvGROW(last_str, (mincount * l) + 1);
3186 repeatcpy(SvPVX(last_str) + l,
3187 SvPVX_const(last_str), l, mincount - 1);
3188 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3189 /* Add additional parts. */
3190 SvCUR_set(data->last_found,
3191 SvCUR(data->last_found) - l);
3192 sv_catsv(data->last_found, last_str);
3193 {
3194 SV * sv = data->last_found;
3195 MAGIC *mg =
3196 SvUTF8(sv) && SvMAGICAL(sv) ?
3197 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3198 if (mg && mg->mg_len >= 0)
3199 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3200 }
8aa23a47 3201 data->last_end += l * (mincount - 1);
b515a41d 3202 }
8aa23a47
YO
3203 } else {
3204 /* start offset must point into the last copy */
3205 data->last_start_min += minnext * (mincount - 1);
3206 data->last_start_max += is_inf ? I32_MAX
3207 : (maxcount - 1) * (minnext + data->pos_delta);
3208 }
c277df42 3209 }
8aa23a47
YO
3210 /* It is counted once already... */
3211 data->pos_min += minnext * (mincount - counted);
3212 data->pos_delta += - counted * deltanext +
3213 (minnext + deltanext) * maxcount - minnext * mincount;
3214 if (mincount != maxcount) {
3215 /* Cannot extend fixed substrings found inside
3216 the group. */
3217 scan_commit(pRExC_state,data,minlenp);
3218 if (mincount && last_str) {
3219 SV * const sv = data->last_found;
3220 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3221 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3222
3223 if (mg)
3224 mg->mg_len = -1;
3225 sv_setsv(sv, last_str);
3226 data->last_end = data->pos_min;
3227 data->last_start_min =
3228 data->pos_min - CHR_SVLEN(last_str);
3229 data->last_start_max = is_inf
3230 ? I32_MAX
3231 : data->pos_min + data->pos_delta
3232 - CHR_SVLEN(last_str);
3233 }
3234 data->longest = &(data->longest_float);
3235 }
3236 SvREFCNT_dec(last_str);
c277df42 3237 }
8aa23a47
YO
3238 if (data && (fl & SF_HAS_EVAL))
3239 data->flags |= SF_HAS_EVAL;
3240 optimize_curly_tail:
3241 if (OP(oscan) != CURLYX) {
3242 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3243 && NEXT_OFF(next))
3244 NEXT_OFF(oscan) += NEXT_OFF(next);
3245 }
3246 continue;
3247 default: /* REF and CLUMP only? */
3248 if (flags & SCF_DO_SUBSTR) {
3249 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3250 data->longest = &(data->longest_float);
3251 }
3252 is_inf = is_inf_internal = 1;
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_anything(pRExC_state, data->start_class);
3255 flags &= ~SCF_DO_STCLASS;
3256 break;
c277df42 3257 }
8aa23a47
YO
3258 }
3259 else if (strchr((const char*)PL_simple,OP(scan))) {
3260 int value = 0;
653099ff 3261
8aa23a47
YO
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp);
3264 data->pos_min++;
3265 }
3266 min++;
3267 if (flags & SCF_DO_STCLASS) {
3268 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3269
8aa23a47
YO
3270 /* Some of the logic below assumes that switching
3271 locale on will only add false positives. */
3272 switch (PL_regkind[OP(scan)]) {
3273 case SANY:
3274 default:
3275 do_default:
3276 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3277 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3278 cl_anything(pRExC_state, data->start_class);
3279 break;
3280 case REG_ANY:
3281 if (OP(scan) == SANY)
3282 goto do_default;
3283 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3284 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3285 || (data->start_class->flags & ANYOF_CLASS));
3286 cl_anything(pRExC_state, data->start_class);
653099ff 3287 }
8aa23a47
YO
3288 if (flags & SCF_DO_STCLASS_AND || !value)
3289 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3290 break;
3291 case ANYOF:
3292 if (flags & SCF_DO_STCLASS_AND)
3293 cl_and(data->start_class,
3294 (struct regnode_charclass_class*)scan);
653099ff 3295 else
8aa23a47
YO
3296 cl_or(pRExC_state, data->start_class,
3297 (struct regnode_charclass_class*)scan);
3298 break;
3299 case ALNUM:
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3303 for (value = 0; value < 256; value++)
3304 if (!isALNUM(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3306 }
653099ff 3307 }
8aa23a47
YO
3308 else {
3309 if (data->start_class->flags & ANYOF_LOCALE)
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3311 else {
3312 for (value = 0; value < 256; value++)
3313 if (isALNUM(value))
3314 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3315 }
8aa23a47
YO
3316 }
3317 break;
3318 case ALNUML:
3319 if (flags & SCF_DO_STCLASS_AND) {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3322 }
3323 else {
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 }
3327 break;
3328 case NALNUM:
3329 if (flags & SCF_DO_STCLASS_AND) {
3330 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3332 for (value = 0; value < 256; value++)
3333 if (isALNUM(value))
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3335 }
3336 }
8aa23a47
YO
3337 else {
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3340 else {
3341 for (value = 0; value < 256; value++)
3342 if (!isALNUM(value))
3343 ANYOF_BITMAP_SET(data->start_class, value);
3344 }
653099ff 3345 }
8aa23a47
YO
3346 break;
3347 case NALNUML:
3348 if (flags & SCF_DO_STCLASS_AND) {
3349 if (data->start_class->flags & ANYOF_LOCALE)
3350 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3351 }
8aa23a47
YO
3352 else {
3353 data->start_class->flags |= ANYOF_LOCALE;
3354 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3355 }
3356 break;
3357 case SPACE:
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3361 for (value = 0; value < 256; value++)
3362 if (!isSPACE(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3364 }
3365 }
8aa23a47
YO
3366 else {
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isSPACE(value))
3372 ANYOF_BITMAP_SET(data->start_class, value);
3373 }
653099ff 3374 }
8aa23a47
YO
3375 break;
3376 case SPACEL:
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3380 }
3381 else {
3382 data->start_class->flags |= ANYOF_LOCALE;
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3384 }
3385 break;
3386 case NSPACE:
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3390 for (value = 0; value < 256; value++)
3391 if (isSPACE(value))
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3393 }
8aa23a47
YO
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (!isSPACE(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3402 }
3403 }
8aa23a47
YO
3404 break;
3405 case NSPACEL:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE) {
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3409 for (value = 0; value < 256; value++)
3410 if (!isSPACE(value))
3411 ANYOF_BITMAP_CLEAR(data->start_class, value);
3412 }
653099ff 3413 }
8aa23a47
YO
3414 else {
3415 data->start_class->flags |= ANYOF_LOCALE;
3416 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3417 }
3418 break;
3419 case DIGIT:
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3422 for (value = 0; value < 256; value++)
3423 if (!isDIGIT(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425 }
3426 else {
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3429 else {
3430 for (value = 0; value < 256; value++)
3431 if (isDIGIT(value))
3432 ANYOF_BITMAP_SET(data->start_class, value);
3433 }
3434 }
3435 break;
3436 case NDIGIT:
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3439 for (value = 0; value < 256; value++)
3440 if (isDIGIT(value))
3441 ANYOF_BITMAP_CLEAR(data->start_class, value);
3442 }
3443 else {
3444 if (data->start_class->flags & ANYOF_LOCALE)
3445 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3446 else {
3447 for (value = 0; value < 256; value++)
3448 if (!isDIGIT(value))
3449 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3450 }
3451 }
8aa23a47
YO
3452 break;
3453 }
3454 if (flags & SCF_DO_STCLASS_OR)
3455 cl_and(data->start_class, and_withp);
3456 flags &= ~SCF_DO_STCLASS;
3457 }
3458 }
3459 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3460 data->flags |= (OP(scan) == MEOL
3461 ? SF_BEFORE_MEOL
3462 : SF_BEFORE_SEOL);
3463 }
3464 else if ( PL_regkind[OP(scan)] == BRANCHJ
3465 /* Lookbehind, or need to calculate parens/evals/stclass: */
3466 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3467 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3468 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3469 || OP(scan) == UNLESSM )
3470 {
3471 /* Negative Lookahead/lookbehind
3472 In this case we can't do fixed string optimisation.
3473 */
1de06328 3474
8aa23a47
YO
3475 I32 deltanext, minnext, fake = 0;
3476 regnode *nscan;
3477 struct regnode_charclass_class intrnl;
3478 int f = 0;
1de06328 3479
8aa23a47
YO
3480 data_fake.flags = 0;
3481 if (data) {
3482 data_fake.whilem_c = data->whilem_c;
3483 data_fake.last_closep = data->last_closep;
c277df42 3484 }
8aa23a47
YO
3485 else
3486 data_fake.last_closep = &fake;
58e23c8d 3487 data_fake.pos_delta = delta;
8aa23a47
YO
3488 if ( flags & SCF_DO_STCLASS && !scan->flags
3489 && OP(scan) == IFMATCH ) { /* Lookahead */
3490 cl_init(pRExC_state, &intrnl);
3491 data_fake.start_class = &intrnl;
3492 f |= SCF_DO_STCLASS_AND;
3493 }
3494 if (flags & SCF_WHILEM_VISITED_POS)
3495 f |= SCF_WHILEM_VISITED_POS;
3496 next = regnext(scan);
3497 nscan = NEXTOPER(NEXTOPER(scan));
3498 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3499 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3500 if (scan->flags) {
3501 if (deltanext) {
58e23c8d 3502 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3503 }
3504 else if (minnext > (I32)U8_MAX) {
58e23c8d 3505 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3506 }
3507 scan->flags = (U8)minnext;
3508 }
3509 if (data) {
3510 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3511 pars++;
3512 if (data_fake.flags & SF_HAS_EVAL)
3513 data->flags |= SF_HAS_EVAL;
3514 data->whilem_c = data_fake.whilem_c;
3515 }
3516 if (f & SCF_DO_STCLASS_AND) {
3517 const int was = (data->start_class->flags & ANYOF_EOS);
3518
3519 cl_and(data->start_class, &intrnl);
3520 if (was)
3521 data->start_class->flags |= ANYOF_EOS;
3522 }
cb434fcc 3523 }
8aa23a47
YO
3524#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3525 else {
3526 /* Positive Lookahead/lookbehind
3527 In this case we can do fixed string optimisation,
3528 but we must be careful about it. Note in the case of
3529 lookbehind the positions will be offset by the minimum
3530 length of the pattern, something we won't know about
3531 until after the recurse.
3532 */
3533 I32 deltanext, fake = 0;
3534 regnode *nscan;
3535 struct regnode_charclass_class intrnl;
3536 int f = 0;
3537 /* We use SAVEFREEPV so that when the full compile
3538 is finished perl will clean up the allocated
3539 minlens when its all done. This was we don't
3540 have to worry about freeing them when we know
3541 they wont be used, which would be a pain.
3542 */
3543 I32 *minnextp;
3544 Newx( minnextp, 1, I32 );
3545 SAVEFREEPV(minnextp);
3546
3547 if (data) {
3548 StructCopy(data, &data_fake, scan_data_t);
3549 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3550 f |= SCF_DO_SUBSTR;
3551 if (scan->flags)
3552 scan_commit(pRExC_state, &data_fake,minlenp);
3553 data_fake.last_found=newSVsv(data->last_found);
3554 }
3555 }
3556 else
3557 data_fake.last_closep = &fake;
3558 data_fake.flags = 0;
58e23c8d 3559 data_fake.pos_delta = delta;
8aa23a47
YO
3560 if (is_inf)
3561 data_fake.flags |= SF_IS_INF;
3562 if ( flags & SCF_DO_STCLASS && !scan->flags
3563 && OP(scan) == IFMATCH ) { /* Lookahead */
3564 cl_init(pRExC_state, &intrnl);
3565 data_fake.start_class = &intrnl;
3566 f |= SCF_DO_STCLASS_AND;
3567 }
3568 if (flags & SCF_WHILEM_VISITED_POS)
3569 f |= SCF_WHILEM_VISITED_POS;
3570 next = regnext(scan);
3571 nscan = NEXTOPER(NEXTOPER(scan));
3572
3573 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3574 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3575 if (scan->flags) {
3576 if (deltanext) {
58e23c8d 3577 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3578 }
3579 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3580 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3581 }
3582 scan->flags = (U8)*minnextp;
3583 }
3584
3585 *minnextp += min;
3586
3587 if (f & SCF_DO_STCLASS_AND) {
3588 const int was = (data->start_class->flags & ANYOF_EOS);
3589
3590 cl_and(data->start_class, &intrnl);
3591 if (was)
3592 data->start_class->flags |= ANYOF_EOS;
3593 }
3594 if (data) {
3595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3596 pars++;
3597 if (data_fake.flags & SF_HAS_EVAL)
3598 data->flags |= SF_HAS_EVAL;
3599 data->whilem_c = data_fake.whilem_c;
3600 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3601 if (RExC_rx->minlen<*minnextp)
3602 RExC_rx->minlen=*minnextp;
3603 scan_commit(pRExC_state, &data_fake, minnextp);
3604 SvREFCNT_dec(data_fake.last_found);
3605
3606 if ( data_fake.minlen_fixed != minlenp )
3607 {
3608 data->offset_fixed= data_fake.offset_fixed;
3609 data->minlen_fixed= data_fake.minlen_fixed;
3610 data->lookbehind_fixed+= scan->flags;
3611 }
3612 if ( data_fake.minlen_float != minlenp )
3613 {
3614 data->minlen_float= data_fake.minlen_float;
3615 data->offset_float_min=data_fake.offset_float_min;
3616 data->offset_float_max=data_fake.offset_float_max;
3617 data->lookbehind_float+= scan->flags;
3618 }
3619 }
3620 }
3621
3622
40d049e4 3623 }
8aa23a47
YO
3624#endif
3625 }
3626 else if (OP(scan) == OPEN) {
3627 if (stopparen != (I32)ARG(scan))
3628 pars++;
3629 }
3630 else if (OP(scan) == CLOSE) {
3631 if (stopparen == (I32)ARG(scan)) {
3632 break;
3633 }
3634 if ((I32)ARG(scan) == is_par) {
3635 next = regnext(scan);
b515a41d 3636
8aa23a47
YO
3637 if ( next && (OP(next) != WHILEM) && next < last)
3638 is_par = 0; /* Disable optimization */
40d049e4 3639 }
8aa23a47
YO
3640 if (data)
3641 *(data->last_closep) = ARG(scan);
3642 }
3643 else if (OP(scan) == EVAL) {
c277df42
IZ
3644 if (data)
3645 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3646 }
3647 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3648 if (flags & SCF_DO_SUBSTR) {
3649 scan_commit(pRExC_state,data,minlenp);
3650 flags &= ~SCF_DO_SUBSTR;
40d049e4 3651 }
8aa23a47
YO
3652 if (data && OP(scan)==ACCEPT) {
3653 data->flags |= SCF_SEEN_ACCEPT;
3654 if (stopmin > min)
3655 stopmin = min;
e2e6a0f1 3656 }
8aa23a47
YO
3657 }
3658 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3659 {
0f5d15d6 3660 if (flags & SCF_DO_SUBSTR) {
1de06328 3661 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3662 data->longest = &(data->longest_float);
3663 }
3664 is_inf = is_inf_internal = 1;
653099ff 3665 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3666 cl_anything(pRExC_state, data->start_class);
96776eda 3667 flags &= ~SCF_DO_STCLASS;
8aa23a47 3668 }
58e23c8d 3669 else if (OP(scan) == GPOS) {
bbe252da 3670 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3671 !(delta || is_inf || (data && data->pos_delta)))
3672 {
bbe252da
YO
3673 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3674 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3675 if (RExC_rx->gofs < (U32)min)
3676 RExC_rx->gofs = min;
3677 } else {
bbe252da 3678 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3679 RExC_rx->gofs = 0;
3680 }
3681 }
786e8c11 3682#ifdef TRIE_STUDY_OPT
40d049e4 3683#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3684 else if (PL_regkind[OP(scan)] == TRIE) {
3685 /* NOTE - There is similar code to this block above for handling
3686 BRANCH nodes on the initial study. If you change stuff here
3687 check there too. */
3688 regnode *trie_node= scan;
3689 regnode *tail= regnext(scan);
3690 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3691 I32 max1 = 0, min1 = I32_MAX;
3692 struct regnode_charclass_class accum;
3693
3694 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3695 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3696 if (flags & SCF_DO_STCLASS)
3697 cl_init_zero(pRExC_state, &accum);
3698
3699 if (!trie->jump) {
3700 min1= trie->minlen;
3701 max1= trie->maxlen;
3702 } else {
3703 const regnode *nextbranch= NULL;
3704 U32 word;
3705
3706 for ( word=1 ; word <= trie->wordcount ; word++)
3707 {
3708 I32 deltanext=0, minnext=0, f = 0, fake;
3709 struct regnode_charclass_class this_class;
3710
3711 data_fake.flags = 0;
3712 if (data) {
3713 data_fake.whilem_c = data->whilem_c;
3714 data_fake.last_closep = data->last_closep;
3715 }
3716 else
3717 data_fake.last_closep = &fake;
58e23c8d 3718 data_fake.pos_delta = delta;
8aa23a47
YO
3719 if (flags & SCF_DO_STCLASS) {
3720 cl_init(pRExC_state, &this_class);
3721 data_fake.start_class = &this_class;
3722 f = SCF_DO_STCLASS_AND;
3723 }
3724 if (flags & SCF_WHILEM_VISITED_POS)
3725 f |= SCF_WHILEM_VISITED_POS;
3726
3727 if (trie->jump[word]) {
3728 if (!nextbranch)
3729 nextbranch = trie_node + trie->jump[0];
3730 scan= trie_node + trie->jump[word];
3731 /* We go from the jump point to the branch that follows
3732 it. Note this means we need the vestigal unused branches
3733 even though they arent otherwise used.
3734 */
3735 minnext = study_chunk(pRExC_state, &scan, minlenp,
3736 &deltanext, (regnode *)nextbranch, &data_fake,
3737 stopparen, recursed, NULL, f,depth+1);
3738 }
3739 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3740 nextbranch= regnext((regnode*)nextbranch);
3741
3742 if (min1 > (I32)(minnext + trie->minlen))
3743 min1 = minnext + trie->minlen;
3744 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3745 max1 = minnext + deltanext + trie->maxlen;
3746 if (deltanext == I32_MAX)
3747 is_inf = is_inf_internal = 1;
3748
3749 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3750 pars++;
3751 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3752 if ( stopmin > min + min1)
3753 stopmin = min + min1;
3754 flags &= ~SCF_DO_SUBSTR;
3755 if (data)
3756 data->flags |= SCF_SEEN_ACCEPT;
3757 }
3758 if (data) {
3759 if (data_fake.flags & SF_HAS_EVAL)
3760 data->flags |= SF_HAS_EVAL;
3761 data->whilem_c = data_fake.whilem_c;
3762 }
3763 if (flags & SCF_DO_STCLASS)
3764 cl_or(pRExC_state, &accum, &this_class);
3765 }
3766 }
3767 if (flags & SCF_DO_SUBSTR) {
3768 data->pos_min += min1;
3769 data->pos_delta += max1 - min1;
3770 if (max1 != min1 || is_inf)
3771 data->longest = &(data->longest_float);
3772 }
3773 min += min1;
3774 delta += max1 - min1;
3775 if (flags & SCF_DO_STCLASS_OR) {
3776 cl_or(pRExC_state, data->start_class, &accum);
3777 if (min1) {
3778 cl_and(data->start_class, and_withp);
3779 flags &= ~SCF_DO_STCLASS;
3780 }
3781 }
3782 else if (flags & SCF_DO_STCLASS_AND) {
3783 if (min1) {
3784 cl_and(data->start_class, &accum);
3785 flags &= ~SCF_DO_STCLASS;
3786 }
3787 else {
3788 /* Switch to OR mode: cache the old value of
3789 * data->start_class */
3790 INIT_AND_WITHP;
3791 StructCopy(data->start_class, and_withp,
3792 struct regnode_charclass_class);
3793 flags &= ~SCF_DO_STCLASS_AND;
3794 StructCopy(&accum, data->start_class,
3795 struct regnode_charclass_class);
3796 flags |= SCF_DO_STCLASS_OR;
3797 data->start_class->flags |= ANYOF_EOS;
3798 }
3799 }
3800 scan= tail;
3801 continue;
3802 }
786e8c11 3803#else
8aa23a47
YO
3804 else if (PL_regkind[OP(scan)] == TRIE) {
3805 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3806 U8*bang=NULL;
3807
3808 min += trie->minlen;
3809 delta += (trie->maxlen - trie->minlen);
3810 flags &= ~SCF_DO_STCLASS; /* xxx */
3811 if (flags & SCF_DO_SUBSTR) {
3812 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3813 data->pos_min += trie->minlen;
3814 data->pos_delta += (trie->maxlen - trie->minlen);
3815 if (trie->maxlen != trie->minlen)
3816 data->longest = &(data->longest_float);
3817 }
3818 if (trie->jump) /* no more substrings -- for now /grr*/
3819 flags &= ~SCF_DO_SUBSTR;
b515a41d 3820 }
8aa23a47
YO
3821#endif /* old or new */
3822#endif /* TRIE_STUDY_OPT */
3823 /* Else: zero-length, ignore. */
3824 scan = regnext(scan);
3825 }
3826 if (frame) {
3827 last = frame->last;
3828 scan = frame->next;
3829 stopparen = frame->stop;
3830 frame = frame->prev;
3831 goto fake_study_recurse;
c277df42
IZ
3832 }
3833
3834 finish:
8aa23a47
YO
3835 assert(!frame);
3836
c277df42 3837 *scanp = scan;
aca2d497 3838 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3839 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3840 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3841 if (is_par > (I32)U8_MAX)
c277df42
IZ
3842 is_par = 0;
3843 if (is_par && pars==1 && data) {
3844 data->flags |= SF_IN_PAR;
3845 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3846 }
3847 else if (pars && data) {
c277df42
IZ
3848 data->flags |= SF_HAS_PAR;
3849 data->flags &= ~SF_IN_PAR;
3850 }
653099ff 3851 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3852 cl_and(data->start_class, and_withp);
786e8c11
YO
3853 if (flags & SCF_TRIE_RESTUDY)
3854 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3855
3856 DEBUG_STUDYDATA(data,depth);
3857
e2e6a0f1 3858 return min < stopmin ? min : stopmin;
c277df42
IZ
3859}
3860
76e3520e 3861STATIC I32
5f66b61c 3862S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3863{
830247a4 3864 if (RExC_rx->data) {
2eb97020 3865 const U32 count = RExC_rx->data->count;
b81d288d 3866 Renewc(RExC_rx->data,
2eb97020 3867 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3868 char, struct reg_data);
2eb97020 3869 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3870 RExC_rx->data->count += n;
a0ed51b3
LW
3871 }
3872 else {
a02a5408 3873 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3874 char, struct reg_data);
a02a5408 3875 Newx(RExC_rx->data->what, n, U8);
830247a4 3876 RExC_rx->data->count = n;
c277df42 3877 }
830247a4
IZ
3878 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3879 return RExC_rx->data->count - n;
c277df42
IZ
3880}
3881
76234dfb 3882#ifndef PERL_IN_XSUB_RE
d88dccdf 3883void
864dbfa3 3884Perl_reginitcolors(pTHX)
d88dccdf 3885{
97aff369 3886 dVAR;
1df70142 3887 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3888 if (s) {
1df70142
AL
3889 char *t = savepv(s);
3890 int i = 0;
3891 PL_colors[0] = t;
d88dccdf 3892 while (++i < 6) {
1df70142
AL
3893 t = strchr(t, '\t');
3894 if (t) {
3895 *t = '\0';
3896 PL_colors[i] = ++t;
d88dccdf
IZ
3897 }
3898 else
1df70142 3899 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3900 }
3901 } else {
1df70142 3902 int i = 0;
b81d288d 3903 while (i < 6)
06b5626a 3904 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3905 }
3906 PL_colorset = 1;
3907}
76234dfb 3908#endif
8615cb43 3909
07be1b83 3910
786e8c11
YO
3911#ifdef TRIE_STUDY_OPT
3912#define CHECK_RESTUDY_GOTO \
3913 if ( \
3914 (data.flags & SCF_TRIE_RESTUDY) \
3915 && ! restudied++ \
3916 ) goto reStudy
3917#else
3918#define CHECK_RESTUDY_GOTO
3919#endif
f9f4320a 3920
a687059c 3921/*
e50aee73 3922 - pregcomp - compile a regular expression into internal code
a687059c
LW
3923 *
3924 * We can't allocate space until we know how big the compiled form will be,
3925 * but we can't compile it (and thus know how big it is) until we've got a
3926 * place to put the code. So we cheat: we compile it twice, once with code
3927 * generation turned off and size counting turned on, and once "for real".
3928 * This also means that we don't allocate space until we are sure that the
3929 * thing really will compile successfully, and we never have to move the
3930 * code and thus invalidate pointers into it. (Note that it has to be in
3931 * one piece because free() must be able to free it all.) [NB: not true in perl]
3932 *
3933 * Beware that the optimization-preparation code in here knows about some
3934 * of the structure of the compiled regexp. [I'll say.]
3935 */
b9b4dddf
YO
3936
3937
3938
f9f4320a 3939#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3940#define RE_ENGINE_PTR &PL_core_reg_engine
3941#else
f9f4320a
YO
3942extern const struct regexp_engine my_reg_engine;
3943#define RE_ENGINE_PTR &my_reg_engine
3944#endif
b9b4dddf
YO
3945/* these make a few things look better, to avoid indentation */
3946#define BEGIN_BLOCK {
f9f4320a
YO
3947#define END_BLOCK }
3948
a687059c 3949regexp *
864dbfa3 3950Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3951{
97aff369 3952 dVAR;
f9f4320a
YO
3953 GET_RE_DEBUG_FLAGS_DECL;
3954 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3955#ifndef PERL_IN_XSUB_RE
3956 BEGIN_BLOCK
f9f4320a
YO
3957 /* Dispatch a request to compile a regexp to correct
3958 regexp engine. */
3959 HV * const table = GvHV(PL_hintgv);
3960 if (table) {
3961 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3962 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3963 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3964 DEBUG_COMPILE_r({
8d8756e7 3965 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3966 SvIV(*ptr));
3967 });
f2f78491 3968 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3969 }
b9b4dddf
YO
3970 }
3971 END_BLOCK
3972#endif
3973 BEGIN_BLOCK
a0d0e21e 3974 register regexp *r;
c277df42 3975 regnode *scan;
c277df42 3976 regnode *first;
a0d0e21e 3977 I32 flags;
a0d0e21e
LW
3978 I32 minlen = 0;
3979 I32 sawplus = 0;
3980 I32 sawopen = 0;
2c2d71f5 3981 scan_data_t data;
830247a4 3982 RExC_state_t RExC_state;
be8e71aa 3983 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3984#ifdef TRIE_STUDY_OPT
3985 int restudied= 0;
3986 RExC_state_t copyRExC_state;
3987#endif
a0d0e21e 3988 if (exp == NULL)
c277df42 3989 FAIL("NULL regexp argument");
a0d0e21e 3990
a5961de5 3991 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3992
5cfc7842 3993 RExC_precomp = exp;
a3621e74 3994 DEBUG_COMPILE_r({
ab3bbdeb
YO
3995 SV *dsv= sv_newmortal();
3996 RE_PV_QUOTED_DECL(s, RExC_utf8,
3997 dsv, RExC_precomp, (xend - exp), 60);
3998 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3999 PL_colors[4],PL_colors[5],s);
a5961de5 4000 });
e2509266 4001 RExC_flags = pm->op_pmflags;
830247a4 4002 RExC_sawback = 0;
bbce6d69 4003
830247a4
IZ
4004 RExC_seen = 0;
4005 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4006 RExC_seen_evals = 0;
4007 RExC_extralen = 0;
c277df42 4008
bbce6d69 4009 /* First pass: determine size, legality. */
830247a4 4010 RExC_parse = exp;
fac92740 4011 RExC_start = exp;
830247a4
IZ
4012 RExC_end = xend;
4013 RExC_naughty = 0;
4014 RExC_npar = 1;
c74340f9 4015 RExC_cpar = 1;
e2e6a0f1 4016 RExC_nestroot = 0;
830247a4
IZ
4017 RExC_size = 0L;
4018 RExC_emit = &PL_regdummy;
4019 RExC_whilem_seen = 0;
fc8cd66c 4020 RExC_charnames = NULL;
40d049e4
YO
4021 RExC_open_parens = NULL;
4022 RExC_close_parens = NULL;
4023 RExC_opend = NULL;
81714fb9 4024 RExC_paren_names = NULL;
40d049e4
YO
4025 RExC_recurse = NULL;
4026 RExC_recurse_count = 0;
81714fb9 4027
85ddcde9
JH
4028#if 0 /* REGC() is (currently) a NOP at the first pass.
4029 * Clever compilers notice this and complain. --jhi */
830247a4 4030 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4031#endif
3dab1dad
YO
4032 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4033 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4034 RExC_precomp = NULL;
a0d0e21e
LW
4035 return(NULL);
4036 }
07be1b83 4037 DEBUG_PARSE_r({
81714fb9
YO
4038 PerlIO_printf(Perl_debug_log,
4039 "Required size %"IVdf" nodes\n"
4040 "Starting second pass (creation)\n",
4041 (IV)RExC_size);
07be1b83
YO
4042 RExC_lastnum=0;
4043 RExC_lastparse=NULL;
4044 });
c277df42
IZ
4045 /* Small enough for pointer-storage convention?
4046 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4047 if (RExC_size >= 0x10000L && RExC_extralen)
4048 RExC_size += RExC_extralen;
c277df42 4049 else
830247a4
IZ
4050 RExC_extralen = 0;
4051 if (RExC_whilem_seen > 15)
4052 RExC_whilem_seen = 15;
a0d0e21e 4053
e2e6a0f1
YO
4054#ifdef DEBUGGING
4055 /* Make room for a sentinel value at the end of the program */
4056 RExC_size++;
4057#endif
4058
f9f4320a
YO
4059 /* Allocate space and zero-initialize. Note, the two step process
4060 of zeroing when in debug mode, thus anything assigned has to
4061 happen after that */
a02a5408 4062 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 4063 char, regexp);
a0d0e21e 4064 if (r == NULL)
b45f050a 4065 FAIL("Regexp out of space");
0f79a09d
GS
4066#ifdef DEBUGGING
4067 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4068 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d
YO
4069#else
4070 /* bulk initialize fields with 0. */
4071 Zero(r, sizeof(regexp), char);
0f79a09d 4072#endif
58e23c8d
YO
4073
4074 /* non-zero initialization begins here */
f9f4320a 4075 r->engine= RE_ENGINE_PTR;
c277df42 4076 r->refcnt = 1;
bbce6d69 4077 r->prelen = xend - exp;
5cfc7842 4078 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4079 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4080 r->intflags = 0;
830247a4 4081 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4082
6bda09f9 4083 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4084 Newxz(RExC_open_parens, RExC_npar,regnode *);
4085 SAVEFREEPV(RExC_open_parens);
4086 Newxz(RExC_close_parens,RExC_npar,regnode *);
4087 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4088 }
4089
4090 /* Useful during FAIL. */
a02a5408 4091 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4092 if (r->offsets) {
2af232bd 4093 r->offsets[0] = RExC_size;
fac92740 4094 }
a3621e74 4095 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4096 "%s %"UVuf" bytes for offset annotations.\n",
4097 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4098 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4099
830247a4 4100 RExC_rx = r;
bbce6d69 4101
4102 /* Second pass: emit code. */
e2509266 4103 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4104 RExC_parse = exp;
4105 RExC_end = xend;
4106 RExC_naughty = 0;
4107 RExC_npar = 1;
c74340f9 4108 RExC_cpar = 1;
fac92740 4109 RExC_emit_start = r->program;
830247a4 4110 RExC_emit = r->program;
e2e6a0f1
YO
4111#ifdef DEBUGGING
4112 /* put a sentinal on the end of the program so we can check for
4113 overwrites */
4114 r->program[RExC_size].type = 255;
4115#endif
2cd61cdb 4116 /* Store the count of eval-groups for security checks: */
786e8c11 4117 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4118 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4119 r->data = 0;
3dab1dad 4120 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4121 return(NULL);
6bda09f9 4122
07be1b83
YO
4123 /* XXXX To minimize changes to RE engine we always allocate
4124 3-units-long substrs field. */
4125 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4126 if (RExC_recurse_count) {
4127 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4128 SAVEFREEPV(RExC_recurse);
4129 }
a0d0e21e 4130
07be1b83 4131reStudy:
1de06328 4132 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4133 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4134
07be1b83
YO
4135#ifdef TRIE_STUDY_OPT
4136 if ( restudied ) {
5d458dd8 4137 U32 seen=RExC_seen;
07be1b83 4138 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4139
4140 RExC_state = copyRExC_state;
4141 if (seen & REG_TOP_LEVEL_BRANCHES)
4142 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4143 else
4144 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4145 if (data.last_found) {
07be1b83 4146 SvREFCNT_dec(data.longest_fixed);
07be1b83 4147 SvREFCNT_dec(data.longest_float);
07be1b83 4148 SvREFCNT_dec(data.last_found);
1de06328 4149 }
40d049e4 4150 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4151 } else {
40d049e4 4152 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4153 copyRExC_state = RExC_state;
07be1b83 4154 }
40d049e4
YO
4155#else
4156 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4157#endif
fc8cd66c 4158
a0d0e21e 4159 /* Dig out information for optimizations. */
bbe252da 4160 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4161 pm->op_pmflags = RExC_flags;
a0ed51b3 4162 if (UTF)
bbe252da 4163 r->extflags |= RXf_UTF8; /* Unicode in it? */
c277df42 4164 r->regstclass = NULL;
830247a4 4165 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4166 r->intflags |= PREGf_NAUGHTY;
c277df42 4167 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4168
1de06328
YO
4169 /* testing for BRANCH here tells us whether there is "must appear"
4170 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4171 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4172 I32 fake;
c5254dd6 4173 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4174 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4175 int stclass_flag;
07be1b83 4176 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4177
4178 first = scan;
c277df42 4179 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4180 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4181 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4182 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4183 /* for now we can't handle lookbehind IFMATCH*/
4184 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4185 (OP(first) == PLUS) ||
4186 (OP(first) == MINMOD) ||
653099ff 4187 /* An {n,m} with n>0 */
07be1b83
YO
4188 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4189 {
786e8c11 4190
a0d0e21e
LW
4191 if (OP(first) == PLUS)
4192 sawplus = 1;
4193 else
3dab1dad 4194 first += regarglen[OP(first)];
07be1b83
YO
4195 if (OP(first) == IFMATCH) {
4196 first = NEXTOPER(first);
4197 first += EXTRA_STEP_2ARGS;
7c167cea 4198 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4199 first = NEXTOPER(first);
a687059c
LW
4200 }
4201
a0d0e21e
LW
4202 /* Starting-point info. */
4203 again:
786e8c11 4204 DEBUG_PEEP("first:",first,0);
07be1b83 4205 /* Ignore EXACT as we deal with it later. */
3dab1dad 4206 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4207 if (OP(first) == EXACT)
6f207bd3 4208 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4209 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4210 r->regstclass = first;
4211 }
07be1b83 4212#ifdef TRIE_STCLASS
786e8c11 4213 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4214 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4215 {
786e8c11 4216 regnode *trie_op;
07be1b83 4217 /* this can happen only on restudy */
786e8c11
YO
4218 if ( OP(first) == TRIE ) {
4219 struct regnode_1 *trieop;
4220 Newxz(trieop,1,struct regnode_1);
4221 StructCopy(first,trieop,struct regnode_1);
4222 trie_op=(regnode *)trieop;
4223 } else {
4224 struct regnode_charclass *trieop;
4225 Newxz(trieop,1,struct regnode_charclass);
4226 StructCopy(first,trieop,struct regnode_charclass);
4227 trie_op=(regnode *)trieop;
4228 }
1de06328 4229 OP(trie_op)+=2;
786e8c11
YO
4230 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4231 r->regstclass = trie_op;
07be1b83
YO
4232 }
4233#endif
bfed75c6 4234 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4235 r->regstclass = first;
3dab1dad
YO
4236 else if (PL_regkind[OP(first)] == BOUND ||
4237 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4238 r->regstclass = first;
3dab1dad 4239 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4240 r->extflags |= (OP(first) == MBOL
4241 ? RXf_ANCH_MBOL
cad2e5aa 4242 : (OP(first) == SBOL
bbe252da
YO
4243 ? RXf_ANCH_SBOL
4244 : RXf_ANCH_BOL));
a0d0e21e 4245 first = NEXTOPER(first);
774d564b 4246 goto again;
4247 }
4248 else if (OP(first) == GPOS) {
bbe252da 4249 r->extflags |= RXf_ANCH_GPOS;
774d564b 4250 first = NEXTOPER(first);
4251 goto again;
a0d0e21e 4252 }
cf2a2b69
YO
4253 else if ((!sawopen || !RExC_sawback) &&
4254 (OP(first) == STAR &&
3dab1dad 4255 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4256 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4257 {
4258 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4259 const int type =
4260 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4261 ? RXf_ANCH_MBOL
4262 : RXf_ANCH_SBOL;
4263 r->extflags |= type;
4264 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4265 first = NEXTOPER(first);
774d564b 4266 goto again;
a0d0e21e 4267 }
b81d288d 4268 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4269 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4270 /* x+ must match at the 1st pos of run of x's */
bbe252da 4271 r->intflags |= PREGf_SKIP;
a0d0e21e 4272
c277df42 4273 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4274#ifdef TRIE_STUDY_OPT
81714fb9 4275 DEBUG_PARSE_r(
be8e71aa
YO
4276 if (!restudied)
4277 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4278 (IV)(first - scan + 1))
4279 );
4280#else
81714fb9 4281 DEBUG_PARSE_r(
be8e71aa
YO
4282 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4283 (IV)(first - scan + 1))
4284 );
4285#endif
4286
4287
a0d0e21e
LW
4288 /*
4289 * If there's something expensive in the r.e., find the
4290 * longest literal string that must appear and make it the
4291 * regmust. Resolve ties in favor of later strings, since
4292 * the regstart check works with the beginning of the r.e.
4293 * and avoiding duplication strengthens checking. Not a
4294 * strong reason, but sufficient in the absence of others.
4295 * [Now we resolve ties in favor of the earlier string if
c277df42 4296 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4297 * earlier string may buy us something the later one won't.]
4298 */
de8c5301 4299
396482e1
GA
4300 data.longest_fixed = newSVpvs("");
4301 data.longest_float = newSVpvs("");
4302 data.last_found = newSVpvs("");
c277df42
IZ
4303 data.longest = &(data.longest_fixed);
4304 first = scan;
653099ff 4305 if (!r->regstclass) {
830247a4 4306 cl_init(pRExC_state, &ch_class);
653099ff
GS
4307 data.start_class = &ch_class;
4308 stclass_flag = SCF_DO_STCLASS_AND;
4309 } else /* XXXX Check for BOUND? */
4310 stclass_flag = 0;
cb434fcc 4311 data.last_closep = &last_close;
de8c5301 4312
1de06328 4313 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4314 &data, -1, NULL, NULL,
4315 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4316
07be1b83 4317
786e8c11
YO
4318 CHECK_RESTUDY_GOTO;
4319
4320
830247a4 4321 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4322 && data.last_start_min == 0 && data.last_end > 0
830247a4 4323 && !RExC_seen_zerolen
bbe252da
YO
4324 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4325 r->extflags |= RXf_CHECK_ALL;
1de06328 4326 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4327 SvREFCNT_dec(data.last_found);
4328
1de06328
YO
4329 /* Note that code very similar to this but for anchored string
4330 follows immediately below, changes may need to be made to both.
4331 Be careful.
4332 */
a0ed51b3 4333 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4334 if (longest_float_length
c277df42
IZ
4335 || (data.flags & SF_FL_BEFORE_EOL
4336 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4337 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4338 {
1182767e 4339 I32 t,ml;
cf93c79d 4340
1de06328 4341 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4342 && data.offset_fixed == data.offset_float_min
4343 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4344 goto remove_float; /* As in (a)+. */
4345
1de06328
YO
4346 /* copy the information about the longest float from the reg_scan_data
4347 over to the program. */
33b8afdf
JH
4348 if (SvUTF8(data.longest_float)) {
4349 r->float_utf8 = data.longest_float;
c445ea15 4350 r->float_substr = NULL;
33b8afdf
JH
4351 } else {
4352 r->float_substr = data.longest_float;
c445ea15 4353 r->float_utf8 = NULL;
33b8afdf 4354 }
1de06328
YO
4355 /* float_end_shift is how many chars that must be matched that
4356 follow this item. We calculate it ahead of time as once the
4357 lookbehind offset is added in we lose the ability to correctly
4358 calculate it.*/
4359 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4360 : (I32)longest_float_length;
1de06328
YO
4361 r->float_end_shift = ml - data.offset_float_min
4362 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4363 + data.lookbehind_float;
4364 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4365 r->float_max_offset = data.offset_float_max;
1182767e 4366 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4367 r->float_max_offset -= data.lookbehind_float;
4368
cf93c79d
IZ
4369 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4370 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4371 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4372 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4373 }
4374 else {
aca2d497 4375 remove_float:
c445ea15 4376 r->float_substr = r->float_utf8 = NULL;
c277df42 4377 SvREFCNT_dec(data.longest_float);
c5254dd6 4378 longest_float_length = 0;
a0d0e21e 4379 }
c277df42 4380
1de06328
YO
4381 /* Note that code very similar to this but for floating string
4382 is immediately above, changes may need to be made to both.
4383 Be careful.
4384 */
a0ed51b3 4385 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4386 if (longest_fixed_length
c277df42
IZ
4387 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4388 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4389 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4390 {
1182767e 4391 I32 t,ml;
cf93c79d 4392
1de06328
YO
4393 /* copy the information about the longest fixed
4394 from the reg_scan_data over to the program. */
33b8afdf
JH
4395 if (SvUTF8(data.longest_fixed)) {
4396 r->anchored_utf8 = data.longest_fixed;
c445ea15 4397 r->anchored_substr = NULL;
33b8afdf
JH
4398 } else {
4399 r->anchored_substr = data.longest_fixed;
c445ea15 4400 r->anchored_utf8 = NULL;
33b8afdf 4401 }
1de06328
YO
4402 /* fixed_end_shift is how many chars that must be matched that
4403 follow this item. We calculate it ahead of time as once the
4404 lookbehind offset is added in we lose the ability to correctly
4405 calculate it.*/
4406 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4407 : (I32)longest_fixed_length;
1de06328
YO
4408 r->anchored_end_shift = ml - data.offset_fixed
4409 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4410 + data.lookbehind_fixed;
4411 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4412
cf93c79d
IZ
4413 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4414 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4415 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4416 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4417 }
4418 else {
c445ea15 4419 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4420 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4421 longest_fixed_length = 0;
a0d0e21e 4422 }
b81d288d 4423 if (r->regstclass
ffc61ed2 4424 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4425 r->regstclass = NULL;
33b8afdf
JH
4426 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4427 && stclass_flag
653099ff 4428 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4429 && !cl_is_anything(data.start_class))
4430 {
1df70142 4431 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4432
a02a5408 4433 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4434 struct regnode_charclass_class);
4435 StructCopy(data.start_class,
830247a4 4436 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4437 struct regnode_charclass_class);
830247a4 4438 r->regstclass = (regnode*)RExC_rx->data->data[n];
bbe252da 4439 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4440 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4441 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4442 PerlIO_printf(Perl_debug_log,
a0288114 4443 "synthetic stclass \"%s\".\n",
3f7c398e 4444 SvPVX_const(sv));});
653099ff 4445 }
c277df42
IZ
4446
4447 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4448 if (longest_fixed_length > longest_float_length) {
1de06328 4449 r->check_end_shift = r->anchored_end_shift;
c277df42 4450 r->check_substr = r->anchored_substr;
33b8afdf 4451 r->check_utf8 = r->anchored_utf8;
c277df42 4452 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4453 if (r->extflags & RXf_ANCH_SINGLE)
4454 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4455 }
4456 else {
1de06328 4457 r->check_end_shift = r->float_end_shift;
c277df42 4458 r->check_substr = r->float_substr;
33b8afdf 4459 r->check_utf8 = r->float_utf8;
1de06328
YO
4460 r->check_offset_min = r->float_min_offset;
4461 r->check_offset_max = r->float_max_offset;
a0d0e21e 4462 }
30382c73
IZ
4463 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4464 This should be changed ASAP! */
bbe252da
YO
4465 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4466 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4467 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4468 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4469 }
1de06328
YO
4470 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4471 if ( (STRLEN)minlen < longest_float_length )
4472 minlen= longest_float_length;
4473 if ( (STRLEN)minlen < longest_fixed_length )
4474 minlen= longest_fixed_length;
4475 */
a0ed51b3
LW
4476 }
4477 else {
c277df42
IZ
4478 /* Several toplevels. Best we can is to set minlen. */
4479 I32 fake;
653099ff 4480 struct regnode_charclass_class ch_class;
cb434fcc 4481 I32 last_close = 0;
c277df42 4482
5d458dd8 4483 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4484
c277df42 4485 scan = r->program + 1;
830247a4 4486 cl_init(pRExC_state, &ch_class);
653099ff 4487 data.start_class = &ch_class;
cb434fcc 4488 data.last_closep = &last_close;
07be1b83 4489
de8c5301 4490
1de06328 4491 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4492 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4493
786e8c11 4494 CHECK_RESTUDY_GOTO;
07be1b83 4495
33b8afdf 4496 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4497 = r->float_substr = r->float_utf8 = NULL;
653099ff 4498 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4499 && !cl_is_anything(data.start_class))
4500 {
1df70142 4501 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4502
a02a5408 4503 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4504 struct regnode_charclass_class);
4505 StructCopy(data.start_class,
830247a4 4506 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4507 struct regnode_charclass_class);
830247a4 4508 r->regstclass = (regnode*)RExC_rx->data->data[n];
bbe252da 4509 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4510 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4511 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4512 PerlIO_printf(Perl_debug_log,
a0288114 4513 "synthetic stclass \"%s\".\n",
3f7c398e 4514 SvPVX_const(sv));});
653099ff 4515 }
a0d0e21e
LW
4516 }
4517
1de06328
YO
4518 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4519 the "real" pattern. */
cf9788e3
RGS
4520 DEBUG_OPTIMISE_r({
4521 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4522 minlen, r->minlen);
4523 });
de8c5301 4524 r->minlenret = minlen;
1de06328
YO
4525 if (r->minlen < minlen)
4526 r->minlen = minlen;
4527
b81d288d 4528 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4529 r->extflags |= RXf_GPOS_SEEN;
830247a4 4530 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4531 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4532 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4533 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4534 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4535 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4536 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4537 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4538 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4539 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4540 if (RExC_paren_names)
4541 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4542 else
4543 r->paren_names = NULL;
4544
40d049e4
YO
4545 if (RExC_recurse_count) {
4546 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4547 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4548 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4549 }
4550 }
a02a5408
JC
4551 Newxz(r->startp, RExC_npar, I32);
4552 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4553 /* assume we don't need to swap parens around before we match */
4554
be8e71aa
YO
4555 DEBUG_DUMP_r({
4556 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4557 regdump(r);
4558 });
8e9a8a48
YO
4559 DEBUG_OFFSETS_r(if (r->offsets) {
4560 const U32 len = r->offsets[0];
4561 U32 i;
4562 GET_RE_DEBUG_FLAGS_DECL;
4563 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4564 for (i = 1; i <= len; i++) {
4565 if (r->offsets[i*2-1] || r->offsets[i*2])
4566 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4567 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4568 }
4569 PerlIO_printf(Perl_debug_log, "\n");
4570 });
a0d0e21e 4571 return(r);
f9f4320a 4572 END_BLOCK
a687059c
LW
4573}
4574
f9f4320a
YO
4575#undef CORE_ONLY_BLOCK
4576#undef END_BLOCK
4577#undef RE_ENGINE_PTR
3dab1dad 4578
9af228c6 4579#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4580SV*
4581Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4582{
4583 I32 parno = 0; /* no match */
4584 if (PL_curpm) {
4585 const REGEXP * const rx = PM_GETRE(PL_curpm);
4586 if (rx && rx->paren_names) {
4587 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4588 if (he_str) {
4589 IV i;
4590 SV* sv_dat=HeVAL(he_str);
4591 I32 *nums=(I32*)SvPVX(sv_dat);
4592 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4593 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4594 rx->endp[nums[i]] != -1)
4595 {
4596 parno = nums[i];
4597 break;
4598 }
4599 }
4600 }
4601 }
4602 }
4603 if ( !parno ) {
4604 return 0;
4605 } else {
4606 GV *gv_paren;
4607 SV *sv= sv_newmortal();
4608 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4609 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4610 return GvSVn(gv_paren);
4611 }
4612}
9af228c6 4613#endif
0a4db386 4614
894be9b7 4615/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4616 * If flags is REG_RSN_RETURN_NULL returns null.
4617 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4618 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4619 * to the parsed name as looked up in the RExC_paren_names hash.
4620 * If there is an error throws a vFAIL().. type exception.
894be9b7 4621 */
0a4db386
YO
4622
4623#define REG_RSN_RETURN_NULL 0
4624#define REG_RSN_RETURN_NAME 1
4625#define REG_RSN_RETURN_DATA 2
4626
894be9b7
YO
4627STATIC SV*
4628S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4629 char *name_start = RExC_parse;
0a4db386 4630 if ( UTF ) {
894be9b7 4631 STRLEN numlen;
0a4db386
YO
4632 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4633 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4634 {
4635 RExC_parse += numlen;
4636 }
4637 } else {
4638 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4639 RExC_parse++;
4640 }
0a4db386
YO
4641 if ( flags ) {
4642 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4643 (int)(RExC_parse - name_start)));
894be9b7 4644 if (UTF)
0a4db386
YO
4645 SvUTF8_on(sv_name);
4646 if ( flags == REG_RSN_RETURN_NAME)
4647 return sv_name;
4648 else if (flags==REG_RSN_RETURN_DATA) {
4649 HE *he_str = NULL;
4650 SV *sv_dat = NULL;
4651 if ( ! sv_name ) /* should not happen*/
4652 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4653 if (RExC_paren_names)
4654 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4655 if ( he_str )
4656 sv_dat = HeVAL(he_str);
4657 if ( ! sv_dat )
4658 vFAIL("Reference to nonexistent named group");
4659 return sv_dat;
4660 }
4661 else {
4662 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4663 }
4664 /* NOT REACHED */
894be9b7 4665 }
0a4db386 4666 return NULL;
894be9b7
YO
4667}
4668
3dab1dad
YO
4669#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4670 int rem=(int)(RExC_end - RExC_parse); \
4671 int cut; \
4672 int num; \
4673 int iscut=0; \
4674 if (rem>10) { \
4675 rem=10; \
4676 iscut=1; \
4677 } \
4678 cut=10-rem; \
4679 if (RExC_lastparse!=RExC_parse) \
4680 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4681 rem, RExC_parse, \
4682 cut + 4, \
4683 iscut ? "..." : "<" \
4684 ); \
4685 else \
4686 PerlIO_printf(Perl_debug_log,"%16s",""); \
4687 \
4688 if (SIZE_ONLY) \
4689 num=RExC_size; \
4690 else \
4691 num=REG_NODE_NUM(RExC_emit); \
4692 if (RExC_lastnum!=num) \
0a4db386 4693 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4694 else \
0a4db386 4695 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4696 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4697 (int)((depth*2)), "", \
3dab1dad
YO
4698 (funcname) \
4699 ); \
4700 RExC_lastnum=num; \
4701 RExC_lastparse=RExC_parse; \
4702})
4703
07be1b83
YO
4704
4705
3dab1dad
YO
4706#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4707 DEBUG_PARSE_MSG((funcname)); \
4708 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4709})
6bda09f9
YO
4710#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4711 DEBUG_PARSE_MSG((funcname)); \
4712 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4713})
a687059c
LW
4714/*
4715 - reg - regular expression, i.e. main body or parenthesized thing
4716 *
4717 * Caller must absorb opening parenthesis.
4718 *
4719 * Combining parenthesis handling with the base level of regular expression
4720 * is a trifle forced, but the need to tie the tails of the branches to what
4721 * follows makes it hard to avoid.
4722 */
07be1b83
YO
4723#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4724#ifdef DEBUGGING
4725#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4726#else
4727#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4728#endif
3dab1dad 4729
e2e6a0f1
YO
4730/* this idea is borrowed from STR_WITH_LEN in handy.h */
4731#define CHECK_WORD(s,v,l) \
4732 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4733
76e3520e 4734STATIC regnode *
3dab1dad 4735S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4736 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4737{
27da23d5 4738 dVAR;
c277df42
IZ
4739 register regnode *ret; /* Will be the head of the group. */
4740 register regnode *br;
4741 register regnode *lastbr;
cbbf8932 4742 register regnode *ender = NULL;
a0d0e21e 4743 register I32 parno = 0;
cbbf8932
AL
4744 I32 flags;
4745 const I32 oregflags = RExC_flags;
6136c704
AL
4746 bool have_branch = 0;
4747 bool is_open = 0;
9d1d55b5
JP
4748
4749 /* for (?g), (?gc), and (?o) warnings; warning
4750 about (?c) will warn about (?g) -- japhy */
4751
6136c704
AL
4752#define WASTED_O 0x01
4753#define WASTED_G 0x02
4754#define WASTED_C 0x04
4755#define WASTED_GC (0x02|0x04)
cbbf8932 4756 I32 wastedflags = 0x00;
9d1d55b5 4757
fac92740 4758 char * parse_start = RExC_parse; /* MJD */
a28509cc 4759 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4760
3dab1dad
YO
4761 GET_RE_DEBUG_FLAGS_DECL;
4762 DEBUG_PARSE("reg ");
4763
4764
821b33a5 4765 *flagp = 0; /* Tentatively. */
a0d0e21e 4766
9d1d55b5 4767
a0d0e21e
LW
4768 /* Make an OPEN node, if parenthesized. */
4769 if (paren) {
e2e6a0f1
YO
4770 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4771 char *start_verb = RExC_parse;
4772 STRLEN verb_len = 0;
4773 char *start_arg = NULL;
4774 unsigned char op = 0;
4775 int argok = 1;
4776 int internal_argval = 0; /* internal_argval is only useful if !argok */
4777 while ( *RExC_parse && *RExC_parse != ')' ) {
4778 if ( *RExC_parse == ':' ) {
4779 start_arg = RExC_parse + 1;
4780 break;
4781 }
4782 RExC_parse++;
4783 }
4784 ++start_verb;
4785 verb_len = RExC_parse - start_verb;
4786 if ( start_arg ) {
4787 RExC_parse++;
4788 while ( *RExC_parse && *RExC_parse != ')' )
4789 RExC_parse++;
4790 if ( *RExC_parse != ')' )
4791 vFAIL("Unterminated verb pattern argument");
4792 if ( RExC_parse == start_arg )
4793 start_arg = NULL;
4794 } else {
4795 if ( *RExC_parse != ')' )
4796 vFAIL("Unterminated verb pattern");
4797 }
5d458dd8 4798
e2e6a0f1
YO
4799 switch ( *start_verb ) {
4800 case 'A': /* (*ACCEPT) */
4801 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4802 op = ACCEPT;
4803 internal_argval = RExC_nestroot;
4804 }
4805 break;
4806 case 'C': /* (*COMMIT) */
4807 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4808 op = COMMIT;
e2e6a0f1
YO
4809 break;
4810 case 'F': /* (*FAIL) */
4811 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4812 op = OPFAIL;
4813 argok = 0;
4814 }
4815 break;
5d458dd8
YO
4816 case ':': /* (*:NAME) */
4817 case 'M': /* (*MARK:NAME) */
4818 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4819 op = MARKPOINT;
5d458dd8
YO
4820 argok = -1;
4821 }
4822 break;
4823 case 'P': /* (*PRUNE) */
4824 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4825 op = PRUNE;
e2e6a0f1 4826 break;
5d458dd8
YO
4827 case 'S': /* (*SKIP) */
4828 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4829 op = SKIP;
4830 break;
4831 case 'T': /* (*THEN) */
4832 /* [19:06] <TimToady> :: is then */
4833 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4834 op = CUTGROUP;
4835 RExC_seen |= REG_SEEN_CUTGROUP;
4836 }
e2e6a0f1
YO
4837 break;
4838 }
4839 if ( ! op ) {
4840 RExC_parse++;
4841 vFAIL3("Unknown verb pattern '%.*s'",
4842 verb_len, start_verb);
4843 }
4844 if ( argok ) {
4845 if ( start_arg && internal_argval ) {
4846 vFAIL3("Verb pattern '%.*s' may not have an argument",
4847 verb_len, start_verb);
4848 } else if ( argok < 0 && !start_arg ) {
4849 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4850 verb_len, start_verb);
4851 } else {
4852 ret = reganode(pRExC_state, op, internal_argval);
4853 if ( ! internal_argval && ! SIZE_ONLY ) {
4854 if (start_arg) {
4855 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4856 ARG(ret) = add_data( pRExC_state, 1, "S" );
4857 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4858 ret->flags = 0;
4859 } else {
4860 ret->flags = 1;
4861 }
4862 }
4863 }
4864 if (!internal_argval)
4865 RExC_seen |= REG_SEEN_VERBARG;
4866 } else if ( start_arg ) {
4867 vFAIL3("Verb pattern '%.*s' may not have an argument",
4868 verb_len, start_verb);
4869 } else {
4870 ret = reg_node(pRExC_state, op);
4871 }
4872 nextchar(pRExC_state);
4873 return ret;
4874 } else
fac92740 4875 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4876 U32 posflags = 0, negflags = 0;
4877 U32 *flagsp = &posflags;
6136c704 4878 bool is_logical = 0;
a28509cc 4879 const char * const seqstart = RExC_parse;
ca9dfc88 4880
830247a4
IZ
4881 RExC_parse++;
4882 paren = *RExC_parse++;
c277df42 4883 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4884 switch (paren) {
894be9b7 4885
fac92740 4886 case '<': /* (?<...) */
b81d288d 4887 if (*RExC_parse == '!')
c277df42 4888 paren = ',';
0a4db386
YO
4889 else if (*RExC_parse != '=')
4890 { /* (?<...>) */
81714fb9 4891 char *name_start;
894be9b7 4892 SV *svname;
81714fb9
YO
4893 paren= '>';
4894 case '\'': /* (?'...') */
4895 name_start= RExC_parse;
0a4db386
YO
4896 svname = reg_scan_name(pRExC_state,
4897 SIZE_ONLY ? /* reverse test from the others */
4898 REG_RSN_RETURN_NAME :
4899 REG_RSN_RETURN_NULL);
81714fb9
YO
4900 if (RExC_parse == name_start)
4901 goto unknown;
4902 if (*RExC_parse != paren)
4903 vFAIL2("Sequence (?%c... not terminated",
4904 paren=='>' ? '<' : paren);
4905 if (SIZE_ONLY) {
e62cc96a
YO
4906 HE *he_str;
4907 SV *sv_dat = NULL;
894be9b7
YO
4908 if (!svname) /* shouldnt happen */
4909 Perl_croak(aTHX_
4910 "panic: reg_scan_name returned NULL");
81714fb9
YO
4911 if (!RExC_paren_names) {
4912 RExC_paren_names= newHV();
4913 sv_2mortal((SV*)RExC_paren_names);
4914 }
4915 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4916 if ( he_str )
81714fb9 4917 sv_dat = HeVAL(he_str);
e62cc96a 4918 if ( ! sv_dat ) {
81714fb9 4919 /* croak baby croak */
e62cc96a
YO
4920 Perl_croak(aTHX_
4921 "panic: paren_name hash element allocation failed");
4922 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4923 IV count=SvIV(sv_dat);
4924 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4925 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4926 pv[count]=RExC_npar;
4927 SvIVX(sv_dat)++;
4928 } else {
4929 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4930 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4931 SvIOK_on(sv_dat);
4932 SvIVX(sv_dat)= 1;
e62cc96a
YO
4933 }
4934
81714fb9
YO
4935 /*sv_dump(sv_dat);*/
4936 }
4937 nextchar(pRExC_state);
4938 paren = 1;
4939 goto capturing_parens;
4940 }
4941 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4942 RExC_parse++;
fac92740
MJD
4943 case '=': /* (?=...) */
4944 case '!': /* (?!...) */
830247a4 4945 RExC_seen_zerolen++;
e2e6a0f1
YO
4946 if (*RExC_parse == ')') {
4947 ret=reg_node(pRExC_state, OPFAIL);
4948 nextchar(pRExC_state);
4949 return ret;
4950 }
fac92740
MJD
4951 case ':': /* (?:...) */
4952 case '>': /* (?>...) */
a0d0e21e 4953 break;
fac92740
MJD
4954 case '$': /* (?$...) */
4955 case '@': /* (?@...) */
8615cb43 4956 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4957 break;
fac92740 4958 case '#': /* (?#...) */
830247a4
IZ
4959 while (*RExC_parse && *RExC_parse != ')')
4960 RExC_parse++;
4961 if (*RExC_parse != ')')
c277df42 4962 FAIL("Sequence (?#... not terminated");
830247a4 4963 nextchar(pRExC_state);
a0d0e21e
LW
4964 *flagp = TRYAGAIN;
4965 return NULL;
894be9b7
YO
4966 case '0' : /* (?0) */
4967 case 'R' : /* (?R) */
4968 if (*RExC_parse != ')')
6bda09f9 4969 FAIL("Sequence (?R) not terminated");
1a147d38 4970 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4971 nextchar(pRExC_state);
4972 return ret;
4973 /*notreached*/
894be9b7
YO
4974 { /* named and numeric backreferences */
4975 I32 num;
4976 char * parse_start;
4977 case '&': /* (?&NAME) */
4978 parse_start = RExC_parse - 1;
4979 {
0a4db386
YO
4980 SV *sv_dat = reg_scan_name(pRExC_state,
4981 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4982 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4983 }
4984 goto gen_recurse_regop;
4985 /* NOT REACHED */
542fa716
YO
4986 case '+':
4987 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4988 RExC_parse++;
4989 vFAIL("Illegal pattern");
4990 }
4991 goto parse_recursion;
4992 /* NOT REACHED*/
4993 case '-': /* (?-1) */
4994 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4995 RExC_parse--; /* rewind to let it be handled later */
4996 goto parse_flags;
4997 }
4998 /*FALLTHROUGH */
6bda09f9
YO
4999 case '1': case '2': case '3': case '4': /* (?1) */
5000 case '5': case '6': case '7': case '8': case '9':
5001 RExC_parse--;
542fa716 5002 parse_recursion:
894be9b7
YO
5003 num = atoi(RExC_parse);
5004 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5005 if (*RExC_parse == '-')
5006 RExC_parse++;
6bda09f9
YO
5007 while (isDIGIT(*RExC_parse))
5008 RExC_parse++;
5009 if (*RExC_parse!=')')
5010 vFAIL("Expecting close bracket");
894be9b7
YO
5011
5012 gen_recurse_regop:
542fa716
YO
5013 if ( paren == '-' ) {
5014 /*
5015 Diagram of capture buffer numbering.
5016 Top line is the normal capture buffer numbers
5017 Botton line is the negative indexing as from
5018 the X (the (?-2))
5019
5020 + 1 2 3 4 5 X 6 7
5021 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5022 - 5 4 3 2 1 X x x
5023
5024 */
5025 num = RExC_npar + num;
5026 if (num < 1) {
5027 RExC_parse++;
5028 vFAIL("Reference to nonexistent group");
5029 }
5030 } else if ( paren == '+' ) {
5031 num = RExC_npar + num - 1;
5032 }
5033
1a147d38 5034 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5035 if (!SIZE_ONLY) {
5036 if (num > (I32)RExC_rx->nparens) {
5037 RExC_parse++;
5038 vFAIL("Reference to nonexistent group");
5039 }
40d049e4 5040 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5041 RExC_emit++;
226de585 5042 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5043 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5044 } else {
6bda09f9 5045 RExC_size++;
6bda09f9 5046 }
0a4db386 5047 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5048 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5049 Set_Node_Offset(ret, parse_start); /* MJD */
5050
6bda09f9
YO
5051 nextchar(pRExC_state);
5052 return ret;
894be9b7
YO
5053 } /* named and numeric backreferences */
5054 /* NOT REACHED */
5055
fac92740 5056 case 'p': /* (?p...) */
9014280d 5057 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5058 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5059 /* FALL THROUGH*/
fac92740 5060 case '?': /* (??...) */
6136c704 5061 is_logical = 1;
438a3801
YST
5062 if (*RExC_parse != '{')
5063 goto unknown;
830247a4 5064 paren = *RExC_parse++;
0f5d15d6 5065 /* FALL THROUGH */
fac92740 5066 case '{': /* (?{...}) */
c277df42 5067 {
c277df42
IZ
5068 I32 count = 1, n = 0;
5069 char c;
830247a4 5070 char *s = RExC_parse;
c277df42 5071
830247a4
IZ
5072 RExC_seen_zerolen++;
5073 RExC_seen |= REG_SEEN_EVAL;
5074 while (count && (c = *RExC_parse)) {
6136c704
AL
5075 if (c == '\\') {
5076 if (RExC_parse[1])
5077 RExC_parse++;
5078 }
b81d288d 5079 else if (c == '{')
c277df42 5080 count++;
b81d288d 5081 else if (c == '}')
c277df42 5082 count--;
830247a4 5083 RExC_parse++;
c277df42 5084 }
6136c704 5085 if (*RExC_parse != ')') {
b81d288d 5086 RExC_parse = s;
b45f050a
JF
5087 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5088 }
c277df42 5089 if (!SIZE_ONLY) {
f3548bdc 5090 PAD *pad;
6136c704
AL
5091 OP_4tree *sop, *rop;
5092 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5093
569233ed
SB
5094 ENTER;
5095 Perl_save_re_context(aTHX);
f3548bdc 5096 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5097 sop->op_private |= OPpREFCOUNTED;
5098 /* re_dup will OpREFCNT_inc */
5099 OpREFCNT_set(sop, 1);
569233ed 5100 LEAVE;
c277df42 5101
830247a4
IZ
5102 n = add_data(pRExC_state, 3, "nop");
5103 RExC_rx->data->data[n] = (void*)rop;
5104 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5105 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5106 SvREFCNT_dec(sv);
a0ed51b3 5107 }
e24b16f9 5108 else { /* First pass */
830247a4 5109 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5110 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5111 /* No compiled RE interpolated, has runtime
5112 components ===> unsafe. */
5113 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5114 if (PL_tainting && PL_tainted)
cc6b7395 5115 FAIL("Eval-group in insecure regular expression");
54df2634 5116#if PERL_VERSION > 8
923e4eb5 5117 if (IN_PERL_COMPILETIME)
b5c19bd7 5118 PL_cv_has_eval = 1;
54df2634 5119#endif
c277df42 5120 }
b5c19bd7 5121
830247a4 5122 nextchar(pRExC_state);
6136c704 5123 if (is_logical) {
830247a4 5124 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5125 if (!SIZE_ONLY)
5126 ret->flags = 2;
3dab1dad 5127 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5128 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5129 return ret;
5130 }
ccb2c380
MP
5131 ret = reganode(pRExC_state, EVAL, n);
5132 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5133 Set_Node_Offset(ret, parse_start);
5134 return ret;
c277df42 5135 }
fac92740 5136 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5137 {
0a4db386 5138 int is_define= 0;
fac92740 5139 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5140 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5141 || RExC_parse[1] == '<'
830247a4 5142 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5143 I32 flag;
5144
830247a4 5145 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5146 if (!SIZE_ONLY)
5147 ret->flags = 1;
3dab1dad 5148 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5149 goto insert_if;
b81d288d 5150 }
a0ed51b3 5151 }
0a4db386
YO
5152 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5153 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5154 {
5155 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5156 char *name_start= RExC_parse++;
5157 I32 num = 0;
5158 SV *sv_dat=reg_scan_name(pRExC_state,
5159 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5160 if (RExC_parse == name_start || *RExC_parse != ch)
5161 vFAIL2("Sequence (?(%c... not terminated",
5162 (ch == '>' ? '<' : ch));
5163 RExC_parse++;
5164 if (!SIZE_ONLY) {
5165 num = add_data( pRExC_state, 1, "S" );
5166 RExC_rx->data->data[num]=(void*)sv_dat;
5167 SvREFCNT_inc(sv_dat);
5168 }
5169 ret = reganode(pRExC_state,NGROUPP,num);
5170 goto insert_if_check_paren;
5171 }
5172 else if (RExC_parse[0] == 'D' &&
5173 RExC_parse[1] == 'E' &&
5174 RExC_parse[2] == 'F' &&
5175 RExC_parse[3] == 'I' &&
5176 RExC_parse[4] == 'N' &&
5177 RExC_parse[5] == 'E')
5178 {
5179 ret = reganode(pRExC_state,DEFINEP,0);
5180 RExC_parse +=6 ;
5181 is_define = 1;
5182 goto insert_if_check_paren;
5183 }
5184 else if (RExC_parse[0] == 'R') {
5185 RExC_parse++;
5186 parno = 0;
5187 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5188 parno = atoi(RExC_parse++);
5189 while (isDIGIT(*RExC_parse))
5190 RExC_parse++;
5191 } else if (RExC_parse[0] == '&') {
5192 SV *sv_dat;
5193 RExC_parse++;
5194 sv_dat = reg_scan_name(pRExC_state,
5195 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5196 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5197 }
1a147d38 5198 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5199 goto insert_if_check_paren;
5200 }
830247a4 5201 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5202 /* (?(1)...) */
6136c704 5203 char c;
830247a4 5204 parno = atoi(RExC_parse++);
c277df42 5205
830247a4
IZ
5206 while (isDIGIT(*RExC_parse))
5207 RExC_parse++;
fac92740 5208 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5209
0a4db386 5210 insert_if_check_paren:
830247a4 5211 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5212 vFAIL("Switch condition not recognized");
c277df42 5213 insert_if:
3dab1dad
YO
5214 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5215 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5216 if (br == NULL)
830247a4 5217 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5218 else
3dab1dad 5219 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5220 c = *nextchar(pRExC_state);
d1b80229
IZ
5221 if (flags&HASWIDTH)
5222 *flagp |= HASWIDTH;
c277df42 5223 if (c == '|') {
0a4db386
YO
5224 if (is_define)
5225 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5226 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5227 regbranch(pRExC_state, &flags, 1,depth+1);
5228 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5229 if (flags&HASWIDTH)
5230 *flagp |= HASWIDTH;
830247a4 5231 c = *nextchar(pRExC_state);
a0ed51b3
LW
5232 }
5233 else
c277df42
IZ
5234 lastbr = NULL;
5235 if (c != ')')
8615cb43 5236 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5237 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5238 REGTAIL(pRExC_state, br, ender);
c277df42 5239 if (lastbr) {
3dab1dad
YO
5240 REGTAIL(pRExC_state, lastbr, ender);
5241 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5242 }
5243 else
3dab1dad 5244 REGTAIL(pRExC_state, ret, ender);
c277df42 5245 return ret;
a0ed51b3
LW
5246 }
5247 else {
830247a4 5248 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5249 }
5250 }
1b1626e4 5251 case 0:
830247a4 5252 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5253 vFAIL("Sequence (? incomplete");
1b1626e4 5254 break;
a0d0e21e 5255 default:
830247a4 5256 --RExC_parse;
fac92740 5257 parse_flags: /* (?i) */
830247a4 5258 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5259 /* (?g), (?gc) and (?o) are useless here
5260 and must be globally applied -- japhy */
5261
5262 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5264 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5265 if (! (wastedflags & wflagbit) ) {
5266 wastedflags |= wflagbit;
5267 vWARN5(
5268 RExC_parse + 1,
5269 "Useless (%s%c) - %suse /%c modifier",
5270 flagsp == &negflags ? "?-" : "?",
5271 *RExC_parse,
5272 flagsp == &negflags ? "don't " : "",
5273 *RExC_parse
5274 );
5275 }
5276 }
5277 }
5278 else if (*RExC_parse == 'c') {
5279 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5280 if (! (wastedflags & WASTED_C) ) {
5281 wastedflags |= WASTED_GC;
9d1d55b5
JP
5282 vWARN3(
5283 RExC_parse + 1,
5284 "Useless (%sc) - %suse /gc modifier",
5285 flagsp == &negflags ? "?-" : "?",
5286 flagsp == &negflags ? "don't " : ""
5287 );
5288 }
5289 }
5290 }
5291 else { pmflag(flagsp, *RExC_parse); }
5292
830247a4 5293 ++RExC_parse;
ca9dfc88 5294 }
830247a4 5295 if (*RExC_parse == '-') {
ca9dfc88 5296 flagsp = &negflags;
9d1d55b5 5297 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5298 ++RExC_parse;
ca9dfc88 5299 goto parse_flags;
48c036b1 5300 }
e2509266
JH
5301 RExC_flags |= posflags;
5302 RExC_flags &= ~negflags;
830247a4
IZ
5303 if (*RExC_parse == ':') {
5304 RExC_parse++;
ca9dfc88
IZ
5305 paren = ':';
5306 break;
5307 }
c277df42 5308 unknown:
830247a4
IZ
5309 if (*RExC_parse != ')') {
5310 RExC_parse++;
5311 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5312 }
830247a4 5313 nextchar(pRExC_state);
a0d0e21e
LW
5314 *flagp = TRYAGAIN;
5315 return NULL;
5316 }
5317 }
fac92740 5318 else { /* (...) */
81714fb9 5319 capturing_parens:
830247a4
IZ
5320 parno = RExC_npar;
5321 RExC_npar++;
e2e6a0f1 5322
830247a4 5323 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5324 if (!SIZE_ONLY ){
5325 if (!RExC_nestroot)
5326 RExC_nestroot = parno;
5327 if (RExC_seen & REG_SEEN_RECURSE) {
5328 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5329 "Setting open paren #%"IVdf" to %d\n",
5330 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5331 RExC_open_parens[parno-1]= ret;
5332 }
6bda09f9 5333 }
fac92740
MJD
5334 Set_Node_Length(ret, 1); /* MJD */
5335 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5336 is_open = 1;
a0d0e21e 5337 }
a0ed51b3 5338 }
fac92740 5339 else /* ! paren */
a0d0e21e
LW
5340 ret = NULL;
5341
5342 /* Pick up the branches, linking them together. */
fac92740 5343 parse_start = RExC_parse; /* MJD */
3dab1dad 5344 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5345 /* branch_len = (paren != 0); */
2af232bd 5346
a0d0e21e
LW
5347 if (br == NULL)
5348 return(NULL);
830247a4
IZ
5349 if (*RExC_parse == '|') {
5350 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5351 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5352 }
fac92740 5353 else { /* MJD */
6bda09f9 5354 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5355 Set_Node_Length(br, paren != 0);
5356 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5357 }
c277df42
IZ
5358 have_branch = 1;
5359 if (SIZE_ONLY)
830247a4 5360 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5361 }
5362 else if (paren == ':') {
c277df42
IZ
5363 *flagp |= flags&SIMPLE;
5364 }
6136c704 5365 if (is_open) { /* Starts with OPEN. */
3dab1dad 5366 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5367 }
5368 else if (paren != '?') /* Not Conditional */
a0d0e21e 5369 ret = br;
32a0ca98 5370 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5371 lastbr = br;
830247a4
IZ
5372 while (*RExC_parse == '|') {
5373 if (!SIZE_ONLY && RExC_extralen) {
5374 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5375 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5376 }
5377 if (SIZE_ONLY)
830247a4
IZ
5378 RExC_extralen += 2; /* Account for LONGJMP. */
5379 nextchar(pRExC_state);
3dab1dad 5380 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5381
a687059c 5382 if (br == NULL)
a0d0e21e 5383 return(NULL);
3dab1dad 5384 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5385 lastbr = br;
821b33a5
IZ
5386 if (flags&HASWIDTH)
5387 *flagp |= HASWIDTH;
a687059c 5388 *flagp |= flags&SPSTART;
a0d0e21e
LW
5389 }
5390
c277df42
IZ
5391 if (have_branch || paren != ':') {
5392 /* Make a closing node, and hook it on the end. */
5393 switch (paren) {
5394 case ':':
830247a4 5395 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5396 break;
5397 case 1:
c74340f9 5398 RExC_cpar++;
830247a4 5399 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5400 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5401 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5402 "Setting close paren #%"IVdf" to %d\n",
5403 (IV)parno, REG_NODE_NUM(ender)));
5404 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5405 if (RExC_nestroot == parno)
5406 RExC_nestroot = 0;
40d049e4 5407 }
fac92740
MJD
5408 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5409 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5410 break;
5411 case '<':
c277df42
IZ
5412 case ',':
5413 case '=':
5414 case '!':
c277df42 5415 *flagp &= ~HASWIDTH;
821b33a5
IZ
5416 /* FALL THROUGH */
5417 case '>':
830247a4 5418 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5419 break;
5420 case 0:
830247a4 5421 ender = reg_node(pRExC_state, END);
40d049e4
YO
5422 if (!SIZE_ONLY) {
5423 assert(!RExC_opend); /* there can only be one! */
5424 RExC_opend = ender;
5425 }
c277df42
IZ
5426 break;
5427 }
eaf3ca90 5428 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5429
9674d46a 5430 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5431 if (depth==1)
5432 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5433
c277df42 5434 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5435 for (br = ret; br; br = regnext(br)) {
5436 const U8 op = PL_regkind[OP(br)];
5437 if (op == BRANCH) {
07be1b83 5438 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5439 }
5440 else if (op == BRANCHJ) {
07be1b83 5441 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5442 }
c277df42
IZ
5443 }
5444 }
a0d0e21e 5445 }
c277df42
IZ
5446
5447 {
e1ec3a88
AL
5448 const char *p;
5449 static const char parens[] = "=!<,>";
c277df42
IZ
5450
5451 if (paren && (p = strchr(parens, paren))) {
eb160463 5452 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5453 int flag = (p - parens) > 1;
5454
5455 if (paren == '>')
5456 node = SUSPEND, flag = 0;
6bda09f9 5457 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5458 Set_Node_Cur_Length(ret);
5459 Set_Node_Offset(ret, parse_start + 1);
c277df42 5460 ret->flags = flag;
07be1b83 5461 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5462 }
a0d0e21e
LW
5463 }
5464
5465 /* Check for proper termination. */
ce3e6498 5466 if (paren) {
e2509266 5467 RExC_flags = oregflags;
830247a4
IZ
5468 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5469 RExC_parse = oregcomp_parse;
380a0633 5470 vFAIL("Unmatched (");
ce3e6498 5471 }
a0ed51b3 5472 }
830247a4
IZ
5473 else if (!paren && RExC_parse < RExC_end) {
5474 if (*RExC_parse == ')') {
5475 RExC_parse++;
380a0633 5476 vFAIL("Unmatched )");
a0ed51b3
LW
5477 }
5478 else
b45f050a 5479 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5480 /* NOTREACHED */
5481 }
a687059c 5482
a0d0e21e 5483 return(ret);
a687059c
LW
5484}
5485
5486/*
5487 - regbranch - one alternative of an | operator
5488 *
5489 * Implements the concatenation operator.
5490 */
76e3520e 5491STATIC regnode *
3dab1dad 5492S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5493{
97aff369 5494 dVAR;
c277df42
IZ
5495 register regnode *ret;
5496 register regnode *chain = NULL;
5497 register regnode *latest;
5498 I32 flags = 0, c = 0;
3dab1dad
YO
5499 GET_RE_DEBUG_FLAGS_DECL;
5500 DEBUG_PARSE("brnc");
b81d288d 5501 if (first)
c277df42
IZ
5502 ret = NULL;
5503 else {
b81d288d 5504 if (!SIZE_ONLY && RExC_extralen)
830247a4 5505 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5506 else {
830247a4 5507 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5508 Set_Node_Length(ret, 1);
5509 }
c277df42
IZ
5510 }
5511
b81d288d 5512 if (!first && SIZE_ONLY)
830247a4 5513 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5514
c277df42 5515 *flagp = WORST; /* Tentatively. */
a0d0e21e 5516
830247a4
IZ
5517 RExC_parse--;
5518 nextchar(pRExC_state);
5519 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5520 flags &= ~TRYAGAIN;
3dab1dad 5521 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5522 if (latest == NULL) {
5523 if (flags & TRYAGAIN)
5524 continue;
5525 return(NULL);
a0ed51b3
LW
5526 }
5527 else if (ret == NULL)
c277df42 5528 ret = latest;
a0d0e21e 5529 *flagp |= flags&HASWIDTH;
c277df42 5530 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5531 *flagp |= flags&SPSTART;
5532 else {
830247a4 5533 RExC_naughty++;
3dab1dad 5534 REGTAIL(pRExC_state, chain, latest);
a687059c 5535 }
a0d0e21e 5536 chain = latest;
c277df42
IZ
5537 c++;
5538 }
5539 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5540 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5541 if (ret == NULL)
5542 ret = chain;
5543 }
5544 if (c == 1) {
5545 *flagp |= flags&SIMPLE;
a0d0e21e 5546 }
a687059c 5547
d4c19fe8 5548 return ret;
a687059c
LW
5549}
5550
5551/*
5552 - regpiece - something followed by possible [*+?]
5553 *
5554 * Note that the branching code sequences used for ? and the general cases
5555 * of * and + are somewhat optimized: they use the same NOTHING node as
5556 * both the endmarker for their branch list and the body of the last branch.
5557 * It might seem that this node could be dispensed with entirely, but the
5558 * endmarker role is not redundant.
5559 */
76e3520e 5560STATIC regnode *
3dab1dad 5561S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5562{
97aff369 5563 dVAR;
c277df42 5564 register regnode *ret;
a0d0e21e
LW
5565 register char op;
5566 register char *next;
5567 I32 flags;
1df70142 5568 const char * const origparse = RExC_parse;
a0d0e21e 5569 I32 min;
c277df42 5570 I32 max = REG_INFTY;
fac92740 5571 char *parse_start;
10edeb5d 5572 const char *maxpos = NULL;
3dab1dad
YO
5573 GET_RE_DEBUG_FLAGS_DECL;
5574 DEBUG_PARSE("piec");
a0d0e21e 5575
3dab1dad 5576 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5577 if (ret == NULL) {
5578 if (flags & TRYAGAIN)
5579 *flagp |= TRYAGAIN;
5580 return(NULL);
5581 }
5582
830247a4 5583 op = *RExC_parse;
a0d0e21e 5584
830247a4 5585 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5586 maxpos = NULL;
fac92740 5587 parse_start = RExC_parse; /* MJD */
830247a4 5588 next = RExC_parse + 1;
a0d0e21e
LW
5589 while (isDIGIT(*next) || *next == ',') {
5590 if (*next == ',') {
5591 if (maxpos)
5592 break;
5593 else
5594 maxpos = next;
a687059c 5595 }
a0d0e21e
LW
5596 next++;
5597 }
5598 if (*next == '}') { /* got one */
5599 if (!maxpos)
5600 maxpos = next;
830247a4
IZ
5601 RExC_parse++;
5602 min = atoi(RExC_parse);
a0d0e21e
LW
5603 if (*maxpos == ',')
5604 maxpos++;
5605 else
830247a4 5606 maxpos = RExC_parse;
a0d0e21e
LW
5607 max = atoi(maxpos);
5608 if (!max && *maxpos != '0')
c277df42
IZ
5609 max = REG_INFTY; /* meaning "infinity" */
5610 else if (max >= REG_INFTY)
8615cb43 5611 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5612 RExC_parse = next;
5613 nextchar(pRExC_state);
a0d0e21e
LW
5614
5615 do_curly:
5616 if ((flags&SIMPLE)) {
830247a4 5617 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5618 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5619 Set_Node_Offset(ret, parse_start+1); /* MJD */
5620 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5621 }
5622 else {
3dab1dad 5623 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5624
5625 w->flags = 0;
3dab1dad 5626 REGTAIL(pRExC_state, ret, w);
830247a4 5627 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5628 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5629 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5630 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5631 }
6bda09f9 5632 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5633 /* MJD hk */
5634 Set_Node_Offset(ret, parse_start+1);
2af232bd 5635 Set_Node_Length(ret,
fac92740 5636 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5637
830247a4 5638 if (!SIZE_ONLY && RExC_extralen)
c277df42 5639 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5640 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5641 if (SIZE_ONLY)
830247a4
IZ
5642 RExC_whilem_seen++, RExC_extralen += 3;
5643 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5644 }
c277df42 5645 ret->flags = 0;
a0d0e21e
LW
5646
5647 if (min > 0)
821b33a5
IZ
5648 *flagp = WORST;
5649 if (max > 0)
5650 *flagp |= HASWIDTH;
a0d0e21e 5651 if (max && max < min)
8615cb43 5652 vFAIL("Can't do {n,m} with n > m");
c277df42 5653 if (!SIZE_ONLY) {
eb160463
GS
5654 ARG1_SET(ret, (U16)min);
5655 ARG2_SET(ret, (U16)max);
a687059c 5656 }
a687059c 5657
a0d0e21e 5658 goto nest_check;
a687059c 5659 }
a0d0e21e 5660 }
a687059c 5661
a0d0e21e
LW
5662 if (!ISMULT1(op)) {
5663 *flagp = flags;
a687059c 5664 return(ret);
a0d0e21e 5665 }
bb20fd44 5666
c277df42 5667#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5668
5669 /* if this is reinstated, don't forget to put this back into perldiag:
5670
5671 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5672
5673 (F) The part of the regexp subject to either the * or + quantifier
5674 could match an empty string. The {#} shows in the regular
5675 expression about where the problem was discovered.
5676
5677 */
5678
bb20fd44 5679 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5680 vFAIL("Regexp *+ operand could be empty");
b81d288d 5681#endif
bb20fd44 5682
fac92740 5683 parse_start = RExC_parse;
830247a4 5684 nextchar(pRExC_state);
a0d0e21e 5685
821b33a5 5686 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5687
5688 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5689 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5690 ret->flags = 0;
830247a4 5691 RExC_naughty += 4;
a0d0e21e
LW
5692 }
5693 else if (op == '*') {
5694 min = 0;
5695 goto do_curly;
a0ed51b3
LW
5696 }
5697 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5698 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5699 ret->flags = 0;
830247a4 5700 RExC_naughty += 3;
a0d0e21e
LW
5701 }
5702 else if (op == '+') {
5703 min = 1;
5704 goto do_curly;
a0ed51b3
LW
5705 }
5706 else if (op == '?') {
a0d0e21e
LW
5707 min = 0; max = 1;
5708 goto do_curly;
5709 }
5710 nest_check:
041457d9 5711 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5712 vWARN3(RExC_parse,
b45f050a 5713 "%.*s matches null string many times",
afd78fd5 5714 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5715 origparse);
a0d0e21e
LW
5716 }
5717
b9b4dddf 5718 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5719 nextchar(pRExC_state);
6bda09f9 5720 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5721 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5722 }
b9b4dddf
YO
5723#ifndef REG_ALLOW_MINMOD_SUSPEND
5724 else
5725#endif
5726 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5727 regnode *ender;
5728 nextchar(pRExC_state);
5729 ender = reg_node(pRExC_state, SUCCEED);
5730 REGTAIL(pRExC_state, ret, ender);
5731 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5732 ret->flags = 0;
5733 ender = reg_node(pRExC_state, TAIL);
5734 REGTAIL(pRExC_state, ret, ender);
5735 /*ret= ender;*/
5736 }
5737
5738 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5739 RExC_parse++;
b45f050a
JF
5740 vFAIL("Nested quantifiers");
5741 }
a0d0e21e
LW
5742
5743 return(ret);
a687059c
LW
5744}
5745
fc8cd66c
YO
5746
5747/* reg_namedseq(pRExC_state,UVp)
5748
5749 This is expected to be called by a parser routine that has
5750 recognized'\N' and needs to handle the rest. RExC_parse is
5751 expected to point at the first char following the N at the time
5752 of the call.
5753
5754 If valuep is non-null then it is assumed that we are parsing inside
5755 of a charclass definition and the first codepoint in the resolved
5756 string is returned via *valuep and the routine will return NULL.
5757 In this mode if a multichar string is returned from the charnames
5758 handler a warning will be issued, and only the first char in the
5759 sequence will be examined. If the string returned is zero length
5760 then the value of *valuep is undefined and NON-NULL will
5761 be returned to indicate failure. (This will NOT be a valid pointer
5762 to a regnode.)
5763
5764 If value is null then it is assumed that we are parsing normal text
5765 and inserts a new EXACT node into the program containing the resolved
5766 string and returns a pointer to the new node. If the string is
5767 zerolength a NOTHING node is emitted.
5768
5769 On success RExC_parse is set to the char following the endbrace.
5770 Parsing failures will generate a fatal errorvia vFAIL(...)
5771
5772 NOTE: We cache all results from the charnames handler locally in
5773 the RExC_charnames hash (created on first use) to prevent a charnames
5774 handler from playing silly-buggers and returning a short string and
5775 then a long string for a given pattern. Since the regexp program
5776 size is calculated during an initial parse this would result
5777 in a buffer overrun so we cache to prevent the charname result from
5778 changing during the course of the parse.
5779
5780 */
5781STATIC regnode *
5782S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5783{
5784 char * name; /* start of the content of the name */
5785 char * endbrace; /* endbrace following the name */
5786 SV *sv_str = NULL;
5787 SV *sv_name = NULL;
5788 STRLEN len; /* this has various purposes throughout the code */
5789 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5790 regnode *ret = NULL;
5791
5792 if (*RExC_parse != '{') {
5793 vFAIL("Missing braces on \\N{}");
5794 }
5795 name = RExC_parse+1;
5796 endbrace = strchr(RExC_parse, '}');
5797 if ( ! endbrace ) {
5798 RExC_parse++;
5799 vFAIL("Missing right brace on \\N{}");
5800 }
5801 RExC_parse = endbrace + 1;
5802
5803
5804 /* RExC_parse points at the beginning brace,
5805 endbrace points at the last */
5806 if ( name[0]=='U' && name[1]=='+' ) {
5807 /* its a "unicode hex" notation {U+89AB} */
5808 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5809 | PERL_SCAN_DISALLOW_PREFIX
5810 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5811 UV cp;
196f1508 5812 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5813 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5814 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5815 cp = 0xFFFD;
5816 }
5817 if (cp > 0xff)
5818 RExC_utf8 = 1;
5819 if ( valuep ) {
5820 *valuep = cp;
5821 return NULL;
5822 }
5823 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5824 } else {
5825 /* fetch the charnames handler for this scope */
5826 HV * const table = GvHV(PL_hintgv);
5827 SV **cvp= table ?
5828 hv_fetchs(table, "charnames", FALSE) :
5829 NULL;
5830 SV *cv= cvp ? *cvp : NULL;
5831 HE *he_str;
5832 int count;
5833 /* create an SV with the name as argument */
5834 sv_name = newSVpvn(name, endbrace - name);
5835
5836 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5837 vFAIL2("Constant(\\N{%s}) unknown: "
5838 "(possibly a missing \"use charnames ...\")",
5839 SvPVX(sv_name));
5840 }
5841 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5842 vFAIL2("Constant(\\N{%s}): "
5843 "$^H{charnames} is not defined",SvPVX(sv_name));
5844 }
5845
5846
5847
5848 if (!RExC_charnames) {
5849 /* make sure our cache is allocated */
5850 RExC_charnames = newHV();
6bda09f9 5851 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5852 }
5853 /* see if we have looked this one up before */
5854 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5855 if ( he_str ) {
5856 sv_str = HeVAL(he_str);
5857 cached = 1;
5858 } else {
5859 dSP ;
5860
5861 ENTER ;
5862 SAVETMPS ;
5863 PUSHMARK(SP) ;
5864
5865 XPUSHs(sv_name);
5866
5867 PUTBACK ;
5868
5869 count= call_sv(cv, G_SCALAR);
5870
5871 if (count == 1) { /* XXXX is this right? dmq */
5872 sv_str = POPs;
5873 SvREFCNT_inc_simple_void(sv_str);
5874 }
5875
5876 SPAGAIN ;
5877 PUTBACK ;
5878 FREETMPS ;
5879 LEAVE ;
5880
5881 if ( !sv_str || !SvOK(sv_str) ) {
5882 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5883 "did not return a defined value",SvPVX(sv_name));
5884 }
5885 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5886 cached = 1;
5887 }
5888 }
5889 if (valuep) {
5890 char *p = SvPV(sv_str, len);
5891 if (len) {
5892 STRLEN numlen = 1;
5893 if ( SvUTF8(sv_str) ) {
196f1508 5894 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5895 if (*valuep > 0x7F)
5896 RExC_utf8 = 1;
5897 /* XXXX
5898 We have to turn on utf8 for high bit chars otherwise
5899 we get failures with
5900
5901 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5902 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5903
5904 This is different from what \x{} would do with the same
5905 codepoint, where the condition is > 0xFF.
5906 - dmq
5907 */
5908
5909
5910 } else {
5911 *valuep = (UV)*p;
5912 /* warn if we havent used the whole string? */
5913 }
5914 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5915 vWARN2(RExC_parse,
5916 "Ignoring excess chars from \\N{%s} in character class",
5917 SvPVX(sv_name)
5918 );
5919 }
5920 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5921 vWARN2(RExC_parse,
5922 "Ignoring zero length \\N{%s} in character class",
5923 SvPVX(sv_name)
5924 );
5925 }
5926 if (sv_name)
5927 SvREFCNT_dec(sv_name);
5928 if (!cached)
5929 SvREFCNT_dec(sv_str);
5930 return len ? NULL : (regnode *)&len;
5931 } else if(SvCUR(sv_str)) {
5932
5933 char *s;
5934 char *p, *pend;
5935 STRLEN charlen = 1;
5936 char * parse_start = name-3; /* needed for the offsets */
5937 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5938
5939 ret = reg_node(pRExC_state,
5940 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5941 s= STRING(ret);
5942
5943 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5944 sv_utf8_upgrade(sv_str);
5945 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5946 RExC_utf8= 1;
5947 }
5948
5949 p = SvPV(sv_str, len);
5950 pend = p + len;
5951 /* len is the length written, charlen is the size the char read */
5952 for ( len = 0; p < pend; p += charlen ) {
5953 if (UTF) {
196f1508 5954 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5955 if (FOLD) {
5956 STRLEN foldlen,numlen;
5957 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5958 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5959 /* Emit all the Unicode characters. */
5960
5961 for (foldbuf = tmpbuf;
5962 foldlen;
5963 foldlen -= numlen)
5964 {
5965 uvc = utf8_to_uvchr(foldbuf, &numlen);
5966 if (numlen > 0) {
5967 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5968 s += unilen;
5969 len += unilen;
5970 /* In EBCDIC the numlen
5971 * and unilen can differ. */
5972 foldbuf += numlen;
5973 if (numlen >= foldlen)
5974 break;
5975 }
5976 else
5977 break; /* "Can't happen." */
5978 }
5979 } else {
5980 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5981 if (unilen > 0) {
5982 s += unilen;
5983 len += unilen;
5984 }
5985 }
5986 } else {
5987 len++;
5988 REGC(*p, s++);
5989 }
5990 }
5991 if (SIZE_ONLY) {
5992 RExC_size += STR_SZ(len);
5993 } else {
5994 STR_LEN(ret) = len;
5995 RExC_emit += STR_SZ(len);
5996 }
5997 Set_Node_Cur_Length(ret); /* MJD */
5998 RExC_parse--;
5999 nextchar(pRExC_state);
6000 } else {
6001 ret = reg_node(pRExC_state,NOTHING);
6002 }
6003 if (!cached) {
6004 SvREFCNT_dec(sv_str);
6005 }
6006 if (sv_name) {
6007 SvREFCNT_dec(sv_name);
6008 }
6009 return ret;
6010
6011}
6012
6013
9e08bc66
TS
6014/*
6015 * reg_recode
6016 *
6017 * It returns the code point in utf8 for the value in *encp.
6018 * value: a code value in the source encoding
6019 * encp: a pointer to an Encode object
6020 *
6021 * If the result from Encode is not a single character,
6022 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6023 */
6024STATIC UV
6025S_reg_recode(pTHX_ const char value, SV **encp)
6026{
6027 STRLEN numlen = 1;
6028 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6029 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6030 : SvPVX(sv);
6031 const STRLEN newlen = SvCUR(sv);
6032 UV uv = UNICODE_REPLACEMENT;
6033
6034 if (newlen)
6035 uv = SvUTF8(sv)
6036 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6037 : *(U8*)s;
6038
6039 if (!newlen || numlen != newlen) {
6040 uv = UNICODE_REPLACEMENT;
6041 if (encp)
6042 *encp = NULL;
6043 }
6044 return uv;
6045}
6046
fc8cd66c 6047
a687059c
LW
6048/*
6049 - regatom - the lowest level
6050 *
6051 * Optimization: gobbles an entire sequence of ordinary characters so that
6052 * it can turn them into a single node, which is smaller to store and
6053 * faster to run. Backslashed characters are exceptions, each becoming a
6054 * separate node; the code is simpler that way and it's not worth fixing.
6055 *
7f6f358c
YO
6056 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6057 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6058 */
76e3520e 6059STATIC regnode *
3dab1dad 6060S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6061{
97aff369 6062 dVAR;
cbbf8932 6063 register regnode *ret = NULL;
a0d0e21e 6064 I32 flags;
45948336 6065 char *parse_start = RExC_parse;
3dab1dad
YO
6066 GET_RE_DEBUG_FLAGS_DECL;
6067 DEBUG_PARSE("atom");
a0d0e21e
LW
6068 *flagp = WORST; /* Tentatively. */
6069
6070tryagain:
830247a4 6071 switch (*RExC_parse) {
a0d0e21e 6072 case '^':
830247a4
IZ
6073 RExC_seen_zerolen++;
6074 nextchar(pRExC_state);
bbe252da 6075 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6076 ret = reg_node(pRExC_state, MBOL);
bbe252da 6077 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6078 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6079 else
830247a4 6080 ret = reg_node(pRExC_state, BOL);
fac92740 6081 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6082 break;
6083 case '$':
830247a4 6084 nextchar(pRExC_state);
b81d288d 6085 if (*RExC_parse)
830247a4 6086 RExC_seen_zerolen++;
bbe252da 6087 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6088 ret = reg_node(pRExC_state, MEOL);
bbe252da 6089 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6090 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6091 else
830247a4 6092 ret = reg_node(pRExC_state, EOL);
fac92740 6093 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6094 break;
6095 case '.':
830247a4 6096 nextchar(pRExC_state);
bbe252da 6097 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6098 ret = reg_node(pRExC_state, SANY);
6099 else
6100 ret = reg_node(pRExC_state, REG_ANY);
6101 *flagp |= HASWIDTH|SIMPLE;
830247a4 6102 RExC_naughty++;
fac92740 6103 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6104 break;
6105 case '[':
b45f050a 6106 {
3dab1dad
YO
6107 char * const oregcomp_parse = ++RExC_parse;
6108 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6109 if (*RExC_parse != ']') {
6110 RExC_parse = oregcomp_parse;
b45f050a
JF
6111 vFAIL("Unmatched [");
6112 }
830247a4 6113 nextchar(pRExC_state);
a0d0e21e 6114 *flagp |= HASWIDTH|SIMPLE;
fac92740 6115 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6116 break;
b45f050a 6117 }
a0d0e21e 6118 case '(':
830247a4 6119 nextchar(pRExC_state);
3dab1dad 6120 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6121 if (ret == NULL) {
bf93d4cc 6122 if (flags & TRYAGAIN) {
830247a4 6123 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6124 /* Make parent create an empty node if needed. */
6125 *flagp |= TRYAGAIN;
6126 return(NULL);
6127 }
a0d0e21e 6128 goto tryagain;
bf93d4cc 6129 }
a0d0e21e
LW
6130 return(NULL);
6131 }
c277df42 6132 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6133 break;
6134 case '|':
6135 case ')':
6136 if (flags & TRYAGAIN) {
6137 *flagp |= TRYAGAIN;
6138 return NULL;
6139 }
b45f050a 6140 vFAIL("Internal urp");
a0d0e21e
LW
6141 /* Supposed to be caught earlier. */
6142 break;
85afd4ae 6143 case '{':
830247a4
IZ
6144 if (!regcurly(RExC_parse)) {
6145 RExC_parse++;
85afd4ae
CS
6146 goto defchar;
6147 }
6148 /* FALL THROUGH */
a0d0e21e
LW
6149 case '?':
6150 case '+':
6151 case '*':
830247a4 6152 RExC_parse++;
b45f050a 6153 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6154 break;
6155 case '\\':
830247a4 6156 switch (*++RExC_parse) {
a0d0e21e 6157 case 'A':
830247a4
IZ
6158 RExC_seen_zerolen++;
6159 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6160 *flagp |= SIMPLE;
830247a4 6161 nextchar(pRExC_state);
fac92740 6162 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6163 break;
6164 case 'G':
830247a4
IZ
6165 ret = reg_node(pRExC_state, GPOS);
6166 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6167 *flagp |= SIMPLE;
830247a4 6168 nextchar(pRExC_state);
fac92740 6169 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6170 break;
6171 case 'Z':
830247a4 6172 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6173 *flagp |= SIMPLE;
a1917ab9 6174 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6175 nextchar(pRExC_state);
a0d0e21e 6176 break;
b85d18e9 6177 case 'z':
830247a4 6178 ret = reg_node(pRExC_state, EOS);
b85d18e9 6179 *flagp |= SIMPLE;
830247a4
IZ
6180 RExC_seen_zerolen++; /* Do not optimize RE away */
6181 nextchar(pRExC_state);
fac92740 6182 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6183 break;
4a2d328f 6184 case 'C':
f33976b4
DB
6185 ret = reg_node(pRExC_state, CANY);
6186 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6187 *flagp |= HASWIDTH|SIMPLE;
830247a4 6188 nextchar(pRExC_state);
fac92740 6189 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6190 break;
6191 case 'X':
830247a4 6192 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6193 *flagp |= HASWIDTH;
830247a4 6194 nextchar(pRExC_state);
fac92740 6195 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6196 break;
a0d0e21e 6197 case 'w':
eb160463 6198 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6199 *flagp |= HASWIDTH|SIMPLE;
830247a4 6200 nextchar(pRExC_state);
fac92740 6201 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6202 break;
6203 case 'W':
eb160463 6204 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6205 *flagp |= HASWIDTH|SIMPLE;
830247a4 6206 nextchar(pRExC_state);
fac92740 6207 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6208 break;
6209 case 'b':
830247a4
IZ
6210 RExC_seen_zerolen++;
6211 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6212 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6213 *flagp |= SIMPLE;
830247a4 6214 nextchar(pRExC_state);
fac92740 6215 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6216 break;
6217 case 'B':
830247a4
IZ
6218 RExC_seen_zerolen++;
6219 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6220 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6221 *flagp |= SIMPLE;
830247a4 6222 nextchar(pRExC_state);
fac92740 6223 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6224 break;
6225 case 's':
eb160463 6226 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6227 *flagp |= HASWIDTH|SIMPLE;
830247a4 6228 nextchar(pRExC_state);
fac92740 6229 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6230 break;
6231 case 'S':
eb160463 6232 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6233 *flagp |= HASWIDTH|SIMPLE;
830247a4 6234 nextchar(pRExC_state);
fac92740 6235 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6236 break;
6237 case 'd':
ffc61ed2 6238 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6239 *flagp |= HASWIDTH|SIMPLE;
830247a4 6240 nextchar(pRExC_state);
fac92740 6241 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6242 break;
6243 case 'D':
ffc61ed2 6244 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6245 *flagp |= HASWIDTH|SIMPLE;
830247a4 6246 nextchar(pRExC_state);
fac92740 6247 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6248 break;
a14b48bc
LW
6249 case 'p':
6250 case 'P':
3568d838 6251 {
3dab1dad 6252 char* const oldregxend = RExC_end;
ccb2c380 6253 char* parse_start = RExC_parse - 2;
a14b48bc 6254
830247a4 6255 if (RExC_parse[1] == '{') {
3568d838 6256 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6257 RExC_end = strchr(RExC_parse, '}');
6258 if (!RExC_end) {
3dab1dad 6259 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6260 RExC_parse += 2;
6261 RExC_end = oldregxend;
0da60cf5 6262 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6263 }
830247a4 6264 RExC_end++;
a14b48bc 6265 }
af6f566e 6266 else {
830247a4 6267 RExC_end = RExC_parse + 2;
af6f566e
HS
6268 if (RExC_end > oldregxend)
6269 RExC_end = oldregxend;
6270 }
830247a4 6271 RExC_parse--;
a14b48bc 6272
3dab1dad 6273 ret = regclass(pRExC_state,depth+1);
a14b48bc 6274
830247a4
IZ
6275 RExC_end = oldregxend;
6276 RExC_parse--;
ccb2c380
MP
6277
6278 Set_Node_Offset(ret, parse_start + 2);
6279 Set_Node_Cur_Length(ret);
830247a4 6280 nextchar(pRExC_state);
a14b48bc
LW
6281 *flagp |= HASWIDTH|SIMPLE;
6282 }
6283 break;
fc8cd66c
YO
6284 case 'N':
6285 /* Handle \N{NAME} here and not below because it can be
6286 multicharacter. join_exact() will join them up later on.
6287 Also this makes sure that things like /\N{BLAH}+/ and
6288 \N{BLAH} being multi char Just Happen. dmq*/
6289 ++RExC_parse;
6290 ret= reg_namedseq(pRExC_state, NULL);
6291 break;
0a4db386 6292 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6293 {
6294 char ch= RExC_parse[1];
6295 if (ch != '<' && ch != '\'') {
6296 if (SIZE_ONLY)
6297 vWARN( RExC_parse + 1,
6298 "Possible broken named back reference treated as literal k");
6299 parse_start--;
6300 goto defchar;
6301 } else {
6302 char* name_start = (RExC_parse += 2);
6303 I32 num = 0;
0a4db386
YO
6304 SV *sv_dat = reg_scan_name(pRExC_state,
6305 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6306 ch= (ch == '<') ? '>' : '\'';
6307
81714fb9
YO
6308 if (RExC_parse == name_start || *RExC_parse != ch)
6309 vFAIL2("Sequence \\k%c... not terminated",
6310 (ch == '>' ? '<' : ch));
6311
6312 RExC_sawback = 1;
6313 ret = reganode(pRExC_state,
6314 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6315 num);
6316 *flagp |= HASWIDTH;
6317
6318
6319 if (!SIZE_ONLY) {
81714fb9
YO
6320 num = add_data( pRExC_state, 1, "S" );
6321 ARG_SET(ret,num);
6322 RExC_rx->data->data[num]=(void*)sv_dat;
6323 SvREFCNT_inc(sv_dat);
6324 }
6325 /* override incorrect value set in reganode MJD */
6326 Set_Node_Offset(ret, parse_start+1);
6327 Set_Node_Cur_Length(ret); /* MJD */
6328 nextchar(pRExC_state);
6329
6330 }
6331 break;
6332 }
a0d0e21e
LW
6333 case 'n':
6334 case 'r':
6335 case 't':
6336 case 'f':
6337 case 'e':
6338 case 'a':
6339 case 'x':
6340 case 'c':
6341 case '0':
6342 goto defchar;
c74340f9 6343 case 'R':
a0d0e21e
LW
6344 case '1': case '2': case '3': case '4':
6345 case '5': case '6': case '7': case '8': case '9':
6346 {
c74340f9
YO
6347 I32 num;
6348 bool isrel=(*RExC_parse=='R');
6349 if (isrel)
6350 RExC_parse++;
6351 num = atoi(RExC_parse);
6352 if (isrel) {
6353 num = RExC_cpar - num;
6354 if (num < 1)
6355 vFAIL("Reference to nonexistent or unclosed group");
6356 }
830247a4 6357 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6358 goto defchar;
6359 else {
3dab1dad 6360 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6361 while (isDIGIT(*RExC_parse))
6362 RExC_parse++;
b45f050a 6363
c74340f9
YO
6364 if (!SIZE_ONLY) {
6365 if (num > (I32)RExC_rx->nparens)
6366 vFAIL("Reference to nonexistent group");
6367 /* People make this error all the time apparently.
6368 So we cant fail on it, even though we should
6369
6370 else if (num >= RExC_cpar)
6371 vFAIL("Reference to unclosed group will always match");
6372 */
6373 }
830247a4 6374 RExC_sawback = 1;
eb160463
GS
6375 ret = reganode(pRExC_state,
6376 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6377 num);
a0d0e21e 6378 *flagp |= HASWIDTH;
2af232bd 6379
fac92740 6380 /* override incorrect value set in reganode MJD */
2af232bd 6381 Set_Node_Offset(ret, parse_start+1);
fac92740 6382 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6383 RExC_parse--;
6384 nextchar(pRExC_state);
a0d0e21e
LW
6385 }
6386 }
6387 break;
6388 case '\0':
830247a4 6389 if (RExC_parse >= RExC_end)
b45f050a 6390 FAIL("Trailing \\");
a0d0e21e
LW
6391 /* FALL THROUGH */
6392 default:
a0288114 6393 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6394 back into the quick-grab loop below */
45948336 6395 parse_start--;
a0d0e21e
LW
6396 goto defchar;
6397 }
6398 break;
4633a7c4
LW
6399
6400 case '#':
bbe252da 6401 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6402 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6403 RExC_parse++;
830247a4 6404 if (RExC_parse < RExC_end)
4633a7c4
LW
6405 goto tryagain;
6406 }
6407 /* FALL THROUGH */
6408
a0d0e21e 6409 default: {
ba210ebe 6410 register STRLEN len;
58ae7d3f 6411 register UV ender;
a0d0e21e 6412 register char *p;
3dab1dad 6413 char *s;
80aecb99 6414 STRLEN foldlen;
89ebb4a3 6415 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6416
6417 parse_start = RExC_parse - 1;
a0d0e21e 6418
830247a4 6419 RExC_parse++;
a0d0e21e
LW
6420
6421 defchar:
58ae7d3f 6422 ender = 0;
eb160463
GS
6423 ret = reg_node(pRExC_state,
6424 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6425 s = STRING(ret);
830247a4
IZ
6426 for (len = 0, p = RExC_parse - 1;
6427 len < 127 && p < RExC_end;
a0d0e21e
LW
6428 len++)
6429 {
3dab1dad 6430 char * const oldp = p;
5b5a24f7 6431
bbe252da 6432 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6433 p = regwhite(p, RExC_end);
a0d0e21e
LW
6434 switch (*p) {
6435 case '^':
6436 case '$':
6437 case '.':
6438 case '[':
6439 case '(':
6440 case ')':
6441 case '|':
6442 goto loopdone;
6443 case '\\':
6444 switch (*++p) {
6445 case 'A':
1ed8eac0
JF
6446 case 'C':
6447 case 'X':
a0d0e21e
LW
6448 case 'G':
6449 case 'Z':
b85d18e9 6450 case 'z':
a0d0e21e
LW
6451 case 'w':
6452 case 'W':
6453 case 'b':
6454 case 'B':
6455 case 's':
6456 case 'S':
6457 case 'd':
6458 case 'D':
a14b48bc
LW
6459 case 'p':
6460 case 'P':
fc8cd66c 6461 case 'N':
c74340f9 6462 case 'R':
a0d0e21e
LW
6463 --p;
6464 goto loopdone;
6465 case 'n':
6466 ender = '\n';
6467 p++;
a687059c 6468 break;
a0d0e21e
LW
6469 case 'r':
6470 ender = '\r';
6471 p++;
a687059c 6472 break;
a0d0e21e
LW
6473 case 't':
6474 ender = '\t';
6475 p++;
a687059c 6476 break;
a0d0e21e
LW
6477 case 'f':
6478 ender = '\f';
6479 p++;
a687059c 6480 break;
a0d0e21e 6481 case 'e':
c7f1f016 6482 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6483 p++;
a687059c 6484 break;
a0d0e21e 6485 case 'a':
c7f1f016 6486 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6487 p++;
a687059c 6488 break;
a0d0e21e 6489 case 'x':
a0ed51b3 6490 if (*++p == '{') {
1df70142 6491 char* const e = strchr(p, '}');
b81d288d 6492
b45f050a 6493 if (!e) {
830247a4 6494 RExC_parse = p + 1;
b45f050a
JF
6495 vFAIL("Missing right brace on \\x{}");
6496 }
de5f0749 6497 else {
a4c04bdc
NC
6498 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6499 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6500 STRLEN numlen = e - p - 1;
53305cf1 6501 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6502 if (ender > 0xff)
6503 RExC_utf8 = 1;
a0ed51b3
LW
6504 p = e + 1;
6505 }
a0ed51b3
LW
6506 }
6507 else {
a4c04bdc 6508 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6509 STRLEN numlen = 2;
53305cf1 6510 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6511 p += numlen;
6512 }
9e08bc66
TS
6513 if (PL_encoding && ender < 0x100)
6514 goto recode_encoding;
a687059c 6515 break;
a0d0e21e
LW
6516 case 'c':
6517 p++;
bbce6d69 6518 ender = UCHARAT(p++);
6519 ender = toCTRL(ender);
a687059c 6520 break;
a0d0e21e
LW
6521 case '0': case '1': case '2': case '3':case '4':
6522 case '5': case '6': case '7': case '8':case '9':
6523 if (*p == '0' ||
830247a4 6524 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6525 I32 flags = 0;
1df70142 6526 STRLEN numlen = 3;
53305cf1 6527 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6528 p += numlen;
6529 }
6530 else {
6531 --p;
6532 goto loopdone;
a687059c 6533 }
9e08bc66
TS
6534 if (PL_encoding && ender < 0x100)
6535 goto recode_encoding;
6536 break;
6537 recode_encoding:
6538 {
6539 SV* enc = PL_encoding;
6540 ender = reg_recode((const char)(U8)ender, &enc);
6541 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6542 vWARN(p, "Invalid escape in the specified encoding");
6543 RExC_utf8 = 1;
6544 }
a687059c 6545 break;
a0d0e21e 6546 case '\0':
830247a4 6547 if (p >= RExC_end)
b45f050a 6548 FAIL("Trailing \\");
a687059c 6549 /* FALL THROUGH */
a0d0e21e 6550 default:
041457d9 6551 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6552 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6553 goto normal_default;
a0d0e21e
LW
6554 }
6555 break;
a687059c 6556 default:
a0ed51b3 6557 normal_default:
fd400ab9 6558 if (UTF8_IS_START(*p) && UTF) {
1df70142 6559 STRLEN numlen;
5e12f4fb 6560 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6561 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6562 p += numlen;
6563 }
6564 else
6565 ender = *p++;
a0d0e21e 6566 break;
a687059c 6567 }
bbe252da 6568 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6569 p = regwhite(p, RExC_end);
60a8b682
JH
6570 if (UTF && FOLD) {
6571 /* Prime the casefolded buffer. */
ac7e0132 6572 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6573 }
a0d0e21e
LW
6574 if (ISMULT2(p)) { /* Back off on ?+*. */
6575 if (len)
6576 p = oldp;
16ea2a2e 6577 else if (UTF) {
80aecb99 6578 if (FOLD) {
60a8b682 6579 /* Emit all the Unicode characters. */
1df70142 6580 STRLEN numlen;
80aecb99
JH
6581 for (foldbuf = tmpbuf;
6582 foldlen;
6583 foldlen -= numlen) {
6584 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6585 if (numlen > 0) {
71207a34 6586 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6587 s += unilen;
6588 len += unilen;
6589 /* In EBCDIC the numlen
6590 * and unilen can differ. */
9dc45d57 6591 foldbuf += numlen;
47654450
JH
6592 if (numlen >= foldlen)
6593 break;
9dc45d57
JH
6594 }
6595 else
6596 break; /* "Can't happen." */
80aecb99
JH
6597 }
6598 }
6599 else {
71207a34 6600 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6601 if (unilen > 0) {
0ebc6274
JH
6602 s += unilen;
6603 len += unilen;
9dc45d57 6604 }
80aecb99 6605 }
a0ed51b3 6606 }
a0d0e21e
LW
6607 else {
6608 len++;
eb160463 6609 REGC((char)ender, s++);
a0d0e21e
LW
6610 }
6611 break;
a687059c 6612 }
16ea2a2e 6613 if (UTF) {
80aecb99 6614 if (FOLD) {
60a8b682 6615 /* Emit all the Unicode characters. */
1df70142 6616 STRLEN numlen;
80aecb99
JH
6617 for (foldbuf = tmpbuf;
6618 foldlen;
6619 foldlen -= numlen) {
6620 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6621 if (numlen > 0) {
71207a34 6622 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6623 len += unilen;
6624 s += unilen;
6625 /* In EBCDIC the numlen
6626 * and unilen can differ. */
9dc45d57 6627 foldbuf += numlen;
47654450
JH
6628 if (numlen >= foldlen)
6629 break;
9dc45d57
JH
6630 }
6631 else
6632 break;
80aecb99
JH
6633 }
6634 }
6635 else {
71207a34 6636 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6637 if (unilen > 0) {
0ebc6274
JH
6638 s += unilen;
6639 len += unilen;
9dc45d57 6640 }
80aecb99
JH
6641 }
6642 len--;
a0ed51b3
LW
6643 }
6644 else
eb160463 6645 REGC((char)ender, s++);
a0d0e21e
LW
6646 }
6647 loopdone:
830247a4 6648 RExC_parse = p - 1;
fac92740 6649 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6650 nextchar(pRExC_state);
793db0cb
JH
6651 {
6652 /* len is STRLEN which is unsigned, need to copy to signed */
6653 IV iv = len;
6654 if (iv < 0)
6655 vFAIL("Internal disaster");
6656 }
a0d0e21e
LW
6657 if (len > 0)
6658 *flagp |= HASWIDTH;
090f7165 6659 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6660 *flagp |= SIMPLE;
3dab1dad 6661
cd439c50 6662 if (SIZE_ONLY)
830247a4 6663 RExC_size += STR_SZ(len);
3dab1dad
YO
6664 else {
6665 STR_LEN(ret) = len;
830247a4 6666 RExC_emit += STR_SZ(len);
07be1b83 6667 }
3dab1dad 6668 }
a0d0e21e
LW
6669 break;
6670 }
a687059c 6671
a0d0e21e 6672 return(ret);
a687059c
LW
6673}
6674
873ef191 6675STATIC char *
5f66b61c 6676S_regwhite(char *p, const char *e)
5b5a24f7
CS
6677{
6678 while (p < e) {
6679 if (isSPACE(*p))
6680 ++p;
6681 else if (*p == '#') {
6682 do {
6683 p++;
6684 } while (p < e && *p != '\n');
6685 }
6686 else
6687 break;
6688 }
6689 return p;
6690}
6691
b8c5462f
JH
6692/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6693 Character classes ([:foo:]) can also be negated ([:^foo:]).
6694 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6695 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6696 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6697
6698#define POSIXCC_DONE(c) ((c) == ':')
6699#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6700#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6701
b8c5462f 6702STATIC I32
830247a4 6703S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6704{
97aff369 6705 dVAR;
936ed897 6706 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6707
830247a4 6708 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6709 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6710 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6711 const char c = UCHARAT(RExC_parse);
097eb12c 6712 char* const s = RExC_parse++;
b81d288d 6713
9a86a77b 6714 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6715 RExC_parse++;
6716 if (RExC_parse == RExC_end)
620e46c5 6717 /* Grandfather lone [:, [=, [. */
830247a4 6718 RExC_parse = s;
620e46c5 6719 else {
3dab1dad 6720 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6721 assert(*t == c);
6722
9a86a77b 6723 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6724 const char *posixcc = s + 1;
830247a4 6725 RExC_parse++; /* skip over the ending ] */
3dab1dad 6726
b8c5462f 6727 if (*s == ':') {
1df70142
AL
6728 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6729 const I32 skip = t - posixcc;
80916619
NC
6730
6731 /* Initially switch on the length of the name. */
6732 switch (skip) {
6733 case 4:
3dab1dad
YO
6734 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6735 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6736 break;
80916619
NC
6737 case 5:
6738 /* Names all of length 5. */
6739 /* alnum alpha ascii blank cntrl digit graph lower
6740 print punct space upper */
6741 /* Offset 4 gives the best switch position. */
6742 switch (posixcc[4]) {
6743 case 'a':
3dab1dad
YO
6744 if (memEQ(posixcc, "alph", 4)) /* alpha */
6745 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6746 break;
6747 case 'e':
3dab1dad
YO
6748 if (memEQ(posixcc, "spac", 4)) /* space */
6749 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6750 break;
6751 case 'h':
3dab1dad
YO
6752 if (memEQ(posixcc, "grap", 4)) /* graph */
6753 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6754 break;
6755 case 'i':
3dab1dad
YO
6756 if (memEQ(posixcc, "asci", 4)) /* ascii */
6757 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6758 break;
6759 case 'k':
3dab1dad
YO
6760 if (memEQ(posixcc, "blan", 4)) /* blank */
6761 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6762 break;
6763 case 'l':
3dab1dad
YO
6764 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6765 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6766 break;
6767 case 'm':
3dab1dad
YO
6768 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6769 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6770 break;
6771 case 'r':
3dab1dad
YO
6772 if (memEQ(posixcc, "lowe", 4)) /* lower */
6773 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6774 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6775 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6776 break;
6777 case 't':
3dab1dad
YO
6778 if (memEQ(posixcc, "digi", 4)) /* digit */
6779 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6780 else if (memEQ(posixcc, "prin", 4)) /* print */
6781 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6782 else if (memEQ(posixcc, "punc", 4)) /* punct */
6783 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6784 break;
b8c5462f
JH
6785 }
6786 break;
80916619 6787 case 6:
3dab1dad
YO
6788 if (memEQ(posixcc, "xdigit", 6))
6789 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6790 break;
6791 }
80916619
NC
6792
6793 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6794 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6795 t - s - 1, s + 1);
80916619
NC
6796 assert (posixcc[skip] == ':');
6797 assert (posixcc[skip+1] == ']');
b45f050a 6798 } else if (!SIZE_ONLY) {
b8c5462f 6799 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6800
830247a4 6801 /* adjust RExC_parse so the warning shows after
b45f050a 6802 the class closes */
9a86a77b 6803 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6804 RExC_parse++;
b45f050a
JF
6805 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6806 }
b8c5462f
JH
6807 } else {
6808 /* Maternal grandfather:
6809 * "[:" ending in ":" but not in ":]" */
830247a4 6810 RExC_parse = s;
767d463e 6811 }
620e46c5
JH
6812 }
6813 }
6814
b8c5462f
JH
6815 return namedclass;
6816}
6817
6818STATIC void
830247a4 6819S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6820{
97aff369 6821 dVAR;
3dab1dad 6822 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6823 const char *s = RExC_parse;
6824 const char c = *s++;
b8c5462f 6825
3dab1dad 6826 while (isALNUM(*s))
b8c5462f
JH
6827 s++;
6828 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6829 if (ckWARN(WARN_REGEXP))
6830 vWARN3(s+2,
6831 "POSIX syntax [%c %c] belongs inside character classes",
6832 c, c);
b45f050a
JF
6833
6834 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6835 if (POSIXCC_NOTYET(c)) {
830247a4 6836 /* adjust RExC_parse so the error shows after
b45f050a 6837 the class closes */
9a86a77b 6838 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6839 NOOP;
b45f050a
JF
6840 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6841 }
b8c5462f
JH
6842 }
6843 }
620e46c5
JH
6844}
6845
7f6f358c
YO
6846
6847/*
6848 parse a class specification and produce either an ANYOF node that
6849 matches the pattern. If the pattern matches a single char only and
6850 that char is < 256 then we produce an EXACT node instead.
6851*/
76e3520e 6852STATIC regnode *
3dab1dad 6853S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6854{
97aff369 6855 dVAR;
9ef43ace 6856 register UV value = 0;
9a86a77b 6857 register UV nextvalue;
3568d838 6858 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6859 register IV range = 0;
c277df42 6860 register regnode *ret;
ba210ebe 6861 STRLEN numlen;
ffc61ed2 6862 IV namedclass;
cbbf8932 6863 char *rangebegin = NULL;
936ed897 6864 bool need_class = 0;
c445ea15 6865 SV *listsv = NULL;
ffc61ed2 6866 UV n;
9e55ce06 6867 bool optimize_invert = TRUE;
cbbf8932 6868 AV* unicode_alternate = NULL;
1b2d223b
JH
6869#ifdef EBCDIC
6870 UV literal_endpoint = 0;
6871#endif
7f6f358c 6872 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6873
3dab1dad 6874 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6875 case we need to change the emitted regop to an EXACT. */
07be1b83 6876 const char * orig_parse = RExC_parse;
72f13be8 6877 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6878#ifndef DEBUGGING
6879 PERL_UNUSED_ARG(depth);
6880#endif
72f13be8 6881
3dab1dad 6882 DEBUG_PARSE("clas");
7f6f358c
YO
6883
6884 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6885 ret = reganode(pRExC_state, ANYOF, 0);
6886
6887 if (!SIZE_ONLY)
6888 ANYOF_FLAGS(ret) = 0;
6889
9a86a77b 6890 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6891 RExC_naughty++;
6892 RExC_parse++;
6893 if (!SIZE_ONLY)
6894 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6895 }
a0d0e21e 6896
73060fc4 6897 if (SIZE_ONLY) {
830247a4 6898 RExC_size += ANYOF_SKIP;
73060fc4
JH
6899 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6900 }
936ed897 6901 else {
830247a4 6902 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6903 if (FOLD)
6904 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6905 if (LOC)
6906 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6907 ANYOF_BITMAP_ZERO(ret);
396482e1 6908 listsv = newSVpvs("# comment\n");
a0d0e21e 6909 }
b8c5462f 6910
9a86a77b
JH
6911 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6912
b938889d 6913 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6914 checkposixcc(pRExC_state);
b8c5462f 6915
f064b6ad
HS
6916 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6917 if (UCHARAT(RExC_parse) == ']')
6918 goto charclassloop;
ffc61ed2 6919
fc8cd66c 6920parseit:
9a86a77b 6921 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6922
6923 charclassloop:
6924
6925 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6926
73b437c8 6927 if (!range)
830247a4 6928 rangebegin = RExC_parse;
ffc61ed2 6929 if (UTF) {
5e12f4fb 6930 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6931 RExC_end - RExC_parse,
9f7f3913 6932 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6933 RExC_parse += numlen;
6934 }
6935 else
6936 value = UCHARAT(RExC_parse++);
7f6f358c 6937
9a86a77b
JH
6938 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6939 if (value == '[' && POSIXCC(nextvalue))
830247a4 6940 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6941 else if (value == '\\') {
ffc61ed2 6942 if (UTF) {
5e12f4fb 6943 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6944 RExC_end - RExC_parse,
9f7f3913 6945 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6946 RExC_parse += numlen;
6947 }
6948 else
6949 value = UCHARAT(RExC_parse++);
470c3474 6950 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6951 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6952 * be a problem later if we want switch on Unicode.
6953 * A similar issue a little bit later when switching on
6954 * namedclass. --jhi */
ffc61ed2 6955 switch ((I32)value) {
b8c5462f
JH
6956 case 'w': namedclass = ANYOF_ALNUM; break;
6957 case 'W': namedclass = ANYOF_NALNUM; break;
6958 case 's': namedclass = ANYOF_SPACE; break;
6959 case 'S': namedclass = ANYOF_NSPACE; break;
6960 case 'd': namedclass = ANYOF_DIGIT; break;
6961 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6962 case 'N': /* Handle \N{NAME} in class */
6963 {
6964 /* We only pay attention to the first char of
6965 multichar strings being returned. I kinda wonder
6966 if this makes sense as it does change the behaviour
6967 from earlier versions, OTOH that behaviour was broken
6968 as well. */
6969 UV v; /* value is register so we cant & it /grrr */
6970 if (reg_namedseq(pRExC_state, &v)) {
6971 goto parseit;
6972 }
6973 value= v;
6974 }
6975 break;
ffc61ed2
JH
6976 case 'p':
6977 case 'P':
3dab1dad
YO
6978 {
6979 char *e;
af6f566e 6980 if (RExC_parse >= RExC_end)
2a4859cd 6981 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6982 if (*RExC_parse == '{') {
1df70142 6983 const U8 c = (U8)value;
ffc61ed2
JH
6984 e = strchr(RExC_parse++, '}');
6985 if (!e)
0da60cf5 6986 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6987 while (isSPACE(UCHARAT(RExC_parse)))
6988 RExC_parse++;
6989 if (e == RExC_parse)
0da60cf5 6990 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6991 n = e - RExC_parse;
ab13f0c7
JH
6992 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6993 n--;
ffc61ed2
JH
6994 }
6995 else {
6996 e = RExC_parse;
6997 n = 1;
6998 }
6999 if (!SIZE_ONLY) {
ab13f0c7
JH
7000 if (UCHARAT(RExC_parse) == '^') {
7001 RExC_parse++;
7002 n--;
7003 value = value == 'p' ? 'P' : 'p'; /* toggle */
7004 while (isSPACE(UCHARAT(RExC_parse))) {
7005 RExC_parse++;
7006 n--;
7007 }
7008 }
097eb12c
AL
7009 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7010 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7011 }
7012 RExC_parse = e + 1;
7013 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7014 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7015 }
f81125e2 7016 break;
b8c5462f
JH
7017 case 'n': value = '\n'; break;
7018 case 'r': value = '\r'; break;
7019 case 't': value = '\t'; break;
7020 case 'f': value = '\f'; break;
7021 case 'b': value = '\b'; break;
c7f1f016
NIS
7022 case 'e': value = ASCII_TO_NATIVE('\033');break;
7023 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7024 case 'x':
ffc61ed2 7025 if (*RExC_parse == '{') {
a4c04bdc
NC
7026 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7027 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7028 char * const e = strchr(RExC_parse++, '}');
b81d288d 7029 if (!e)
ffc61ed2 7030 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7031
7032 numlen = e - RExC_parse;
7033 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7034 RExC_parse = e + 1;
7035 }
7036 else {
a4c04bdc 7037 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7038 numlen = 2;
7039 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7040 RExC_parse += numlen;
7041 }
9e08bc66
TS
7042 if (PL_encoding && value < 0x100)
7043 goto recode_encoding;
b8c5462f
JH
7044 break;
7045 case 'c':
830247a4 7046 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7047 value = toCTRL(value);
7048 break;
7049 case '0': case '1': case '2': case '3': case '4':
7050 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7051 {
7052 I32 flags = 0;
7053 numlen = 3;
7054 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7055 RExC_parse += numlen;
7056 if (PL_encoding && value < 0x100)
7057 goto recode_encoding;
7058 break;
7059 }
7060 recode_encoding:
7061 {
7062 SV* enc = PL_encoding;
7063 value = reg_recode((const char)(U8)value, &enc);
7064 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7065 vWARN(RExC_parse,
7066 "Invalid escape in the specified encoding");
7067 break;
7068 }
1028017a 7069 default:
041457d9 7070 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7071 vWARN2(RExC_parse,
7072 "Unrecognized escape \\%c in character class passed through",
7073 (int)value);
1028017a 7074 break;
b8c5462f 7075 }
ffc61ed2 7076 } /* end of \blah */
1b2d223b
JH
7077#ifdef EBCDIC
7078 else
7079 literal_endpoint++;
7080#endif
ffc61ed2
JH
7081
7082 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7083
7084 if (!SIZE_ONLY && !need_class)
936ed897 7085 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7086
936ed897 7087 need_class = 1;
ffc61ed2
JH
7088
7089 /* a bad range like a-\d, a-[:digit:] ? */
7090 if (range) {
73b437c8 7091 if (!SIZE_ONLY) {
afd78fd5 7092 if (ckWARN(WARN_REGEXP)) {
097eb12c 7093 const int w =
afd78fd5
JH
7094 RExC_parse >= rangebegin ?
7095 RExC_parse - rangebegin : 0;
830247a4 7096 vWARN4(RExC_parse,
b45f050a 7097 "False [] range \"%*.*s\"",
097eb12c 7098 w, w, rangebegin);
afd78fd5 7099 }
3568d838
JH
7100 if (prevvalue < 256) {
7101 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7102 ANYOF_BITMAP_SET(ret, '-');
7103 }
7104 else {
7105 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7106 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7107 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7108 }
b8c5462f 7109 }
ffc61ed2
JH
7110
7111 range = 0; /* this was not a true range */
73b437c8 7112 }
ffc61ed2 7113
73b437c8 7114 if (!SIZE_ONLY) {
c49a72a9
NC
7115 const char *what = NULL;
7116 char yesno = 0;
7117
3568d838
JH
7118 if (namedclass > OOB_NAMEDCLASS)
7119 optimize_invert = FALSE;
e2962f66
JH
7120 /* Possible truncation here but in some 64-bit environments
7121 * the compiler gets heartburn about switch on 64-bit values.
7122 * A similar issue a little earlier when switching on value.
98f323fa 7123 * --jhi */
e2962f66 7124 switch ((I32)namedclass) {
73b437c8
JH
7125 case ANYOF_ALNUM:
7126 if (LOC)
936ed897 7127 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7128 else {
7129 for (value = 0; value < 256; value++)
7130 if (isALNUM(value))
936ed897 7131 ANYOF_BITMAP_SET(ret, value);
73b437c8 7132 }
c49a72a9
NC
7133 yesno = '+';
7134 what = "Word";
73b437c8
JH
7135 break;
7136 case ANYOF_NALNUM:
7137 if (LOC)
936ed897 7138 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7139 else {
7140 for (value = 0; value < 256; value++)
7141 if (!isALNUM(value))
936ed897 7142 ANYOF_BITMAP_SET(ret, value);
73b437c8 7143 }
c49a72a9
NC
7144 yesno = '!';
7145 what = "Word";
73b437c8 7146 break;
ffc61ed2 7147 case ANYOF_ALNUMC:
73b437c8 7148 if (LOC)
ffc61ed2 7149 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7150 else {
7151 for (value = 0; value < 256; value++)
ffc61ed2 7152 if (isALNUMC(value))
936ed897 7153 ANYOF_BITMAP_SET(ret, value);
73b437c8 7154 }
c49a72a9
NC
7155 yesno = '+';
7156 what = "Alnum";
73b437c8
JH
7157 break;
7158 case ANYOF_NALNUMC:
7159 if (LOC)
936ed897 7160 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7161 else {
7162 for (value = 0; value < 256; value++)
7163 if (!isALNUMC(value))
936ed897 7164 ANYOF_BITMAP_SET(ret, value);
73b437c8 7165 }
c49a72a9
NC
7166 yesno = '!';
7167 what = "Alnum";
73b437c8
JH
7168 break;
7169 case ANYOF_ALPHA:
7170 if (LOC)
936ed897 7171 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7172 else {
7173 for (value = 0; value < 256; value++)
7174 if (isALPHA(value))
936ed897 7175 ANYOF_BITMAP_SET(ret, value);
73b437c8 7176 }
c49a72a9
NC
7177 yesno = '+';
7178 what = "Alpha";
73b437c8
JH
7179 break;
7180 case ANYOF_NALPHA:
7181 if (LOC)
936ed897 7182 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7183 else {
7184 for (value = 0; value < 256; value++)
7185 if (!isALPHA(value))
936ed897 7186 ANYOF_BITMAP_SET(ret, value);
73b437c8 7187 }
c49a72a9
NC
7188 yesno = '!';
7189 what = "Alpha";
73b437c8
JH
7190 break;
7191 case ANYOF_ASCII:
7192 if (LOC)
936ed897 7193 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7194 else {
c7f1f016 7195#ifndef EBCDIC
1ba5c669
JH
7196 for (value = 0; value < 128; value++)
7197 ANYOF_BITMAP_SET(ret, value);
7198#else /* EBCDIC */
ffbc6a93 7199 for (value = 0; value < 256; value++) {
3a3c4447
JH
7200 if (isASCII(value))
7201 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7202 }
1ba5c669 7203#endif /* EBCDIC */
73b437c8 7204 }
c49a72a9
NC
7205 yesno = '+';
7206 what = "ASCII";
73b437c8
JH
7207 break;
7208 case ANYOF_NASCII:
7209 if (LOC)
936ed897 7210 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7211 else {
c7f1f016 7212#ifndef EBCDIC
1ba5c669
JH
7213 for (value = 128; value < 256; value++)
7214 ANYOF_BITMAP_SET(ret, value);
7215#else /* EBCDIC */
ffbc6a93 7216 for (value = 0; value < 256; value++) {
3a3c4447
JH
7217 if (!isASCII(value))
7218 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7219 }
1ba5c669 7220#endif /* EBCDIC */
73b437c8 7221 }
c49a72a9
NC
7222 yesno = '!';
7223 what = "ASCII";
73b437c8 7224 break;
aaa51d5e
JF
7225 case ANYOF_BLANK:
7226 if (LOC)
7227 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7228 else {
7229 for (value = 0; value < 256; value++)
7230 if (isBLANK(value))
7231 ANYOF_BITMAP_SET(ret, value);
7232 }
c49a72a9
NC
7233 yesno = '+';
7234 what = "Blank";
aaa51d5e
JF
7235 break;
7236 case ANYOF_NBLANK:
7237 if (LOC)
7238 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7239 else {
7240 for (value = 0; value < 256; value++)
7241 if (!isBLANK(value))
7242 ANYOF_BITMAP_SET(ret, value);
7243 }
c49a72a9
NC
7244 yesno = '!';
7245 what = "Blank";
aaa51d5e 7246 break;
73b437c8
JH
7247 case ANYOF_CNTRL:
7248 if (LOC)
936ed897 7249 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7250 else {
7251 for (value = 0; value < 256; value++)
7252 if (isCNTRL(value))
936ed897 7253 ANYOF_BITMAP_SET(ret, value);
73b437c8 7254 }
c49a72a9
NC
7255 yesno = '+';
7256 what = "Cntrl";
73b437c8
JH
7257 break;
7258 case ANYOF_NCNTRL:
7259 if (LOC)
936ed897 7260 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7261 else {
7262 for (value = 0; value < 256; value++)
7263 if (!isCNTRL(value))
936ed897 7264 ANYOF_BITMAP_SET(ret, value);
73b437c8 7265 }
c49a72a9
NC
7266 yesno = '!';
7267 what = "Cntrl";
ffc61ed2
JH
7268 break;
7269 case ANYOF_DIGIT:
7270 if (LOC)
7271 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7272 else {
7273 /* consecutive digits assumed */
7274 for (value = '0'; value <= '9'; value++)
7275 ANYOF_BITMAP_SET(ret, value);
7276 }
c49a72a9
NC
7277 yesno = '+';
7278 what = "Digit";
ffc61ed2
JH
7279 break;
7280 case ANYOF_NDIGIT:
7281 if (LOC)
7282 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7283 else {
7284 /* consecutive digits assumed */
7285 for (value = 0; value < '0'; value++)
7286 ANYOF_BITMAP_SET(ret, value);
7287 for (value = '9' + 1; value < 256; value++)
7288 ANYOF_BITMAP_SET(ret, value);
7289 }
c49a72a9
NC
7290 yesno = '!';
7291 what = "Digit";
73b437c8
JH
7292 break;
7293 case ANYOF_GRAPH:
7294 if (LOC)
936ed897 7295 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7296 else {
7297 for (value = 0; value < 256; value++)
7298 if (isGRAPH(value))
936ed897 7299 ANYOF_BITMAP_SET(ret, value);
73b437c8 7300 }
c49a72a9
NC
7301 yesno = '+';
7302 what = "Graph";
73b437c8
JH
7303 break;
7304 case ANYOF_NGRAPH:
7305 if (LOC)
936ed897 7306 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7307 else {
7308 for (value = 0; value < 256; value++)
7309 if (!isGRAPH(value))
936ed897 7310 ANYOF_BITMAP_SET(ret, value);
73b437c8 7311 }
c49a72a9
NC
7312 yesno = '!';
7313 what = "Graph";
73b437c8
JH
7314 break;
7315 case ANYOF_LOWER:
7316 if (LOC)
936ed897 7317 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7318 else {
7319 for (value = 0; value < 256; value++)
7320 if (isLOWER(value))
936ed897 7321 ANYOF_BITMAP_SET(ret, value);
73b437c8 7322 }
c49a72a9
NC
7323 yesno = '+';
7324 what = "Lower";
73b437c8
JH
7325 break;
7326 case ANYOF_NLOWER:
7327 if (LOC)
936ed897 7328 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7329 else {
7330 for (value = 0; value < 256; value++)
7331 if (!isLOWER(value))
936ed897 7332 ANYOF_BITMAP_SET(ret, value);
73b437c8 7333 }
c49a72a9
NC
7334 yesno = '!';
7335 what = "Lower";
73b437c8
JH
7336 break;
7337 case ANYOF_PRINT:
7338 if (LOC)
936ed897 7339 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7340 else {
7341 for (value = 0; value < 256; value++)
7342 if (isPRINT(value))
936ed897 7343 ANYOF_BITMAP_SET(ret, value);
73b437c8 7344 }
c49a72a9
NC
7345 yesno = '+';
7346 what = "Print";
73b437c8
JH
7347 break;
7348 case ANYOF_NPRINT:
7349 if (LOC)
936ed897 7350 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7351 else {
7352 for (value = 0; value < 256; value++)
7353 if (!isPRINT(value))
936ed897 7354 ANYOF_BITMAP_SET(ret, value);
73b437c8 7355 }
c49a72a9
NC
7356 yesno = '!';
7357 what = "Print";
73b437c8 7358 break;
aaa51d5e
JF
7359 case ANYOF_PSXSPC:
7360 if (LOC)
7361 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7362 else {
7363 for (value = 0; value < 256; value++)
7364 if (isPSXSPC(value))
7365 ANYOF_BITMAP_SET(ret, value);
7366 }
c49a72a9
NC
7367 yesno = '+';
7368 what = "Space";
aaa51d5e
JF
7369 break;
7370 case ANYOF_NPSXSPC:
7371 if (LOC)
7372 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7373 else {
7374 for (value = 0; value < 256; value++)
7375 if (!isPSXSPC(value))
7376 ANYOF_BITMAP_SET(ret, value);
7377 }
c49a72a9
NC
7378 yesno = '!';
7379 what = "Space";
aaa51d5e 7380 break;
73b437c8
JH
7381 case ANYOF_PUNCT:
7382 if (LOC)
936ed897 7383 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7384 else {
7385 for (value = 0; value < 256; value++)
7386 if (isPUNCT(value))
936ed897 7387 ANYOF_BITMAP_SET(ret, value);
73b437c8 7388 }
c49a72a9
NC
7389 yesno = '+';
7390 what = "Punct";
73b437c8
JH
7391 break;
7392 case ANYOF_NPUNCT:
7393 if (LOC)
936ed897 7394 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7395 else {
7396 for (value = 0; value < 256; value++)
7397 if (!isPUNCT(value))
936ed897 7398 ANYOF_BITMAP_SET(ret, value);
73b437c8 7399 }
c49a72a9
NC
7400 yesno = '!';
7401 what = "Punct";
ffc61ed2
JH
7402 break;
7403 case ANYOF_SPACE:
7404 if (LOC)
7405 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7406 else {
7407 for (value = 0; value < 256; value++)
7408 if (isSPACE(value))
7409 ANYOF_BITMAP_SET(ret, value);
7410 }
c49a72a9
NC
7411 yesno = '+';
7412 what = "SpacePerl";
ffc61ed2
JH
7413 break;
7414 case ANYOF_NSPACE:
7415 if (LOC)
7416 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7417 else {
7418 for (value = 0; value < 256; value++)
7419 if (!isSPACE(value))
7420 ANYOF_BITMAP_SET(ret, value);
7421 }
c49a72a9
NC
7422 yesno = '!';
7423 what = "SpacePerl";
73b437c8
JH
7424 break;
7425 case ANYOF_UPPER:
7426 if (LOC)
936ed897 7427 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7428 else {
7429 for (value = 0; value < 256; value++)
7430 if (isUPPER(value))
936ed897 7431 ANYOF_BITMAP_SET(ret, value);
73b437c8 7432 }
c49a72a9
NC
7433 yesno = '+';
7434 what = "Upper";
73b437c8
JH
7435 break;
7436 case ANYOF_NUPPER:
7437 if (LOC)
936ed897 7438 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7439 else {
7440 for (value = 0; value < 256; value++)
7441 if (!isUPPER(value))
936ed897 7442 ANYOF_BITMAP_SET(ret, value);
73b437c8 7443 }
c49a72a9
NC
7444 yesno = '!';
7445 what = "Upper";
73b437c8
JH
7446 break;
7447 case ANYOF_XDIGIT:
7448 if (LOC)
936ed897 7449 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7450 else {
7451 for (value = 0; value < 256; value++)
7452 if (isXDIGIT(value))
936ed897 7453 ANYOF_BITMAP_SET(ret, value);
73b437c8 7454 }
c49a72a9
NC
7455 yesno = '+';
7456 what = "XDigit";
73b437c8
JH
7457 break;
7458 case ANYOF_NXDIGIT:
7459 if (LOC)
936ed897 7460 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7461 else {
7462 for (value = 0; value < 256; value++)
7463 if (!isXDIGIT(value))
936ed897 7464 ANYOF_BITMAP_SET(ret, value);
73b437c8 7465 }
c49a72a9
NC
7466 yesno = '!';
7467 what = "XDigit";
73b437c8 7468 break;
f81125e2
JP
7469 case ANYOF_MAX:
7470 /* this is to handle \p and \P */
7471 break;
73b437c8 7472 default:
b45f050a 7473 vFAIL("Invalid [::] class");
73b437c8 7474 break;
b8c5462f 7475 }
c49a72a9
NC
7476 if (what) {
7477 /* Strings such as "+utf8::isWord\n" */
7478 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7479 }
b8c5462f 7480 if (LOC)
936ed897 7481 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7482 continue;
a0d0e21e 7483 }
ffc61ed2
JH
7484 } /* end of namedclass \blah */
7485
a0d0e21e 7486 if (range) {
eb160463 7487 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7488 const int w = RExC_parse - rangebegin;
7489 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7490 range = 0; /* not a valid range */
73b437c8 7491 }
a0d0e21e
LW
7492 }
7493 else {
3568d838 7494 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7495 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7496 RExC_parse[1] != ']') {
7497 RExC_parse++;
ffc61ed2
JH
7498
7499 /* a bad range like \w-, [:word:]- ? */
7500 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7501 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7502 const int w =
afd78fd5
JH
7503 RExC_parse >= rangebegin ?
7504 RExC_parse - rangebegin : 0;
830247a4 7505 vWARN4(RExC_parse,
b45f050a 7506 "False [] range \"%*.*s\"",
097eb12c 7507 w, w, rangebegin);
afd78fd5 7508 }
73b437c8 7509 if (!SIZE_ONLY)
936ed897 7510 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7511 } else
ffc61ed2
JH
7512 range = 1; /* yeah, it's a range! */
7513 continue; /* but do it the next time */
a0d0e21e 7514 }
a687059c 7515 }
ffc61ed2 7516
93733859 7517 /* now is the next time */
07be1b83 7518 /*stored += (value - prevvalue + 1);*/
ae5c130c 7519 if (!SIZE_ONLY) {
3568d838 7520 if (prevvalue < 256) {
1df70142 7521 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7522 IV i;
3568d838 7523#ifdef EBCDIC
1b2d223b
JH
7524 /* In EBCDIC [\x89-\x91] should include
7525 * the \x8e but [i-j] should not. */
7526 if (literal_endpoint == 2 &&
7527 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7528 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7529 {
3568d838
JH
7530 if (isLOWER(prevvalue)) {
7531 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7532 if (isLOWER(i))
7533 ANYOF_BITMAP_SET(ret, i);
7534 } else {
3568d838 7535 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7536 if (isUPPER(i))
7537 ANYOF_BITMAP_SET(ret, i);
7538 }
8ada0baa 7539 }
ffc61ed2 7540 else
8ada0baa 7541#endif
07be1b83
YO
7542 for (i = prevvalue; i <= ceilvalue; i++) {
7543 if (!ANYOF_BITMAP_TEST(ret,i)) {
7544 stored++;
7545 ANYOF_BITMAP_SET(ret, i);
7546 }
7547 }
3568d838 7548 }
a5961de5 7549 if (value > 255 || UTF) {
1df70142
AL
7550 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7551 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7552 stored+=2; /* can't optimize this class */
ffc61ed2 7553 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7554 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7555 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7556 prevnatvalue, natvalue);
7557 }
7558 else if (prevnatvalue == natvalue) {
7559 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7560 if (FOLD) {
89ebb4a3 7561 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7562 STRLEN foldlen;
1df70142 7563 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7564
e294cc5d
JH
7565#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7566 if (RExC_precomp[0] == ':' &&
7567 RExC_precomp[1] == '[' &&
7568 (f == 0xDF || f == 0x92)) {
7569 f = NATIVE_TO_UNI(f);
7570 }
7571#endif
c840d2a2
JH
7572 /* If folding and foldable and a single
7573 * character, insert also the folded version
7574 * to the charclass. */
9e55ce06 7575 if (f != value) {
e294cc5d
JH
7576#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7577 if ((RExC_precomp[0] == ':' &&
7578 RExC_precomp[1] == '[' &&
7579 (f == 0xA2 &&
7580 (value == 0xFB05 || value == 0xFB06))) ?
7581 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7582 foldlen == (STRLEN)UNISKIP(f) )
7583#else
eb160463 7584 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7585#endif
9e55ce06
JH
7586 Perl_sv_catpvf(aTHX_ listsv,
7587 "%04"UVxf"\n", f);
7588 else {
7589 /* Any multicharacter foldings
7590 * require the following transform:
7591 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7592 * where E folds into "pq" and F folds
7593 * into "rst", all other characters
7594 * fold to single characters. We save
7595 * away these multicharacter foldings,
7596 * to be later saved as part of the
7597 * additional "s" data. */
7598 SV *sv;
7599
7600 if (!unicode_alternate)
7601 unicode_alternate = newAV();
7602 sv = newSVpvn((char*)foldbuf, foldlen);
7603 SvUTF8_on(sv);
7604 av_push(unicode_alternate, sv);
7605 }
7606 }
254ba52a 7607
60a8b682
JH
7608 /* If folding and the value is one of the Greek
7609 * sigmas insert a few more sigmas to make the
7610 * folding rules of the sigmas to work right.
7611 * Note that not all the possible combinations
7612 * are handled here: some of them are handled
9e55ce06
JH
7613 * by the standard folding rules, and some of
7614 * them (literal or EXACTF cases) are handled
7615 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7616 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7617 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7618 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7619 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7620 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7621 }
7622 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7623 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7624 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7625 }
7626 }
ffc61ed2 7627 }
1b2d223b
JH
7628#ifdef EBCDIC
7629 literal_endpoint = 0;
7630#endif
8ada0baa 7631 }
ffc61ed2
JH
7632
7633 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7634 }
ffc61ed2 7635
936ed897 7636 if (need_class) {
4f66b38d 7637 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7638 if (SIZE_ONLY)
830247a4 7639 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7640 else
830247a4 7641 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7642 }
ffc61ed2 7643
7f6f358c
YO
7644
7645 if (SIZE_ONLY)
7646 return ret;
7647 /****** !SIZE_ONLY AFTER HERE *********/
7648
7649 if( stored == 1 && value < 256
7650 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7651 ) {
7652 /* optimize single char class to an EXACT node
7653 but *only* when its not a UTF/high char */
07be1b83
YO
7654 const char * cur_parse= RExC_parse;
7655 RExC_emit = (regnode *)orig_emit;
7656 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7657 ret = reg_node(pRExC_state,
7658 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7659 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7660 *STRING(ret)= (char)value;
7661 STR_LEN(ret)= 1;
7662 RExC_emit += STR_SZ(1);
7663 return ret;
7664 }
ae5c130c 7665 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7666 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7667 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7668 ) {
a0ed51b3 7669 for (value = 0; value < 256; ++value) {
936ed897 7670 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7671 UV fold = PL_fold[value];
ffc61ed2
JH
7672
7673 if (fold != value)
7674 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7675 }
7676 }
936ed897 7677 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7678 }
ffc61ed2 7679
ae5c130c 7680 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7681 if (optimize_invert &&
ffc61ed2
JH
7682 /* If the only flag is inversion. */
7683 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7684 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7685 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7686 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7687 }
7f6f358c 7688 {
097eb12c 7689 AV * const av = newAV();
ffc61ed2 7690 SV *rv;
9e55ce06 7691 /* The 0th element stores the character class description
6a0407ee 7692 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7693 * to initialize the appropriate swash (which gets stored in
7694 * the 1st element), and also useful for dumping the regnode.
7695 * The 2nd element stores the multicharacter foldings,
6a0407ee 7696 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7697 av_store(av, 0, listsv);
7698 av_store(av, 1, NULL);
9e55ce06 7699 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7700 rv = newRV_noinc((SV*)av);
19860706 7701 n = add_data(pRExC_state, 1, "s");
830247a4 7702 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7703 ARG_SET(ret, n);
a0ed51b3 7704 }
a0ed51b3
LW
7705 return ret;
7706}
7707
76e3520e 7708STATIC char*
830247a4 7709S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7710{
097eb12c 7711 char* const retval = RExC_parse++;
a0d0e21e 7712
4633a7c4 7713 for (;;) {
830247a4
IZ
7714 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7715 RExC_parse[2] == '#') {
e994fd66
AE
7716 while (*RExC_parse != ')') {
7717 if (RExC_parse == RExC_end)
7718 FAIL("Sequence (?#... not terminated");
830247a4 7719 RExC_parse++;
e994fd66 7720 }
830247a4 7721 RExC_parse++;
4633a7c4
LW
7722 continue;
7723 }
bbe252da 7724 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7725 if (isSPACE(*RExC_parse)) {
7726 RExC_parse++;
748a9306
LW
7727 continue;
7728 }
830247a4 7729 else if (*RExC_parse == '#') {
e994fd66
AE
7730 while (RExC_parse < RExC_end)
7731 if (*RExC_parse++ == '\n') break;
748a9306
LW
7732 continue;
7733 }
748a9306 7734 }
4633a7c4 7735 return retval;
a0d0e21e 7736 }
a687059c
LW
7737}
7738
7739/*
c277df42 7740- reg_node - emit a node
a0d0e21e 7741*/
76e3520e 7742STATIC regnode * /* Location. */
830247a4 7743S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7744{
97aff369 7745 dVAR;
c277df42 7746 register regnode *ptr;
504618e9 7747 regnode * const ret = RExC_emit;
07be1b83 7748 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7749
c277df42 7750 if (SIZE_ONLY) {
830247a4
IZ
7751 SIZE_ALIGN(RExC_size);
7752 RExC_size += 1;
a0d0e21e
LW
7753 return(ret);
7754 }
e2e6a0f1
YO
7755#ifdef DEBUGGING
7756 if (OP(RExC_emit) == 255)
7757 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7758 reg_name[op], OP(RExC_emit));
7759#endif
c277df42 7760 NODE_ALIGN_FILL(ret);
a0d0e21e 7761 ptr = ret;
c277df42 7762 FILL_ADVANCE_NODE(ptr, op);
fac92740 7763 if (RExC_offsets) { /* MJD */
07be1b83 7764 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7765 "reg_node", __LINE__,
7766 reg_name[op],
07be1b83
YO
7767 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7768 ? "Overwriting end of array!\n" : "OK",
7769 (UV)(RExC_emit - RExC_emit_start),
7770 (UV)(RExC_parse - RExC_start),
7771 (UV)RExC_offsets[0]));
ccb2c380 7772 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7773 }
07be1b83 7774
830247a4 7775 RExC_emit = ptr;
a0d0e21e 7776 return(ret);
a687059c
LW
7777}
7778
7779/*
a0d0e21e
LW
7780- reganode - emit a node with an argument
7781*/
76e3520e 7782STATIC regnode * /* Location. */
830247a4 7783S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7784{
97aff369 7785 dVAR;
c277df42 7786 register regnode *ptr;
504618e9 7787 regnode * const ret = RExC_emit;
07be1b83 7788 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7789
c277df42 7790 if (SIZE_ONLY) {
830247a4
IZ
7791 SIZE_ALIGN(RExC_size);
7792 RExC_size += 2;
6bda09f9
YO
7793 /*
7794 We can't do this:
7795
7796 assert(2==regarglen[op]+1);
7797
7798 Anything larger than this has to allocate the extra amount.
7799 If we changed this to be:
7800
7801 RExC_size += (1 + regarglen[op]);
7802
7803 then it wouldn't matter. Its not clear what side effect
7804 might come from that so its not done so far.
7805 -- dmq
7806 */
a0d0e21e
LW
7807 return(ret);
7808 }
e2e6a0f1
YO
7809#ifdef DEBUGGING
7810 if (OP(RExC_emit) == 255)
7811 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7812#endif
c277df42 7813 NODE_ALIGN_FILL(ret);
a0d0e21e 7814 ptr = ret;
c277df42 7815 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7816 if (RExC_offsets) { /* MJD */
07be1b83 7817 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7818 "reganode",
ccb2c380
MP
7819 __LINE__,
7820 reg_name[op],
07be1b83 7821 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7822 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7823 (UV)(RExC_emit - RExC_emit_start),
7824 (UV)(RExC_parse - RExC_start),
7825 (UV)RExC_offsets[0]));
ccb2c380 7826 Set_Cur_Node_Offset;
fac92740
MJD
7827 }
7828
830247a4 7829 RExC_emit = ptr;
a0d0e21e 7830 return(ret);
fe14fcc3
LW
7831}
7832
7833/*
cd439c50 7834- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7835*/
71207a34
AL
7836STATIC STRLEN
7837S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7838{
97aff369 7839 dVAR;
71207a34 7840 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7841}
7842
7843/*
a0d0e21e
LW
7844- reginsert - insert an operator in front of already-emitted operand
7845*
7846* Means relocating the operand.
7847*/
76e3520e 7848STATIC void
6bda09f9 7849S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7850{
97aff369 7851 dVAR;
c277df42
IZ
7852 register regnode *src;
7853 register regnode *dst;
7854 register regnode *place;
504618e9 7855 const int offset = regarglen[(U8)op];
6bda09f9 7856 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7857 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7858/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7859 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7860 if (SIZE_ONLY) {
6bda09f9 7861 RExC_size += size;
a0d0e21e
LW
7862 return;
7863 }
a687059c 7864
830247a4 7865 src = RExC_emit;
6bda09f9 7866 RExC_emit += size;
830247a4 7867 dst = RExC_emit;
40d049e4 7868 if (RExC_open_parens) {
6bda09f9 7869 int paren;
6d99fb9b 7870 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7871 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7872 if ( RExC_open_parens[paren] >= opnd ) {
7873 DEBUG_PARSE_FMT("open"," - %d",size);
7874 RExC_open_parens[paren] += size;
7875 } else {
7876 DEBUG_PARSE_FMT("open"," - %s","ok");
7877 }
7878 if ( RExC_close_parens[paren] >= opnd ) {
7879 DEBUG_PARSE_FMT("close"," - %d",size);
7880 RExC_close_parens[paren] += size;
7881 } else {
7882 DEBUG_PARSE_FMT("close"," - %s","ok");
7883 }
7884 }
6bda09f9 7885 }
40d049e4 7886
fac92740 7887 while (src > opnd) {
c277df42 7888 StructCopy(--src, --dst, regnode);
fac92740 7889 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7890 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7891 "reg_insert",
ccb2c380
MP
7892 __LINE__,
7893 reg_name[op],
07be1b83
YO
7894 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7895 ? "Overwriting end of array!\n" : "OK",
7896 (UV)(src - RExC_emit_start),
7897 (UV)(dst - RExC_emit_start),
7898 (UV)RExC_offsets[0]));
ccb2c380
MP
7899 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7900 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7901 }
7902 }
7903
a0d0e21e
LW
7904
7905 place = opnd; /* Op node, where operand used to be. */
fac92740 7906 if (RExC_offsets) { /* MJD */
07be1b83 7907 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7908 "reginsert",
ccb2c380
MP
7909 __LINE__,
7910 reg_name[op],
07be1b83 7911 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7912 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7913 (UV)(place - RExC_emit_start),
7914 (UV)(RExC_parse - RExC_start),
786e8c11 7915 (UV)RExC_offsets[0]));
ccb2c380 7916 Set_Node_Offset(place, RExC_parse);
45948336 7917 Set_Node_Length(place, 1);
fac92740 7918 }
c277df42
IZ
7919 src = NEXTOPER(place);
7920 FILL_ADVANCE_NODE(place, op);
7921 Zero(src, offset, regnode);
a687059c
LW
7922}
7923
7924/*
c277df42 7925- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7926- SEE ALSO: regtail_study
a0d0e21e 7927*/
097eb12c 7928/* TODO: All three parms should be const */
76e3520e 7929STATIC void
3dab1dad 7930S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7931{
97aff369 7932 dVAR;
c277df42 7933 register regnode *scan;
72f13be8 7934 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7935#ifndef DEBUGGING
7936 PERL_UNUSED_ARG(depth);
7937#endif
a0d0e21e 7938
c277df42 7939 if (SIZE_ONLY)
a0d0e21e
LW
7940 return;
7941
7942 /* Find last node. */
7943 scan = p;
7944 for (;;) {
504618e9 7945 regnode * const temp = regnext(scan);
3dab1dad
YO
7946 DEBUG_PARSE_r({
7947 SV * const mysv=sv_newmortal();
7948 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7949 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7950 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7951 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7952 (temp == NULL ? "->" : ""),
7953 (temp == NULL ? reg_name[OP(val)] : "")
7954 );
3dab1dad
YO
7955 });
7956 if (temp == NULL)
7957 break;
7958 scan = temp;
7959 }
7960
7961 if (reg_off_by_arg[OP(scan)]) {
7962 ARG_SET(scan, val - scan);
7963 }
7964 else {
7965 NEXT_OFF(scan) = val - scan;
7966 }
7967}
7968
07be1b83 7969#ifdef DEBUGGING
3dab1dad
YO
7970/*
7971- regtail_study - set the next-pointer at the end of a node chain of p to val.
7972- Look for optimizable sequences at the same time.
7973- currently only looks for EXACT chains.
07be1b83
YO
7974
7975This is expermental code. The idea is to use this routine to perform
7976in place optimizations on branches and groups as they are constructed,
7977with the long term intention of removing optimization from study_chunk so
7978that it is purely analytical.
7979
7980Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7981to control which is which.
7982
3dab1dad
YO
7983*/
7984/* TODO: All four parms should be const */
07be1b83 7985
3dab1dad
YO
7986STATIC U8
7987S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7988{
7989 dVAR;
7990 register regnode *scan;
07be1b83
YO
7991 U8 exact = PSEUDO;
7992#ifdef EXPERIMENTAL_INPLACESCAN
7993 I32 min = 0;
7994#endif
7995
3dab1dad
YO
7996 GET_RE_DEBUG_FLAGS_DECL;
7997
07be1b83 7998
3dab1dad
YO
7999 if (SIZE_ONLY)
8000 return exact;
8001
8002 /* Find last node. */
8003
8004 scan = p;
8005 for (;;) {
8006 regnode * const temp = regnext(scan);
07be1b83
YO
8007#ifdef EXPERIMENTAL_INPLACESCAN
8008 if (PL_regkind[OP(scan)] == EXACT)
8009 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8010 return EXACT;
8011#endif
3dab1dad
YO
8012 if ( exact ) {
8013 switch (OP(scan)) {
8014 case EXACT:
8015 case EXACTF:
8016 case EXACTFL:
8017 if( exact == PSEUDO )
8018 exact= OP(scan);
07be1b83
YO
8019 else if ( exact != OP(scan) )
8020 exact= 0;
3dab1dad
YO
8021 case NOTHING:
8022 break;
8023 default:
8024 exact= 0;
8025 }
8026 }
8027 DEBUG_PARSE_r({
8028 SV * const mysv=sv_newmortal();
8029 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8030 regprop(RExC_rx, mysv, scan);
eaf3ca90 8031 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8032 SvPV_nolen_const(mysv),
eaf3ca90
YO
8033 REG_NODE_NUM(scan),
8034 reg_name[exact]);
3dab1dad 8035 });
a0d0e21e
LW
8036 if (temp == NULL)
8037 break;
8038 scan = temp;
8039 }
07be1b83
YO
8040 DEBUG_PARSE_r({
8041 SV * const mysv_val=sv_newmortal();
8042 DEBUG_PARSE_MSG("");
8043 regprop(RExC_rx, mysv_val, val);
8044 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8045 SvPV_nolen_const(mysv_val),
8046 REG_NODE_NUM(val),
8047 val - scan
8048 );
8049 });
c277df42
IZ
8050 if (reg_off_by_arg[OP(scan)]) {
8051 ARG_SET(scan, val - scan);
a0ed51b3
LW
8052 }
8053 else {
c277df42
IZ
8054 NEXT_OFF(scan) = val - scan;
8055 }
3dab1dad
YO
8056
8057 return exact;
a687059c 8058}
07be1b83 8059#endif
a687059c
LW
8060
8061/*
a687059c
LW
8062 - regcurly - a little FSA that accepts {\d+,?\d*}
8063 */
79072805 8064STATIC I32
5f66b61c 8065S_regcurly(register const char *s)
a687059c
LW
8066{
8067 if (*s++ != '{')
8068 return FALSE;
f0fcb552 8069 if (!isDIGIT(*s))
a687059c 8070 return FALSE;
f0fcb552 8071 while (isDIGIT(*s))
a687059c
LW
8072 s++;
8073 if (*s == ',')
8074 s++;
f0fcb552 8075 while (isDIGIT(*s))
a687059c
LW
8076 s++;
8077 if (*s != '}')
8078 return FALSE;
8079 return TRUE;
8080}
8081
a687059c
LW
8082
8083/*
fd181c75 8084 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8085 */
8086void
097eb12c 8087Perl_regdump(pTHX_ const regexp *r)
a687059c 8088{
35ff7856 8089#ifdef DEBUGGING
97aff369 8090 dVAR;
c445ea15 8091 SV * const sv = sv_newmortal();
ab3bbdeb 8092 SV *dsv= sv_newmortal();
a687059c 8093
786e8c11 8094 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8095
8096 /* Header fields of interest. */
ab3bbdeb
YO
8097 if (r->anchored_substr) {
8098 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8099 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8100 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8101 "anchored %s%s at %"IVdf" ",
8102 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8103 (IV)r->anchored_offset);
ab3bbdeb
YO
8104 } else if (r->anchored_utf8) {
8105 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8106 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8107 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8108 "anchored utf8 %s%s at %"IVdf" ",
8109 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8110 (IV)r->anchored_offset);
ab3bbdeb
YO
8111 }
8112 if (r->float_substr) {
8113 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8114 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8115 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8116 "floating %s%s at %"IVdf"..%"UVuf" ",
8117 s, RE_SV_TAIL(r->float_substr),
7b0972df 8118 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8119 } else if (r->float_utf8) {
8120 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8121 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8122 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8123 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8124 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8125 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8126 }
33b8afdf 8127 if (r->check_substr || r->check_utf8)
b81d288d 8128 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8129 (const char *)
8130 (r->check_substr == r->float_substr
8131 && r->check_utf8 == r->float_utf8
8132 ? "(checking floating" : "(checking anchored"));
bbe252da 8133 if (r->extflags & RXf_NOSCAN)
c277df42 8134 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8135 if (r->extflags & RXf_CHECK_ALL)
c277df42 8136 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8137 if (r->check_substr || r->check_utf8)
c277df42
IZ
8138 PerlIO_printf(Perl_debug_log, ") ");
8139
46fc3d4c 8140 if (r->regstclass) {
32fc9b6a 8141 regprop(r, sv, r->regstclass);
1de06328 8142 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8143 }
bbe252da 8144 if (r->extflags & RXf_ANCH) {
774d564b 8145 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8146 if (r->extflags & RXf_ANCH_BOL)
774d564b 8147 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8148 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8149 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8150 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8151 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8152 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8153 PerlIO_printf(Perl_debug_log, "(GPOS)");
8154 PerlIO_putc(Perl_debug_log, ' ');
8155 }
bbe252da 8156 if (r->extflags & RXf_GPOS_SEEN)
58e23c8d 8157 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs);
bbe252da 8158 if (r->intflags & PREGf_SKIP)
760ac839 8159 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8160 if (r->intflags & PREGf_IMPLICIT)
760ac839 8161 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8162 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
bbe252da 8163 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8164 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8165 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8166#else
96a5add6 8167 PERL_UNUSED_CONTEXT;
65e66c80 8168 PERL_UNUSED_ARG(r);
17c3b450 8169#endif /* DEBUGGING */
a687059c
LW
8170}
8171
8172/*
a0d0e21e
LW
8173- regprop - printable representation of opcode
8174*/
46fc3d4c 8175void
32fc9b6a 8176Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8177{
35ff7856 8178#ifdef DEBUGGING
97aff369 8179 dVAR;
9b155405 8180 register int k;
1de06328 8181 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8182
54dc92de 8183 sv_setpvn(sv, "", 0);
8aa23a47 8184
03363afd 8185 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8186 /* It would be nice to FAIL() here, but this may be called from
8187 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8188 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8189 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8190
3dab1dad 8191 k = PL_regkind[OP(o)];
9b155405 8192
2a782b5b 8193 if (k == EXACT) {
396482e1 8194 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8195 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8196 * is a crude hack but it may be the best for now since
8197 * we have no flag "this EXACTish node was UTF-8"
8198 * --jhi */
8199 const char * const s =
ddc5bc0f 8200 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8201 PL_colors[0], PL_colors[1],
8202 PERL_PV_ESCAPE_UNI_DETECT |
8203 PERL_PV_PRETTY_ELIPSES |
8204 PERL_PV_PRETTY_LTGT
8205 );
8206 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8207 } else if (k == TRIE) {
3dab1dad 8208 /* print the details of the trie in dumpuntil instead, as
4f639d21 8209 * prog->data isn't available here */
1de06328
YO
8210 const char op = OP(o);
8211 const I32 n = ARG(o);
8212 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8213 (reg_ac_data *)prog->data->data[n] :
8214 NULL;
8215 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8216 (reg_trie_data*)prog->data->data[n] :
8217 ac->trie;
8218
8219 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8220 DEBUG_TRIE_COMPILE_r(
8221 Perl_sv_catpvf(aTHX_ sv,
8222 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8223 (UV)trie->startstate,
1e2e3d02 8224 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8225 (UV)trie->wordcount,
8226 (UV)trie->minlen,
8227 (UV)trie->maxlen,
8228 (UV)TRIE_CHARCOUNT(trie),
8229 (UV)trie->uniquecharcount
8230 )
8231 );
8232 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8233 int i;
8234 int rangestart = -1;
f46cb337 8235 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8236 Perl_sv_catpvf(aTHX_ sv, "[");
8237 for (i = 0; i <= 256; i++) {
8238 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8239 if (rangestart == -1)
8240 rangestart = i;
8241 } else if (rangestart != -1) {
8242 if (i <= rangestart + 3)
8243 for (; rangestart < i; rangestart++)
8244 put_byte(sv, rangestart);
8245 else {
8246 put_byte(sv, rangestart);
8247 sv_catpvs(sv, "-");
8248 put_byte(sv, i - 1);
8249 }
8250 rangestart = -1;
8251 }
8252 }
8253 Perl_sv_catpvf(aTHX_ sv, "]");
8254 }
8255
a3621e74 8256 } else if (k == CURLY) {
cb434fcc 8257 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8258 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8259 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8260 }
2c2d71f5
JH
8261 else if (k == WHILEM && o->flags) /* Ordinal/of */
8262 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8263 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8264 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8265 else if (k == GOSUB)
6bda09f9 8266 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8267 else if (k == VERB) {
8268 if (!o->flags)
8269 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8270 (SV*)prog->data->data[ ARG( o ) ]);
8271 } else if (k == LOGICAL)
04ebc1ab 8272 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8273 else if (k == ANYOF) {
8274 int i, rangestart = -1;
2d03de9c 8275 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8276
8277 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8278 static const char * const anyofs[] = {
653099ff
GS
8279 "\\w",
8280 "\\W",
8281 "\\s",
8282 "\\S",
8283 "\\d",
8284 "\\D",
8285 "[:alnum:]",
8286 "[:^alnum:]",
8287 "[:alpha:]",
8288 "[:^alpha:]",
8289 "[:ascii:]",
8290 "[:^ascii:]",
8291 "[:ctrl:]",
8292 "[:^ctrl:]",
8293 "[:graph:]",
8294 "[:^graph:]",
8295 "[:lower:]",
8296 "[:^lower:]",
8297 "[:print:]",
8298 "[:^print:]",
8299 "[:punct:]",
8300 "[:^punct:]",
8301 "[:upper:]",
aaa51d5e 8302 "[:^upper:]",
653099ff 8303 "[:xdigit:]",
aaa51d5e
JF
8304 "[:^xdigit:]",
8305 "[:space:]",
8306 "[:^space:]",
8307 "[:blank:]",
8308 "[:^blank:]"
653099ff
GS
8309 };
8310
19860706 8311 if (flags & ANYOF_LOCALE)
396482e1 8312 sv_catpvs(sv, "{loc}");
19860706 8313 if (flags & ANYOF_FOLD)
396482e1 8314 sv_catpvs(sv, "{i}");
653099ff 8315 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8316 if (flags & ANYOF_INVERT)
396482e1 8317 sv_catpvs(sv, "^");
ffc61ed2
JH
8318 for (i = 0; i <= 256; i++) {
8319 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8320 if (rangestart == -1)
8321 rangestart = i;
8322 } else if (rangestart != -1) {
8323 if (i <= rangestart + 3)
8324 for (; rangestart < i; rangestart++)
653099ff 8325 put_byte(sv, rangestart);
ffc61ed2
JH
8326 else {
8327 put_byte(sv, rangestart);
396482e1 8328 sv_catpvs(sv, "-");
ffc61ed2 8329 put_byte(sv, i - 1);
653099ff 8330 }
ffc61ed2 8331 rangestart = -1;
653099ff 8332 }
847a199f 8333 }
ffc61ed2
JH
8334
8335 if (o->flags & ANYOF_CLASS)
bb7a0f54 8336 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8337 if (ANYOF_CLASS_TEST(o,i))
8338 sv_catpv(sv, anyofs[i]);
8339
8340 if (flags & ANYOF_UNICODE)
396482e1 8341 sv_catpvs(sv, "{unicode}");
1aa99e6b 8342 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8343 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8344
8345 {
8346 SV *lv;
32fc9b6a 8347 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8348
ffc61ed2
JH
8349 if (lv) {
8350 if (sw) {
89ebb4a3 8351 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8352
ffc61ed2 8353 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8354 uvchr_to_utf8(s, i);
ffc61ed2 8355
3568d838 8356 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8357 if (rangestart == -1)
8358 rangestart = i;
8359 } else if (rangestart != -1) {
ffc61ed2
JH
8360 if (i <= rangestart + 3)
8361 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8362 const U8 * const e = uvchr_to_utf8(s,rangestart);
8363 U8 *p;
8364 for(p = s; p < e; p++)
ffc61ed2
JH
8365 put_byte(sv, *p);
8366 }
8367 else {
2d03de9c
AL
8368 const U8 *e = uvchr_to_utf8(s,rangestart);
8369 U8 *p;
8370 for (p = s; p < e; p++)
ffc61ed2 8371 put_byte(sv, *p);
396482e1 8372 sv_catpvs(sv, "-");
2d03de9c
AL
8373 e = uvchr_to_utf8(s, i-1);
8374 for (p = s; p < e; p++)
1df70142 8375 put_byte(sv, *p);
ffc61ed2
JH
8376 }
8377 rangestart = -1;
8378 }
19860706 8379 }
ffc61ed2 8380
396482e1 8381 sv_catpvs(sv, "..."); /* et cetera */
19860706 8382 }
fde631ed 8383
ffc61ed2 8384 {
2e0de35c 8385 char *s = savesvpv(lv);
c445ea15 8386 char * const origs = s;
b81d288d 8387
3dab1dad
YO
8388 while (*s && *s != '\n')
8389 s++;
b81d288d 8390
ffc61ed2 8391 if (*s == '\n') {
2d03de9c 8392 const char * const t = ++s;
ffc61ed2
JH
8393
8394 while (*s) {
8395 if (*s == '\n')
8396 *s = ' ';
8397 s++;
8398 }
8399 if (s[-1] == ' ')
8400 s[-1] = 0;
8401
8402 sv_catpv(sv, t);
fde631ed 8403 }
b81d288d 8404
ffc61ed2 8405 Safefree(origs);
fde631ed
JH
8406 }
8407 }
653099ff 8408 }
ffc61ed2 8409
653099ff
GS
8410 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8411 }
9b155405 8412 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8413 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8414#else
96a5add6 8415 PERL_UNUSED_CONTEXT;
65e66c80
SP
8416 PERL_UNUSED_ARG(sv);
8417 PERL_UNUSED_ARG(o);
f9049ba1 8418 PERL_UNUSED_ARG(prog);
17c3b450 8419#endif /* DEBUGGING */
35ff7856 8420}
a687059c 8421
cad2e5aa
JH
8422SV *
8423Perl_re_intuit_string(pTHX_ regexp *prog)
8424{ /* Assume that RE_INTUIT is set */
97aff369 8425 dVAR;
a3621e74 8426 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8427 PERL_UNUSED_CONTEXT;
8428
a3621e74 8429 DEBUG_COMPILE_r(
cfd0369c 8430 {
2d03de9c 8431 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8432 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8433
8434 if (!PL_colorset) reginitcolors();
8435 PerlIO_printf(Perl_debug_log,
a0288114 8436 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8437 PL_colors[4],
8438 prog->check_substr ? "" : "utf8 ",
8439 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8440 s,
8441 PL_colors[1],
8442 (strlen(s) > 60 ? "..." : ""));
8443 } );
8444
33b8afdf 8445 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8446}
8447
84da74a7
YO
8448/*
8449 pregfree - free a regexp
8450
8451 See regdupe below if you change anything here.
8452*/
8453
2b69d0c2 8454void
864dbfa3 8455Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8456{
27da23d5 8457 dVAR;
0df25f3d 8458
fc32ee4a 8459 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8460
7821416a
IZ
8461 if (!r || (--r->refcnt > 0))
8462 return;
ab3bbdeb 8463 DEBUG_COMPILE_r({
0df25f3d
YO
8464 if (!PL_colorset)
8465 reginitcolors();
5c9f2f80
YO
8466 {
8467 SV *dsv= sv_newmortal();
bbe252da 8468 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
ab3bbdeb
YO
8469 dsv, r->precomp, r->prelen, 60);
8470 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8471 PL_colors[4],PL_colors[5],s);
8472 }
9e55ce06 8473 });
cad2e5aa 8474
43c5f42d
NC
8475 /* gcov results gave these as non-null 100% of the time, so there's no
8476 optimisation in checking them before calling Safefree */
8477 Safefree(r->precomp);
8478 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8479 RX_MATCH_COPY_FREE(r);
f8c7b90f 8480#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8481 if (r->saved_copy)
8482 SvREFCNT_dec(r->saved_copy);
8483#endif
a193d654
GS
8484 if (r->substrs) {
8485 if (r->anchored_substr)
8486 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8487 if (r->anchored_utf8)
8488 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8489 if (r->float_substr)
8490 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8491 if (r->float_utf8)
8492 SvREFCNT_dec(r->float_utf8);
2779dcf1 8493 Safefree(r->substrs);
a193d654 8494 }
81714fb9
YO
8495 if (r->paren_names)
8496 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8497 if (r->data) {
8498 int n = r->data->count;
f3548bdc
DM
8499 PAD* new_comppad = NULL;
8500 PAD* old_comppad;
4026c95a 8501 PADOFFSET refcnt;
dfad63ad 8502
c277df42 8503 while (--n >= 0) {
261faec3 8504 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8505 switch (r->data->what[n]) {
8506 case 's':
81714fb9 8507 case 'S':
c277df42
IZ
8508 SvREFCNT_dec((SV*)r->data->data[n]);
8509 break;
653099ff
GS
8510 case 'f':
8511 Safefree(r->data->data[n]);
8512 break;
dfad63ad
HS
8513 case 'p':
8514 new_comppad = (AV*)r->data->data[n];
8515 break;
c277df42 8516 case 'o':
dfad63ad 8517 if (new_comppad == NULL)
cea2e8a9 8518 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8519 PAD_SAVE_LOCAL(old_comppad,
8520 /* Watch out for global destruction's random ordering. */
c445ea15 8521 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8522 );
b34c0dd4 8523 OP_REFCNT_LOCK;
4026c95a
SH
8524 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8525 OP_REFCNT_UNLOCK;
8526 if (!refcnt)
9b978d73 8527 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8528
f3548bdc 8529 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8530 SvREFCNT_dec((SV*)new_comppad);
8531 new_comppad = NULL;
c277df42
IZ
8532 break;
8533 case 'n':
9e55ce06 8534 break;
07be1b83 8535 case 'T':
be8e71aa
YO
8536 { /* Aho Corasick add-on structure for a trie node.
8537 Used in stclass optimization only */
07be1b83
YO
8538 U32 refcount;
8539 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8540 OP_REFCNT_LOCK;
8541 refcount = --aho->refcount;
8542 OP_REFCNT_UNLOCK;
8543 if ( !refcount ) {
8544 Safefree(aho->states);
8545 Safefree(aho->fail);
8546 aho->trie=NULL; /* not necessary to free this as it is
8547 handled by the 't' case */
8548 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8549 Safefree(r->regstclass);
07be1b83
YO
8550 }
8551 }
8552 break;
a3621e74 8553 case 't':
07be1b83 8554 {
be8e71aa 8555 /* trie structure. */
07be1b83
YO
8556 U32 refcount;
8557 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8558 OP_REFCNT_LOCK;
8559 refcount = --trie->refcount;
8560 OP_REFCNT_UNLOCK;
8561 if ( !refcount ) {
8562 Safefree(trie->charmap);
8563 if (trie->widecharmap)
8564 SvREFCNT_dec((SV*)trie->widecharmap);
8565 Safefree(trie->states);
8566 Safefree(trie->trans);
8567 if (trie->bitmap)
8568 Safefree(trie->bitmap);
8569 if (trie->wordlen)
8570 Safefree(trie->wordlen);
786e8c11
YO
8571 if (trie->jump)
8572 Safefree(trie->jump);
8573 if (trie->nextword)
8574 Safefree(trie->nextword);
a3621e74 8575#ifdef DEBUGGING
5c9f2f80
YO
8576 if (trie->words)
8577 SvREFCNT_dec((SV*)trie->words);
8578 if (trie->revcharmap)
8579 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8580#endif
07be1b83 8581 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8582 }
07be1b83
YO
8583 }
8584 break;
c277df42 8585 default:
830247a4 8586 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8587 }
8588 }
8589 Safefree(r->data->what);
8590 Safefree(r->data);
a0d0e21e
LW
8591 }
8592 Safefree(r->startp);
8593 Safefree(r->endp);
c74340f9
YO
8594 if (r->swap) {
8595 Safefree(r->swap->startp);
8596 Safefree(r->swap->endp);
8597 Safefree(r->swap);
8598 }
a0d0e21e 8599 Safefree(r);
a687059c 8600}
c277df42 8601
84da74a7
YO
8602#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8603#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8604#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8605#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8606
8607/*
8608 regdupe - duplicate a regexp.
8609
8610 This routine is called by sv.c's re_dup and is expected to clone a
8611 given regexp structure. It is a no-op when not under USE_ITHREADS.
8612 (Originally this *was* re_dup() for change history see sv.c)
8613
8614 See pregfree() above if you change anything here.
8615*/
a3c0e9ca 8616#if defined(USE_ITHREADS)
84da74a7
YO
8617regexp *
8618Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8619{
84da74a7
YO
8620 dVAR;
8621 REGEXP *ret;
8622 int i, len, npar;
8623 struct reg_substr_datum *s;
8624
8625 if (!r)
8626 return (REGEXP *)NULL;
8627
8628 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8629 return ret;
8630
8631 len = r->offsets[0];
8632 npar = r->nparens+1;
8633
8634 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8635 Copy(r->program, ret->program, len+1, regnode);
8636
8637 Newx(ret->startp, npar, I32);
8638 Copy(r->startp, ret->startp, npar, I32);
8639 Newx(ret->endp, npar, I32);
8640 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8641 if(r->swap) {
8642 Newx(ret->swap, 1, regexp_paren_ofs);
8643 /* no need to copy these */
8644 Newx(ret->swap->startp, npar, I32);
8645 Newx(ret->swap->endp, npar, I32);
8646 } else {
8647 ret->swap = NULL;
8648 }
84da74a7
YO
8649
8650 Newx(ret->substrs, 1, struct reg_substr_data);
8651 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8652 s->min_offset = r->substrs->data[i].min_offset;
8653 s->max_offset = r->substrs->data[i].max_offset;
8654 s->end_shift = r->substrs->data[i].end_shift;
8655 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8656 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8657 }
8658
8659 ret->regstclass = NULL;
8660 if (r->data) {
8661 struct reg_data *d;
8662 const int count = r->data->count;
8663 int i;
8664
8665 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8666 char, struct reg_data);
8667 Newx(d->what, count, U8);
8668
8669 d->count = count;
8670 for (i = 0; i < count; i++) {
8671 d->what[i] = r->data->what[i];
8672 switch (d->what[i]) {
bbe252da 8673 /* legal options are one of: sSfpontT
84da74a7
YO
8674 see also regcomp.h and pregfree() */
8675 case 's':
81714fb9 8676 case 'S':
84da74a7
YO
8677 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8678 break;
8679 case 'p':
8680 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8681 break;
8682 case 'f':
8683 /* This is cheating. */
8684 Newx(d->data[i], 1, struct regnode_charclass_class);
8685 StructCopy(r->data->data[i], d->data[i],
8686 struct regnode_charclass_class);
8687 ret->regstclass = (regnode*)d->data[i];
8688 break;
8689 case 'o':
bbe252da
YO
8690 /* Compiled op trees are readonly and in shared memory,
8691 and can thus be shared without duplication. */
84da74a7
YO
8692 OP_REFCNT_LOCK;
8693 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8694 OP_REFCNT_UNLOCK;
8695 break;
8696 case 'n':
8697 d->data[i] = r->data->data[i];
8698 break;
8699 case 't':
8700 d->data[i] = r->data->data[i];
8701 OP_REFCNT_LOCK;
8702 ((reg_trie_data*)d->data[i])->refcount++;
8703 OP_REFCNT_UNLOCK;
8704 break;
8705 case 'T':
8706 d->data[i] = r->data->data[i];
8707 OP_REFCNT_LOCK;
8708 ((reg_ac_data*)d->data[i])->refcount++;
8709 OP_REFCNT_UNLOCK;
8710 /* Trie stclasses are readonly and can thus be shared
8711 * without duplication. We free the stclass in pregfree
8712 * when the corresponding reg_ac_data struct is freed.
8713 */
8714 ret->regstclass= r->regstclass;
8715 break;
8716 default:
8717 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8718 }
8719 }
8720
8721 ret->data = d;
8722 }
8723 else
8724 ret->data = NULL;
8725
8726 Newx(ret->offsets, 2*len+1, U32);
8727 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8728
8729 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8730 ret->refcnt = r->refcnt;
8731 ret->minlen = r->minlen;
de8c5301 8732 ret->minlenret = r->minlenret;
84da74a7
YO
8733 ret->prelen = r->prelen;
8734 ret->nparens = r->nparens;
8735 ret->lastparen = r->lastparen;
8736 ret->lastcloseparen = r->lastcloseparen;
bbe252da
YO
8737 ret->intflags = r->intflags;
8738 ret->extflags = r->extflags;
84da74a7
YO
8739
8740 ret->sublen = r->sublen;
8741
f9f4320a 8742 ret->engine = r->engine;
81714fb9
YO
8743
8744 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8745
84da74a7
YO
8746 if (RX_MATCH_COPIED(ret))
8747 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8748 else
8749 ret->subbeg = NULL;
8750#ifdef PERL_OLD_COPY_ON_WRITE
8751 ret->saved_copy = NULL;
8752#endif
8753
8754 ptr_table_store(PL_ptr_table, r, ret);
8755 return ret;
84da74a7 8756}
a3c0e9ca 8757#endif
84da74a7 8758
de8c5301
YO
8759/*
8760 reg_stringify()
8761
8762 converts a regexp embedded in a MAGIC struct to its stringified form,
8763 caching the converted form in the struct and returns the cached
8764 string.
8765
8766 If lp is nonnull then it is used to return the length of the
8767 resulting string
8768
8769 If flags is nonnull and the returned string contains UTF8 then
8770 (flags & 1) will be true.
8771
8772 If haseval is nonnull then it is used to return whether the pattern
8773 contains evals.
8774
8775 Normally called via macro:
8776
8777 CALLREG_STRINGIFY(mg,0,0);
8778
8779 And internally with
8780
8781 CALLREG_AS_STR(mg,lp,flags,haseval)
8782
8783 See sv_2pv_flags() in sv.c for an example of internal usage.
8784
8785 */
8786
8787char *
8788Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8789 dVAR;
8790 const regexp * const re = (regexp *)mg->mg_obj;
8791
8792 if (!mg->mg_ptr) {
8793 const char *fptr = "msix";
8794 char reflags[6];
8795 char ch;
8796 int left = 0;
8797 int right = 4;
8798 bool need_newline = 0;
bbe252da 8799 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8800
8801 while((ch = *fptr++)) {
8802 if(reganch & 1) {
8803 reflags[left++] = ch;
8804 }
8805 else {
8806 reflags[right--] = ch;
8807 }
8808 reganch >>= 1;
8809 }
8810 if(left != 4) {
8811 reflags[left] = '-';
8812 left = 5;
8813 }
8814
8815 mg->mg_len = re->prelen + 4 + left;
8816 /*
8817 * If /x was used, we have to worry about a regex ending with a
8818 * comment later being embedded within another regex. If so, we don't
8819 * want this regex's "commentization" to leak out to the right part of
8820 * the enclosing regex, we must cap it with a newline.
8821 *
8822 * So, if /x was used, we scan backwards from the end of the regex. If
8823 * we find a '#' before we find a newline, we need to add a newline
8824 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8825 * we don't need to add anything. -jfriedl
8826 */
bbe252da 8827 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8828 const char *endptr = re->precomp + re->prelen;
8829 while (endptr >= re->precomp) {
8830 const char c = *(endptr--);
8831 if (c == '\n')
8832 break; /* don't need another */
8833 if (c == '#') {
8834 /* we end while in a comment, so we need a newline */
8835 mg->mg_len++; /* save space for it */
8836 need_newline = 1; /* note to add it */
8837 break;
8838 }
8839 }
8840 }
8841
8842 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8843 mg->mg_ptr[0] = '(';
8844 mg->mg_ptr[1] = '?';
8845 Copy(reflags, mg->mg_ptr+2, left, char);
8846 *(mg->mg_ptr+left+2) = ':';
8847 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8848 if (need_newline)
8849 mg->mg_ptr[mg->mg_len - 2] = '\n';
8850 mg->mg_ptr[mg->mg_len - 1] = ')';
8851 mg->mg_ptr[mg->mg_len] = 0;
8852 }
8853 if (haseval)
8854 *haseval = re->program[0].next_off;
8855 if (flags)
bbe252da 8856 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8857
8858 if (lp)
8859 *lp = mg->mg_len;
8860 return mg->mg_ptr;
8861}
8862
8863
76234dfb 8864#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8865/*
8866 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8867 */
8868regnode *
864dbfa3 8869Perl_regnext(pTHX_ register regnode *p)
c277df42 8870{
97aff369 8871 dVAR;
c277df42
IZ
8872 register I32 offset;
8873
3280af22 8874 if (p == &PL_regdummy)
c277df42
IZ
8875 return(NULL);
8876
8877 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8878 if (offset == 0)
8879 return(NULL);
8880
c277df42 8881 return(p+offset);
c277df42 8882}
76234dfb 8883#endif
c277df42 8884
01f988be 8885STATIC void
cea2e8a9 8886S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8887{
8888 va_list args;
8889 STRLEN l1 = strlen(pat1);
8890 STRLEN l2 = strlen(pat2);
8891 char buf[512];
06bf62c7 8892 SV *msv;
73d840c0 8893 const char *message;
c277df42
IZ
8894
8895 if (l1 > 510)
8896 l1 = 510;
8897 if (l1 + l2 > 510)
8898 l2 = 510 - l1;
8899 Copy(pat1, buf, l1 , char);
8900 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8901 buf[l1 + l2] = '\n';
8902 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8903#ifdef I_STDARG
8904 /* ANSI variant takes additional second argument */
c277df42 8905 va_start(args, pat2);
8736538c
AS
8906#else
8907 va_start(args);
8908#endif
5a844595 8909 msv = vmess(buf, &args);
c277df42 8910 va_end(args);
cfd0369c 8911 message = SvPV_const(msv,l1);
c277df42
IZ
8912 if (l1 > 512)
8913 l1 = 512;
8914 Copy(message, buf, l1 , char);
197cf9b9 8915 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8916 Perl_croak(aTHX_ "%s", buf);
c277df42 8917}
a0ed51b3
LW
8918
8919/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8920
76234dfb 8921#ifndef PERL_IN_XSUB_RE
a0ed51b3 8922void
864dbfa3 8923Perl_save_re_context(pTHX)
b81d288d 8924{
97aff369 8925 dVAR;
1ade1aa1
NC
8926
8927 struct re_save_state *state;
8928
8929 SAVEVPTR(PL_curcop);
8930 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8931
8932 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8933 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8934 SSPUSHINT(SAVEt_RE_STATE);
8935
46ab3289 8936 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8937
a0ed51b3 8938 PL_reg_start_tmp = 0;
a0ed51b3 8939 PL_reg_start_tmpl = 0;
c445ea15 8940 PL_reg_oldsaved = NULL;
a5db57d6 8941 PL_reg_oldsavedlen = 0;
a5db57d6 8942 PL_reg_maxiter = 0;
a5db57d6 8943 PL_reg_leftiter = 0;
c445ea15 8944 PL_reg_poscache = NULL;
a5db57d6 8945 PL_reg_poscache_size = 0;
1ade1aa1
NC
8946#ifdef PERL_OLD_COPY_ON_WRITE
8947 PL_nrs = NULL;
8948#endif
ada6e8a9 8949
c445ea15
AL
8950 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8951 if (PL_curpm) {
8952 const REGEXP * const rx = PM_GETRE(PL_curpm);
8953 if (rx) {
1df70142 8954 U32 i;
ada6e8a9 8955 for (i = 1; i <= rx->nparens; i++) {
1df70142 8956 char digits[TYPE_CHARS(long)];
d9fad198 8957 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8958 GV *const *const gvp
8959 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8960
b37c2d43
AL
8961 if (gvp) {
8962 GV * const gv = *gvp;
8963 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8964 save_scalar(gv);
49f27e4b 8965 }
ada6e8a9
AMS
8966 }
8967 }
8968 }
a0ed51b3 8969}
76234dfb 8970#endif
51371543 8971
51371543 8972static void
acfe0abc 8973clear_re(pTHX_ void *r)
51371543 8974{
97aff369 8975 dVAR;
51371543
GS
8976 ReREFCNT_dec((regexp *)r);
8977}
ffbc6a93 8978
a28509cc
AL
8979#ifdef DEBUGGING
8980
8981STATIC void
8982S_put_byte(pTHX_ SV *sv, int c)
8983{
8984 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8985 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8986 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8987 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8988 else
8989 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8990}
8991
786e8c11 8992
3dab1dad
YO
8993#define CLEAR_OPTSTART \
8994 if (optstart) STMT_START { \
07be1b83 8995 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8996 optstart=NULL; \
8997 } STMT_END
8998
786e8c11 8999#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9000
b5a2f8d8
NC
9001STATIC const regnode *
9002S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9003 const regnode *last, const regnode *plast,
9004 SV* sv, I32 indent, U32 depth)
a28509cc 9005{
97aff369 9006 dVAR;
786e8c11 9007 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9008 register const regnode *next;
3dab1dad
YO
9009 const regnode *optstart= NULL;
9010 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9011
786e8c11
YO
9012#ifdef DEBUG_DUMPUNTIL
9013 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9014 last ? last-start : 0,plast ? plast-start : 0);
9015#endif
9016
9017 if (plast && plast < last)
9018 last= plast;
9019
9020 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9021 /* While that wasn't END last time... */
9022
9023 NODE_ALIGN(node);
9024 op = OP(node);
de734bd5 9025 if (op == CLOSE || op == WHILEM)
786e8c11 9026 indent--;
b5a2f8d8 9027 next = regnext((regnode *)node);
07be1b83 9028
a28509cc 9029 /* Where, what. */
8e11feef 9030 if (OP(node) == OPTIMIZED) {
e68ec53f 9031 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9032 optstart = node;
3dab1dad 9033 else
8e11feef 9034 goto after_print;
3dab1dad
YO
9035 } else
9036 CLEAR_OPTSTART;
07be1b83 9037
32fc9b6a 9038 regprop(r, sv, node);
a28509cc 9039 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9040 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9041
9042 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9043 if (next == NULL) /* Next ptr. */
9044 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9045 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9046 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9047 else
9048 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9049
1de06328 9050 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9051 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9052 }
9053
a28509cc
AL
9054 after_print:
9055 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9056 assert(next);
9057 {
9058 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9059 ? regnext((regnode *)next)
9060 : next);
be8e71aa
YO
9061 if (last && nnode > last)
9062 nnode = last;
786e8c11 9063 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9064 }
a28509cc
AL
9065 }
9066 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9067 assert(next);
786e8c11 9068 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9069 }
9070 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9071 const regnode *this_trie = node;
1de06328 9072 const char op = OP(node);
a28509cc 9073 const I32 n = ARG(node);
1de06328
YO
9074 const reg_ac_data * const ac = op>=AHOCORASICK ?
9075 (reg_ac_data *)r->data->data[n] :
9076 NULL;
9077 const reg_trie_data * const trie = op<AHOCORASICK ?
9078 (reg_trie_data*)r->data->data[n] :
9079 ac->trie;
786e8c11 9080 const regnode *nextbranch= NULL;
a28509cc 9081 I32 word_idx;
1de06328 9082 sv_setpvn(sv, "", 0);
786e8c11 9083 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9084 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9085
9086 PerlIO_printf(Perl_debug_log, "%*s%s ",
9087 (int)(2*(indent+3)), "",
9088 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9089 PL_colors[0], PL_colors[1],
9090 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9091 PERL_PV_PRETTY_ELIPSES |
7f69552c 9092 PERL_PV_PRETTY_LTGT
786e8c11
YO
9093 )
9094 : "???"
9095 );
9096 if (trie->jump) {
40d049e4 9097 U16 dist= trie->jump[word_idx+1];
7f69552c 9098 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9099 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9100 if (dist) {
9101 if (!nextbranch)
24b23f37 9102 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9103 DUMPUNTIL(this_trie + dist, nextbranch);
9104 }
786e8c11
YO
9105 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9106 nextbranch= regnext((regnode *)nextbranch);
9107 } else {
9108 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9109 }
786e8c11
YO
9110 }
9111 if (last && next > last)
9112 node= last;
9113 else
9114 node= next;
a28509cc 9115 }
786e8c11
YO
9116 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9117 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9118 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9119 }
9120 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9121 assert(next);
786e8c11 9122 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9123 }
9124 else if ( op == PLUS || op == STAR) {
786e8c11 9125 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9126 }
9127 else if (op == ANYOF) {
9128 /* arglen 1 + class block */
9129 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9130 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9131 node = NEXTOPER(node);
9132 }
9133 else if (PL_regkind[(U8)op] == EXACT) {
9134 /* Literal string, where present. */
9135 node += NODE_SZ_STR(node) - 1;
9136 node = NEXTOPER(node);
9137 }
9138 else {
9139 node = NEXTOPER(node);
9140 node += regarglen[(U8)op];
9141 }
9142 if (op == CURLYX || op == OPEN)
786e8c11 9143 indent++;
a28509cc 9144 }
3dab1dad 9145 CLEAR_OPTSTART;
786e8c11
YO
9146#ifdef DEBUG_DUMPUNTIL
9147 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9148#endif
1de06328 9149 return node;
a28509cc
AL
9150}
9151
9152#endif /* DEBUGGING */
9153
241d1a3b
NC
9154/*
9155 * Local variables:
9156 * c-indentation-style: bsd
9157 * c-basic-offset: 4
9158 * indent-tabs-mode: t
9159 * End:
9160 *
37442d52
RGS
9161 * ex: set ts=8 sts=4 sw=4 noet:
9162 */