This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Extend the test-reonly make target on windows (Yves Orton)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83 2121#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2122 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2129 }});
07be1b83 2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
b515a41d
YO
2292/* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2295struct scan_frame;
2296typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2301} scan_frame;
2302
76e3520e 2303STATIC I32
40d049e4 2304S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2305 I32 *minlenp, I32 *deltap,
40d049e4
YO
2306 regnode *last,
2307 scan_data_t *data,
2308 I32 stopparen,
2309 U8* recursed,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
c277df42
IZ
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
40d049e4
YO
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2319{
97aff369 2320 dVAR;
c277df42
IZ
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2323 I32 delta = 0;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2325 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
a3621e74 2328 SV *re_trie_maxbuff = NULL;
786e8c11 2329 regnode *first_non_open = scan;
e2e6a0f1 2330 I32 stopmin = I32_MAX;
8aa23a47
YO
2331 scan_frame *frame = NULL;
2332
a3621e74 2333 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2334
13a24bad 2335#ifdef DEBUGGING
40d049e4 2336 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2337#endif
40d049e4 2338
786e8c11 2339 if ( depth == 0 ) {
40d049e4 2340 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2341 first_non_open=regnext(first_non_open);
2342 }
2343
b81d288d 2344
8aa23a47
YO
2345 fake_study_recurse:
2346 while ( scan && OP(scan) != END && scan < last ){
2347 /* Peephole optimizer: */
2348 DEBUG_STUDYDATA(data,depth);
2349 DEBUG_PEEP("Peep",scan,depth);
2350 JOIN_EXACT(scan,&min,0);
2351
2352 /* Follow the next-chain of the current node and optimize
2353 away all the NOTHINGs from it. */
2354 if (OP(scan) != CURLYX) {
2355 const int max = (reg_off_by_arg[OP(scan)]
2356 ? I32_MAX
2357 /* I32 may be smaller than U16 on CRAYs! */
2358 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2359 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2360 int noff;
2361 regnode *n = scan;
2362
2363 /* Skip NOTHING and LONGJMP. */
2364 while ((n = regnext(n))
2365 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2366 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2367 && off + noff < max)
2368 off += noff;
2369 if (reg_off_by_arg[OP(scan)])
2370 ARG(scan) = off;
2371 else
2372 NEXT_OFF(scan) = off;
2373 }
a3621e74 2374
c277df42 2375
8aa23a47
YO
2376
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2382 code = OP(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2384
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2388 too. */
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2392
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
2397
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
2401
2402 num++;
2403 data_fake.flags = 0;
2404 if (data) {
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2407 }
2408 else
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
2411 scan = NEXTOPER(scan);
2412 if (code != BRANCH)
c277df42 2413 scan = NEXTOPER(scan);
8aa23a47
YO
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2418 }
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2421
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2424 next, &data_fake,
2425 stopparen, recursed, NULL, f,depth+1);
2426 if (min1 > minnext)
2427 min1 = minnext;
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2432 scan = next;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2434 pars++;
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2439 if (data)
2440 data->flags |= SCF_SEEN_ACCEPT;
2441 }
2442 if (data) {
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
3dab1dad 2446 }
8aa23a47
YO
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
2449 }
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2451 min1 = 0;
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
2457 }
2458 min += min1;
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2462 if (min1) {
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
653099ff 2465 }
8aa23a47
YO
2466 }
2467 else if (flags & SCF_DO_STCLASS_AND) {
2468 if (min1) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2471 }
8aa23a47
YO
2472 else {
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2475 INIT_AND_WITHP;
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2483 }
8aa23a47 2484 }
a3621e74 2485
8aa23a47
YO
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2487 /* demq.
a3621e74 2488
8aa23a47
YO
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2492 for subsequences of
a3621e74 2493
8aa23a47
YO
2494 BRANCH->EXACT=>x1
2495 BRANCH->EXACT=>x2
2496 tail
a3621e74 2497
8aa23a47 2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2499
8aa23a47
YO
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
a3621e74 2503
8aa23a47 2504 We have two cases
a3621e74 2505
8aa23a47 2506 1. patterns where the whole set of branch can be converted.
a3621e74 2507
8aa23a47 2508 2. patterns where only a subset can be converted.
a3621e74 2509
8aa23a47
YO
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2512 branchs so
a3621e74 2513
8aa23a47
YO
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
786e8c11 2516
8aa23a47
YO
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
a3621e74 2520
8aa23a47
YO
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
b515a41d 2525
8aa23a47
YO
2526 */
2527
2528 int made=0;
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2533 }
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2535 regnode *cur;
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2539 U8 optype = 0;
2540 U32 count=0;
a3621e74
YO
2541
2542#ifdef DEBUGGING
8aa23a47 2543 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2544#endif
8aa23a47
YO
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2550 tails.
2551 */
2552
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2556 }
a3621e74 2557
8aa23a47
YO
2558
2559 DEBUG_OPTIMISE_r({
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2565 );
2566 });
2567
2568 /*
2569
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
2590
2591 */
2592
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
b515a41d 2596#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2597 regnode * const noper_next = regnext( noper );
b515a41d
YO
2598#endif
2599
8aa23a47
YO
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2604
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2608
2609 if ( noper_next ) {
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2613 }
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2616 });
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
786e8c11 2620#ifdef NOJUMPTRIE
8aa23a47 2621 && noper_next == tail
786e8c11 2622#endif
8aa23a47
YO
2623 && count < U16_MAX)
2624 {
2625 count++;
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2629 } else {
2630 last = cur;
2631 }
2632 } else {
2633 if ( last ) {
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2636 optype, depth+1 );
2637 }
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2639#ifdef NOJUMPTRIE
8aa23a47 2640 && noper_next == tail
786e8c11 2641#endif
8aa23a47
YO
2642 ){
2643 count = 1;
2644 first = cur;
2645 optype = OP( noper );
2646 } else {
2647 count = 0;
2648 first = NULL;
2649 optype = 0;
2650 }
2651 last = NULL;
2652 }
2653 }
2654 DEBUG_OPTIMISE_r({
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2659
2660 });
2661 if ( last ) {
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2663#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2667 depth==0 ) {
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2670 && scan == tail )
2671 {
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2673 }
2674 }
3dab1dad 2675#endif
8aa23a47
YO
2676 }
2677 }
2678
2679 } /* do trie */
2680
653099ff 2681 }
8aa23a47
YO
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2686 continue;
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2689 I32 paren;
2690 regnode *start;
2691 regnode *end;
2692
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2696 paren = ARG(scan);
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2700 } else {
2701 paren = 0;
2702 start = RExC_rx->program + 1;
2703 end = RExC_opend;
2704 }
2705 if (!recursed) {
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2708 }
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2712 } else {
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2716 }
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2721 }
2722 } else {
2723 Newx(newframe,1,scan_frame);
2724 paren = stopparen;
2725 start = scan+2;
2726 end = regnext(scan);
2727 }
2728 if (newframe) {
2729 assert(start);
2730 assert(end);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = last;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2736
2737 frame = newframe;
2738 scan = start;
2739 stopparen = paren;
2740 last = end;
2741
2742 continue;
2743 }
2744 }
2745 else if (OP(scan) == EXACT) {
2746 I32 l = STR_LEN(scan);
2747 UV uc;
2748 if (UTF) {
2749 const U8 * const s = (U8*)STRING(scan);
2750 l = utf8_length(s, s + l);
2751 uc = utf8_to_uvchr(s, NULL);
2752 } else {
2753 uc = *((U8*)STRING(scan));
2754 }
2755 min += l;
2756 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2757 /* The code below prefers earlier match for fixed
2758 offset, later match for variable offset. */
2759 if (data->last_end == -1) { /* Update the start info. */
2760 data->last_start_min = data->pos_min;
2761 data->last_start_max = is_inf
2762 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2763 }
8aa23a47
YO
2764 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2765 if (UTF)
2766 SvUTF8_on(data->last_found);
2767 {
2768 SV * const sv = data->last_found;
2769 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2770 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2771 if (mg && mg->mg_len >= 0)
2772 mg->mg_len += utf8_length((U8*)STRING(scan),
2773 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2774 }
8aa23a47
YO
2775 data->last_end = data->pos_min + l;
2776 data->pos_min += l; /* As in the first entry. */
2777 data->flags &= ~SF_BEFORE_EOL;
2778 }
2779 if (flags & SCF_DO_STCLASS_AND) {
2780 /* Check whether it is compatible with what we know already! */
2781 int compat = 1;
2782
2783 if (uc >= 0x100 ||
2784 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2785 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2786 && (!(data->start_class->flags & ANYOF_FOLD)
2787 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2788 )
2789 compat = 0;
2790 ANYOF_CLASS_ZERO(data->start_class);
2791 ANYOF_BITMAP_ZERO(data->start_class);
2792 if (compat)
2793 ANYOF_BITMAP_SET(data->start_class, uc);
2794 data->start_class->flags &= ~ANYOF_EOS;
2795 if (uc < 0x100)
2796 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2797 }
2798 else if (flags & SCF_DO_STCLASS_OR) {
2799 /* false positive possible if the class is case-folded */
2800 if (uc < 0x100)
2801 ANYOF_BITMAP_SET(data->start_class, uc);
2802 else
2803 data->start_class->flags |= ANYOF_UNICODE_ALL;
2804 data->start_class->flags &= ~ANYOF_EOS;
2805 cl_and(data->start_class, and_withp);
2806 }
2807 flags &= ~SCF_DO_STCLASS;
2808 }
2809 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2810 I32 l = STR_LEN(scan);
2811 UV uc = *((U8*)STRING(scan));
2812
2813 /* Search for fixed substrings supports EXACT only. */
2814 if (flags & SCF_DO_SUBSTR) {
2815 assert(data);
2816 scan_commit(pRExC_state, data, minlenp);
2817 }
2818 if (UTF) {
2819 const U8 * const s = (U8 *)STRING(scan);
2820 l = utf8_length(s, s + l);
2821 uc = utf8_to_uvchr(s, NULL);
2822 }
2823 min += l;
2824 if (flags & SCF_DO_SUBSTR)
2825 data->pos_min += l;
2826 if (flags & SCF_DO_STCLASS_AND) {
2827 /* Check whether it is compatible with what we know already! */
2828 int compat = 1;
2829
2830 if (uc >= 0x100 ||
2831 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2833 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2834 compat = 0;
2835 ANYOF_CLASS_ZERO(data->start_class);
2836 ANYOF_BITMAP_ZERO(data->start_class);
2837 if (compat) {
2838 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2839 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2840 data->start_class->flags |= ANYOF_FOLD;
2841 if (OP(scan) == EXACTFL)
2842 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2843 }
8aa23a47
YO
2844 }
2845 else if (flags & SCF_DO_STCLASS_OR) {
2846 if (data->start_class->flags & ANYOF_FOLD) {
2847 /* false positive possible if the class is case-folded.
2848 Assume that the locale settings are the same... */
1aa99e6b
IH
2849 if (uc < 0x100)
2850 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2851 data->start_class->flags &= ~ANYOF_EOS;
2852 }
8aa23a47 2853 cl_and(data->start_class, and_withp);
653099ff 2854 }
8aa23a47
YO
2855 flags &= ~SCF_DO_STCLASS;
2856 }
2857 else if (strchr((const char*)PL_varies,OP(scan))) {
2858 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2859 I32 f = flags, pos_before = 0;
2860 regnode * const oscan = scan;
2861 struct regnode_charclass_class this_class;
2862 struct regnode_charclass_class *oclass = NULL;
2863 I32 next_is_eval = 0;
2864
2865 switch (PL_regkind[OP(scan)]) {
2866 case WHILEM: /* End of (?:...)* . */
2867 scan = NEXTOPER(scan);
2868 goto finish;
2869 case PLUS:
2870 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2871 next = NEXTOPER(scan);
2872 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2873 mincount = 1;
2874 maxcount = REG_INFTY;
2875 next = regnext(scan);
2876 scan = NEXTOPER(scan);
2877 goto do_curly;
2878 }
2879 }
2880 if (flags & SCF_DO_SUBSTR)
2881 data->pos_min++;
2882 min++;
2883 /* Fall through. */
2884 case STAR:
2885 if (flags & SCF_DO_STCLASS) {
2886 mincount = 0;
2887 maxcount = REG_INFTY;
2888 next = regnext(scan);
2889 scan = NEXTOPER(scan);
2890 goto do_curly;
2891 }
2892 is_inf = is_inf_internal = 1;
2893 scan = regnext(scan);
c277df42 2894 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2895 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2896 data->longest = &(data->longest_float);
c277df42 2897 }
8aa23a47
YO
2898 goto optimize_curly_tail;
2899 case CURLY:
2900 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2901 && (scan->flags == stopparen))
2902 {
2903 mincount = 1;
2904 maxcount = 1;
2905 } else {
2906 mincount = ARG1(scan);
2907 maxcount = ARG2(scan);
653099ff 2908 }
8aa23a47
YO
2909 next = regnext(scan);
2910 if (OP(scan) == CURLYX) {
2911 I32 lp = (data ? *(data->last_closep) : 0);
2912 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2913 }
8aa23a47
YO
2914 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2915 next_is_eval = (OP(scan) == EVAL);
2916 do_curly:
2917 if (flags & SCF_DO_SUBSTR) {
2918 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2919 pos_before = data->pos_min;
b45f050a 2920 }
8aa23a47
YO
2921 if (data) {
2922 fl = data->flags;
2923 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2924 if (is_inf)
2925 data->flags |= SF_IS_INF;
2926 }
2927 if (flags & SCF_DO_STCLASS) {
2928 cl_init(pRExC_state, &this_class);
2929 oclass = data->start_class;
2930 data->start_class = &this_class;
2931 f |= SCF_DO_STCLASS_AND;
2932 f &= ~SCF_DO_STCLASS_OR;
2933 }
2934 /* These are the cases when once a subexpression
2935 fails at a particular position, it cannot succeed
2936 even after backtracking at the enclosing scope.
2937
2938 XXXX what if minimal match and we are at the
2939 initial run of {n,m}? */
2940 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2941 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2942
8aa23a47
YO
2943 /* This will finish on WHILEM, setting scan, or on NULL: */
2944 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2945 last, data, stopparen, recursed, NULL,
2946 (mincount == 0
2947 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2948
8aa23a47
YO
2949 if (flags & SCF_DO_STCLASS)
2950 data->start_class = oclass;
2951 if (mincount == 0 || minnext == 0) {
2952 if (flags & SCF_DO_STCLASS_OR) {
2953 cl_or(pRExC_state, data->start_class, &this_class);
2954 }
2955 else if (flags & SCF_DO_STCLASS_AND) {
2956 /* Switch to OR mode: cache the old value of
2957 * data->start_class */
2958 INIT_AND_WITHP;
2959 StructCopy(data->start_class, and_withp,
2960 struct regnode_charclass_class);
2961 flags &= ~SCF_DO_STCLASS_AND;
2962 StructCopy(&this_class, data->start_class,
2963 struct regnode_charclass_class);
2964 flags |= SCF_DO_STCLASS_OR;
2965 data->start_class->flags |= ANYOF_EOS;
2966 }
2967 } else { /* Non-zero len */
2968 if (flags & SCF_DO_STCLASS_OR) {
2969 cl_or(pRExC_state, data->start_class, &this_class);
2970 cl_and(data->start_class, and_withp);
2971 }
2972 else if (flags & SCF_DO_STCLASS_AND)
2973 cl_and(data->start_class, &this_class);
2974 flags &= ~SCF_DO_STCLASS;
2975 }
2976 if (!scan) /* It was not CURLYX, but CURLY. */
2977 scan = next;
2978 if ( /* ? quantifier ok, except for (?{ ... }) */
2979 (next_is_eval || !(mincount == 0 && maxcount == 1))
2980 && (minnext == 0) && (deltanext == 0)
2981 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2982 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2983 && ckWARN(WARN_REGEXP))
2984 {
2985 vWARN(RExC_parse,
2986 "Quantifier unexpected on zero-length expression");
2987 }
2988
2989 min += minnext * mincount;
2990 is_inf_internal |= ((maxcount == REG_INFTY
2991 && (minnext + deltanext) > 0)
2992 || deltanext == I32_MAX);
2993 is_inf |= is_inf_internal;
2994 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2995
2996 /* Try powerful optimization CURLYX => CURLYN. */
2997 if ( OP(oscan) == CURLYX && data
2998 && data->flags & SF_IN_PAR
2999 && !(data->flags & SF_HAS_EVAL)
3000 && !deltanext && minnext == 1 ) {
3001 /* Try to optimize to CURLYN. */
3002 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3003 regnode * const nxt1 = nxt;
497b47a8 3004#ifdef DEBUGGING
8aa23a47 3005 regnode *nxt2;
497b47a8 3006#endif
c277df42 3007
8aa23a47
YO
3008 /* Skip open. */
3009 nxt = regnext(nxt);
3010 if (!strchr((const char*)PL_simple,OP(nxt))
3011 && !(PL_regkind[OP(nxt)] == EXACT
3012 && STR_LEN(nxt) == 1))
3013 goto nogo;
497b47a8 3014#ifdef DEBUGGING
8aa23a47 3015 nxt2 = nxt;
497b47a8 3016#endif
8aa23a47
YO
3017 nxt = regnext(nxt);
3018 if (OP(nxt) != CLOSE)
3019 goto nogo;
3020 if (RExC_open_parens) {
3021 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3022 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3023 }
3024 /* Now we know that nxt2 is the only contents: */
3025 oscan->flags = (U8)ARG(nxt);
3026 OP(oscan) = CURLYN;
3027 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3028
c277df42 3029#ifdef DEBUGGING
8aa23a47
YO
3030 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3031 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3032 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3033 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3034 OP(nxt + 1) = OPTIMIZED; /* was count. */
3035 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3036#endif
8aa23a47
YO
3037 }
3038 nogo:
3039
3040 /* Try optimization CURLYX => CURLYM. */
3041 if ( OP(oscan) == CURLYX && data
3042 && !(data->flags & SF_HAS_PAR)
3043 && !(data->flags & SF_HAS_EVAL)
3044 && !deltanext /* atom is fixed width */
3045 && minnext != 0 /* CURLYM can't handle zero width */
3046 ) {
3047 /* XXXX How to optimize if data == 0? */
3048 /* Optimize to a simpler form. */
3049 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3050 regnode *nxt2;
3051
3052 OP(oscan) = CURLYM;
3053 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3054 && (OP(nxt2) != WHILEM))
3055 nxt = nxt2;
3056 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3057 /* Need to optimize away parenths. */
3058 if (data->flags & SF_IN_PAR) {
3059 /* Set the parenth number. */
3060 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3061
3062 if (OP(nxt) != CLOSE)
3063 FAIL("Panic opt close");
3064 oscan->flags = (U8)ARG(nxt);
3065 if (RExC_open_parens) {
3066 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3067 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3068 }
8aa23a47
YO
3069 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3070 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3071
c277df42 3072#ifdef DEBUGGING
8aa23a47
YO
3073 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3074 OP(nxt + 1) = OPTIMIZED; /* was count. */
3075 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3076 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3077#endif
c277df42 3078#if 0
8aa23a47
YO
3079 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3080 regnode *nnxt = regnext(nxt1);
3081
3082 if (nnxt == nxt) {
3083 if (reg_off_by_arg[OP(nxt1)])
3084 ARG_SET(nxt1, nxt2 - nxt1);
3085 else if (nxt2 - nxt1 < U16_MAX)
3086 NEXT_OFF(nxt1) = nxt2 - nxt1;
3087 else
3088 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3089 }
8aa23a47 3090 nxt1 = nnxt;
c277df42 3091 }
5d1c421c 3092#endif
8aa23a47
YO
3093 /* Optimize again: */
3094 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3095 NULL, stopparen, recursed, NULL, 0,depth+1);
3096 }
3097 else
3098 oscan->flags = 0;
3099 }
3100 else if ((OP(oscan) == CURLYX)
3101 && (flags & SCF_WHILEM_VISITED_POS)
3102 /* See the comment on a similar expression above.
3103 However, this time it not a subexpression
3104 we care about, but the expression itself. */
3105 && (maxcount == REG_INFTY)
3106 && data && ++data->whilem_c < 16) {
3107 /* This stays as CURLYX, we can put the count/of pair. */
3108 /* Find WHILEM (as in regexec.c) */
3109 regnode *nxt = oscan + NEXT_OFF(oscan);
3110
3111 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3112 nxt += ARG(nxt);
3113 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3114 | (RExC_whilem_seen << 4)); /* On WHILEM */
3115 }
3116 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3117 pars++;
3118 if (flags & SCF_DO_SUBSTR) {
3119 SV *last_str = NULL;
3120 int counted = mincount != 0;
a0ed51b3 3121
8aa23a47
YO
3122 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3123#if defined(SPARC64_GCC_WORKAROUND)
3124 I32 b = 0;
3125 STRLEN l = 0;
3126 const char *s = NULL;
3127 I32 old = 0;
b515a41d 3128
8aa23a47
YO
3129 if (pos_before >= data->last_start_min)
3130 b = pos_before;
3131 else
3132 b = data->last_start_min;
b515a41d 3133
8aa23a47
YO
3134 l = 0;
3135 s = SvPV_const(data->last_found, l);
3136 old = b - data->last_start_min;
3137
3138#else
3139 I32 b = pos_before >= data->last_start_min
3140 ? pos_before : data->last_start_min;
3141 STRLEN l;
3142 const char * const s = SvPV_const(data->last_found, l);
3143 I32 old = b - data->last_start_min;
3144#endif
3145
3146 if (UTF)
3147 old = utf8_hop((U8*)s, old) - (U8*)s;
3148
3149 l -= old;
3150 /* Get the added string: */
3151 last_str = newSVpvn(s + old, l);
3152 if (UTF)
3153 SvUTF8_on(last_str);
3154 if (deltanext == 0 && pos_before == b) {
3155 /* What was added is a constant string */
3156 if (mincount > 1) {
3157 SvGROW(last_str, (mincount * l) + 1);
3158 repeatcpy(SvPVX(last_str) + l,
3159 SvPVX_const(last_str), l, mincount - 1);
3160 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3161 /* Add additional parts. */
3162 SvCUR_set(data->last_found,
3163 SvCUR(data->last_found) - l);
3164 sv_catsv(data->last_found, last_str);
3165 {
3166 SV * sv = data->last_found;
3167 MAGIC *mg =
3168 SvUTF8(sv) && SvMAGICAL(sv) ?
3169 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3170 if (mg && mg->mg_len >= 0)
3171 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3172 }
8aa23a47 3173 data->last_end += l * (mincount - 1);
b515a41d 3174 }
8aa23a47
YO
3175 } else {
3176 /* start offset must point into the last copy */
3177 data->last_start_min += minnext * (mincount - 1);
3178 data->last_start_max += is_inf ? I32_MAX
3179 : (maxcount - 1) * (minnext + data->pos_delta);
3180 }
c277df42 3181 }
8aa23a47
YO
3182 /* It is counted once already... */
3183 data->pos_min += minnext * (mincount - counted);
3184 data->pos_delta += - counted * deltanext +
3185 (minnext + deltanext) * maxcount - minnext * mincount;
3186 if (mincount != maxcount) {
3187 /* Cannot extend fixed substrings found inside
3188 the group. */
3189 scan_commit(pRExC_state,data,minlenp);
3190 if (mincount && last_str) {
3191 SV * const sv = data->last_found;
3192 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3193 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3194
3195 if (mg)
3196 mg->mg_len = -1;
3197 sv_setsv(sv, last_str);
3198 data->last_end = data->pos_min;
3199 data->last_start_min =
3200 data->pos_min - CHR_SVLEN(last_str);
3201 data->last_start_max = is_inf
3202 ? I32_MAX
3203 : data->pos_min + data->pos_delta
3204 - CHR_SVLEN(last_str);
3205 }
3206 data->longest = &(data->longest_float);
3207 }
3208 SvREFCNT_dec(last_str);
c277df42 3209 }
8aa23a47
YO
3210 if (data && (fl & SF_HAS_EVAL))
3211 data->flags |= SF_HAS_EVAL;
3212 optimize_curly_tail:
3213 if (OP(oscan) != CURLYX) {
3214 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3215 && NEXT_OFF(next))
3216 NEXT_OFF(oscan) += NEXT_OFF(next);
3217 }
3218 continue;
3219 default: /* REF and CLUMP only? */
3220 if (flags & SCF_DO_SUBSTR) {
3221 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3222 data->longest = &(data->longest_float);
3223 }
3224 is_inf = is_inf_internal = 1;
3225 if (flags & SCF_DO_STCLASS_OR)
3226 cl_anything(pRExC_state, data->start_class);
3227 flags &= ~SCF_DO_STCLASS;
3228 break;
c277df42 3229 }
8aa23a47
YO
3230 }
3231 else if (strchr((const char*)PL_simple,OP(scan))) {
3232 int value = 0;
653099ff 3233
8aa23a47
YO
3234 if (flags & SCF_DO_SUBSTR) {
3235 scan_commit(pRExC_state,data,minlenp);
3236 data->pos_min++;
3237 }
3238 min++;
3239 if (flags & SCF_DO_STCLASS) {
3240 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3241
8aa23a47
YO
3242 /* Some of the logic below assumes that switching
3243 locale on will only add false positives. */
3244 switch (PL_regkind[OP(scan)]) {
3245 case SANY:
3246 default:
3247 do_default:
3248 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3249 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3250 cl_anything(pRExC_state, data->start_class);
3251 break;
3252 case REG_ANY:
3253 if (OP(scan) == SANY)
3254 goto do_default;
3255 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3256 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3257 || (data->start_class->flags & ANYOF_CLASS));
3258 cl_anything(pRExC_state, data->start_class);
653099ff 3259 }
8aa23a47
YO
3260 if (flags & SCF_DO_STCLASS_AND || !value)
3261 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3262 break;
3263 case ANYOF:
3264 if (flags & SCF_DO_STCLASS_AND)
3265 cl_and(data->start_class,
3266 (struct regnode_charclass_class*)scan);
653099ff 3267 else
8aa23a47
YO
3268 cl_or(pRExC_state, data->start_class,
3269 (struct regnode_charclass_class*)scan);
3270 break;
3271 case ALNUM:
3272 if (flags & SCF_DO_STCLASS_AND) {
3273 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3275 for (value = 0; value < 256; value++)
3276 if (!isALNUM(value))
3277 ANYOF_BITMAP_CLEAR(data->start_class, value);
3278 }
653099ff 3279 }
8aa23a47
YO
3280 else {
3281 if (data->start_class->flags & ANYOF_LOCALE)
3282 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3283 else {
3284 for (value = 0; value < 256; value++)
3285 if (isALNUM(value))
3286 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3287 }
8aa23a47
YO
3288 }
3289 break;
3290 case ALNUML:
3291 if (flags & SCF_DO_STCLASS_AND) {
3292 if (data->start_class->flags & ANYOF_LOCALE)
3293 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3294 }
3295 else {
3296 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3297 data->start_class->flags |= ANYOF_LOCALE;
3298 }
3299 break;
3300 case NALNUM:
3301 if (flags & SCF_DO_STCLASS_AND) {
3302 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3304 for (value = 0; value < 256; value++)
3305 if (isALNUM(value))
3306 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3307 }
3308 }
8aa23a47
YO
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (!isALNUM(value))
3315 ANYOF_BITMAP_SET(data->start_class, value);
3316 }
653099ff 3317 }
8aa23a47
YO
3318 break;
3319 case NALNUML:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 if (data->start_class->flags & ANYOF_LOCALE)
3322 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3323 }
8aa23a47
YO
3324 else {
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3327 }
3328 break;
3329 case SPACE:
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3333 for (value = 0; value < 256; value++)
3334 if (!isSPACE(value))
3335 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3336 }
3337 }
8aa23a47
YO
3338 else {
3339 if (data->start_class->flags & ANYOF_LOCALE)
3340 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3341 else {
3342 for (value = 0; value < 256; value++)
3343 if (isSPACE(value))
3344 ANYOF_BITMAP_SET(data->start_class, value);
3345 }
653099ff 3346 }
8aa23a47
YO
3347 break;
3348 case SPACEL:
3349 if (flags & SCF_DO_STCLASS_AND) {
3350 if (data->start_class->flags & ANYOF_LOCALE)
3351 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3352 }
3353 else {
3354 data->start_class->flags |= ANYOF_LOCALE;
3355 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3356 }
3357 break;
3358 case NSPACE:
3359 if (flags & SCF_DO_STCLASS_AND) {
3360 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3362 for (value = 0; value < 256; value++)
3363 if (isSPACE(value))
3364 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3365 }
8aa23a47
YO
3366 }
3367 else {
3368 if (data->start_class->flags & ANYOF_LOCALE)
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3370 else {
3371 for (value = 0; value < 256; value++)
3372 if (!isSPACE(value))
3373 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3374 }
3375 }
8aa23a47
YO
3376 break;
3377 case NSPACEL:
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (data->start_class->flags & ANYOF_LOCALE) {
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3381 for (value = 0; value < 256; value++)
3382 if (!isSPACE(value))
3383 ANYOF_BITMAP_CLEAR(data->start_class, value);
3384 }
653099ff 3385 }
8aa23a47
YO
3386 else {
3387 data->start_class->flags |= ANYOF_LOCALE;
3388 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3389 }
3390 break;
3391 case DIGIT:
3392 if (flags & SCF_DO_STCLASS_AND) {
3393 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3394 for (value = 0; value < 256; value++)
3395 if (!isDIGIT(value))
3396 ANYOF_BITMAP_CLEAR(data->start_class, value);
3397 }
3398 else {
3399 if (data->start_class->flags & ANYOF_LOCALE)
3400 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3401 else {
3402 for (value = 0; value < 256; value++)
3403 if (isDIGIT(value))
3404 ANYOF_BITMAP_SET(data->start_class, value);
3405 }
3406 }
3407 break;
3408 case NDIGIT:
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3411 for (value = 0; value < 256; value++)
3412 if (isDIGIT(value))
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3414 }
3415 else {
3416 if (data->start_class->flags & ANYOF_LOCALE)
3417 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3418 else {
3419 for (value = 0; value < 256; value++)
3420 if (!isDIGIT(value))
3421 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3422 }
3423 }
8aa23a47
YO
3424 break;
3425 }
3426 if (flags & SCF_DO_STCLASS_OR)
3427 cl_and(data->start_class, and_withp);
3428 flags &= ~SCF_DO_STCLASS;
3429 }
3430 }
3431 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3432 data->flags |= (OP(scan) == MEOL
3433 ? SF_BEFORE_MEOL
3434 : SF_BEFORE_SEOL);
3435 }
3436 else if ( PL_regkind[OP(scan)] == BRANCHJ
3437 /* Lookbehind, or need to calculate parens/evals/stclass: */
3438 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3439 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3440 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3441 || OP(scan) == UNLESSM )
3442 {
3443 /* Negative Lookahead/lookbehind
3444 In this case we can't do fixed string optimisation.
3445 */
1de06328 3446
8aa23a47
YO
3447 I32 deltanext, minnext, fake = 0;
3448 regnode *nscan;
3449 struct regnode_charclass_class intrnl;
3450 int f = 0;
1de06328 3451
8aa23a47
YO
3452 data_fake.flags = 0;
3453 if (data) {
3454 data_fake.whilem_c = data->whilem_c;
3455 data_fake.last_closep = data->last_closep;
c277df42 3456 }
8aa23a47
YO
3457 else
3458 data_fake.last_closep = &fake;
3459 if ( flags & SCF_DO_STCLASS && !scan->flags
3460 && OP(scan) == IFMATCH ) { /* Lookahead */
3461 cl_init(pRExC_state, &intrnl);
3462 data_fake.start_class = &intrnl;
3463 f |= SCF_DO_STCLASS_AND;
3464 }
3465 if (flags & SCF_WHILEM_VISITED_POS)
3466 f |= SCF_WHILEM_VISITED_POS;
3467 next = regnext(scan);
3468 nscan = NEXTOPER(NEXTOPER(scan));
3469 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3470 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3471 if (scan->flags) {
3472 if (deltanext) {
3473 vFAIL("Variable length lookbehind not implemented");
3474 }
3475 else if (minnext > (I32)U8_MAX) {
3476 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3477 }
3478 scan->flags = (U8)minnext;
3479 }
3480 if (data) {
3481 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3482 pars++;
3483 if (data_fake.flags & SF_HAS_EVAL)
3484 data->flags |= SF_HAS_EVAL;
3485 data->whilem_c = data_fake.whilem_c;
3486 }
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3489
3490 cl_and(data->start_class, &intrnl);
3491 if (was)
3492 data->start_class->flags |= ANYOF_EOS;
3493 }
cb434fcc 3494 }
8aa23a47
YO
3495#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3496 else {
3497 /* Positive Lookahead/lookbehind
3498 In this case we can do fixed string optimisation,
3499 but we must be careful about it. Note in the case of
3500 lookbehind the positions will be offset by the minimum
3501 length of the pattern, something we won't know about
3502 until after the recurse.
3503 */
3504 I32 deltanext, fake = 0;
3505 regnode *nscan;
3506 struct regnode_charclass_class intrnl;
3507 int f = 0;
3508 /* We use SAVEFREEPV so that when the full compile
3509 is finished perl will clean up the allocated
3510 minlens when its all done. This was we don't
3511 have to worry about freeing them when we know
3512 they wont be used, which would be a pain.
3513 */
3514 I32 *minnextp;
3515 Newx( minnextp, 1, I32 );
3516 SAVEFREEPV(minnextp);
3517
3518 if (data) {
3519 StructCopy(data, &data_fake, scan_data_t);
3520 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3521 f |= SCF_DO_SUBSTR;
3522 if (scan->flags)
3523 scan_commit(pRExC_state, &data_fake,minlenp);
3524 data_fake.last_found=newSVsv(data->last_found);
3525 }
3526 }
3527 else
3528 data_fake.last_closep = &fake;
3529 data_fake.flags = 0;
3530 if (is_inf)
3531 data_fake.flags |= SF_IS_INF;
3532 if ( flags & SCF_DO_STCLASS && !scan->flags
3533 && OP(scan) == IFMATCH ) { /* Lookahead */
3534 cl_init(pRExC_state, &intrnl);
3535 data_fake.start_class = &intrnl;
3536 f |= SCF_DO_STCLASS_AND;
3537 }
3538 if (flags & SCF_WHILEM_VISITED_POS)
3539 f |= SCF_WHILEM_VISITED_POS;
3540 next = regnext(scan);
3541 nscan = NEXTOPER(NEXTOPER(scan));
3542
3543 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3544 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3545 if (scan->flags) {
3546 if (deltanext) {
3547 vFAIL("Variable length lookbehind not implemented");
3548 }
3549 else if (*minnextp > (I32)U8_MAX) {
3550 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3551 }
3552 scan->flags = (U8)*minnextp;
3553 }
3554
3555 *minnextp += min;
3556
3557 if (f & SCF_DO_STCLASS_AND) {
3558 const int was = (data->start_class->flags & ANYOF_EOS);
3559
3560 cl_and(data->start_class, &intrnl);
3561 if (was)
3562 data->start_class->flags |= ANYOF_EOS;
3563 }
3564 if (data) {
3565 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3566 pars++;
3567 if (data_fake.flags & SF_HAS_EVAL)
3568 data->flags |= SF_HAS_EVAL;
3569 data->whilem_c = data_fake.whilem_c;
3570 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3571 if (RExC_rx->minlen<*minnextp)
3572 RExC_rx->minlen=*minnextp;
3573 scan_commit(pRExC_state, &data_fake, minnextp);
3574 SvREFCNT_dec(data_fake.last_found);
3575
3576 if ( data_fake.minlen_fixed != minlenp )
3577 {
3578 data->offset_fixed= data_fake.offset_fixed;
3579 data->minlen_fixed= data_fake.minlen_fixed;
3580 data->lookbehind_fixed+= scan->flags;
3581 }
3582 if ( data_fake.minlen_float != minlenp )
3583 {
3584 data->minlen_float= data_fake.minlen_float;
3585 data->offset_float_min=data_fake.offset_float_min;
3586 data->offset_float_max=data_fake.offset_float_max;
3587 data->lookbehind_float+= scan->flags;
3588 }
3589 }
3590 }
3591
3592
40d049e4 3593 }
8aa23a47
YO
3594#endif
3595 }
3596 else if (OP(scan) == OPEN) {
3597 if (stopparen != (I32)ARG(scan))
3598 pars++;
3599 }
3600 else if (OP(scan) == CLOSE) {
3601 if (stopparen == (I32)ARG(scan)) {
3602 break;
3603 }
3604 if ((I32)ARG(scan) == is_par) {
3605 next = regnext(scan);
b515a41d 3606
8aa23a47
YO
3607 if ( next && (OP(next) != WHILEM) && next < last)
3608 is_par = 0; /* Disable optimization */
40d049e4 3609 }
8aa23a47
YO
3610 if (data)
3611 *(data->last_closep) = ARG(scan);
3612 }
3613 else if (OP(scan) == EVAL) {
c277df42
IZ
3614 if (data)
3615 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3616 }
3617 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3618 if (flags & SCF_DO_SUBSTR) {
3619 scan_commit(pRExC_state,data,minlenp);
3620 flags &= ~SCF_DO_SUBSTR;
40d049e4 3621 }
8aa23a47
YO
3622 if (data && OP(scan)==ACCEPT) {
3623 data->flags |= SCF_SEEN_ACCEPT;
3624 if (stopmin > min)
3625 stopmin = min;
e2e6a0f1 3626 }
8aa23a47
YO
3627 }
3628 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3629 {
0f5d15d6 3630 if (flags & SCF_DO_SUBSTR) {
1de06328 3631 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3632 data->longest = &(data->longest_float);
3633 }
3634 is_inf = is_inf_internal = 1;
653099ff 3635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3636 cl_anything(pRExC_state, data->start_class);
96776eda 3637 flags &= ~SCF_DO_STCLASS;
8aa23a47 3638 }
786e8c11 3639#ifdef TRIE_STUDY_OPT
40d049e4 3640#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3641 else if (PL_regkind[OP(scan)] == TRIE) {
3642 /* NOTE - There is similar code to this block above for handling
3643 BRANCH nodes on the initial study. If you change stuff here
3644 check there too. */
3645 regnode *trie_node= scan;
3646 regnode *tail= regnext(scan);
3647 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3648 I32 max1 = 0, min1 = I32_MAX;
3649 struct regnode_charclass_class accum;
3650
3651 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3652 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3653 if (flags & SCF_DO_STCLASS)
3654 cl_init_zero(pRExC_state, &accum);
3655
3656 if (!trie->jump) {
3657 min1= trie->minlen;
3658 max1= trie->maxlen;
3659 } else {
3660 const regnode *nextbranch= NULL;
3661 U32 word;
3662
3663 for ( word=1 ; word <= trie->wordcount ; word++)
3664 {
3665 I32 deltanext=0, minnext=0, f = 0, fake;
3666 struct regnode_charclass_class this_class;
3667
3668 data_fake.flags = 0;
3669 if (data) {
3670 data_fake.whilem_c = data->whilem_c;
3671 data_fake.last_closep = data->last_closep;
3672 }
3673 else
3674 data_fake.last_closep = &fake;
3675
3676 if (flags & SCF_DO_STCLASS) {
3677 cl_init(pRExC_state, &this_class);
3678 data_fake.start_class = &this_class;
3679 f = SCF_DO_STCLASS_AND;
3680 }
3681 if (flags & SCF_WHILEM_VISITED_POS)
3682 f |= SCF_WHILEM_VISITED_POS;
3683
3684 if (trie->jump[word]) {
3685 if (!nextbranch)
3686 nextbranch = trie_node + trie->jump[0];
3687 scan= trie_node + trie->jump[word];
3688 /* We go from the jump point to the branch that follows
3689 it. Note this means we need the vestigal unused branches
3690 even though they arent otherwise used.
3691 */
3692 minnext = study_chunk(pRExC_state, &scan, minlenp,
3693 &deltanext, (regnode *)nextbranch, &data_fake,
3694 stopparen, recursed, NULL, f,depth+1);
3695 }
3696 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3697 nextbranch= regnext((regnode*)nextbranch);
3698
3699 if (min1 > (I32)(minnext + trie->minlen))
3700 min1 = minnext + trie->minlen;
3701 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3702 max1 = minnext + deltanext + trie->maxlen;
3703 if (deltanext == I32_MAX)
3704 is_inf = is_inf_internal = 1;
3705
3706 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3707 pars++;
3708 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3709 if ( stopmin > min + min1)
3710 stopmin = min + min1;
3711 flags &= ~SCF_DO_SUBSTR;
3712 if (data)
3713 data->flags |= SCF_SEEN_ACCEPT;
3714 }
3715 if (data) {
3716 if (data_fake.flags & SF_HAS_EVAL)
3717 data->flags |= SF_HAS_EVAL;
3718 data->whilem_c = data_fake.whilem_c;
3719 }
3720 if (flags & SCF_DO_STCLASS)
3721 cl_or(pRExC_state, &accum, &this_class);
3722 }
3723 }
3724 if (flags & SCF_DO_SUBSTR) {
3725 data->pos_min += min1;
3726 data->pos_delta += max1 - min1;
3727 if (max1 != min1 || is_inf)
3728 data->longest = &(data->longest_float);
3729 }
3730 min += min1;
3731 delta += max1 - min1;
3732 if (flags & SCF_DO_STCLASS_OR) {
3733 cl_or(pRExC_state, data->start_class, &accum);
3734 if (min1) {
3735 cl_and(data->start_class, and_withp);
3736 flags &= ~SCF_DO_STCLASS;
3737 }
3738 }
3739 else if (flags & SCF_DO_STCLASS_AND) {
3740 if (min1) {
3741 cl_and(data->start_class, &accum);
3742 flags &= ~SCF_DO_STCLASS;
3743 }
3744 else {
3745 /* Switch to OR mode: cache the old value of
3746 * data->start_class */
3747 INIT_AND_WITHP;
3748 StructCopy(data->start_class, and_withp,
3749 struct regnode_charclass_class);
3750 flags &= ~SCF_DO_STCLASS_AND;
3751 StructCopy(&accum, data->start_class,
3752 struct regnode_charclass_class);
3753 flags |= SCF_DO_STCLASS_OR;
3754 data->start_class->flags |= ANYOF_EOS;
3755 }
3756 }
3757 scan= tail;
3758 continue;
3759 }
786e8c11 3760#else
8aa23a47
YO
3761 else if (PL_regkind[OP(scan)] == TRIE) {
3762 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3763 U8*bang=NULL;
3764
3765 min += trie->minlen;
3766 delta += (trie->maxlen - trie->minlen);
3767 flags &= ~SCF_DO_STCLASS; /* xxx */
3768 if (flags & SCF_DO_SUBSTR) {
3769 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3770 data->pos_min += trie->minlen;
3771 data->pos_delta += (trie->maxlen - trie->minlen);
3772 if (trie->maxlen != trie->minlen)
3773 data->longest = &(data->longest_float);
3774 }
3775 if (trie->jump) /* no more substrings -- for now /grr*/
3776 flags &= ~SCF_DO_SUBSTR;
b515a41d 3777 }
8aa23a47
YO
3778#endif /* old or new */
3779#endif /* TRIE_STUDY_OPT */
3780 /* Else: zero-length, ignore. */
3781 scan = regnext(scan);
3782 }
3783 if (frame) {
3784 last = frame->last;
3785 scan = frame->next;
3786 stopparen = frame->stop;
3787 frame = frame->prev;
3788 goto fake_study_recurse;
c277df42
IZ
3789 }
3790
3791 finish:
8aa23a47
YO
3792 assert(!frame);
3793
c277df42 3794 *scanp = scan;
aca2d497 3795 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3796 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3797 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3798 if (is_par > (I32)U8_MAX)
c277df42
IZ
3799 is_par = 0;
3800 if (is_par && pars==1 && data) {
3801 data->flags |= SF_IN_PAR;
3802 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3803 }
3804 else if (pars && data) {
c277df42
IZ
3805 data->flags |= SF_HAS_PAR;
3806 data->flags &= ~SF_IN_PAR;
3807 }
653099ff 3808 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3809 cl_and(data->start_class, and_withp);
786e8c11
YO
3810 if (flags & SCF_TRIE_RESTUDY)
3811 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3812
3813 DEBUG_STUDYDATA(data,depth);
3814
e2e6a0f1 3815 return min < stopmin ? min : stopmin;
c277df42
IZ
3816}
3817
76e3520e 3818STATIC I32
5f66b61c 3819S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3820{
830247a4 3821 if (RExC_rx->data) {
2eb97020 3822 const U32 count = RExC_rx->data->count;
b81d288d 3823 Renewc(RExC_rx->data,
2eb97020 3824 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3825 char, struct reg_data);
2eb97020 3826 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3827 RExC_rx->data->count += n;
a0ed51b3
LW
3828 }
3829 else {
a02a5408 3830 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3831 char, struct reg_data);
a02a5408 3832 Newx(RExC_rx->data->what, n, U8);
830247a4 3833 RExC_rx->data->count = n;
c277df42 3834 }
830247a4
IZ
3835 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3836 return RExC_rx->data->count - n;
c277df42
IZ
3837}
3838
76234dfb 3839#ifndef PERL_IN_XSUB_RE
d88dccdf 3840void
864dbfa3 3841Perl_reginitcolors(pTHX)
d88dccdf 3842{
97aff369 3843 dVAR;
1df70142 3844 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3845 if (s) {
1df70142
AL
3846 char *t = savepv(s);
3847 int i = 0;
3848 PL_colors[0] = t;
d88dccdf 3849 while (++i < 6) {
1df70142
AL
3850 t = strchr(t, '\t');
3851 if (t) {
3852 *t = '\0';
3853 PL_colors[i] = ++t;
d88dccdf
IZ
3854 }
3855 else
1df70142 3856 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3857 }
3858 } else {
1df70142 3859 int i = 0;
b81d288d 3860 while (i < 6)
06b5626a 3861 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3862 }
3863 PL_colorset = 1;
3864}
76234dfb 3865#endif
8615cb43 3866
07be1b83 3867
786e8c11
YO
3868#ifdef TRIE_STUDY_OPT
3869#define CHECK_RESTUDY_GOTO \
3870 if ( \
3871 (data.flags & SCF_TRIE_RESTUDY) \
3872 && ! restudied++ \
3873 ) goto reStudy
3874#else
3875#define CHECK_RESTUDY_GOTO
3876#endif
f9f4320a 3877
a687059c 3878/*
e50aee73 3879 - pregcomp - compile a regular expression into internal code
a687059c
LW
3880 *
3881 * We can't allocate space until we know how big the compiled form will be,
3882 * but we can't compile it (and thus know how big it is) until we've got a
3883 * place to put the code. So we cheat: we compile it twice, once with code
3884 * generation turned off and size counting turned on, and once "for real".
3885 * This also means that we don't allocate space until we are sure that the
3886 * thing really will compile successfully, and we never have to move the
3887 * code and thus invalidate pointers into it. (Note that it has to be in
3888 * one piece because free() must be able to free it all.) [NB: not true in perl]
3889 *
3890 * Beware that the optimization-preparation code in here knows about some
3891 * of the structure of the compiled regexp. [I'll say.]
3892 */
b9b4dddf
YO
3893
3894
3895
f9f4320a 3896#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3897#define RE_ENGINE_PTR &PL_core_reg_engine
3898#else
f9f4320a
YO
3899extern const struct regexp_engine my_reg_engine;
3900#define RE_ENGINE_PTR &my_reg_engine
3901#endif
b9b4dddf
YO
3902/* these make a few things look better, to avoid indentation */
3903#define BEGIN_BLOCK {
f9f4320a
YO
3904#define END_BLOCK }
3905
a687059c 3906regexp *
864dbfa3 3907Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3908{
97aff369 3909 dVAR;
f9f4320a
YO
3910 GET_RE_DEBUG_FLAGS_DECL;
3911 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3912#ifndef PERL_IN_XSUB_RE
3913 BEGIN_BLOCK
f9f4320a
YO
3914 /* Dispatch a request to compile a regexp to correct
3915 regexp engine. */
3916 HV * const table = GvHV(PL_hintgv);
3917 if (table) {
3918 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3919 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3920 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3921 DEBUG_COMPILE_r({
8d8756e7 3922 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3923 SvIV(*ptr));
3924 });
f2f78491 3925 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3926 }
b9b4dddf
YO
3927 }
3928 END_BLOCK
3929#endif
3930 BEGIN_BLOCK
a0d0e21e 3931 register regexp *r;
c277df42 3932 regnode *scan;
c277df42 3933 regnode *first;
a0d0e21e 3934 I32 flags;
a0d0e21e
LW
3935 I32 minlen = 0;
3936 I32 sawplus = 0;
3937 I32 sawopen = 0;
2c2d71f5 3938 scan_data_t data;
830247a4 3939 RExC_state_t RExC_state;
be8e71aa 3940 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3941#ifdef TRIE_STUDY_OPT
3942 int restudied= 0;
3943 RExC_state_t copyRExC_state;
3944#endif
a0d0e21e 3945 if (exp == NULL)
c277df42 3946 FAIL("NULL regexp argument");
a0d0e21e 3947
a5961de5 3948 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3949
5cfc7842 3950 RExC_precomp = exp;
a3621e74 3951 DEBUG_COMPILE_r({
ab3bbdeb
YO
3952 SV *dsv= sv_newmortal();
3953 RE_PV_QUOTED_DECL(s, RExC_utf8,
3954 dsv, RExC_precomp, (xend - exp), 60);
3955 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3956 PL_colors[4],PL_colors[5],s);
a5961de5 3957 });
e2509266 3958 RExC_flags = pm->op_pmflags;
830247a4 3959 RExC_sawback = 0;
bbce6d69 3960
830247a4
IZ
3961 RExC_seen = 0;
3962 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3963 RExC_seen_evals = 0;
3964 RExC_extralen = 0;
c277df42 3965
bbce6d69 3966 /* First pass: determine size, legality. */
830247a4 3967 RExC_parse = exp;
fac92740 3968 RExC_start = exp;
830247a4
IZ
3969 RExC_end = xend;
3970 RExC_naughty = 0;
3971 RExC_npar = 1;
c74340f9 3972 RExC_cpar = 1;
e2e6a0f1 3973 RExC_nestroot = 0;
830247a4
IZ
3974 RExC_size = 0L;
3975 RExC_emit = &PL_regdummy;
3976 RExC_whilem_seen = 0;
fc8cd66c 3977 RExC_charnames = NULL;
40d049e4
YO
3978 RExC_open_parens = NULL;
3979 RExC_close_parens = NULL;
3980 RExC_opend = NULL;
81714fb9 3981 RExC_paren_names = NULL;
40d049e4
YO
3982 RExC_recurse = NULL;
3983 RExC_recurse_count = 0;
81714fb9 3984
85ddcde9
JH
3985#if 0 /* REGC() is (currently) a NOP at the first pass.
3986 * Clever compilers notice this and complain. --jhi */
830247a4 3987 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3988#endif
3dab1dad
YO
3989 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3990 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3991 RExC_precomp = NULL;
a0d0e21e
LW
3992 return(NULL);
3993 }
07be1b83 3994 DEBUG_PARSE_r({
81714fb9
YO
3995 PerlIO_printf(Perl_debug_log,
3996 "Required size %"IVdf" nodes\n"
3997 "Starting second pass (creation)\n",
3998 (IV)RExC_size);
07be1b83
YO
3999 RExC_lastnum=0;
4000 RExC_lastparse=NULL;
4001 });
c277df42
IZ
4002 /* Small enough for pointer-storage convention?
4003 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4004 if (RExC_size >= 0x10000L && RExC_extralen)
4005 RExC_size += RExC_extralen;
c277df42 4006 else
830247a4
IZ
4007 RExC_extralen = 0;
4008 if (RExC_whilem_seen > 15)
4009 RExC_whilem_seen = 15;
a0d0e21e 4010
e2e6a0f1
YO
4011#ifdef DEBUGGING
4012 /* Make room for a sentinel value at the end of the program */
4013 RExC_size++;
4014#endif
4015
f9f4320a
YO
4016 /* Allocate space and zero-initialize. Note, the two step process
4017 of zeroing when in debug mode, thus anything assigned has to
4018 happen after that */
a02a5408 4019 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 4020 char, regexp);
a0d0e21e 4021 if (r == NULL)
b45f050a 4022 FAIL("Regexp out of space");
0f79a09d
GS
4023#ifdef DEBUGGING
4024 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4025 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 4026#endif
f9f4320a
YO
4027 /* initialization begins here */
4028 r->engine= RE_ENGINE_PTR;
c277df42 4029 r->refcnt = 1;
bbce6d69 4030 r->prelen = xend - exp;
5cfc7842 4031 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4032 r->subbeg = NULL;
f8c7b90f 4033#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4034 r->saved_copy = NULL;
ed252734 4035#endif
cf93c79d 4036 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4037 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4038 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4039
4040 r->substrs = 0; /* Useful during FAIL. */
4041 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4042 r->endp = 0;
c74340f9 4043 r->swap = NULL;
81714fb9
YO
4044 r->paren_names = 0;
4045
6bda09f9 4046 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4047 Newxz(RExC_open_parens, RExC_npar,regnode *);
4048 SAVEFREEPV(RExC_open_parens);
4049 Newxz(RExC_close_parens,RExC_npar,regnode *);
4050 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4051 }
4052
4053 /* Useful during FAIL. */
a02a5408 4054 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4055 if (r->offsets) {
2af232bd 4056 r->offsets[0] = RExC_size;
fac92740 4057 }
a3621e74 4058 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4059 "%s %"UVuf" bytes for offset annotations.\n",
4060 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4061 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4062
830247a4 4063 RExC_rx = r;
bbce6d69 4064
4065 /* Second pass: emit code. */
e2509266 4066 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4067 RExC_parse = exp;
4068 RExC_end = xend;
4069 RExC_naughty = 0;
4070 RExC_npar = 1;
c74340f9 4071 RExC_cpar = 1;
fac92740 4072 RExC_emit_start = r->program;
830247a4 4073 RExC_emit = r->program;
e2e6a0f1
YO
4074#ifdef DEBUGGING
4075 /* put a sentinal on the end of the program so we can check for
4076 overwrites */
4077 r->program[RExC_size].type = 255;
4078#endif
2cd61cdb 4079 /* Store the count of eval-groups for security checks: */
786e8c11 4080 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4081 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4082 r->data = 0;
3dab1dad 4083 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4084 return(NULL);
6bda09f9 4085
07be1b83
YO
4086 /* XXXX To minimize changes to RE engine we always allocate
4087 3-units-long substrs field. */
4088 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4089 if (RExC_recurse_count) {
4090 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4091 SAVEFREEPV(RExC_recurse);
4092 }
a0d0e21e 4093
07be1b83 4094reStudy:
1de06328 4095 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4096 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4097
07be1b83
YO
4098#ifdef TRIE_STUDY_OPT
4099 if ( restudied ) {
5d458dd8 4100 U32 seen=RExC_seen;
07be1b83 4101 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4102
4103 RExC_state = copyRExC_state;
4104 if (seen & REG_TOP_LEVEL_BRANCHES)
4105 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4106 else
4107 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4108 if (data.last_found) {
07be1b83 4109 SvREFCNT_dec(data.longest_fixed);
07be1b83 4110 SvREFCNT_dec(data.longest_float);
07be1b83 4111 SvREFCNT_dec(data.last_found);
1de06328 4112 }
40d049e4 4113 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4114 } else {
40d049e4 4115 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4116 copyRExC_state = RExC_state;
07be1b83 4117 }
40d049e4
YO
4118#else
4119 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4120#endif
fc8cd66c 4121
a0d0e21e 4122 /* Dig out information for optimizations. */
cf93c79d 4123 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4124 pm->op_pmflags = RExC_flags;
a0ed51b3 4125 if (UTF)
5ff6fc6d 4126 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4127 r->regstclass = NULL;
830247a4 4128 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4129 r->reganch |= ROPT_NAUGHTY;
c277df42 4130 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4131
1de06328
YO
4132 /* testing for BRANCH here tells us whether there is "must appear"
4133 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4134 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4135 I32 fake;
c5254dd6 4136 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4137 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4138 int stclass_flag;
07be1b83 4139 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4140
4141 first = scan;
c277df42 4142 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4143 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4144 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4145 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4146 /* for now we can't handle lookbehind IFMATCH*/
4147 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4148 (OP(first) == PLUS) ||
4149 (OP(first) == MINMOD) ||
653099ff 4150 /* An {n,m} with n>0 */
07be1b83
YO
4151 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4152 {
786e8c11 4153
a0d0e21e
LW
4154 if (OP(first) == PLUS)
4155 sawplus = 1;
4156 else
3dab1dad 4157 first += regarglen[OP(first)];
07be1b83
YO
4158 if (OP(first) == IFMATCH) {
4159 first = NEXTOPER(first);
4160 first += EXTRA_STEP_2ARGS;
7c167cea 4161 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4162 first = NEXTOPER(first);
a687059c
LW
4163 }
4164
a0d0e21e
LW
4165 /* Starting-point info. */
4166 again:
786e8c11 4167 DEBUG_PEEP("first:",first,0);
07be1b83 4168 /* Ignore EXACT as we deal with it later. */
3dab1dad 4169 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4170 if (OP(first) == EXACT)
6f207bd3 4171 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4172 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4173 r->regstclass = first;
4174 }
07be1b83 4175#ifdef TRIE_STCLASS
786e8c11 4176 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4177 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4178 {
786e8c11 4179 regnode *trie_op;
07be1b83 4180 /* this can happen only on restudy */
786e8c11
YO
4181 if ( OP(first) == TRIE ) {
4182 struct regnode_1 *trieop;
4183 Newxz(trieop,1,struct regnode_1);
4184 StructCopy(first,trieop,struct regnode_1);
4185 trie_op=(regnode *)trieop;
4186 } else {
4187 struct regnode_charclass *trieop;
4188 Newxz(trieop,1,struct regnode_charclass);
4189 StructCopy(first,trieop,struct regnode_charclass);
4190 trie_op=(regnode *)trieop;
4191 }
1de06328 4192 OP(trie_op)+=2;
786e8c11
YO
4193 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4194 r->regstclass = trie_op;
07be1b83
YO
4195 }
4196#endif
bfed75c6 4197 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4198 r->regstclass = first;
3dab1dad
YO
4199 else if (PL_regkind[OP(first)] == BOUND ||
4200 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4201 r->regstclass = first;
3dab1dad 4202 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4203 r->reganch |= (OP(first) == MBOL
4204 ? ROPT_ANCH_MBOL
4205 : (OP(first) == SBOL
4206 ? ROPT_ANCH_SBOL
4207 : ROPT_ANCH_BOL));
a0d0e21e 4208 first = NEXTOPER(first);
774d564b 4209 goto again;
4210 }
4211 else if (OP(first) == GPOS) {
4212 r->reganch |= ROPT_ANCH_GPOS;
4213 first = NEXTOPER(first);
4214 goto again;
a0d0e21e 4215 }
cf2a2b69
YO
4216 else if ((!sawopen || !RExC_sawback) &&
4217 (OP(first) == STAR &&
3dab1dad 4218 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4219 !(r->reganch & ROPT_ANCH) )
4220 {
4221 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4222 const int type =
4223 (OP(NEXTOPER(first)) == REG_ANY)
4224 ? ROPT_ANCH_MBOL
4225 : ROPT_ANCH_SBOL;
cad2e5aa 4226 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4227 first = NEXTOPER(first);
774d564b 4228 goto again;
a0d0e21e 4229 }
b81d288d 4230 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4231 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4232 /* x+ must match at the 1st pos of run of x's */
4233 r->reganch |= ROPT_SKIP;
a0d0e21e 4234
c277df42 4235 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4236#ifdef TRIE_STUDY_OPT
81714fb9 4237 DEBUG_PARSE_r(
be8e71aa
YO
4238 if (!restudied)
4239 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4240 (IV)(first - scan + 1))
4241 );
4242#else
81714fb9 4243 DEBUG_PARSE_r(
be8e71aa
YO
4244 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4245 (IV)(first - scan + 1))
4246 );
4247#endif
4248
4249
a0d0e21e
LW
4250 /*
4251 * If there's something expensive in the r.e., find the
4252 * longest literal string that must appear and make it the
4253 * regmust. Resolve ties in favor of later strings, since
4254 * the regstart check works with the beginning of the r.e.
4255 * and avoiding duplication strengthens checking. Not a
4256 * strong reason, but sufficient in the absence of others.
4257 * [Now we resolve ties in favor of the earlier string if
c277df42 4258 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4259 * earlier string may buy us something the later one won't.]
4260 */
de8c5301 4261
396482e1
GA
4262 data.longest_fixed = newSVpvs("");
4263 data.longest_float = newSVpvs("");
4264 data.last_found = newSVpvs("");
c277df42
IZ
4265 data.longest = &(data.longest_fixed);
4266 first = scan;
653099ff 4267 if (!r->regstclass) {
830247a4 4268 cl_init(pRExC_state, &ch_class);
653099ff
GS
4269 data.start_class = &ch_class;
4270 stclass_flag = SCF_DO_STCLASS_AND;
4271 } else /* XXXX Check for BOUND? */
4272 stclass_flag = 0;
cb434fcc 4273 data.last_closep = &last_close;
de8c5301 4274
1de06328 4275 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4276 &data, -1, NULL, NULL,
4277 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4278
07be1b83 4279
786e8c11
YO
4280 CHECK_RESTUDY_GOTO;
4281
4282
830247a4 4283 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4284 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4285 && !RExC_seen_zerolen
4286 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4287 r->reganch |= ROPT_CHECK_ALL;
1de06328 4288 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4289 SvREFCNT_dec(data.last_found);
4290
1de06328
YO
4291 /* Note that code very similar to this but for anchored string
4292 follows immediately below, changes may need to be made to both.
4293 Be careful.
4294 */
a0ed51b3 4295 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4296 if (longest_float_length
c277df42
IZ
4297 || (data.flags & SF_FL_BEFORE_EOL
4298 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4299 || (RExC_flags & PMf_MULTILINE))))
4300 {
1182767e 4301 I32 t,ml;
cf93c79d 4302
1de06328 4303 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4304 && data.offset_fixed == data.offset_float_min
4305 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4306 goto remove_float; /* As in (a)+. */
4307
1de06328
YO
4308 /* copy the information about the longest float from the reg_scan_data
4309 over to the program. */
33b8afdf
JH
4310 if (SvUTF8(data.longest_float)) {
4311 r->float_utf8 = data.longest_float;
c445ea15 4312 r->float_substr = NULL;
33b8afdf
JH
4313 } else {
4314 r->float_substr = data.longest_float;
c445ea15 4315 r->float_utf8 = NULL;
33b8afdf 4316 }
1de06328
YO
4317 /* float_end_shift is how many chars that must be matched that
4318 follow this item. We calculate it ahead of time as once the
4319 lookbehind offset is added in we lose the ability to correctly
4320 calculate it.*/
4321 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4322 : (I32)longest_float_length;
1de06328
YO
4323 r->float_end_shift = ml - data.offset_float_min
4324 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4325 + data.lookbehind_float;
4326 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4327 r->float_max_offset = data.offset_float_max;
1182767e 4328 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4329 r->float_max_offset -= data.lookbehind_float;
4330
cf93c79d
IZ
4331 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4332 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4333 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4334 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4335 }
4336 else {
aca2d497 4337 remove_float:
c445ea15 4338 r->float_substr = r->float_utf8 = NULL;
c277df42 4339 SvREFCNT_dec(data.longest_float);
c5254dd6 4340 longest_float_length = 0;
a0d0e21e 4341 }
c277df42 4342
1de06328
YO
4343 /* Note that code very similar to this but for floating string
4344 is immediately above, changes may need to be made to both.
4345 Be careful.
4346 */
a0ed51b3 4347 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4348 if (longest_fixed_length
c277df42
IZ
4349 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4350 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4351 || (RExC_flags & PMf_MULTILINE))))
4352 {
1182767e 4353 I32 t,ml;
cf93c79d 4354
1de06328
YO
4355 /* copy the information about the longest fixed
4356 from the reg_scan_data over to the program. */
33b8afdf
JH
4357 if (SvUTF8(data.longest_fixed)) {
4358 r->anchored_utf8 = data.longest_fixed;
c445ea15 4359 r->anchored_substr = NULL;
33b8afdf
JH
4360 } else {
4361 r->anchored_substr = data.longest_fixed;
c445ea15 4362 r->anchored_utf8 = NULL;
33b8afdf 4363 }
1de06328
YO
4364 /* fixed_end_shift is how many chars that must be matched that
4365 follow this item. We calculate it ahead of time as once the
4366 lookbehind offset is added in we lose the ability to correctly
4367 calculate it.*/
4368 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4369 : (I32)longest_fixed_length;
1de06328
YO
4370 r->anchored_end_shift = ml - data.offset_fixed
4371 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4372 + data.lookbehind_fixed;
4373 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4374
cf93c79d
IZ
4375 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4376 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4377 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4378 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4379 }
4380 else {
c445ea15 4381 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4382 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4383 longest_fixed_length = 0;
a0d0e21e 4384 }
b81d288d 4385 if (r->regstclass
ffc61ed2 4386 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4387 r->regstclass = NULL;
33b8afdf
JH
4388 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4389 && stclass_flag
653099ff 4390 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4391 && !cl_is_anything(data.start_class))
4392 {
1df70142 4393 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4394
a02a5408 4395 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4396 struct regnode_charclass_class);
4397 StructCopy(data.start_class,
830247a4 4398 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4399 struct regnode_charclass_class);
830247a4 4400 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4401 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4402 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4403 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4404 PerlIO_printf(Perl_debug_log,
a0288114 4405 "synthetic stclass \"%s\".\n",
3f7c398e 4406 SvPVX_const(sv));});
653099ff 4407 }
c277df42
IZ
4408
4409 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4410 if (longest_fixed_length > longest_float_length) {
1de06328 4411 r->check_end_shift = r->anchored_end_shift;
c277df42 4412 r->check_substr = r->anchored_substr;
33b8afdf 4413 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4414 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4415 if (r->reganch & ROPT_ANCH_SINGLE)
4416 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4417 }
4418 else {
1de06328 4419 r->check_end_shift = r->float_end_shift;
c277df42 4420 r->check_substr = r->float_substr;
33b8afdf 4421 r->check_utf8 = r->float_utf8;
1de06328
YO
4422 r->check_offset_min = r->float_min_offset;
4423 r->check_offset_max = r->float_max_offset;
a0d0e21e 4424 }
30382c73
IZ
4425 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4426 This should be changed ASAP! */
33b8afdf 4427 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4428 r->reganch |= RE_USE_INTUIT;
33b8afdf 4429 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4430 r->reganch |= RE_INTUIT_TAIL;
4431 }
1de06328
YO
4432 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4433 if ( (STRLEN)minlen < longest_float_length )
4434 minlen= longest_float_length;
4435 if ( (STRLEN)minlen < longest_fixed_length )
4436 minlen= longest_fixed_length;
4437 */
a0ed51b3
LW
4438 }
4439 else {
c277df42
IZ
4440 /* Several toplevels. Best we can is to set minlen. */
4441 I32 fake;
653099ff 4442 struct regnode_charclass_class ch_class;
cb434fcc 4443 I32 last_close = 0;
c277df42 4444
5d458dd8 4445 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4446
c277df42 4447 scan = r->program + 1;
830247a4 4448 cl_init(pRExC_state, &ch_class);
653099ff 4449 data.start_class = &ch_class;
cb434fcc 4450 data.last_closep = &last_close;
07be1b83 4451
de8c5301 4452
1de06328 4453 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4454 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4455
786e8c11 4456 CHECK_RESTUDY_GOTO;
07be1b83 4457
33b8afdf 4458 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4459 = r->float_substr = r->float_utf8 = NULL;
653099ff 4460 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4461 && !cl_is_anything(data.start_class))
4462 {
1df70142 4463 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4464
a02a5408 4465 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4466 struct regnode_charclass_class);
4467 StructCopy(data.start_class,
830247a4 4468 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4469 struct regnode_charclass_class);
830247a4 4470 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4471 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4472 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4473 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4474 PerlIO_printf(Perl_debug_log,
a0288114 4475 "synthetic stclass \"%s\".\n",
3f7c398e 4476 SvPVX_const(sv));});
653099ff 4477 }
a0d0e21e
LW
4478 }
4479
1de06328
YO
4480 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4481 the "real" pattern. */
cf9788e3
RGS
4482 DEBUG_OPTIMISE_r({
4483 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4484 minlen, r->minlen);
4485 });
de8c5301 4486 r->minlenret = minlen;
1de06328
YO
4487 if (r->minlen < minlen)
4488 r->minlen = minlen;
4489
b81d288d 4490 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4491 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4492 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4493 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4494 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4495 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4496 if (RExC_seen & REG_SEEN_CANY)
4497 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4498 if (RExC_seen & REG_SEEN_VERBARG)
4499 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4500 if (RExC_seen & REG_SEEN_CUTGROUP)
4501 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4502 if (RExC_paren_names)
4503 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4504 else
4505 r->paren_names = NULL;
4506
40d049e4
YO
4507 if (RExC_recurse_count) {
4508 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4509 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4510 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4511 }
4512 }
a02a5408
JC
4513 Newxz(r->startp, RExC_npar, I32);
4514 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4515 /* assume we don't need to swap parens around before we match */
4516
be8e71aa
YO
4517 DEBUG_DUMP_r({
4518 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4519 regdump(r);
4520 });
8e9a8a48
YO
4521 DEBUG_OFFSETS_r(if (r->offsets) {
4522 const U32 len = r->offsets[0];
4523 U32 i;
4524 GET_RE_DEBUG_FLAGS_DECL;
4525 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4526 for (i = 1; i <= len; i++) {
4527 if (r->offsets[i*2-1] || r->offsets[i*2])
4528 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4529 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4530 }
4531 PerlIO_printf(Perl_debug_log, "\n");
4532 });
a0d0e21e 4533 return(r);
f9f4320a 4534 END_BLOCK
a687059c
LW
4535}
4536
f9f4320a
YO
4537#undef CORE_ONLY_BLOCK
4538#undef END_BLOCK
4539#undef RE_ENGINE_PTR
3dab1dad 4540
9af228c6 4541#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4542SV*
4543Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4544{
4545 I32 parno = 0; /* no match */
4546 if (PL_curpm) {
4547 const REGEXP * const rx = PM_GETRE(PL_curpm);
4548 if (rx && rx->paren_names) {
4549 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4550 if (he_str) {
4551 IV i;
4552 SV* sv_dat=HeVAL(he_str);
4553 I32 *nums=(I32*)SvPVX(sv_dat);
4554 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4555 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4556 rx->endp[nums[i]] != -1)
4557 {
4558 parno = nums[i];
4559 break;
4560 }
4561 }
4562 }
4563 }
4564 }
4565 if ( !parno ) {
4566 return 0;
4567 } else {
4568 GV *gv_paren;
4569 SV *sv= sv_newmortal();
4570 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4571 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4572 return GvSVn(gv_paren);
4573 }
4574}
9af228c6 4575#endif
0a4db386 4576
894be9b7 4577/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4578 * If flags is REG_RSN_RETURN_NULL returns null.
4579 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4580 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4581 * to the parsed name as looked up in the RExC_paren_names hash.
4582 * If there is an error throws a vFAIL().. type exception.
894be9b7 4583 */
0a4db386
YO
4584
4585#define REG_RSN_RETURN_NULL 0
4586#define REG_RSN_RETURN_NAME 1
4587#define REG_RSN_RETURN_DATA 2
4588
894be9b7
YO
4589STATIC SV*
4590S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4591 char *name_start = RExC_parse;
0a4db386 4592 if ( UTF ) {
894be9b7 4593 STRLEN numlen;
0a4db386
YO
4594 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4595 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4596 {
4597 RExC_parse += numlen;
4598 }
4599 } else {
4600 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4601 RExC_parse++;
4602 }
0a4db386
YO
4603 if ( flags ) {
4604 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4605 (int)(RExC_parse - name_start)));
894be9b7 4606 if (UTF)
0a4db386
YO
4607 SvUTF8_on(sv_name);
4608 if ( flags == REG_RSN_RETURN_NAME)
4609 return sv_name;
4610 else if (flags==REG_RSN_RETURN_DATA) {
4611 HE *he_str = NULL;
4612 SV *sv_dat = NULL;
4613 if ( ! sv_name ) /* should not happen*/
4614 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4615 if (RExC_paren_names)
4616 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4617 if ( he_str )
4618 sv_dat = HeVAL(he_str);
4619 if ( ! sv_dat )
4620 vFAIL("Reference to nonexistent named group");
4621 return sv_dat;
4622 }
4623 else {
4624 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4625 }
4626 /* NOT REACHED */
894be9b7 4627 }
0a4db386 4628 return NULL;
894be9b7
YO
4629}
4630
3dab1dad
YO
4631#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4632 int rem=(int)(RExC_end - RExC_parse); \
4633 int cut; \
4634 int num; \
4635 int iscut=0; \
4636 if (rem>10) { \
4637 rem=10; \
4638 iscut=1; \
4639 } \
4640 cut=10-rem; \
4641 if (RExC_lastparse!=RExC_parse) \
4642 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4643 rem, RExC_parse, \
4644 cut + 4, \
4645 iscut ? "..." : "<" \
4646 ); \
4647 else \
4648 PerlIO_printf(Perl_debug_log,"%16s",""); \
4649 \
4650 if (SIZE_ONLY) \
4651 num=RExC_size; \
4652 else \
4653 num=REG_NODE_NUM(RExC_emit); \
4654 if (RExC_lastnum!=num) \
0a4db386 4655 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4656 else \
0a4db386 4657 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4658 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4659 (int)((depth*2)), "", \
3dab1dad
YO
4660 (funcname) \
4661 ); \
4662 RExC_lastnum=num; \
4663 RExC_lastparse=RExC_parse; \
4664})
4665
07be1b83
YO
4666
4667
3dab1dad
YO
4668#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4669 DEBUG_PARSE_MSG((funcname)); \
4670 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4671})
6bda09f9
YO
4672#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4673 DEBUG_PARSE_MSG((funcname)); \
4674 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4675})
a687059c
LW
4676/*
4677 - reg - regular expression, i.e. main body or parenthesized thing
4678 *
4679 * Caller must absorb opening parenthesis.
4680 *
4681 * Combining parenthesis handling with the base level of regular expression
4682 * is a trifle forced, but the need to tie the tails of the branches to what
4683 * follows makes it hard to avoid.
4684 */
07be1b83
YO
4685#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4686#ifdef DEBUGGING
4687#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4688#else
4689#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4690#endif
3dab1dad 4691
e2e6a0f1
YO
4692/* this idea is borrowed from STR_WITH_LEN in handy.h */
4693#define CHECK_WORD(s,v,l) \
4694 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4695
76e3520e 4696STATIC regnode *
3dab1dad 4697S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4698 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4699{
27da23d5 4700 dVAR;
c277df42
IZ
4701 register regnode *ret; /* Will be the head of the group. */
4702 register regnode *br;
4703 register regnode *lastbr;
cbbf8932 4704 register regnode *ender = NULL;
a0d0e21e 4705 register I32 parno = 0;
cbbf8932
AL
4706 I32 flags;
4707 const I32 oregflags = RExC_flags;
6136c704
AL
4708 bool have_branch = 0;
4709 bool is_open = 0;
9d1d55b5
JP
4710
4711 /* for (?g), (?gc), and (?o) warnings; warning
4712 about (?c) will warn about (?g) -- japhy */
4713
6136c704
AL
4714#define WASTED_O 0x01
4715#define WASTED_G 0x02
4716#define WASTED_C 0x04
4717#define WASTED_GC (0x02|0x04)
cbbf8932 4718 I32 wastedflags = 0x00;
9d1d55b5 4719
fac92740 4720 char * parse_start = RExC_parse; /* MJD */
a28509cc 4721 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4722
3dab1dad
YO
4723 GET_RE_DEBUG_FLAGS_DECL;
4724 DEBUG_PARSE("reg ");
4725
4726
821b33a5 4727 *flagp = 0; /* Tentatively. */
a0d0e21e 4728
9d1d55b5 4729
a0d0e21e
LW
4730 /* Make an OPEN node, if parenthesized. */
4731 if (paren) {
e2e6a0f1
YO
4732 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4733 char *start_verb = RExC_parse;
4734 STRLEN verb_len = 0;
4735 char *start_arg = NULL;
4736 unsigned char op = 0;
4737 int argok = 1;
4738 int internal_argval = 0; /* internal_argval is only useful if !argok */
4739 while ( *RExC_parse && *RExC_parse != ')' ) {
4740 if ( *RExC_parse == ':' ) {
4741 start_arg = RExC_parse + 1;
4742 break;
4743 }
4744 RExC_parse++;
4745 }
4746 ++start_verb;
4747 verb_len = RExC_parse - start_verb;
4748 if ( start_arg ) {
4749 RExC_parse++;
4750 while ( *RExC_parse && *RExC_parse != ')' )
4751 RExC_parse++;
4752 if ( *RExC_parse != ')' )
4753 vFAIL("Unterminated verb pattern argument");
4754 if ( RExC_parse == start_arg )
4755 start_arg = NULL;
4756 } else {
4757 if ( *RExC_parse != ')' )
4758 vFAIL("Unterminated verb pattern");
4759 }
5d458dd8 4760
e2e6a0f1
YO
4761 switch ( *start_verb ) {
4762 case 'A': /* (*ACCEPT) */
4763 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4764 op = ACCEPT;
4765 internal_argval = RExC_nestroot;
4766 }
4767 break;
4768 case 'C': /* (*COMMIT) */
4769 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4770 op = COMMIT;
e2e6a0f1
YO
4771 break;
4772 case 'F': /* (*FAIL) */
4773 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4774 op = OPFAIL;
4775 argok = 0;
4776 }
4777 break;
5d458dd8
YO
4778 case ':': /* (*:NAME) */
4779 case 'M': /* (*MARK:NAME) */
4780 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4781 op = MARKPOINT;
5d458dd8
YO
4782 argok = -1;
4783 }
4784 break;
4785 case 'P': /* (*PRUNE) */
4786 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4787 op = PRUNE;
e2e6a0f1 4788 break;
5d458dd8
YO
4789 case 'S': /* (*SKIP) */
4790 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4791 op = SKIP;
4792 break;
4793 case 'T': /* (*THEN) */
4794 /* [19:06] <TimToady> :: is then */
4795 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4796 op = CUTGROUP;
4797 RExC_seen |= REG_SEEN_CUTGROUP;
4798 }
e2e6a0f1
YO
4799 break;
4800 }
4801 if ( ! op ) {
4802 RExC_parse++;
4803 vFAIL3("Unknown verb pattern '%.*s'",
4804 verb_len, start_verb);
4805 }
4806 if ( argok ) {
4807 if ( start_arg && internal_argval ) {
4808 vFAIL3("Verb pattern '%.*s' may not have an argument",
4809 verb_len, start_verb);
4810 } else if ( argok < 0 && !start_arg ) {
4811 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4812 verb_len, start_verb);
4813 } else {
4814 ret = reganode(pRExC_state, op, internal_argval);
4815 if ( ! internal_argval && ! SIZE_ONLY ) {
4816 if (start_arg) {
4817 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4818 ARG(ret) = add_data( pRExC_state, 1, "S" );
4819 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4820 ret->flags = 0;
4821 } else {
4822 ret->flags = 1;
4823 }
4824 }
4825 }
4826 if (!internal_argval)
4827 RExC_seen |= REG_SEEN_VERBARG;
4828 } else if ( start_arg ) {
4829 vFAIL3("Verb pattern '%.*s' may not have an argument",
4830 verb_len, start_verb);
4831 } else {
4832 ret = reg_node(pRExC_state, op);
4833 }
4834 nextchar(pRExC_state);
4835 return ret;
4836 } else
fac92740 4837 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4838 U32 posflags = 0, negflags = 0;
4839 U32 *flagsp = &posflags;
6136c704 4840 bool is_logical = 0;
a28509cc 4841 const char * const seqstart = RExC_parse;
ca9dfc88 4842
830247a4
IZ
4843 RExC_parse++;
4844 paren = *RExC_parse++;
c277df42 4845 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4846 switch (paren) {
894be9b7 4847
fac92740 4848 case '<': /* (?<...) */
b81d288d 4849 if (*RExC_parse == '!')
c277df42 4850 paren = ',';
0a4db386
YO
4851 else if (*RExC_parse != '=')
4852 { /* (?<...>) */
81714fb9 4853 char *name_start;
894be9b7 4854 SV *svname;
81714fb9
YO
4855 paren= '>';
4856 case '\'': /* (?'...') */
4857 name_start= RExC_parse;
0a4db386
YO
4858 svname = reg_scan_name(pRExC_state,
4859 SIZE_ONLY ? /* reverse test from the others */
4860 REG_RSN_RETURN_NAME :
4861 REG_RSN_RETURN_NULL);
81714fb9
YO
4862 if (RExC_parse == name_start)
4863 goto unknown;
4864 if (*RExC_parse != paren)
4865 vFAIL2("Sequence (?%c... not terminated",
4866 paren=='>' ? '<' : paren);
4867 if (SIZE_ONLY) {
e62cc96a
YO
4868 HE *he_str;
4869 SV *sv_dat = NULL;
894be9b7
YO
4870 if (!svname) /* shouldnt happen */
4871 Perl_croak(aTHX_
4872 "panic: reg_scan_name returned NULL");
81714fb9
YO
4873 if (!RExC_paren_names) {
4874 RExC_paren_names= newHV();
4875 sv_2mortal((SV*)RExC_paren_names);
4876 }
4877 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4878 if ( he_str )
81714fb9 4879 sv_dat = HeVAL(he_str);
e62cc96a 4880 if ( ! sv_dat ) {
81714fb9 4881 /* croak baby croak */
e62cc96a
YO
4882 Perl_croak(aTHX_
4883 "panic: paren_name hash element allocation failed");
4884 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4885 IV count=SvIV(sv_dat);
4886 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4887 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4888 pv[count]=RExC_npar;
4889 SvIVX(sv_dat)++;
4890 } else {
4891 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4892 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4893 SvIOK_on(sv_dat);
4894 SvIVX(sv_dat)= 1;
e62cc96a
YO
4895 }
4896
81714fb9
YO
4897 /*sv_dump(sv_dat);*/
4898 }
4899 nextchar(pRExC_state);
4900 paren = 1;
4901 goto capturing_parens;
4902 }
4903 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4904 RExC_parse++;
fac92740
MJD
4905 case '=': /* (?=...) */
4906 case '!': /* (?!...) */
830247a4 4907 RExC_seen_zerolen++;
e2e6a0f1
YO
4908 if (*RExC_parse == ')') {
4909 ret=reg_node(pRExC_state, OPFAIL);
4910 nextchar(pRExC_state);
4911 return ret;
4912 }
fac92740
MJD
4913 case ':': /* (?:...) */
4914 case '>': /* (?>...) */
a0d0e21e 4915 break;
fac92740
MJD
4916 case '$': /* (?$...) */
4917 case '@': /* (?@...) */
8615cb43 4918 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4919 break;
fac92740 4920 case '#': /* (?#...) */
830247a4
IZ
4921 while (*RExC_parse && *RExC_parse != ')')
4922 RExC_parse++;
4923 if (*RExC_parse != ')')
c277df42 4924 FAIL("Sequence (?#... not terminated");
830247a4 4925 nextchar(pRExC_state);
a0d0e21e
LW
4926 *flagp = TRYAGAIN;
4927 return NULL;
894be9b7
YO
4928 case '0' : /* (?0) */
4929 case 'R' : /* (?R) */
4930 if (*RExC_parse != ')')
6bda09f9 4931 FAIL("Sequence (?R) not terminated");
1a147d38 4932 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4933 nextchar(pRExC_state);
4934 return ret;
4935 /*notreached*/
894be9b7
YO
4936 { /* named and numeric backreferences */
4937 I32 num;
4938 char * parse_start;
4939 case '&': /* (?&NAME) */
4940 parse_start = RExC_parse - 1;
4941 {
0a4db386
YO
4942 SV *sv_dat = reg_scan_name(pRExC_state,
4943 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4944 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4945 }
4946 goto gen_recurse_regop;
4947 /* NOT REACHED */
542fa716
YO
4948 case '+':
4949 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4950 RExC_parse++;
4951 vFAIL("Illegal pattern");
4952 }
4953 goto parse_recursion;
4954 /* NOT REACHED*/
4955 case '-': /* (?-1) */
4956 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4957 RExC_parse--; /* rewind to let it be handled later */
4958 goto parse_flags;
4959 }
4960 /*FALLTHROUGH */
6bda09f9
YO
4961 case '1': case '2': case '3': case '4': /* (?1) */
4962 case '5': case '6': case '7': case '8': case '9':
4963 RExC_parse--;
542fa716 4964 parse_recursion:
894be9b7
YO
4965 num = atoi(RExC_parse);
4966 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
4967 if (*RExC_parse == '-')
4968 RExC_parse++;
6bda09f9
YO
4969 while (isDIGIT(*RExC_parse))
4970 RExC_parse++;
4971 if (*RExC_parse!=')')
4972 vFAIL("Expecting close bracket");
894be9b7
YO
4973
4974 gen_recurse_regop:
542fa716
YO
4975 if ( paren == '-' ) {
4976 /*
4977 Diagram of capture buffer numbering.
4978 Top line is the normal capture buffer numbers
4979 Botton line is the negative indexing as from
4980 the X (the (?-2))
4981
4982 + 1 2 3 4 5 X 6 7
4983 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4984 - 5 4 3 2 1 X x x
4985
4986 */
4987 num = RExC_npar + num;
4988 if (num < 1) {
4989 RExC_parse++;
4990 vFAIL("Reference to nonexistent group");
4991 }
4992 } else if ( paren == '+' ) {
4993 num = RExC_npar + num - 1;
4994 }
4995
1a147d38 4996 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4997 if (!SIZE_ONLY) {
4998 if (num > (I32)RExC_rx->nparens) {
4999 RExC_parse++;
5000 vFAIL("Reference to nonexistent group");
5001 }
40d049e4 5002 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5003 RExC_emit++;
226de585 5004 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5005 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5006 } else {
6bda09f9 5007 RExC_size++;
6bda09f9 5008 }
0a4db386 5009 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5010 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5011 Set_Node_Offset(ret, parse_start); /* MJD */
5012
6bda09f9
YO
5013 nextchar(pRExC_state);
5014 return ret;
894be9b7
YO
5015 } /* named and numeric backreferences */
5016 /* NOT REACHED */
5017
fac92740 5018 case 'p': /* (?p...) */
9014280d 5019 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5020 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5021 /* FALL THROUGH*/
fac92740 5022 case '?': /* (??...) */
6136c704 5023 is_logical = 1;
438a3801
YST
5024 if (*RExC_parse != '{')
5025 goto unknown;
830247a4 5026 paren = *RExC_parse++;
0f5d15d6 5027 /* FALL THROUGH */
fac92740 5028 case '{': /* (?{...}) */
c277df42 5029 {
c277df42
IZ
5030 I32 count = 1, n = 0;
5031 char c;
830247a4 5032 char *s = RExC_parse;
c277df42 5033
830247a4
IZ
5034 RExC_seen_zerolen++;
5035 RExC_seen |= REG_SEEN_EVAL;
5036 while (count && (c = *RExC_parse)) {
6136c704
AL
5037 if (c == '\\') {
5038 if (RExC_parse[1])
5039 RExC_parse++;
5040 }
b81d288d 5041 else if (c == '{')
c277df42 5042 count++;
b81d288d 5043 else if (c == '}')
c277df42 5044 count--;
830247a4 5045 RExC_parse++;
c277df42 5046 }
6136c704 5047 if (*RExC_parse != ')') {
b81d288d 5048 RExC_parse = s;
b45f050a
JF
5049 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5050 }
c277df42 5051 if (!SIZE_ONLY) {
f3548bdc 5052 PAD *pad;
6136c704
AL
5053 OP_4tree *sop, *rop;
5054 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5055
569233ed
SB
5056 ENTER;
5057 Perl_save_re_context(aTHX);
f3548bdc 5058 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5059 sop->op_private |= OPpREFCOUNTED;
5060 /* re_dup will OpREFCNT_inc */
5061 OpREFCNT_set(sop, 1);
569233ed 5062 LEAVE;
c277df42 5063
830247a4
IZ
5064 n = add_data(pRExC_state, 3, "nop");
5065 RExC_rx->data->data[n] = (void*)rop;
5066 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5067 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5068 SvREFCNT_dec(sv);
a0ed51b3 5069 }
e24b16f9 5070 else { /* First pass */
830247a4 5071 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5072 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5073 /* No compiled RE interpolated, has runtime
5074 components ===> unsafe. */
5075 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5076 if (PL_tainting && PL_tainted)
cc6b7395 5077 FAIL("Eval-group in insecure regular expression");
54df2634 5078#if PERL_VERSION > 8
923e4eb5 5079 if (IN_PERL_COMPILETIME)
b5c19bd7 5080 PL_cv_has_eval = 1;
54df2634 5081#endif
c277df42 5082 }
b5c19bd7 5083
830247a4 5084 nextchar(pRExC_state);
6136c704 5085 if (is_logical) {
830247a4 5086 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5087 if (!SIZE_ONLY)
5088 ret->flags = 2;
3dab1dad 5089 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5090 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5091 return ret;
5092 }
ccb2c380
MP
5093 ret = reganode(pRExC_state, EVAL, n);
5094 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5095 Set_Node_Offset(ret, parse_start);
5096 return ret;
c277df42 5097 }
fac92740 5098 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5099 {
0a4db386 5100 int is_define= 0;
fac92740 5101 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5102 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5103 || RExC_parse[1] == '<'
830247a4 5104 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5105 I32 flag;
5106
830247a4 5107 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5108 if (!SIZE_ONLY)
5109 ret->flags = 1;
3dab1dad 5110 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5111 goto insert_if;
b81d288d 5112 }
a0ed51b3 5113 }
0a4db386
YO
5114 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5115 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5116 {
5117 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5118 char *name_start= RExC_parse++;
5119 I32 num = 0;
5120 SV *sv_dat=reg_scan_name(pRExC_state,
5121 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5122 if (RExC_parse == name_start || *RExC_parse != ch)
5123 vFAIL2("Sequence (?(%c... not terminated",
5124 (ch == '>' ? '<' : ch));
5125 RExC_parse++;
5126 if (!SIZE_ONLY) {
5127 num = add_data( pRExC_state, 1, "S" );
5128 RExC_rx->data->data[num]=(void*)sv_dat;
5129 SvREFCNT_inc(sv_dat);
5130 }
5131 ret = reganode(pRExC_state,NGROUPP,num);
5132 goto insert_if_check_paren;
5133 }
5134 else if (RExC_parse[0] == 'D' &&
5135 RExC_parse[1] == 'E' &&
5136 RExC_parse[2] == 'F' &&
5137 RExC_parse[3] == 'I' &&
5138 RExC_parse[4] == 'N' &&
5139 RExC_parse[5] == 'E')
5140 {
5141 ret = reganode(pRExC_state,DEFINEP,0);
5142 RExC_parse +=6 ;
5143 is_define = 1;
5144 goto insert_if_check_paren;
5145 }
5146 else if (RExC_parse[0] == 'R') {
5147 RExC_parse++;
5148 parno = 0;
5149 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5150 parno = atoi(RExC_parse++);
5151 while (isDIGIT(*RExC_parse))
5152 RExC_parse++;
5153 } else if (RExC_parse[0] == '&') {
5154 SV *sv_dat;
5155 RExC_parse++;
5156 sv_dat = reg_scan_name(pRExC_state,
5157 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5159 }
1a147d38 5160 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5161 goto insert_if_check_paren;
5162 }
830247a4 5163 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5164 /* (?(1)...) */
6136c704 5165 char c;
830247a4 5166 parno = atoi(RExC_parse++);
c277df42 5167
830247a4
IZ
5168 while (isDIGIT(*RExC_parse))
5169 RExC_parse++;
fac92740 5170 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5171
0a4db386 5172 insert_if_check_paren:
830247a4 5173 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5174 vFAIL("Switch condition not recognized");
c277df42 5175 insert_if:
3dab1dad
YO
5176 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5177 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5178 if (br == NULL)
830247a4 5179 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5180 else
3dab1dad 5181 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5182 c = *nextchar(pRExC_state);
d1b80229
IZ
5183 if (flags&HASWIDTH)
5184 *flagp |= HASWIDTH;
c277df42 5185 if (c == '|') {
0a4db386
YO
5186 if (is_define)
5187 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5188 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5189 regbranch(pRExC_state, &flags, 1,depth+1);
5190 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5191 if (flags&HASWIDTH)
5192 *flagp |= HASWIDTH;
830247a4 5193 c = *nextchar(pRExC_state);
a0ed51b3
LW
5194 }
5195 else
c277df42
IZ
5196 lastbr = NULL;
5197 if (c != ')')
8615cb43 5198 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5199 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5200 REGTAIL(pRExC_state, br, ender);
c277df42 5201 if (lastbr) {
3dab1dad
YO
5202 REGTAIL(pRExC_state, lastbr, ender);
5203 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5204 }
5205 else
3dab1dad 5206 REGTAIL(pRExC_state, ret, ender);
c277df42 5207 return ret;
a0ed51b3
LW
5208 }
5209 else {
830247a4 5210 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5211 }
5212 }
1b1626e4 5213 case 0:
830247a4 5214 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5215 vFAIL("Sequence (? incomplete");
1b1626e4 5216 break;
a0d0e21e 5217 default:
830247a4 5218 --RExC_parse;
fac92740 5219 parse_flags: /* (?i) */
830247a4 5220 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5221 /* (?g), (?gc) and (?o) are useless here
5222 and must be globally applied -- japhy */
5223
5224 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5225 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5226 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5227 if (! (wastedflags & wflagbit) ) {
5228 wastedflags |= wflagbit;
5229 vWARN5(
5230 RExC_parse + 1,
5231 "Useless (%s%c) - %suse /%c modifier",
5232 flagsp == &negflags ? "?-" : "?",
5233 *RExC_parse,
5234 flagsp == &negflags ? "don't " : "",
5235 *RExC_parse
5236 );
5237 }
5238 }
5239 }
5240 else if (*RExC_parse == 'c') {
5241 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5242 if (! (wastedflags & WASTED_C) ) {
5243 wastedflags |= WASTED_GC;
9d1d55b5
JP
5244 vWARN3(
5245 RExC_parse + 1,
5246 "Useless (%sc) - %suse /gc modifier",
5247 flagsp == &negflags ? "?-" : "?",
5248 flagsp == &negflags ? "don't " : ""
5249 );
5250 }
5251 }
5252 }
5253 else { pmflag(flagsp, *RExC_parse); }
5254
830247a4 5255 ++RExC_parse;
ca9dfc88 5256 }
830247a4 5257 if (*RExC_parse == '-') {
ca9dfc88 5258 flagsp = &negflags;
9d1d55b5 5259 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5260 ++RExC_parse;
ca9dfc88 5261 goto parse_flags;
48c036b1 5262 }
e2509266
JH
5263 RExC_flags |= posflags;
5264 RExC_flags &= ~negflags;
830247a4
IZ
5265 if (*RExC_parse == ':') {
5266 RExC_parse++;
ca9dfc88
IZ
5267 paren = ':';
5268 break;
5269 }
c277df42 5270 unknown:
830247a4
IZ
5271 if (*RExC_parse != ')') {
5272 RExC_parse++;
5273 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5274 }
830247a4 5275 nextchar(pRExC_state);
a0d0e21e
LW
5276 *flagp = TRYAGAIN;
5277 return NULL;
5278 }
5279 }
fac92740 5280 else { /* (...) */
81714fb9 5281 capturing_parens:
830247a4
IZ
5282 parno = RExC_npar;
5283 RExC_npar++;
e2e6a0f1 5284
830247a4 5285 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5286 if (!SIZE_ONLY ){
5287 if (!RExC_nestroot)
5288 RExC_nestroot = parno;
5289 if (RExC_seen & REG_SEEN_RECURSE) {
5290 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5291 "Setting open paren #%"IVdf" to %d\n",
5292 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5293 RExC_open_parens[parno-1]= ret;
5294 }
6bda09f9 5295 }
fac92740
MJD
5296 Set_Node_Length(ret, 1); /* MJD */
5297 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5298 is_open = 1;
a0d0e21e 5299 }
a0ed51b3 5300 }
fac92740 5301 else /* ! paren */
a0d0e21e
LW
5302 ret = NULL;
5303
5304 /* Pick up the branches, linking them together. */
fac92740 5305 parse_start = RExC_parse; /* MJD */
3dab1dad 5306 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5307 /* branch_len = (paren != 0); */
2af232bd 5308
a0d0e21e
LW
5309 if (br == NULL)
5310 return(NULL);
830247a4
IZ
5311 if (*RExC_parse == '|') {
5312 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5313 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5314 }
fac92740 5315 else { /* MJD */
6bda09f9 5316 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5317 Set_Node_Length(br, paren != 0);
5318 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5319 }
c277df42
IZ
5320 have_branch = 1;
5321 if (SIZE_ONLY)
830247a4 5322 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5323 }
5324 else if (paren == ':') {
c277df42
IZ
5325 *flagp |= flags&SIMPLE;
5326 }
6136c704 5327 if (is_open) { /* Starts with OPEN. */
3dab1dad 5328 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5329 }
5330 else if (paren != '?') /* Not Conditional */
a0d0e21e 5331 ret = br;
32a0ca98 5332 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5333 lastbr = br;
830247a4
IZ
5334 while (*RExC_parse == '|') {
5335 if (!SIZE_ONLY && RExC_extralen) {
5336 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5337 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5338 }
5339 if (SIZE_ONLY)
830247a4
IZ
5340 RExC_extralen += 2; /* Account for LONGJMP. */
5341 nextchar(pRExC_state);
3dab1dad 5342 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5343
a687059c 5344 if (br == NULL)
a0d0e21e 5345 return(NULL);
3dab1dad 5346 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5347 lastbr = br;
821b33a5
IZ
5348 if (flags&HASWIDTH)
5349 *flagp |= HASWIDTH;
a687059c 5350 *flagp |= flags&SPSTART;
a0d0e21e
LW
5351 }
5352
c277df42
IZ
5353 if (have_branch || paren != ':') {
5354 /* Make a closing node, and hook it on the end. */
5355 switch (paren) {
5356 case ':':
830247a4 5357 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5358 break;
5359 case 1:
c74340f9 5360 RExC_cpar++;
830247a4 5361 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5362 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5363 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5364 "Setting close paren #%"IVdf" to %d\n",
5365 (IV)parno, REG_NODE_NUM(ender)));
5366 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5367 if (RExC_nestroot == parno)
5368 RExC_nestroot = 0;
40d049e4 5369 }
fac92740
MJD
5370 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5371 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5372 break;
5373 case '<':
c277df42
IZ
5374 case ',':
5375 case '=':
5376 case '!':
c277df42 5377 *flagp &= ~HASWIDTH;
821b33a5
IZ
5378 /* FALL THROUGH */
5379 case '>':
830247a4 5380 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5381 break;
5382 case 0:
830247a4 5383 ender = reg_node(pRExC_state, END);
40d049e4
YO
5384 if (!SIZE_ONLY) {
5385 assert(!RExC_opend); /* there can only be one! */
5386 RExC_opend = ender;
5387 }
c277df42
IZ
5388 break;
5389 }
eaf3ca90 5390 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5391
9674d46a 5392 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5393 if (depth==1)
5394 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5395
c277df42 5396 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5397 for (br = ret; br; br = regnext(br)) {
5398 const U8 op = PL_regkind[OP(br)];
5399 if (op == BRANCH) {
07be1b83 5400 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5401 }
5402 else if (op == BRANCHJ) {
07be1b83 5403 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5404 }
c277df42
IZ
5405 }
5406 }
a0d0e21e 5407 }
c277df42
IZ
5408
5409 {
e1ec3a88
AL
5410 const char *p;
5411 static const char parens[] = "=!<,>";
c277df42
IZ
5412
5413 if (paren && (p = strchr(parens, paren))) {
eb160463 5414 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5415 int flag = (p - parens) > 1;
5416
5417 if (paren == '>')
5418 node = SUSPEND, flag = 0;
6bda09f9 5419 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5420 Set_Node_Cur_Length(ret);
5421 Set_Node_Offset(ret, parse_start + 1);
c277df42 5422 ret->flags = flag;
07be1b83 5423 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5424 }
a0d0e21e
LW
5425 }
5426
5427 /* Check for proper termination. */
ce3e6498 5428 if (paren) {
e2509266 5429 RExC_flags = oregflags;
830247a4
IZ
5430 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5431 RExC_parse = oregcomp_parse;
380a0633 5432 vFAIL("Unmatched (");
ce3e6498 5433 }
a0ed51b3 5434 }
830247a4
IZ
5435 else if (!paren && RExC_parse < RExC_end) {
5436 if (*RExC_parse == ')') {
5437 RExC_parse++;
380a0633 5438 vFAIL("Unmatched )");
a0ed51b3
LW
5439 }
5440 else
b45f050a 5441 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5442 /* NOTREACHED */
5443 }
a687059c 5444
a0d0e21e 5445 return(ret);
a687059c
LW
5446}
5447
5448/*
5449 - regbranch - one alternative of an | operator
5450 *
5451 * Implements the concatenation operator.
5452 */
76e3520e 5453STATIC regnode *
3dab1dad 5454S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5455{
97aff369 5456 dVAR;
c277df42
IZ
5457 register regnode *ret;
5458 register regnode *chain = NULL;
5459 register regnode *latest;
5460 I32 flags = 0, c = 0;
3dab1dad
YO
5461 GET_RE_DEBUG_FLAGS_DECL;
5462 DEBUG_PARSE("brnc");
b81d288d 5463 if (first)
c277df42
IZ
5464 ret = NULL;
5465 else {
b81d288d 5466 if (!SIZE_ONLY && RExC_extralen)
830247a4 5467 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5468 else {
830247a4 5469 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5470 Set_Node_Length(ret, 1);
5471 }
c277df42
IZ
5472 }
5473
b81d288d 5474 if (!first && SIZE_ONLY)
830247a4 5475 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5476
c277df42 5477 *flagp = WORST; /* Tentatively. */
a0d0e21e 5478
830247a4
IZ
5479 RExC_parse--;
5480 nextchar(pRExC_state);
5481 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5482 flags &= ~TRYAGAIN;
3dab1dad 5483 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5484 if (latest == NULL) {
5485 if (flags & TRYAGAIN)
5486 continue;
5487 return(NULL);
a0ed51b3
LW
5488 }
5489 else if (ret == NULL)
c277df42 5490 ret = latest;
a0d0e21e 5491 *flagp |= flags&HASWIDTH;
c277df42 5492 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5493 *flagp |= flags&SPSTART;
5494 else {
830247a4 5495 RExC_naughty++;
3dab1dad 5496 REGTAIL(pRExC_state, chain, latest);
a687059c 5497 }
a0d0e21e 5498 chain = latest;
c277df42
IZ
5499 c++;
5500 }
5501 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5502 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5503 if (ret == NULL)
5504 ret = chain;
5505 }
5506 if (c == 1) {
5507 *flagp |= flags&SIMPLE;
a0d0e21e 5508 }
a687059c 5509
d4c19fe8 5510 return ret;
a687059c
LW
5511}
5512
5513/*
5514 - regpiece - something followed by possible [*+?]
5515 *
5516 * Note that the branching code sequences used for ? and the general cases
5517 * of * and + are somewhat optimized: they use the same NOTHING node as
5518 * both the endmarker for their branch list and the body of the last branch.
5519 * It might seem that this node could be dispensed with entirely, but the
5520 * endmarker role is not redundant.
5521 */
76e3520e 5522STATIC regnode *
3dab1dad 5523S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5524{
97aff369 5525 dVAR;
c277df42 5526 register regnode *ret;
a0d0e21e
LW
5527 register char op;
5528 register char *next;
5529 I32 flags;
1df70142 5530 const char * const origparse = RExC_parse;
a0d0e21e 5531 I32 min;
c277df42 5532 I32 max = REG_INFTY;
fac92740 5533 char *parse_start;
10edeb5d 5534 const char *maxpos = NULL;
3dab1dad
YO
5535 GET_RE_DEBUG_FLAGS_DECL;
5536 DEBUG_PARSE("piec");
a0d0e21e 5537
3dab1dad 5538 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5539 if (ret == NULL) {
5540 if (flags & TRYAGAIN)
5541 *flagp |= TRYAGAIN;
5542 return(NULL);
5543 }
5544
830247a4 5545 op = *RExC_parse;
a0d0e21e 5546
830247a4 5547 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5548 maxpos = NULL;
fac92740 5549 parse_start = RExC_parse; /* MJD */
830247a4 5550 next = RExC_parse + 1;
a0d0e21e
LW
5551 while (isDIGIT(*next) || *next == ',') {
5552 if (*next == ',') {
5553 if (maxpos)
5554 break;
5555 else
5556 maxpos = next;
a687059c 5557 }
a0d0e21e
LW
5558 next++;
5559 }
5560 if (*next == '}') { /* got one */
5561 if (!maxpos)
5562 maxpos = next;
830247a4
IZ
5563 RExC_parse++;
5564 min = atoi(RExC_parse);
a0d0e21e
LW
5565 if (*maxpos == ',')
5566 maxpos++;
5567 else
830247a4 5568 maxpos = RExC_parse;
a0d0e21e
LW
5569 max = atoi(maxpos);
5570 if (!max && *maxpos != '0')
c277df42
IZ
5571 max = REG_INFTY; /* meaning "infinity" */
5572 else if (max >= REG_INFTY)
8615cb43 5573 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5574 RExC_parse = next;
5575 nextchar(pRExC_state);
a0d0e21e
LW
5576
5577 do_curly:
5578 if ((flags&SIMPLE)) {
830247a4 5579 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5580 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5581 Set_Node_Offset(ret, parse_start+1); /* MJD */
5582 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5583 }
5584 else {
3dab1dad 5585 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5586
5587 w->flags = 0;
3dab1dad 5588 REGTAIL(pRExC_state, ret, w);
830247a4 5589 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5590 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5591 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5592 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5593 }
6bda09f9 5594 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5595 /* MJD hk */
5596 Set_Node_Offset(ret, parse_start+1);
2af232bd 5597 Set_Node_Length(ret,
fac92740 5598 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5599
830247a4 5600 if (!SIZE_ONLY && RExC_extralen)
c277df42 5601 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5602 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5603 if (SIZE_ONLY)
830247a4
IZ
5604 RExC_whilem_seen++, RExC_extralen += 3;
5605 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5606 }
c277df42 5607 ret->flags = 0;
a0d0e21e
LW
5608
5609 if (min > 0)
821b33a5
IZ
5610 *flagp = WORST;
5611 if (max > 0)
5612 *flagp |= HASWIDTH;
a0d0e21e 5613 if (max && max < min)
8615cb43 5614 vFAIL("Can't do {n,m} with n > m");
c277df42 5615 if (!SIZE_ONLY) {
eb160463
GS
5616 ARG1_SET(ret, (U16)min);
5617 ARG2_SET(ret, (U16)max);
a687059c 5618 }
a687059c 5619
a0d0e21e 5620 goto nest_check;
a687059c 5621 }
a0d0e21e 5622 }
a687059c 5623
a0d0e21e
LW
5624 if (!ISMULT1(op)) {
5625 *flagp = flags;
a687059c 5626 return(ret);
a0d0e21e 5627 }
bb20fd44 5628
c277df42 5629#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5630
5631 /* if this is reinstated, don't forget to put this back into perldiag:
5632
5633 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5634
5635 (F) The part of the regexp subject to either the * or + quantifier
5636 could match an empty string. The {#} shows in the regular
5637 expression about where the problem was discovered.
5638
5639 */
5640
bb20fd44 5641 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5642 vFAIL("Regexp *+ operand could be empty");
b81d288d 5643#endif
bb20fd44 5644
fac92740 5645 parse_start = RExC_parse;
830247a4 5646 nextchar(pRExC_state);
a0d0e21e 5647
821b33a5 5648 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5649
5650 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5651 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5652 ret->flags = 0;
830247a4 5653 RExC_naughty += 4;
a0d0e21e
LW
5654 }
5655 else if (op == '*') {
5656 min = 0;
5657 goto do_curly;
a0ed51b3
LW
5658 }
5659 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5660 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5661 ret->flags = 0;
830247a4 5662 RExC_naughty += 3;
a0d0e21e
LW
5663 }
5664 else if (op == '+') {
5665 min = 1;
5666 goto do_curly;
a0ed51b3
LW
5667 }
5668 else if (op == '?') {
a0d0e21e
LW
5669 min = 0; max = 1;
5670 goto do_curly;
5671 }
5672 nest_check:
041457d9 5673 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5674 vWARN3(RExC_parse,
b45f050a 5675 "%.*s matches null string many times",
afd78fd5 5676 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5677 origparse);
a0d0e21e
LW
5678 }
5679
b9b4dddf 5680 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5681 nextchar(pRExC_state);
6bda09f9 5682 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5683 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5684 }
b9b4dddf
YO
5685#ifndef REG_ALLOW_MINMOD_SUSPEND
5686 else
5687#endif
5688 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5689 regnode *ender;
5690 nextchar(pRExC_state);
5691 ender = reg_node(pRExC_state, SUCCEED);
5692 REGTAIL(pRExC_state, ret, ender);
5693 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5694 ret->flags = 0;
5695 ender = reg_node(pRExC_state, TAIL);
5696 REGTAIL(pRExC_state, ret, ender);
5697 /*ret= ender;*/
5698 }
5699
5700 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5701 RExC_parse++;
b45f050a
JF
5702 vFAIL("Nested quantifiers");
5703 }
a0d0e21e
LW
5704
5705 return(ret);
a687059c
LW
5706}
5707
fc8cd66c
YO
5708
5709/* reg_namedseq(pRExC_state,UVp)
5710
5711 This is expected to be called by a parser routine that has
5712 recognized'\N' and needs to handle the rest. RExC_parse is
5713 expected to point at the first char following the N at the time
5714 of the call.
5715
5716 If valuep is non-null then it is assumed that we are parsing inside
5717 of a charclass definition and the first codepoint in the resolved
5718 string is returned via *valuep and the routine will return NULL.
5719 In this mode if a multichar string is returned from the charnames
5720 handler a warning will be issued, and only the first char in the
5721 sequence will be examined. If the string returned is zero length
5722 then the value of *valuep is undefined and NON-NULL will
5723 be returned to indicate failure. (This will NOT be a valid pointer
5724 to a regnode.)
5725
5726 If value is null then it is assumed that we are parsing normal text
5727 and inserts a new EXACT node into the program containing the resolved
5728 string and returns a pointer to the new node. If the string is
5729 zerolength a NOTHING node is emitted.
5730
5731 On success RExC_parse is set to the char following the endbrace.
5732 Parsing failures will generate a fatal errorvia vFAIL(...)
5733
5734 NOTE: We cache all results from the charnames handler locally in
5735 the RExC_charnames hash (created on first use) to prevent a charnames
5736 handler from playing silly-buggers and returning a short string and
5737 then a long string for a given pattern. Since the regexp program
5738 size is calculated during an initial parse this would result
5739 in a buffer overrun so we cache to prevent the charname result from
5740 changing during the course of the parse.
5741
5742 */
5743STATIC regnode *
5744S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5745{
5746 char * name; /* start of the content of the name */
5747 char * endbrace; /* endbrace following the name */
5748 SV *sv_str = NULL;
5749 SV *sv_name = NULL;
5750 STRLEN len; /* this has various purposes throughout the code */
5751 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5752 regnode *ret = NULL;
5753
5754 if (*RExC_parse != '{') {
5755 vFAIL("Missing braces on \\N{}");
5756 }
5757 name = RExC_parse+1;
5758 endbrace = strchr(RExC_parse, '}');
5759 if ( ! endbrace ) {
5760 RExC_parse++;
5761 vFAIL("Missing right brace on \\N{}");
5762 }
5763 RExC_parse = endbrace + 1;
5764
5765
5766 /* RExC_parse points at the beginning brace,
5767 endbrace points at the last */
5768 if ( name[0]=='U' && name[1]=='+' ) {
5769 /* its a "unicode hex" notation {U+89AB} */
5770 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5771 | PERL_SCAN_DISALLOW_PREFIX
5772 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5773 UV cp;
196f1508 5774 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5775 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5776 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5777 cp = 0xFFFD;
5778 }
5779 if (cp > 0xff)
5780 RExC_utf8 = 1;
5781 if ( valuep ) {
5782 *valuep = cp;
5783 return NULL;
5784 }
5785 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5786 } else {
5787 /* fetch the charnames handler for this scope */
5788 HV * const table = GvHV(PL_hintgv);
5789 SV **cvp= table ?
5790 hv_fetchs(table, "charnames", FALSE) :
5791 NULL;
5792 SV *cv= cvp ? *cvp : NULL;
5793 HE *he_str;
5794 int count;
5795 /* create an SV with the name as argument */
5796 sv_name = newSVpvn(name, endbrace - name);
5797
5798 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5799 vFAIL2("Constant(\\N{%s}) unknown: "
5800 "(possibly a missing \"use charnames ...\")",
5801 SvPVX(sv_name));
5802 }
5803 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5804 vFAIL2("Constant(\\N{%s}): "
5805 "$^H{charnames} is not defined",SvPVX(sv_name));
5806 }
5807
5808
5809
5810 if (!RExC_charnames) {
5811 /* make sure our cache is allocated */
5812 RExC_charnames = newHV();
6bda09f9 5813 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5814 }
5815 /* see if we have looked this one up before */
5816 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5817 if ( he_str ) {
5818 sv_str = HeVAL(he_str);
5819 cached = 1;
5820 } else {
5821 dSP ;
5822
5823 ENTER ;
5824 SAVETMPS ;
5825 PUSHMARK(SP) ;
5826
5827 XPUSHs(sv_name);
5828
5829 PUTBACK ;
5830
5831 count= call_sv(cv, G_SCALAR);
5832
5833 if (count == 1) { /* XXXX is this right? dmq */
5834 sv_str = POPs;
5835 SvREFCNT_inc_simple_void(sv_str);
5836 }
5837
5838 SPAGAIN ;
5839 PUTBACK ;
5840 FREETMPS ;
5841 LEAVE ;
5842
5843 if ( !sv_str || !SvOK(sv_str) ) {
5844 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5845 "did not return a defined value",SvPVX(sv_name));
5846 }
5847 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5848 cached = 1;
5849 }
5850 }
5851 if (valuep) {
5852 char *p = SvPV(sv_str, len);
5853 if (len) {
5854 STRLEN numlen = 1;
5855 if ( SvUTF8(sv_str) ) {
196f1508 5856 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5857 if (*valuep > 0x7F)
5858 RExC_utf8 = 1;
5859 /* XXXX
5860 We have to turn on utf8 for high bit chars otherwise
5861 we get failures with
5862
5863 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5864 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5865
5866 This is different from what \x{} would do with the same
5867 codepoint, where the condition is > 0xFF.
5868 - dmq
5869 */
5870
5871
5872 } else {
5873 *valuep = (UV)*p;
5874 /* warn if we havent used the whole string? */
5875 }
5876 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5877 vWARN2(RExC_parse,
5878 "Ignoring excess chars from \\N{%s} in character class",
5879 SvPVX(sv_name)
5880 );
5881 }
5882 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5883 vWARN2(RExC_parse,
5884 "Ignoring zero length \\N{%s} in character class",
5885 SvPVX(sv_name)
5886 );
5887 }
5888 if (sv_name)
5889 SvREFCNT_dec(sv_name);
5890 if (!cached)
5891 SvREFCNT_dec(sv_str);
5892 return len ? NULL : (regnode *)&len;
5893 } else if(SvCUR(sv_str)) {
5894
5895 char *s;
5896 char *p, *pend;
5897 STRLEN charlen = 1;
5898 char * parse_start = name-3; /* needed for the offsets */
5899 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5900
5901 ret = reg_node(pRExC_state,
5902 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5903 s= STRING(ret);
5904
5905 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5906 sv_utf8_upgrade(sv_str);
5907 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5908 RExC_utf8= 1;
5909 }
5910
5911 p = SvPV(sv_str, len);
5912 pend = p + len;
5913 /* len is the length written, charlen is the size the char read */
5914 for ( len = 0; p < pend; p += charlen ) {
5915 if (UTF) {
196f1508 5916 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5917 if (FOLD) {
5918 STRLEN foldlen,numlen;
5919 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5920 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5921 /* Emit all the Unicode characters. */
5922
5923 for (foldbuf = tmpbuf;
5924 foldlen;
5925 foldlen -= numlen)
5926 {
5927 uvc = utf8_to_uvchr(foldbuf, &numlen);
5928 if (numlen > 0) {
5929 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5930 s += unilen;
5931 len += unilen;
5932 /* In EBCDIC the numlen
5933 * and unilen can differ. */
5934 foldbuf += numlen;
5935 if (numlen >= foldlen)
5936 break;
5937 }
5938 else
5939 break; /* "Can't happen." */
5940 }
5941 } else {
5942 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5943 if (unilen > 0) {
5944 s += unilen;
5945 len += unilen;
5946 }
5947 }
5948 } else {
5949 len++;
5950 REGC(*p, s++);
5951 }
5952 }
5953 if (SIZE_ONLY) {
5954 RExC_size += STR_SZ(len);
5955 } else {
5956 STR_LEN(ret) = len;
5957 RExC_emit += STR_SZ(len);
5958 }
5959 Set_Node_Cur_Length(ret); /* MJD */
5960 RExC_parse--;
5961 nextchar(pRExC_state);
5962 } else {
5963 ret = reg_node(pRExC_state,NOTHING);
5964 }
5965 if (!cached) {
5966 SvREFCNT_dec(sv_str);
5967 }
5968 if (sv_name) {
5969 SvREFCNT_dec(sv_name);
5970 }
5971 return ret;
5972
5973}
5974
5975
9e08bc66
TS
5976/*
5977 * reg_recode
5978 *
5979 * It returns the code point in utf8 for the value in *encp.
5980 * value: a code value in the source encoding
5981 * encp: a pointer to an Encode object
5982 *
5983 * If the result from Encode is not a single character,
5984 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5985 */
5986STATIC UV
5987S_reg_recode(pTHX_ const char value, SV **encp)
5988{
5989 STRLEN numlen = 1;
5990 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5991 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5992 : SvPVX(sv);
5993 const STRLEN newlen = SvCUR(sv);
5994 UV uv = UNICODE_REPLACEMENT;
5995
5996 if (newlen)
5997 uv = SvUTF8(sv)
5998 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5999 : *(U8*)s;
6000
6001 if (!newlen || numlen != newlen) {
6002 uv = UNICODE_REPLACEMENT;
6003 if (encp)
6004 *encp = NULL;
6005 }
6006 return uv;
6007}
6008
fc8cd66c 6009
a687059c
LW
6010/*
6011 - regatom - the lowest level
6012 *
6013 * Optimization: gobbles an entire sequence of ordinary characters so that
6014 * it can turn them into a single node, which is smaller to store and
6015 * faster to run. Backslashed characters are exceptions, each becoming a
6016 * separate node; the code is simpler that way and it's not worth fixing.
6017 *
7f6f358c
YO
6018 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6019 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6020 */
76e3520e 6021STATIC regnode *
3dab1dad 6022S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6023{
97aff369 6024 dVAR;
cbbf8932 6025 register regnode *ret = NULL;
a0d0e21e 6026 I32 flags;
45948336 6027 char *parse_start = RExC_parse;
3dab1dad
YO
6028 GET_RE_DEBUG_FLAGS_DECL;
6029 DEBUG_PARSE("atom");
a0d0e21e
LW
6030 *flagp = WORST; /* Tentatively. */
6031
6032tryagain:
830247a4 6033 switch (*RExC_parse) {
a0d0e21e 6034 case '^':
830247a4
IZ
6035 RExC_seen_zerolen++;
6036 nextchar(pRExC_state);
e2509266 6037 if (RExC_flags & PMf_MULTILINE)
830247a4 6038 ret = reg_node(pRExC_state, MBOL);
e2509266 6039 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6040 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6041 else
830247a4 6042 ret = reg_node(pRExC_state, BOL);
fac92740 6043 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6044 break;
6045 case '$':
830247a4 6046 nextchar(pRExC_state);
b81d288d 6047 if (*RExC_parse)
830247a4 6048 RExC_seen_zerolen++;
e2509266 6049 if (RExC_flags & PMf_MULTILINE)
830247a4 6050 ret = reg_node(pRExC_state, MEOL);
e2509266 6051 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6052 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6053 else
830247a4 6054 ret = reg_node(pRExC_state, EOL);
fac92740 6055 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6056 break;
6057 case '.':
830247a4 6058 nextchar(pRExC_state);
e2509266 6059 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6060 ret = reg_node(pRExC_state, SANY);
6061 else
6062 ret = reg_node(pRExC_state, REG_ANY);
6063 *flagp |= HASWIDTH|SIMPLE;
830247a4 6064 RExC_naughty++;
fac92740 6065 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6066 break;
6067 case '[':
b45f050a 6068 {
3dab1dad
YO
6069 char * const oregcomp_parse = ++RExC_parse;
6070 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6071 if (*RExC_parse != ']') {
6072 RExC_parse = oregcomp_parse;
b45f050a
JF
6073 vFAIL("Unmatched [");
6074 }
830247a4 6075 nextchar(pRExC_state);
a0d0e21e 6076 *flagp |= HASWIDTH|SIMPLE;
fac92740 6077 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6078 break;
b45f050a 6079 }
a0d0e21e 6080 case '(':
830247a4 6081 nextchar(pRExC_state);
3dab1dad 6082 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6083 if (ret == NULL) {
bf93d4cc 6084 if (flags & TRYAGAIN) {
830247a4 6085 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6086 /* Make parent create an empty node if needed. */
6087 *flagp |= TRYAGAIN;
6088 return(NULL);
6089 }
a0d0e21e 6090 goto tryagain;
bf93d4cc 6091 }
a0d0e21e
LW
6092 return(NULL);
6093 }
c277df42 6094 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6095 break;
6096 case '|':
6097 case ')':
6098 if (flags & TRYAGAIN) {
6099 *flagp |= TRYAGAIN;
6100 return NULL;
6101 }
b45f050a 6102 vFAIL("Internal urp");
a0d0e21e
LW
6103 /* Supposed to be caught earlier. */
6104 break;
85afd4ae 6105 case '{':
830247a4
IZ
6106 if (!regcurly(RExC_parse)) {
6107 RExC_parse++;
85afd4ae
CS
6108 goto defchar;
6109 }
6110 /* FALL THROUGH */
a0d0e21e
LW
6111 case '?':
6112 case '+':
6113 case '*':
830247a4 6114 RExC_parse++;
b45f050a 6115 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6116 break;
6117 case '\\':
830247a4 6118 switch (*++RExC_parse) {
a0d0e21e 6119 case 'A':
830247a4
IZ
6120 RExC_seen_zerolen++;
6121 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6122 *flagp |= SIMPLE;
830247a4 6123 nextchar(pRExC_state);
fac92740 6124 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6125 break;
6126 case 'G':
830247a4
IZ
6127 ret = reg_node(pRExC_state, GPOS);
6128 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6129 *flagp |= SIMPLE;
830247a4 6130 nextchar(pRExC_state);
fac92740 6131 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6132 break;
6133 case 'Z':
830247a4 6134 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6135 *flagp |= SIMPLE;
a1917ab9 6136 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6137 nextchar(pRExC_state);
a0d0e21e 6138 break;
b85d18e9 6139 case 'z':
830247a4 6140 ret = reg_node(pRExC_state, EOS);
b85d18e9 6141 *flagp |= SIMPLE;
830247a4
IZ
6142 RExC_seen_zerolen++; /* Do not optimize RE away */
6143 nextchar(pRExC_state);
fac92740 6144 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6145 break;
4a2d328f 6146 case 'C':
f33976b4
DB
6147 ret = reg_node(pRExC_state, CANY);
6148 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6149 *flagp |= HASWIDTH|SIMPLE;
830247a4 6150 nextchar(pRExC_state);
fac92740 6151 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6152 break;
6153 case 'X':
830247a4 6154 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6155 *flagp |= HASWIDTH;
830247a4 6156 nextchar(pRExC_state);
fac92740 6157 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6158 break;
a0d0e21e 6159 case 'w':
eb160463 6160 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6161 *flagp |= HASWIDTH|SIMPLE;
830247a4 6162 nextchar(pRExC_state);
fac92740 6163 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6164 break;
6165 case 'W':
eb160463 6166 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6167 *flagp |= HASWIDTH|SIMPLE;
830247a4 6168 nextchar(pRExC_state);
fac92740 6169 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6170 break;
6171 case 'b':
830247a4
IZ
6172 RExC_seen_zerolen++;
6173 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6174 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6175 *flagp |= SIMPLE;
830247a4 6176 nextchar(pRExC_state);
fac92740 6177 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6178 break;
6179 case 'B':
830247a4
IZ
6180 RExC_seen_zerolen++;
6181 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6182 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6183 *flagp |= SIMPLE;
830247a4 6184 nextchar(pRExC_state);
fac92740 6185 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6186 break;
6187 case 's':
eb160463 6188 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6189 *flagp |= HASWIDTH|SIMPLE;
830247a4 6190 nextchar(pRExC_state);
fac92740 6191 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6192 break;
6193 case 'S':
eb160463 6194 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6195 *flagp |= HASWIDTH|SIMPLE;
830247a4 6196 nextchar(pRExC_state);
fac92740 6197 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6198 break;
6199 case 'd':
ffc61ed2 6200 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6201 *flagp |= HASWIDTH|SIMPLE;
830247a4 6202 nextchar(pRExC_state);
fac92740 6203 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6204 break;
6205 case 'D':
ffc61ed2 6206 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6207 *flagp |= HASWIDTH|SIMPLE;
830247a4 6208 nextchar(pRExC_state);
fac92740 6209 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6210 break;
a14b48bc
LW
6211 case 'p':
6212 case 'P':
3568d838 6213 {
3dab1dad 6214 char* const oldregxend = RExC_end;
ccb2c380 6215 char* parse_start = RExC_parse - 2;
a14b48bc 6216
830247a4 6217 if (RExC_parse[1] == '{') {
3568d838 6218 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6219 RExC_end = strchr(RExC_parse, '}');
6220 if (!RExC_end) {
3dab1dad 6221 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6222 RExC_parse += 2;
6223 RExC_end = oldregxend;
0da60cf5 6224 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6225 }
830247a4 6226 RExC_end++;
a14b48bc 6227 }
af6f566e 6228 else {
830247a4 6229 RExC_end = RExC_parse + 2;
af6f566e
HS
6230 if (RExC_end > oldregxend)
6231 RExC_end = oldregxend;
6232 }
830247a4 6233 RExC_parse--;
a14b48bc 6234
3dab1dad 6235 ret = regclass(pRExC_state,depth+1);
a14b48bc 6236
830247a4
IZ
6237 RExC_end = oldregxend;
6238 RExC_parse--;
ccb2c380
MP
6239
6240 Set_Node_Offset(ret, parse_start + 2);
6241 Set_Node_Cur_Length(ret);
830247a4 6242 nextchar(pRExC_state);
a14b48bc
LW
6243 *flagp |= HASWIDTH|SIMPLE;
6244 }
6245 break;
fc8cd66c
YO
6246 case 'N':
6247 /* Handle \N{NAME} here and not below because it can be
6248 multicharacter. join_exact() will join them up later on.
6249 Also this makes sure that things like /\N{BLAH}+/ and
6250 \N{BLAH} being multi char Just Happen. dmq*/
6251 ++RExC_parse;
6252 ret= reg_namedseq(pRExC_state, NULL);
6253 break;
0a4db386 6254 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6255 {
6256 char ch= RExC_parse[1];
6257 if (ch != '<' && ch != '\'') {
6258 if (SIZE_ONLY)
6259 vWARN( RExC_parse + 1,
6260 "Possible broken named back reference treated as literal k");
6261 parse_start--;
6262 goto defchar;
6263 } else {
6264 char* name_start = (RExC_parse += 2);
6265 I32 num = 0;
0a4db386
YO
6266 SV *sv_dat = reg_scan_name(pRExC_state,
6267 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6268 ch= (ch == '<') ? '>' : '\'';
6269
81714fb9
YO
6270 if (RExC_parse == name_start || *RExC_parse != ch)
6271 vFAIL2("Sequence \\k%c... not terminated",
6272 (ch == '>' ? '<' : ch));
6273
6274 RExC_sawback = 1;
6275 ret = reganode(pRExC_state,
6276 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6277 num);
6278 *flagp |= HASWIDTH;
6279
6280
6281 if (!SIZE_ONLY) {
81714fb9
YO
6282 num = add_data( pRExC_state, 1, "S" );
6283 ARG_SET(ret,num);
6284 RExC_rx->data->data[num]=(void*)sv_dat;
6285 SvREFCNT_inc(sv_dat);
6286 }
6287 /* override incorrect value set in reganode MJD */
6288 Set_Node_Offset(ret, parse_start+1);
6289 Set_Node_Cur_Length(ret); /* MJD */
6290 nextchar(pRExC_state);
6291
6292 }
6293 break;
6294 }
a0d0e21e
LW
6295 case 'n':
6296 case 'r':
6297 case 't':
6298 case 'f':
6299 case 'e':
6300 case 'a':
6301 case 'x':
6302 case 'c':
6303 case '0':
6304 goto defchar;
c74340f9 6305 case 'R':
a0d0e21e
LW
6306 case '1': case '2': case '3': case '4':
6307 case '5': case '6': case '7': case '8': case '9':
6308 {
c74340f9
YO
6309 I32 num;
6310 bool isrel=(*RExC_parse=='R');
6311 if (isrel)
6312 RExC_parse++;
6313 num = atoi(RExC_parse);
6314 if (isrel) {
6315 num = RExC_cpar - num;
6316 if (num < 1)
6317 vFAIL("Reference to nonexistent or unclosed group");
6318 }
830247a4 6319 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6320 goto defchar;
6321 else {
3dab1dad 6322 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6323 while (isDIGIT(*RExC_parse))
6324 RExC_parse++;
b45f050a 6325
c74340f9
YO
6326 if (!SIZE_ONLY) {
6327 if (num > (I32)RExC_rx->nparens)
6328 vFAIL("Reference to nonexistent group");
6329 /* People make this error all the time apparently.
6330 So we cant fail on it, even though we should
6331
6332 else if (num >= RExC_cpar)
6333 vFAIL("Reference to unclosed group will always match");
6334 */
6335 }
830247a4 6336 RExC_sawback = 1;
eb160463
GS
6337 ret = reganode(pRExC_state,
6338 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6339 num);
a0d0e21e 6340 *flagp |= HASWIDTH;
2af232bd 6341
fac92740 6342 /* override incorrect value set in reganode MJD */
2af232bd 6343 Set_Node_Offset(ret, parse_start+1);
fac92740 6344 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6345 RExC_parse--;
6346 nextchar(pRExC_state);
a0d0e21e
LW
6347 }
6348 }
6349 break;
6350 case '\0':
830247a4 6351 if (RExC_parse >= RExC_end)
b45f050a 6352 FAIL("Trailing \\");
a0d0e21e
LW
6353 /* FALL THROUGH */
6354 default:
a0288114 6355 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6356 back into the quick-grab loop below */
45948336 6357 parse_start--;
a0d0e21e
LW
6358 goto defchar;
6359 }
6360 break;
4633a7c4
LW
6361
6362 case '#':
e2509266 6363 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6364 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6365 RExC_parse++;
830247a4 6366 if (RExC_parse < RExC_end)
4633a7c4
LW
6367 goto tryagain;
6368 }
6369 /* FALL THROUGH */
6370
a0d0e21e 6371 default: {
ba210ebe 6372 register STRLEN len;
58ae7d3f 6373 register UV ender;
a0d0e21e 6374 register char *p;
3dab1dad 6375 char *s;
80aecb99 6376 STRLEN foldlen;
89ebb4a3 6377 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6378
6379 parse_start = RExC_parse - 1;
a0d0e21e 6380
830247a4 6381 RExC_parse++;
a0d0e21e
LW
6382
6383 defchar:
58ae7d3f 6384 ender = 0;
eb160463
GS
6385 ret = reg_node(pRExC_state,
6386 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6387 s = STRING(ret);
830247a4
IZ
6388 for (len = 0, p = RExC_parse - 1;
6389 len < 127 && p < RExC_end;
a0d0e21e
LW
6390 len++)
6391 {
3dab1dad 6392 char * const oldp = p;
5b5a24f7 6393
e2509266 6394 if (RExC_flags & PMf_EXTENDED)
830247a4 6395 p = regwhite(p, RExC_end);
a0d0e21e
LW
6396 switch (*p) {
6397 case '^':
6398 case '$':
6399 case '.':
6400 case '[':
6401 case '(':
6402 case ')':
6403 case '|':
6404 goto loopdone;
6405 case '\\':
6406 switch (*++p) {
6407 case 'A':
1ed8eac0
JF
6408 case 'C':
6409 case 'X':
a0d0e21e
LW
6410 case 'G':
6411 case 'Z':
b85d18e9 6412 case 'z':
a0d0e21e
LW
6413 case 'w':
6414 case 'W':
6415 case 'b':
6416 case 'B':
6417 case 's':
6418 case 'S':
6419 case 'd':
6420 case 'D':
a14b48bc
LW
6421 case 'p':
6422 case 'P':
fc8cd66c 6423 case 'N':
c74340f9 6424 case 'R':
a0d0e21e
LW
6425 --p;
6426 goto loopdone;
6427 case 'n':
6428 ender = '\n';
6429 p++;
a687059c 6430 break;
a0d0e21e
LW
6431 case 'r':
6432 ender = '\r';
6433 p++;
a687059c 6434 break;
a0d0e21e
LW
6435 case 't':
6436 ender = '\t';
6437 p++;
a687059c 6438 break;
a0d0e21e
LW
6439 case 'f':
6440 ender = '\f';
6441 p++;
a687059c 6442 break;
a0d0e21e 6443 case 'e':
c7f1f016 6444 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6445 p++;
a687059c 6446 break;
a0d0e21e 6447 case 'a':
c7f1f016 6448 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6449 p++;
a687059c 6450 break;
a0d0e21e 6451 case 'x':
a0ed51b3 6452 if (*++p == '{') {
1df70142 6453 char* const e = strchr(p, '}');
b81d288d 6454
b45f050a 6455 if (!e) {
830247a4 6456 RExC_parse = p + 1;
b45f050a
JF
6457 vFAIL("Missing right brace on \\x{}");
6458 }
de5f0749 6459 else {
a4c04bdc
NC
6460 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6461 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6462 STRLEN numlen = e - p - 1;
53305cf1 6463 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6464 if (ender > 0xff)
6465 RExC_utf8 = 1;
a0ed51b3
LW
6466 p = e + 1;
6467 }
a0ed51b3
LW
6468 }
6469 else {
a4c04bdc 6470 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6471 STRLEN numlen = 2;
53305cf1 6472 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6473 p += numlen;
6474 }
9e08bc66
TS
6475 if (PL_encoding && ender < 0x100)
6476 goto recode_encoding;
a687059c 6477 break;
a0d0e21e
LW
6478 case 'c':
6479 p++;
bbce6d69 6480 ender = UCHARAT(p++);
6481 ender = toCTRL(ender);
a687059c 6482 break;
a0d0e21e
LW
6483 case '0': case '1': case '2': case '3':case '4':
6484 case '5': case '6': case '7': case '8':case '9':
6485 if (*p == '0' ||
830247a4 6486 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6487 I32 flags = 0;
1df70142 6488 STRLEN numlen = 3;
53305cf1 6489 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6490 p += numlen;
6491 }
6492 else {
6493 --p;
6494 goto loopdone;
a687059c 6495 }
9e08bc66
TS
6496 if (PL_encoding && ender < 0x100)
6497 goto recode_encoding;
6498 break;
6499 recode_encoding:
6500 {
6501 SV* enc = PL_encoding;
6502 ender = reg_recode((const char)(U8)ender, &enc);
6503 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6504 vWARN(p, "Invalid escape in the specified encoding");
6505 RExC_utf8 = 1;
6506 }
a687059c 6507 break;
a0d0e21e 6508 case '\0':
830247a4 6509 if (p >= RExC_end)
b45f050a 6510 FAIL("Trailing \\");
a687059c 6511 /* FALL THROUGH */
a0d0e21e 6512 default:
041457d9 6513 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6514 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6515 goto normal_default;
a0d0e21e
LW
6516 }
6517 break;
a687059c 6518 default:
a0ed51b3 6519 normal_default:
fd400ab9 6520 if (UTF8_IS_START(*p) && UTF) {
1df70142 6521 STRLEN numlen;
5e12f4fb 6522 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6523 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6524 p += numlen;
6525 }
6526 else
6527 ender = *p++;
a0d0e21e 6528 break;
a687059c 6529 }
e2509266 6530 if (RExC_flags & PMf_EXTENDED)
830247a4 6531 p = regwhite(p, RExC_end);
60a8b682
JH
6532 if (UTF && FOLD) {
6533 /* Prime the casefolded buffer. */
ac7e0132 6534 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6535 }
a0d0e21e
LW
6536 if (ISMULT2(p)) { /* Back off on ?+*. */
6537 if (len)
6538 p = oldp;
16ea2a2e 6539 else if (UTF) {
80aecb99 6540 if (FOLD) {
60a8b682 6541 /* Emit all the Unicode characters. */
1df70142 6542 STRLEN numlen;
80aecb99
JH
6543 for (foldbuf = tmpbuf;
6544 foldlen;
6545 foldlen -= numlen) {
6546 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6547 if (numlen > 0) {
71207a34 6548 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6549 s += unilen;
6550 len += unilen;
6551 /* In EBCDIC the numlen
6552 * and unilen can differ. */
9dc45d57 6553 foldbuf += numlen;
47654450
JH
6554 if (numlen >= foldlen)
6555 break;
9dc45d57
JH
6556 }
6557 else
6558 break; /* "Can't happen." */
80aecb99
JH
6559 }
6560 }
6561 else {
71207a34 6562 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6563 if (unilen > 0) {
0ebc6274
JH
6564 s += unilen;
6565 len += unilen;
9dc45d57 6566 }
80aecb99 6567 }
a0ed51b3 6568 }
a0d0e21e
LW
6569 else {
6570 len++;
eb160463 6571 REGC((char)ender, s++);
a0d0e21e
LW
6572 }
6573 break;
a687059c 6574 }
16ea2a2e 6575 if (UTF) {
80aecb99 6576 if (FOLD) {
60a8b682 6577 /* Emit all the Unicode characters. */
1df70142 6578 STRLEN numlen;
80aecb99
JH
6579 for (foldbuf = tmpbuf;
6580 foldlen;
6581 foldlen -= numlen) {
6582 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6583 if (numlen > 0) {
71207a34 6584 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6585 len += unilen;
6586 s += unilen;
6587 /* In EBCDIC the numlen
6588 * and unilen can differ. */
9dc45d57 6589 foldbuf += numlen;
47654450
JH
6590 if (numlen >= foldlen)
6591 break;
9dc45d57
JH
6592 }
6593 else
6594 break;
80aecb99
JH
6595 }
6596 }
6597 else {
71207a34 6598 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6599 if (unilen > 0) {
0ebc6274
JH
6600 s += unilen;
6601 len += unilen;
9dc45d57 6602 }
80aecb99
JH
6603 }
6604 len--;
a0ed51b3
LW
6605 }
6606 else
eb160463 6607 REGC((char)ender, s++);
a0d0e21e
LW
6608 }
6609 loopdone:
830247a4 6610 RExC_parse = p - 1;
fac92740 6611 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6612 nextchar(pRExC_state);
793db0cb
JH
6613 {
6614 /* len is STRLEN which is unsigned, need to copy to signed */
6615 IV iv = len;
6616 if (iv < 0)
6617 vFAIL("Internal disaster");
6618 }
a0d0e21e
LW
6619 if (len > 0)
6620 *flagp |= HASWIDTH;
090f7165 6621 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6622 *flagp |= SIMPLE;
3dab1dad 6623
cd439c50 6624 if (SIZE_ONLY)
830247a4 6625 RExC_size += STR_SZ(len);
3dab1dad
YO
6626 else {
6627 STR_LEN(ret) = len;
830247a4 6628 RExC_emit += STR_SZ(len);
07be1b83 6629 }
3dab1dad 6630 }
a0d0e21e
LW
6631 break;
6632 }
a687059c 6633
a0d0e21e 6634 return(ret);
a687059c
LW
6635}
6636
873ef191 6637STATIC char *
5f66b61c 6638S_regwhite(char *p, const char *e)
5b5a24f7
CS
6639{
6640 while (p < e) {
6641 if (isSPACE(*p))
6642 ++p;
6643 else if (*p == '#') {
6644 do {
6645 p++;
6646 } while (p < e && *p != '\n');
6647 }
6648 else
6649 break;
6650 }
6651 return p;
6652}
6653
b8c5462f
JH
6654/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6655 Character classes ([:foo:]) can also be negated ([:^foo:]).
6656 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6657 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6658 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6659
6660#define POSIXCC_DONE(c) ((c) == ':')
6661#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6662#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6663
b8c5462f 6664STATIC I32
830247a4 6665S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6666{
97aff369 6667 dVAR;
936ed897 6668 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6669
830247a4 6670 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6671 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6672 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6673 const char c = UCHARAT(RExC_parse);
097eb12c 6674 char* const s = RExC_parse++;
b81d288d 6675
9a86a77b 6676 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6677 RExC_parse++;
6678 if (RExC_parse == RExC_end)
620e46c5 6679 /* Grandfather lone [:, [=, [. */
830247a4 6680 RExC_parse = s;
620e46c5 6681 else {
3dab1dad 6682 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6683 assert(*t == c);
6684
9a86a77b 6685 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6686 const char *posixcc = s + 1;
830247a4 6687 RExC_parse++; /* skip over the ending ] */
3dab1dad 6688
b8c5462f 6689 if (*s == ':') {
1df70142
AL
6690 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6691 const I32 skip = t - posixcc;
80916619
NC
6692
6693 /* Initially switch on the length of the name. */
6694 switch (skip) {
6695 case 4:
3dab1dad
YO
6696 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6697 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6698 break;
80916619
NC
6699 case 5:
6700 /* Names all of length 5. */
6701 /* alnum alpha ascii blank cntrl digit graph lower
6702 print punct space upper */
6703 /* Offset 4 gives the best switch position. */
6704 switch (posixcc[4]) {
6705 case 'a':
3dab1dad
YO
6706 if (memEQ(posixcc, "alph", 4)) /* alpha */
6707 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6708 break;
6709 case 'e':
3dab1dad
YO
6710 if (memEQ(posixcc, "spac", 4)) /* space */
6711 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6712 break;
6713 case 'h':
3dab1dad
YO
6714 if (memEQ(posixcc, "grap", 4)) /* graph */
6715 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6716 break;
6717 case 'i':
3dab1dad
YO
6718 if (memEQ(posixcc, "asci", 4)) /* ascii */
6719 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6720 break;
6721 case 'k':
3dab1dad
YO
6722 if (memEQ(posixcc, "blan", 4)) /* blank */
6723 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6724 break;
6725 case 'l':
3dab1dad
YO
6726 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6727 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6728 break;
6729 case 'm':
3dab1dad
YO
6730 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6731 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6732 break;
6733 case 'r':
3dab1dad
YO
6734 if (memEQ(posixcc, "lowe", 4)) /* lower */
6735 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6736 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6737 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6738 break;
6739 case 't':
3dab1dad
YO
6740 if (memEQ(posixcc, "digi", 4)) /* digit */
6741 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6742 else if (memEQ(posixcc, "prin", 4)) /* print */
6743 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6744 else if (memEQ(posixcc, "punc", 4)) /* punct */
6745 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6746 break;
b8c5462f
JH
6747 }
6748 break;
80916619 6749 case 6:
3dab1dad
YO
6750 if (memEQ(posixcc, "xdigit", 6))
6751 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6752 break;
6753 }
80916619
NC
6754
6755 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6756 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6757 t - s - 1, s + 1);
80916619
NC
6758 assert (posixcc[skip] == ':');
6759 assert (posixcc[skip+1] == ']');
b45f050a 6760 } else if (!SIZE_ONLY) {
b8c5462f 6761 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6762
830247a4 6763 /* adjust RExC_parse so the warning shows after
b45f050a 6764 the class closes */
9a86a77b 6765 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6766 RExC_parse++;
b45f050a
JF
6767 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6768 }
b8c5462f
JH
6769 } else {
6770 /* Maternal grandfather:
6771 * "[:" ending in ":" but not in ":]" */
830247a4 6772 RExC_parse = s;
767d463e 6773 }
620e46c5
JH
6774 }
6775 }
6776
b8c5462f
JH
6777 return namedclass;
6778}
6779
6780STATIC void
830247a4 6781S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6782{
97aff369 6783 dVAR;
3dab1dad 6784 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6785 const char *s = RExC_parse;
6786 const char c = *s++;
b8c5462f 6787
3dab1dad 6788 while (isALNUM(*s))
b8c5462f
JH
6789 s++;
6790 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6791 if (ckWARN(WARN_REGEXP))
6792 vWARN3(s+2,
6793 "POSIX syntax [%c %c] belongs inside character classes",
6794 c, c);
b45f050a
JF
6795
6796 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6797 if (POSIXCC_NOTYET(c)) {
830247a4 6798 /* adjust RExC_parse so the error shows after
b45f050a 6799 the class closes */
9a86a77b 6800 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6801 NOOP;
b45f050a
JF
6802 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6803 }
b8c5462f
JH
6804 }
6805 }
620e46c5
JH
6806}
6807
7f6f358c
YO
6808
6809/*
6810 parse a class specification and produce either an ANYOF node that
6811 matches the pattern. If the pattern matches a single char only and
6812 that char is < 256 then we produce an EXACT node instead.
6813*/
76e3520e 6814STATIC regnode *
3dab1dad 6815S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6816{
97aff369 6817 dVAR;
9ef43ace 6818 register UV value = 0;
9a86a77b 6819 register UV nextvalue;
3568d838 6820 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6821 register IV range = 0;
c277df42 6822 register regnode *ret;
ba210ebe 6823 STRLEN numlen;
ffc61ed2 6824 IV namedclass;
cbbf8932 6825 char *rangebegin = NULL;
936ed897 6826 bool need_class = 0;
c445ea15 6827 SV *listsv = NULL;
ffc61ed2 6828 UV n;
9e55ce06 6829 bool optimize_invert = TRUE;
cbbf8932 6830 AV* unicode_alternate = NULL;
1b2d223b
JH
6831#ifdef EBCDIC
6832 UV literal_endpoint = 0;
6833#endif
7f6f358c 6834 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6835
3dab1dad 6836 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6837 case we need to change the emitted regop to an EXACT. */
07be1b83 6838 const char * orig_parse = RExC_parse;
72f13be8 6839 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6840#ifndef DEBUGGING
6841 PERL_UNUSED_ARG(depth);
6842#endif
72f13be8 6843
3dab1dad 6844 DEBUG_PARSE("clas");
7f6f358c
YO
6845
6846 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6847 ret = reganode(pRExC_state, ANYOF, 0);
6848
6849 if (!SIZE_ONLY)
6850 ANYOF_FLAGS(ret) = 0;
6851
9a86a77b 6852 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6853 RExC_naughty++;
6854 RExC_parse++;
6855 if (!SIZE_ONLY)
6856 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6857 }
a0d0e21e 6858
73060fc4 6859 if (SIZE_ONLY) {
830247a4 6860 RExC_size += ANYOF_SKIP;
73060fc4
JH
6861 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6862 }
936ed897 6863 else {
830247a4 6864 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6865 if (FOLD)
6866 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6867 if (LOC)
6868 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6869 ANYOF_BITMAP_ZERO(ret);
396482e1 6870 listsv = newSVpvs("# comment\n");
a0d0e21e 6871 }
b8c5462f 6872
9a86a77b
JH
6873 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6874
b938889d 6875 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6876 checkposixcc(pRExC_state);
b8c5462f 6877
f064b6ad
HS
6878 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6879 if (UCHARAT(RExC_parse) == ']')
6880 goto charclassloop;
ffc61ed2 6881
fc8cd66c 6882parseit:
9a86a77b 6883 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6884
6885 charclassloop:
6886
6887 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6888
73b437c8 6889 if (!range)
830247a4 6890 rangebegin = RExC_parse;
ffc61ed2 6891 if (UTF) {
5e12f4fb 6892 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6893 RExC_end - RExC_parse,
9f7f3913 6894 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6895 RExC_parse += numlen;
6896 }
6897 else
6898 value = UCHARAT(RExC_parse++);
7f6f358c 6899
9a86a77b
JH
6900 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6901 if (value == '[' && POSIXCC(nextvalue))
830247a4 6902 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6903 else if (value == '\\') {
ffc61ed2 6904 if (UTF) {
5e12f4fb 6905 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6906 RExC_end - RExC_parse,
9f7f3913 6907 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6908 RExC_parse += numlen;
6909 }
6910 else
6911 value = UCHARAT(RExC_parse++);
470c3474 6912 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6913 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6914 * be a problem later if we want switch on Unicode.
6915 * A similar issue a little bit later when switching on
6916 * namedclass. --jhi */
ffc61ed2 6917 switch ((I32)value) {
b8c5462f
JH
6918 case 'w': namedclass = ANYOF_ALNUM; break;
6919 case 'W': namedclass = ANYOF_NALNUM; break;
6920 case 's': namedclass = ANYOF_SPACE; break;
6921 case 'S': namedclass = ANYOF_NSPACE; break;
6922 case 'd': namedclass = ANYOF_DIGIT; break;
6923 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6924 case 'N': /* Handle \N{NAME} in class */
6925 {
6926 /* We only pay attention to the first char of
6927 multichar strings being returned. I kinda wonder
6928 if this makes sense as it does change the behaviour
6929 from earlier versions, OTOH that behaviour was broken
6930 as well. */
6931 UV v; /* value is register so we cant & it /grrr */
6932 if (reg_namedseq(pRExC_state, &v)) {
6933 goto parseit;
6934 }
6935 value= v;
6936 }
6937 break;
ffc61ed2
JH
6938 case 'p':
6939 case 'P':
3dab1dad
YO
6940 {
6941 char *e;
af6f566e 6942 if (RExC_parse >= RExC_end)
2a4859cd 6943 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6944 if (*RExC_parse == '{') {
1df70142 6945 const U8 c = (U8)value;
ffc61ed2
JH
6946 e = strchr(RExC_parse++, '}');
6947 if (!e)
0da60cf5 6948 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6949 while (isSPACE(UCHARAT(RExC_parse)))
6950 RExC_parse++;
6951 if (e == RExC_parse)
0da60cf5 6952 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6953 n = e - RExC_parse;
ab13f0c7
JH
6954 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6955 n--;
ffc61ed2
JH
6956 }
6957 else {
6958 e = RExC_parse;
6959 n = 1;
6960 }
6961 if (!SIZE_ONLY) {
ab13f0c7
JH
6962 if (UCHARAT(RExC_parse) == '^') {
6963 RExC_parse++;
6964 n--;
6965 value = value == 'p' ? 'P' : 'p'; /* toggle */
6966 while (isSPACE(UCHARAT(RExC_parse))) {
6967 RExC_parse++;
6968 n--;
6969 }
6970 }
097eb12c
AL
6971 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6972 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6973 }
6974 RExC_parse = e + 1;
6975 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6976 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6977 }
f81125e2 6978 break;
b8c5462f
JH
6979 case 'n': value = '\n'; break;
6980 case 'r': value = '\r'; break;
6981 case 't': value = '\t'; break;
6982 case 'f': value = '\f'; break;
6983 case 'b': value = '\b'; break;
c7f1f016
NIS
6984 case 'e': value = ASCII_TO_NATIVE('\033');break;
6985 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6986 case 'x':
ffc61ed2 6987 if (*RExC_parse == '{') {
a4c04bdc
NC
6988 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6989 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6990 char * const e = strchr(RExC_parse++, '}');
b81d288d 6991 if (!e)
ffc61ed2 6992 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6993
6994 numlen = e - RExC_parse;
6995 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6996 RExC_parse = e + 1;
6997 }
6998 else {
a4c04bdc 6999 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7000 numlen = 2;
7001 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7002 RExC_parse += numlen;
7003 }
9e08bc66
TS
7004 if (PL_encoding && value < 0x100)
7005 goto recode_encoding;
b8c5462f
JH
7006 break;
7007 case 'c':
830247a4 7008 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7009 value = toCTRL(value);
7010 break;
7011 case '0': case '1': case '2': case '3': case '4':
7012 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7013 {
7014 I32 flags = 0;
7015 numlen = 3;
7016 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7017 RExC_parse += numlen;
7018 if (PL_encoding && value < 0x100)
7019 goto recode_encoding;
7020 break;
7021 }
7022 recode_encoding:
7023 {
7024 SV* enc = PL_encoding;
7025 value = reg_recode((const char)(U8)value, &enc);
7026 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7027 vWARN(RExC_parse,
7028 "Invalid escape in the specified encoding");
7029 break;
7030 }
1028017a 7031 default:
041457d9 7032 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7033 vWARN2(RExC_parse,
7034 "Unrecognized escape \\%c in character class passed through",
7035 (int)value);
1028017a 7036 break;
b8c5462f 7037 }
ffc61ed2 7038 } /* end of \blah */
1b2d223b
JH
7039#ifdef EBCDIC
7040 else
7041 literal_endpoint++;
7042#endif
ffc61ed2
JH
7043
7044 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7045
7046 if (!SIZE_ONLY && !need_class)
936ed897 7047 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7048
936ed897 7049 need_class = 1;
ffc61ed2
JH
7050
7051 /* a bad range like a-\d, a-[:digit:] ? */
7052 if (range) {
73b437c8 7053 if (!SIZE_ONLY) {
afd78fd5 7054 if (ckWARN(WARN_REGEXP)) {
097eb12c 7055 const int w =
afd78fd5
JH
7056 RExC_parse >= rangebegin ?
7057 RExC_parse - rangebegin : 0;
830247a4 7058 vWARN4(RExC_parse,
b45f050a 7059 "False [] range \"%*.*s\"",
097eb12c 7060 w, w, rangebegin);
afd78fd5 7061 }
3568d838
JH
7062 if (prevvalue < 256) {
7063 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7064 ANYOF_BITMAP_SET(ret, '-');
7065 }
7066 else {
7067 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7068 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7069 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7070 }
b8c5462f 7071 }
ffc61ed2
JH
7072
7073 range = 0; /* this was not a true range */
73b437c8 7074 }
ffc61ed2 7075
73b437c8 7076 if (!SIZE_ONLY) {
c49a72a9
NC
7077 const char *what = NULL;
7078 char yesno = 0;
7079
3568d838
JH
7080 if (namedclass > OOB_NAMEDCLASS)
7081 optimize_invert = FALSE;
e2962f66
JH
7082 /* Possible truncation here but in some 64-bit environments
7083 * the compiler gets heartburn about switch on 64-bit values.
7084 * A similar issue a little earlier when switching on value.
98f323fa 7085 * --jhi */
e2962f66 7086 switch ((I32)namedclass) {
73b437c8
JH
7087 case ANYOF_ALNUM:
7088 if (LOC)
936ed897 7089 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7090 else {
7091 for (value = 0; value < 256; value++)
7092 if (isALNUM(value))
936ed897 7093 ANYOF_BITMAP_SET(ret, value);
73b437c8 7094 }
c49a72a9
NC
7095 yesno = '+';
7096 what = "Word";
73b437c8
JH
7097 break;
7098 case ANYOF_NALNUM:
7099 if (LOC)
936ed897 7100 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7101 else {
7102 for (value = 0; value < 256; value++)
7103 if (!isALNUM(value))
936ed897 7104 ANYOF_BITMAP_SET(ret, value);
73b437c8 7105 }
c49a72a9
NC
7106 yesno = '!';
7107 what = "Word";
73b437c8 7108 break;
ffc61ed2 7109 case ANYOF_ALNUMC:
73b437c8 7110 if (LOC)
ffc61ed2 7111 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7112 else {
7113 for (value = 0; value < 256; value++)
ffc61ed2 7114 if (isALNUMC(value))
936ed897 7115 ANYOF_BITMAP_SET(ret, value);
73b437c8 7116 }
c49a72a9
NC
7117 yesno = '+';
7118 what = "Alnum";
73b437c8
JH
7119 break;
7120 case ANYOF_NALNUMC:
7121 if (LOC)
936ed897 7122 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7123 else {
7124 for (value = 0; value < 256; value++)
7125 if (!isALNUMC(value))
936ed897 7126 ANYOF_BITMAP_SET(ret, value);
73b437c8 7127 }
c49a72a9
NC
7128 yesno = '!';
7129 what = "Alnum";
73b437c8
JH
7130 break;
7131 case ANYOF_ALPHA:
7132 if (LOC)
936ed897 7133 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7134 else {
7135 for (value = 0; value < 256; value++)
7136 if (isALPHA(value))
936ed897 7137 ANYOF_BITMAP_SET(ret, value);
73b437c8 7138 }
c49a72a9
NC
7139 yesno = '+';
7140 what = "Alpha";
73b437c8
JH
7141 break;
7142 case ANYOF_NALPHA:
7143 if (LOC)
936ed897 7144 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7145 else {
7146 for (value = 0; value < 256; value++)
7147 if (!isALPHA(value))
936ed897 7148 ANYOF_BITMAP_SET(ret, value);
73b437c8 7149 }
c49a72a9
NC
7150 yesno = '!';
7151 what = "Alpha";
73b437c8
JH
7152 break;
7153 case ANYOF_ASCII:
7154 if (LOC)
936ed897 7155 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7156 else {
c7f1f016 7157#ifndef EBCDIC
1ba5c669
JH
7158 for (value = 0; value < 128; value++)
7159 ANYOF_BITMAP_SET(ret, value);
7160#else /* EBCDIC */
ffbc6a93 7161 for (value = 0; value < 256; value++) {
3a3c4447
JH
7162 if (isASCII(value))
7163 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7164 }
1ba5c669 7165#endif /* EBCDIC */
73b437c8 7166 }
c49a72a9
NC
7167 yesno = '+';
7168 what = "ASCII";
73b437c8
JH
7169 break;
7170 case ANYOF_NASCII:
7171 if (LOC)
936ed897 7172 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7173 else {
c7f1f016 7174#ifndef EBCDIC
1ba5c669
JH
7175 for (value = 128; value < 256; value++)
7176 ANYOF_BITMAP_SET(ret, value);
7177#else /* EBCDIC */
ffbc6a93 7178 for (value = 0; value < 256; value++) {
3a3c4447
JH
7179 if (!isASCII(value))
7180 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7181 }
1ba5c669 7182#endif /* EBCDIC */
73b437c8 7183 }
c49a72a9
NC
7184 yesno = '!';
7185 what = "ASCII";
73b437c8 7186 break;
aaa51d5e
JF
7187 case ANYOF_BLANK:
7188 if (LOC)
7189 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7190 else {
7191 for (value = 0; value < 256; value++)
7192 if (isBLANK(value))
7193 ANYOF_BITMAP_SET(ret, value);
7194 }
c49a72a9
NC
7195 yesno = '+';
7196 what = "Blank";
aaa51d5e
JF
7197 break;
7198 case ANYOF_NBLANK:
7199 if (LOC)
7200 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7201 else {
7202 for (value = 0; value < 256; value++)
7203 if (!isBLANK(value))
7204 ANYOF_BITMAP_SET(ret, value);
7205 }
c49a72a9
NC
7206 yesno = '!';
7207 what = "Blank";
aaa51d5e 7208 break;
73b437c8
JH
7209 case ANYOF_CNTRL:
7210 if (LOC)
936ed897 7211 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7212 else {
7213 for (value = 0; value < 256; value++)
7214 if (isCNTRL(value))
936ed897 7215 ANYOF_BITMAP_SET(ret, value);
73b437c8 7216 }
c49a72a9
NC
7217 yesno = '+';
7218 what = "Cntrl";
73b437c8
JH
7219 break;
7220 case ANYOF_NCNTRL:
7221 if (LOC)
936ed897 7222 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7223 else {
7224 for (value = 0; value < 256; value++)
7225 if (!isCNTRL(value))
936ed897 7226 ANYOF_BITMAP_SET(ret, value);
73b437c8 7227 }
c49a72a9
NC
7228 yesno = '!';
7229 what = "Cntrl";
ffc61ed2
JH
7230 break;
7231 case ANYOF_DIGIT:
7232 if (LOC)
7233 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7234 else {
7235 /* consecutive digits assumed */
7236 for (value = '0'; value <= '9'; value++)
7237 ANYOF_BITMAP_SET(ret, value);
7238 }
c49a72a9
NC
7239 yesno = '+';
7240 what = "Digit";
ffc61ed2
JH
7241 break;
7242 case ANYOF_NDIGIT:
7243 if (LOC)
7244 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7245 else {
7246 /* consecutive digits assumed */
7247 for (value = 0; value < '0'; value++)
7248 ANYOF_BITMAP_SET(ret, value);
7249 for (value = '9' + 1; value < 256; value++)
7250 ANYOF_BITMAP_SET(ret, value);
7251 }
c49a72a9
NC
7252 yesno = '!';
7253 what = "Digit";
73b437c8
JH
7254 break;
7255 case ANYOF_GRAPH:
7256 if (LOC)
936ed897 7257 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7258 else {
7259 for (value = 0; value < 256; value++)
7260 if (isGRAPH(value))
936ed897 7261 ANYOF_BITMAP_SET(ret, value);
73b437c8 7262 }
c49a72a9
NC
7263 yesno = '+';
7264 what = "Graph";
73b437c8
JH
7265 break;
7266 case ANYOF_NGRAPH:
7267 if (LOC)
936ed897 7268 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7269 else {
7270 for (value = 0; value < 256; value++)
7271 if (!isGRAPH(value))
936ed897 7272 ANYOF_BITMAP_SET(ret, value);
73b437c8 7273 }
c49a72a9
NC
7274 yesno = '!';
7275 what = "Graph";
73b437c8
JH
7276 break;
7277 case ANYOF_LOWER:
7278 if (LOC)
936ed897 7279 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7280 else {
7281 for (value = 0; value < 256; value++)
7282 if (isLOWER(value))
936ed897 7283 ANYOF_BITMAP_SET(ret, value);
73b437c8 7284 }
c49a72a9
NC
7285 yesno = '+';
7286 what = "Lower";
73b437c8
JH
7287 break;
7288 case ANYOF_NLOWER:
7289 if (LOC)
936ed897 7290 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7291 else {
7292 for (value = 0; value < 256; value++)
7293 if (!isLOWER(value))
936ed897 7294 ANYOF_BITMAP_SET(ret, value);
73b437c8 7295 }
c49a72a9
NC
7296 yesno = '!';
7297 what = "Lower";
73b437c8
JH
7298 break;
7299 case ANYOF_PRINT:
7300 if (LOC)
936ed897 7301 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7302 else {
7303 for (value = 0; value < 256; value++)
7304 if (isPRINT(value))
936ed897 7305 ANYOF_BITMAP_SET(ret, value);
73b437c8 7306 }
c49a72a9
NC
7307 yesno = '+';
7308 what = "Print";
73b437c8
JH
7309 break;
7310 case ANYOF_NPRINT:
7311 if (LOC)
936ed897 7312 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7313 else {
7314 for (value = 0; value < 256; value++)
7315 if (!isPRINT(value))
936ed897 7316 ANYOF_BITMAP_SET(ret, value);
73b437c8 7317 }
c49a72a9
NC
7318 yesno = '!';
7319 what = "Print";
73b437c8 7320 break;
aaa51d5e
JF
7321 case ANYOF_PSXSPC:
7322 if (LOC)
7323 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7324 else {
7325 for (value = 0; value < 256; value++)
7326 if (isPSXSPC(value))
7327 ANYOF_BITMAP_SET(ret, value);
7328 }
c49a72a9
NC
7329 yesno = '+';
7330 what = "Space";
aaa51d5e
JF
7331 break;
7332 case ANYOF_NPSXSPC:
7333 if (LOC)
7334 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7335 else {
7336 for (value = 0; value < 256; value++)
7337 if (!isPSXSPC(value))
7338 ANYOF_BITMAP_SET(ret, value);
7339 }
c49a72a9
NC
7340 yesno = '!';
7341 what = "Space";
aaa51d5e 7342 break;
73b437c8
JH
7343 case ANYOF_PUNCT:
7344 if (LOC)
936ed897 7345 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7346 else {
7347 for (value = 0; value < 256; value++)
7348 if (isPUNCT(value))
936ed897 7349 ANYOF_BITMAP_SET(ret, value);
73b437c8 7350 }
c49a72a9
NC
7351 yesno = '+';
7352 what = "Punct";
73b437c8
JH
7353 break;
7354 case ANYOF_NPUNCT:
7355 if (LOC)
936ed897 7356 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7357 else {
7358 for (value = 0; value < 256; value++)
7359 if (!isPUNCT(value))
936ed897 7360 ANYOF_BITMAP_SET(ret, value);
73b437c8 7361 }
c49a72a9
NC
7362 yesno = '!';
7363 what = "Punct";
ffc61ed2
JH
7364 break;
7365 case ANYOF_SPACE:
7366 if (LOC)
7367 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7368 else {
7369 for (value = 0; value < 256; value++)
7370 if (isSPACE(value))
7371 ANYOF_BITMAP_SET(ret, value);
7372 }
c49a72a9
NC
7373 yesno = '+';
7374 what = "SpacePerl";
ffc61ed2
JH
7375 break;
7376 case ANYOF_NSPACE:
7377 if (LOC)
7378 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7379 else {
7380 for (value = 0; value < 256; value++)
7381 if (!isSPACE(value))
7382 ANYOF_BITMAP_SET(ret, value);
7383 }
c49a72a9
NC
7384 yesno = '!';
7385 what = "SpacePerl";
73b437c8
JH
7386 break;
7387 case ANYOF_UPPER:
7388 if (LOC)
936ed897 7389 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7390 else {
7391 for (value = 0; value < 256; value++)
7392 if (isUPPER(value))
936ed897 7393 ANYOF_BITMAP_SET(ret, value);
73b437c8 7394 }
c49a72a9
NC
7395 yesno = '+';
7396 what = "Upper";
73b437c8
JH
7397 break;
7398 case ANYOF_NUPPER:
7399 if (LOC)
936ed897 7400 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7401 else {
7402 for (value = 0; value < 256; value++)
7403 if (!isUPPER(value))
936ed897 7404 ANYOF_BITMAP_SET(ret, value);
73b437c8 7405 }
c49a72a9
NC
7406 yesno = '!';
7407 what = "Upper";
73b437c8
JH
7408 break;
7409 case ANYOF_XDIGIT:
7410 if (LOC)
936ed897 7411 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7412 else {
7413 for (value = 0; value < 256; value++)
7414 if (isXDIGIT(value))
936ed897 7415 ANYOF_BITMAP_SET(ret, value);
73b437c8 7416 }
c49a72a9
NC
7417 yesno = '+';
7418 what = "XDigit";
73b437c8
JH
7419 break;
7420 case ANYOF_NXDIGIT:
7421 if (LOC)
936ed897 7422 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7423 else {
7424 for (value = 0; value < 256; value++)
7425 if (!isXDIGIT(value))
936ed897 7426 ANYOF_BITMAP_SET(ret, value);
73b437c8 7427 }
c49a72a9
NC
7428 yesno = '!';
7429 what = "XDigit";
73b437c8 7430 break;
f81125e2
JP
7431 case ANYOF_MAX:
7432 /* this is to handle \p and \P */
7433 break;
73b437c8 7434 default:
b45f050a 7435 vFAIL("Invalid [::] class");
73b437c8 7436 break;
b8c5462f 7437 }
c49a72a9
NC
7438 if (what) {
7439 /* Strings such as "+utf8::isWord\n" */
7440 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7441 }
b8c5462f 7442 if (LOC)
936ed897 7443 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7444 continue;
a0d0e21e 7445 }
ffc61ed2
JH
7446 } /* end of namedclass \blah */
7447
a0d0e21e 7448 if (range) {
eb160463 7449 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7450 const int w = RExC_parse - rangebegin;
7451 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7452 range = 0; /* not a valid range */
73b437c8 7453 }
a0d0e21e
LW
7454 }
7455 else {
3568d838 7456 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7457 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7458 RExC_parse[1] != ']') {
7459 RExC_parse++;
ffc61ed2
JH
7460
7461 /* a bad range like \w-, [:word:]- ? */
7462 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7463 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7464 const int w =
afd78fd5
JH
7465 RExC_parse >= rangebegin ?
7466 RExC_parse - rangebegin : 0;
830247a4 7467 vWARN4(RExC_parse,
b45f050a 7468 "False [] range \"%*.*s\"",
097eb12c 7469 w, w, rangebegin);
afd78fd5 7470 }
73b437c8 7471 if (!SIZE_ONLY)
936ed897 7472 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7473 } else
ffc61ed2
JH
7474 range = 1; /* yeah, it's a range! */
7475 continue; /* but do it the next time */
a0d0e21e 7476 }
a687059c 7477 }
ffc61ed2 7478
93733859 7479 /* now is the next time */
07be1b83 7480 /*stored += (value - prevvalue + 1);*/
ae5c130c 7481 if (!SIZE_ONLY) {
3568d838 7482 if (prevvalue < 256) {
1df70142 7483 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7484 IV i;
3568d838 7485#ifdef EBCDIC
1b2d223b
JH
7486 /* In EBCDIC [\x89-\x91] should include
7487 * the \x8e but [i-j] should not. */
7488 if (literal_endpoint == 2 &&
7489 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7490 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7491 {
3568d838
JH
7492 if (isLOWER(prevvalue)) {
7493 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7494 if (isLOWER(i))
7495 ANYOF_BITMAP_SET(ret, i);
7496 } else {
3568d838 7497 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7498 if (isUPPER(i))
7499 ANYOF_BITMAP_SET(ret, i);
7500 }
8ada0baa 7501 }
ffc61ed2 7502 else
8ada0baa 7503#endif
07be1b83
YO
7504 for (i = prevvalue; i <= ceilvalue; i++) {
7505 if (!ANYOF_BITMAP_TEST(ret,i)) {
7506 stored++;
7507 ANYOF_BITMAP_SET(ret, i);
7508 }
7509 }
3568d838 7510 }
a5961de5 7511 if (value > 255 || UTF) {
1df70142
AL
7512 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7513 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7514 stored+=2; /* can't optimize this class */
ffc61ed2 7515 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7516 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7517 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7518 prevnatvalue, natvalue);
7519 }
7520 else if (prevnatvalue == natvalue) {
7521 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7522 if (FOLD) {
89ebb4a3 7523 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7524 STRLEN foldlen;
1df70142 7525 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7526
e294cc5d
JH
7527#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7528 if (RExC_precomp[0] == ':' &&
7529 RExC_precomp[1] == '[' &&
7530 (f == 0xDF || f == 0x92)) {
7531 f = NATIVE_TO_UNI(f);
7532 }
7533#endif
c840d2a2
JH
7534 /* If folding and foldable and a single
7535 * character, insert also the folded version
7536 * to the charclass. */
9e55ce06 7537 if (f != value) {
e294cc5d
JH
7538#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7539 if ((RExC_precomp[0] == ':' &&
7540 RExC_precomp[1] == '[' &&
7541 (f == 0xA2 &&
7542 (value == 0xFB05 || value == 0xFB06))) ?
7543 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7544 foldlen == (STRLEN)UNISKIP(f) )
7545#else
eb160463 7546 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7547#endif
9e55ce06
JH
7548 Perl_sv_catpvf(aTHX_ listsv,
7549 "%04"UVxf"\n", f);
7550 else {
7551 /* Any multicharacter foldings
7552 * require the following transform:
7553 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7554 * where E folds into "pq" and F folds
7555 * into "rst", all other characters
7556 * fold to single characters. We save
7557 * away these multicharacter foldings,
7558 * to be later saved as part of the
7559 * additional "s" data. */
7560 SV *sv;
7561
7562 if (!unicode_alternate)
7563 unicode_alternate = newAV();
7564 sv = newSVpvn((char*)foldbuf, foldlen);
7565 SvUTF8_on(sv);
7566 av_push(unicode_alternate, sv);
7567 }
7568 }
254ba52a 7569
60a8b682
JH
7570 /* If folding and the value is one of the Greek
7571 * sigmas insert a few more sigmas to make the
7572 * folding rules of the sigmas to work right.
7573 * Note that not all the possible combinations
7574 * are handled here: some of them are handled
9e55ce06
JH
7575 * by the standard folding rules, and some of
7576 * them (literal or EXACTF cases) are handled
7577 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7578 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7579 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7580 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7581 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7582 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7583 }
7584 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7585 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7586 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7587 }
7588 }
ffc61ed2 7589 }
1b2d223b
JH
7590#ifdef EBCDIC
7591 literal_endpoint = 0;
7592#endif
8ada0baa 7593 }
ffc61ed2
JH
7594
7595 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7596 }
ffc61ed2 7597
936ed897 7598 if (need_class) {
4f66b38d 7599 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7600 if (SIZE_ONLY)
830247a4 7601 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7602 else
830247a4 7603 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7604 }
ffc61ed2 7605
7f6f358c
YO
7606
7607 if (SIZE_ONLY)
7608 return ret;
7609 /****** !SIZE_ONLY AFTER HERE *********/
7610
7611 if( stored == 1 && value < 256
7612 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7613 ) {
7614 /* optimize single char class to an EXACT node
7615 but *only* when its not a UTF/high char */
07be1b83
YO
7616 const char * cur_parse= RExC_parse;
7617 RExC_emit = (regnode *)orig_emit;
7618 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7619 ret = reg_node(pRExC_state,
7620 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7621 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7622 *STRING(ret)= (char)value;
7623 STR_LEN(ret)= 1;
7624 RExC_emit += STR_SZ(1);
7625 return ret;
7626 }
ae5c130c 7627 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7628 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7629 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7630 ) {
a0ed51b3 7631 for (value = 0; value < 256; ++value) {
936ed897 7632 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7633 UV fold = PL_fold[value];
ffc61ed2
JH
7634
7635 if (fold != value)
7636 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7637 }
7638 }
936ed897 7639 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7640 }
ffc61ed2 7641
ae5c130c 7642 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7643 if (optimize_invert &&
ffc61ed2
JH
7644 /* If the only flag is inversion. */
7645 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7646 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7647 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7648 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7649 }
7f6f358c 7650 {
097eb12c 7651 AV * const av = newAV();
ffc61ed2 7652 SV *rv;
9e55ce06 7653 /* The 0th element stores the character class description
6a0407ee 7654 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7655 * to initialize the appropriate swash (which gets stored in
7656 * the 1st element), and also useful for dumping the regnode.
7657 * The 2nd element stores the multicharacter foldings,
6a0407ee 7658 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7659 av_store(av, 0, listsv);
7660 av_store(av, 1, NULL);
9e55ce06 7661 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7662 rv = newRV_noinc((SV*)av);
19860706 7663 n = add_data(pRExC_state, 1, "s");
830247a4 7664 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7665 ARG_SET(ret, n);
a0ed51b3 7666 }
a0ed51b3
LW
7667 return ret;
7668}
7669
76e3520e 7670STATIC char*
830247a4 7671S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7672{
097eb12c 7673 char* const retval = RExC_parse++;
a0d0e21e 7674
4633a7c4 7675 for (;;) {
830247a4
IZ
7676 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7677 RExC_parse[2] == '#') {
e994fd66
AE
7678 while (*RExC_parse != ')') {
7679 if (RExC_parse == RExC_end)
7680 FAIL("Sequence (?#... not terminated");
830247a4 7681 RExC_parse++;
e994fd66 7682 }
830247a4 7683 RExC_parse++;
4633a7c4
LW
7684 continue;
7685 }
e2509266 7686 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7687 if (isSPACE(*RExC_parse)) {
7688 RExC_parse++;
748a9306
LW
7689 continue;
7690 }
830247a4 7691 else if (*RExC_parse == '#') {
e994fd66
AE
7692 while (RExC_parse < RExC_end)
7693 if (*RExC_parse++ == '\n') break;
748a9306
LW
7694 continue;
7695 }
748a9306 7696 }
4633a7c4 7697 return retval;
a0d0e21e 7698 }
a687059c
LW
7699}
7700
7701/*
c277df42 7702- reg_node - emit a node
a0d0e21e 7703*/
76e3520e 7704STATIC regnode * /* Location. */
830247a4 7705S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7706{
97aff369 7707 dVAR;
c277df42 7708 register regnode *ptr;
504618e9 7709 regnode * const ret = RExC_emit;
07be1b83 7710 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7711
c277df42 7712 if (SIZE_ONLY) {
830247a4
IZ
7713 SIZE_ALIGN(RExC_size);
7714 RExC_size += 1;
a0d0e21e
LW
7715 return(ret);
7716 }
e2e6a0f1
YO
7717#ifdef DEBUGGING
7718 if (OP(RExC_emit) == 255)
7719 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7720 reg_name[op], OP(RExC_emit));
7721#endif
c277df42 7722 NODE_ALIGN_FILL(ret);
a0d0e21e 7723 ptr = ret;
c277df42 7724 FILL_ADVANCE_NODE(ptr, op);
fac92740 7725 if (RExC_offsets) { /* MJD */
07be1b83 7726 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7727 "reg_node", __LINE__,
7728 reg_name[op],
07be1b83
YO
7729 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7730 ? "Overwriting end of array!\n" : "OK",
7731 (UV)(RExC_emit - RExC_emit_start),
7732 (UV)(RExC_parse - RExC_start),
7733 (UV)RExC_offsets[0]));
ccb2c380 7734 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7735 }
07be1b83 7736
830247a4 7737 RExC_emit = ptr;
a0d0e21e 7738 return(ret);
a687059c
LW
7739}
7740
7741/*
a0d0e21e
LW
7742- reganode - emit a node with an argument
7743*/
76e3520e 7744STATIC regnode * /* Location. */
830247a4 7745S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7746{
97aff369 7747 dVAR;
c277df42 7748 register regnode *ptr;
504618e9 7749 regnode * const ret = RExC_emit;
07be1b83 7750 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7751
c277df42 7752 if (SIZE_ONLY) {
830247a4
IZ
7753 SIZE_ALIGN(RExC_size);
7754 RExC_size += 2;
6bda09f9
YO
7755 /*
7756 We can't do this:
7757
7758 assert(2==regarglen[op]+1);
7759
7760 Anything larger than this has to allocate the extra amount.
7761 If we changed this to be:
7762
7763 RExC_size += (1 + regarglen[op]);
7764
7765 then it wouldn't matter. Its not clear what side effect
7766 might come from that so its not done so far.
7767 -- dmq
7768 */
a0d0e21e
LW
7769 return(ret);
7770 }
e2e6a0f1
YO
7771#ifdef DEBUGGING
7772 if (OP(RExC_emit) == 255)
7773 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7774#endif
c277df42 7775 NODE_ALIGN_FILL(ret);
a0d0e21e 7776 ptr = ret;
c277df42 7777 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7778 if (RExC_offsets) { /* MJD */
07be1b83 7779 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7780 "reganode",
ccb2c380
MP
7781 __LINE__,
7782 reg_name[op],
07be1b83 7783 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7784 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7785 (UV)(RExC_emit - RExC_emit_start),
7786 (UV)(RExC_parse - RExC_start),
7787 (UV)RExC_offsets[0]));
ccb2c380 7788 Set_Cur_Node_Offset;
fac92740
MJD
7789 }
7790
830247a4 7791 RExC_emit = ptr;
a0d0e21e 7792 return(ret);
fe14fcc3
LW
7793}
7794
7795/*
cd439c50 7796- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7797*/
71207a34
AL
7798STATIC STRLEN
7799S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7800{
97aff369 7801 dVAR;
71207a34 7802 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7803}
7804
7805/*
a0d0e21e
LW
7806- reginsert - insert an operator in front of already-emitted operand
7807*
7808* Means relocating the operand.
7809*/
76e3520e 7810STATIC void
6bda09f9 7811S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7812{
97aff369 7813 dVAR;
c277df42
IZ
7814 register regnode *src;
7815 register regnode *dst;
7816 register regnode *place;
504618e9 7817 const int offset = regarglen[(U8)op];
6bda09f9 7818 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7819 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7820/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7821 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7822 if (SIZE_ONLY) {
6bda09f9 7823 RExC_size += size;
a0d0e21e
LW
7824 return;
7825 }
a687059c 7826
830247a4 7827 src = RExC_emit;
6bda09f9 7828 RExC_emit += size;
830247a4 7829 dst = RExC_emit;
40d049e4 7830 if (RExC_open_parens) {
6bda09f9 7831 int paren;
6d99fb9b 7832 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7833 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7834 if ( RExC_open_parens[paren] >= opnd ) {
7835 DEBUG_PARSE_FMT("open"," - %d",size);
7836 RExC_open_parens[paren] += size;
7837 } else {
7838 DEBUG_PARSE_FMT("open"," - %s","ok");
7839 }
7840 if ( RExC_close_parens[paren] >= opnd ) {
7841 DEBUG_PARSE_FMT("close"," - %d",size);
7842 RExC_close_parens[paren] += size;
7843 } else {
7844 DEBUG_PARSE_FMT("close"," - %s","ok");
7845 }
7846 }
6bda09f9 7847 }
40d049e4 7848
fac92740 7849 while (src > opnd) {
c277df42 7850 StructCopy(--src, --dst, regnode);
fac92740 7851 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7852 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7853 "reg_insert",
ccb2c380
MP
7854 __LINE__,
7855 reg_name[op],
07be1b83
YO
7856 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7857 ? "Overwriting end of array!\n" : "OK",
7858 (UV)(src - RExC_emit_start),
7859 (UV)(dst - RExC_emit_start),
7860 (UV)RExC_offsets[0]));
ccb2c380
MP
7861 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7862 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7863 }
7864 }
7865
a0d0e21e
LW
7866
7867 place = opnd; /* Op node, where operand used to be. */
fac92740 7868 if (RExC_offsets) { /* MJD */
07be1b83 7869 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7870 "reginsert",
ccb2c380
MP
7871 __LINE__,
7872 reg_name[op],
07be1b83 7873 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7874 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7875 (UV)(place - RExC_emit_start),
7876 (UV)(RExC_parse - RExC_start),
786e8c11 7877 (UV)RExC_offsets[0]));
ccb2c380 7878 Set_Node_Offset(place, RExC_parse);
45948336 7879 Set_Node_Length(place, 1);
fac92740 7880 }
c277df42
IZ
7881 src = NEXTOPER(place);
7882 FILL_ADVANCE_NODE(place, op);
7883 Zero(src, offset, regnode);
a687059c
LW
7884}
7885
7886/*
c277df42 7887- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7888- SEE ALSO: regtail_study
a0d0e21e 7889*/
097eb12c 7890/* TODO: All three parms should be const */
76e3520e 7891STATIC void
3dab1dad 7892S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7893{
97aff369 7894 dVAR;
c277df42 7895 register regnode *scan;
72f13be8 7896 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7897#ifndef DEBUGGING
7898 PERL_UNUSED_ARG(depth);
7899#endif
a0d0e21e 7900
c277df42 7901 if (SIZE_ONLY)
a0d0e21e
LW
7902 return;
7903
7904 /* Find last node. */
7905 scan = p;
7906 for (;;) {
504618e9 7907 regnode * const temp = regnext(scan);
3dab1dad
YO
7908 DEBUG_PARSE_r({
7909 SV * const mysv=sv_newmortal();
7910 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7911 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7912 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7913 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7914 (temp == NULL ? "->" : ""),
7915 (temp == NULL ? reg_name[OP(val)] : "")
7916 );
3dab1dad
YO
7917 });
7918 if (temp == NULL)
7919 break;
7920 scan = temp;
7921 }
7922
7923 if (reg_off_by_arg[OP(scan)]) {
7924 ARG_SET(scan, val - scan);
7925 }
7926 else {
7927 NEXT_OFF(scan) = val - scan;
7928 }
7929}
7930
07be1b83 7931#ifdef DEBUGGING
3dab1dad
YO
7932/*
7933- regtail_study - set the next-pointer at the end of a node chain of p to val.
7934- Look for optimizable sequences at the same time.
7935- currently only looks for EXACT chains.
07be1b83
YO
7936
7937This is expermental code. The idea is to use this routine to perform
7938in place optimizations on branches and groups as they are constructed,
7939with the long term intention of removing optimization from study_chunk so
7940that it is purely analytical.
7941
7942Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7943to control which is which.
7944
3dab1dad
YO
7945*/
7946/* TODO: All four parms should be const */
07be1b83 7947
3dab1dad
YO
7948STATIC U8
7949S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7950{
7951 dVAR;
7952 register regnode *scan;
07be1b83
YO
7953 U8 exact = PSEUDO;
7954#ifdef EXPERIMENTAL_INPLACESCAN
7955 I32 min = 0;
7956#endif
7957
3dab1dad
YO
7958 GET_RE_DEBUG_FLAGS_DECL;
7959
07be1b83 7960
3dab1dad
YO
7961 if (SIZE_ONLY)
7962 return exact;
7963
7964 /* Find last node. */
7965
7966 scan = p;
7967 for (;;) {
7968 regnode * const temp = regnext(scan);
07be1b83
YO
7969#ifdef EXPERIMENTAL_INPLACESCAN
7970 if (PL_regkind[OP(scan)] == EXACT)
7971 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7972 return EXACT;
7973#endif
3dab1dad
YO
7974 if ( exact ) {
7975 switch (OP(scan)) {
7976 case EXACT:
7977 case EXACTF:
7978 case EXACTFL:
7979 if( exact == PSEUDO )
7980 exact= OP(scan);
07be1b83
YO
7981 else if ( exact != OP(scan) )
7982 exact= 0;
3dab1dad
YO
7983 case NOTHING:
7984 break;
7985 default:
7986 exact= 0;
7987 }
7988 }
7989 DEBUG_PARSE_r({
7990 SV * const mysv=sv_newmortal();
7991 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7992 regprop(RExC_rx, mysv, scan);
eaf3ca90 7993 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7994 SvPV_nolen_const(mysv),
eaf3ca90
YO
7995 REG_NODE_NUM(scan),
7996 reg_name[exact]);
3dab1dad 7997 });
a0d0e21e
LW
7998 if (temp == NULL)
7999 break;
8000 scan = temp;
8001 }
07be1b83
YO
8002 DEBUG_PARSE_r({
8003 SV * const mysv_val=sv_newmortal();
8004 DEBUG_PARSE_MSG("");
8005 regprop(RExC_rx, mysv_val, val);
8006 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8007 SvPV_nolen_const(mysv_val),
8008 REG_NODE_NUM(val),
8009 val - scan
8010 );
8011 });
c277df42
IZ
8012 if (reg_off_by_arg[OP(scan)]) {
8013 ARG_SET(scan, val - scan);
a0ed51b3
LW
8014 }
8015 else {
c277df42
IZ
8016 NEXT_OFF(scan) = val - scan;
8017 }
3dab1dad
YO
8018
8019 return exact;
a687059c 8020}
07be1b83 8021#endif
a687059c
LW
8022
8023/*
a687059c
LW
8024 - regcurly - a little FSA that accepts {\d+,?\d*}
8025 */
79072805 8026STATIC I32
5f66b61c 8027S_regcurly(register const char *s)
a687059c
LW
8028{
8029 if (*s++ != '{')
8030 return FALSE;
f0fcb552 8031 if (!isDIGIT(*s))
a687059c 8032 return FALSE;
f0fcb552 8033 while (isDIGIT(*s))
a687059c
LW
8034 s++;
8035 if (*s == ',')
8036 s++;
f0fcb552 8037 while (isDIGIT(*s))
a687059c
LW
8038 s++;
8039 if (*s != '}')
8040 return FALSE;
8041 return TRUE;
8042}
8043
a687059c
LW
8044
8045/*
fd181c75 8046 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8047 */
8048void
097eb12c 8049Perl_regdump(pTHX_ const regexp *r)
a687059c 8050{
35ff7856 8051#ifdef DEBUGGING
97aff369 8052 dVAR;
c445ea15 8053 SV * const sv = sv_newmortal();
ab3bbdeb 8054 SV *dsv= sv_newmortal();
a687059c 8055
786e8c11 8056 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8057
8058 /* Header fields of interest. */
ab3bbdeb
YO
8059 if (r->anchored_substr) {
8060 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8061 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8062 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8063 "anchored %s%s at %"IVdf" ",
8064 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8065 (IV)r->anchored_offset);
ab3bbdeb
YO
8066 } else if (r->anchored_utf8) {
8067 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8068 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8069 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8070 "anchored utf8 %s%s at %"IVdf" ",
8071 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8072 (IV)r->anchored_offset);
ab3bbdeb
YO
8073 }
8074 if (r->float_substr) {
8075 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8076 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8077 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8078 "floating %s%s at %"IVdf"..%"UVuf" ",
8079 s, RE_SV_TAIL(r->float_substr),
7b0972df 8080 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8081 } else if (r->float_utf8) {
8082 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8083 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8084 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8085 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8086 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8087 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8088 }
33b8afdf 8089 if (r->check_substr || r->check_utf8)
b81d288d 8090 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8091 (const char *)
8092 (r->check_substr == r->float_substr
8093 && r->check_utf8 == r->float_utf8
8094 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8095 if (r->reganch & ROPT_NOSCAN)
8096 PerlIO_printf(Perl_debug_log, " noscan");
8097 if (r->reganch & ROPT_CHECK_ALL)
8098 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8099 if (r->check_substr || r->check_utf8)
c277df42
IZ
8100 PerlIO_printf(Perl_debug_log, ") ");
8101
46fc3d4c 8102 if (r->regstclass) {
32fc9b6a 8103 regprop(r, sv, r->regstclass);
1de06328 8104 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8105 }
774d564b 8106 if (r->reganch & ROPT_ANCH) {
8107 PerlIO_printf(Perl_debug_log, "anchored");
8108 if (r->reganch & ROPT_ANCH_BOL)
8109 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8110 if (r->reganch & ROPT_ANCH_MBOL)
8111 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8112 if (r->reganch & ROPT_ANCH_SBOL)
8113 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8114 if (r->reganch & ROPT_ANCH_GPOS)
8115 PerlIO_printf(Perl_debug_log, "(GPOS)");
8116 PerlIO_putc(Perl_debug_log, ' ');
8117 }
c277df42
IZ
8118 if (r->reganch & ROPT_GPOS_SEEN)
8119 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8120 if (r->reganch & ROPT_SKIP)
760ac839 8121 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8122 if (r->reganch & ROPT_IMPLICIT)
760ac839 8123 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8124 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8125 if (r->reganch & ROPT_EVAL_SEEN)
8126 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8127 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8128#else
96a5add6 8129 PERL_UNUSED_CONTEXT;
65e66c80 8130 PERL_UNUSED_ARG(r);
17c3b450 8131#endif /* DEBUGGING */
a687059c
LW
8132}
8133
8134/*
a0d0e21e
LW
8135- regprop - printable representation of opcode
8136*/
46fc3d4c 8137void
32fc9b6a 8138Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8139{
35ff7856 8140#ifdef DEBUGGING
97aff369 8141 dVAR;
9b155405 8142 register int k;
1de06328 8143 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8144
54dc92de 8145 sv_setpvn(sv, "", 0);
8aa23a47 8146
03363afd 8147 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8148 /* It would be nice to FAIL() here, but this may be called from
8149 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8150 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8151 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8152
3dab1dad 8153 k = PL_regkind[OP(o)];
9b155405 8154
2a782b5b 8155 if (k == EXACT) {
396482e1 8156 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8157 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8158 * is a crude hack but it may be the best for now since
8159 * we have no flag "this EXACTish node was UTF-8"
8160 * --jhi */
8161 const char * const s =
ddc5bc0f 8162 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8163 PL_colors[0], PL_colors[1],
8164 PERL_PV_ESCAPE_UNI_DETECT |
8165 PERL_PV_PRETTY_ELIPSES |
8166 PERL_PV_PRETTY_LTGT
8167 );
8168 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8169 } else if (k == TRIE) {
3dab1dad 8170 /* print the details of the trie in dumpuntil instead, as
4f639d21 8171 * prog->data isn't available here */
1de06328
YO
8172 const char op = OP(o);
8173 const I32 n = ARG(o);
8174 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8175 (reg_ac_data *)prog->data->data[n] :
8176 NULL;
8177 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8178 (reg_trie_data*)prog->data->data[n] :
8179 ac->trie;
8180
8181 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8182 DEBUG_TRIE_COMPILE_r(
8183 Perl_sv_catpvf(aTHX_ sv,
8184 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8185 (UV)trie->startstate,
1e2e3d02 8186 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8187 (UV)trie->wordcount,
8188 (UV)trie->minlen,
8189 (UV)trie->maxlen,
8190 (UV)TRIE_CHARCOUNT(trie),
8191 (UV)trie->uniquecharcount
8192 )
8193 );
8194 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8195 int i;
8196 int rangestart = -1;
f46cb337 8197 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8198 Perl_sv_catpvf(aTHX_ sv, "[");
8199 for (i = 0; i <= 256; i++) {
8200 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8201 if (rangestart == -1)
8202 rangestart = i;
8203 } else if (rangestart != -1) {
8204 if (i <= rangestart + 3)
8205 for (; rangestart < i; rangestart++)
8206 put_byte(sv, rangestart);
8207 else {
8208 put_byte(sv, rangestart);
8209 sv_catpvs(sv, "-");
8210 put_byte(sv, i - 1);
8211 }
8212 rangestart = -1;
8213 }
8214 }
8215 Perl_sv_catpvf(aTHX_ sv, "]");
8216 }
8217
a3621e74 8218 } else if (k == CURLY) {
cb434fcc 8219 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8220 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8221 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8222 }
2c2d71f5
JH
8223 else if (k == WHILEM && o->flags) /* Ordinal/of */
8224 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8225 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8226 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8227 else if (k == GOSUB)
6bda09f9 8228 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8229 else if (k == VERB) {
8230 if (!o->flags)
8231 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8232 (SV*)prog->data->data[ ARG( o ) ]);
8233 } else if (k == LOGICAL)
04ebc1ab 8234 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8235 else if (k == ANYOF) {
8236 int i, rangestart = -1;
2d03de9c 8237 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8238
8239 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8240 static const char * const anyofs[] = {
653099ff
GS
8241 "\\w",
8242 "\\W",
8243 "\\s",
8244 "\\S",
8245 "\\d",
8246 "\\D",
8247 "[:alnum:]",
8248 "[:^alnum:]",
8249 "[:alpha:]",
8250 "[:^alpha:]",
8251 "[:ascii:]",
8252 "[:^ascii:]",
8253 "[:ctrl:]",
8254 "[:^ctrl:]",
8255 "[:graph:]",
8256 "[:^graph:]",
8257 "[:lower:]",
8258 "[:^lower:]",
8259 "[:print:]",
8260 "[:^print:]",
8261 "[:punct:]",
8262 "[:^punct:]",
8263 "[:upper:]",
aaa51d5e 8264 "[:^upper:]",
653099ff 8265 "[:xdigit:]",
aaa51d5e
JF
8266 "[:^xdigit:]",
8267 "[:space:]",
8268 "[:^space:]",
8269 "[:blank:]",
8270 "[:^blank:]"
653099ff
GS
8271 };
8272
19860706 8273 if (flags & ANYOF_LOCALE)
396482e1 8274 sv_catpvs(sv, "{loc}");
19860706 8275 if (flags & ANYOF_FOLD)
396482e1 8276 sv_catpvs(sv, "{i}");
653099ff 8277 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8278 if (flags & ANYOF_INVERT)
396482e1 8279 sv_catpvs(sv, "^");
ffc61ed2
JH
8280 for (i = 0; i <= 256; i++) {
8281 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8282 if (rangestart == -1)
8283 rangestart = i;
8284 } else if (rangestart != -1) {
8285 if (i <= rangestart + 3)
8286 for (; rangestart < i; rangestart++)
653099ff 8287 put_byte(sv, rangestart);
ffc61ed2
JH
8288 else {
8289 put_byte(sv, rangestart);
396482e1 8290 sv_catpvs(sv, "-");
ffc61ed2 8291 put_byte(sv, i - 1);
653099ff 8292 }
ffc61ed2 8293 rangestart = -1;
653099ff 8294 }
847a199f 8295 }
ffc61ed2
JH
8296
8297 if (o->flags & ANYOF_CLASS)
bb7a0f54 8298 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8299 if (ANYOF_CLASS_TEST(o,i))
8300 sv_catpv(sv, anyofs[i]);
8301
8302 if (flags & ANYOF_UNICODE)
396482e1 8303 sv_catpvs(sv, "{unicode}");
1aa99e6b 8304 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8305 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8306
8307 {
8308 SV *lv;
32fc9b6a 8309 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8310
ffc61ed2
JH
8311 if (lv) {
8312 if (sw) {
89ebb4a3 8313 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8314
ffc61ed2 8315 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8316 uvchr_to_utf8(s, i);
ffc61ed2 8317
3568d838 8318 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8319 if (rangestart == -1)
8320 rangestart = i;
8321 } else if (rangestart != -1) {
ffc61ed2
JH
8322 if (i <= rangestart + 3)
8323 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8324 const U8 * const e = uvchr_to_utf8(s,rangestart);
8325 U8 *p;
8326 for(p = s; p < e; p++)
ffc61ed2
JH
8327 put_byte(sv, *p);
8328 }
8329 else {
2d03de9c
AL
8330 const U8 *e = uvchr_to_utf8(s,rangestart);
8331 U8 *p;
8332 for (p = s; p < e; p++)
ffc61ed2 8333 put_byte(sv, *p);
396482e1 8334 sv_catpvs(sv, "-");
2d03de9c
AL
8335 e = uvchr_to_utf8(s, i-1);
8336 for (p = s; p < e; p++)
1df70142 8337 put_byte(sv, *p);
ffc61ed2
JH
8338 }
8339 rangestart = -1;
8340 }
19860706 8341 }
ffc61ed2 8342
396482e1 8343 sv_catpvs(sv, "..."); /* et cetera */
19860706 8344 }
fde631ed 8345
ffc61ed2 8346 {
2e0de35c 8347 char *s = savesvpv(lv);
c445ea15 8348 char * const origs = s;
b81d288d 8349
3dab1dad
YO
8350 while (*s && *s != '\n')
8351 s++;
b81d288d 8352
ffc61ed2 8353 if (*s == '\n') {
2d03de9c 8354 const char * const t = ++s;
ffc61ed2
JH
8355
8356 while (*s) {
8357 if (*s == '\n')
8358 *s = ' ';
8359 s++;
8360 }
8361 if (s[-1] == ' ')
8362 s[-1] = 0;
8363
8364 sv_catpv(sv, t);
fde631ed 8365 }
b81d288d 8366
ffc61ed2 8367 Safefree(origs);
fde631ed
JH
8368 }
8369 }
653099ff 8370 }
ffc61ed2 8371
653099ff
GS
8372 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8373 }
9b155405 8374 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8375 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8376#else
96a5add6 8377 PERL_UNUSED_CONTEXT;
65e66c80
SP
8378 PERL_UNUSED_ARG(sv);
8379 PERL_UNUSED_ARG(o);
f9049ba1 8380 PERL_UNUSED_ARG(prog);
17c3b450 8381#endif /* DEBUGGING */
35ff7856 8382}
a687059c 8383
cad2e5aa
JH
8384SV *
8385Perl_re_intuit_string(pTHX_ regexp *prog)
8386{ /* Assume that RE_INTUIT is set */
97aff369 8387 dVAR;
a3621e74 8388 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8389 PERL_UNUSED_CONTEXT;
8390
a3621e74 8391 DEBUG_COMPILE_r(
cfd0369c 8392 {
2d03de9c 8393 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8394 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8395
8396 if (!PL_colorset) reginitcolors();
8397 PerlIO_printf(Perl_debug_log,
a0288114 8398 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8399 PL_colors[4],
8400 prog->check_substr ? "" : "utf8 ",
8401 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8402 s,
8403 PL_colors[1],
8404 (strlen(s) > 60 ? "..." : ""));
8405 } );
8406
33b8afdf 8407 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8408}
8409
84da74a7
YO
8410/*
8411 pregfree - free a regexp
8412
8413 See regdupe below if you change anything here.
8414*/
8415
2b69d0c2 8416void
864dbfa3 8417Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8418{
27da23d5 8419 dVAR;
0df25f3d 8420
fc32ee4a 8421 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8422
7821416a
IZ
8423 if (!r || (--r->refcnt > 0))
8424 return;
ab3bbdeb 8425 DEBUG_COMPILE_r({
0df25f3d
YO
8426 if (!PL_colorset)
8427 reginitcolors();
5c9f2f80
YO
8428 {
8429 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8430 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8431 dsv, r->precomp, r->prelen, 60);
8432 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8433 PL_colors[4],PL_colors[5],s);
8434 }
9e55ce06 8435 });
cad2e5aa 8436
43c5f42d
NC
8437 /* gcov results gave these as non-null 100% of the time, so there's no
8438 optimisation in checking them before calling Safefree */
8439 Safefree(r->precomp);
8440 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8441 RX_MATCH_COPY_FREE(r);
f8c7b90f 8442#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8443 if (r->saved_copy)
8444 SvREFCNT_dec(r->saved_copy);
8445#endif
a193d654
GS
8446 if (r->substrs) {
8447 if (r->anchored_substr)
8448 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8449 if (r->anchored_utf8)
8450 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8451 if (r->float_substr)
8452 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8453 if (r->float_utf8)
8454 SvREFCNT_dec(r->float_utf8);
2779dcf1 8455 Safefree(r->substrs);
a193d654 8456 }
81714fb9
YO
8457 if (r->paren_names)
8458 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8459 if (r->data) {
8460 int n = r->data->count;
f3548bdc
DM
8461 PAD* new_comppad = NULL;
8462 PAD* old_comppad;
4026c95a 8463 PADOFFSET refcnt;
dfad63ad 8464
c277df42 8465 while (--n >= 0) {
261faec3 8466 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8467 switch (r->data->what[n]) {
8468 case 's':
81714fb9 8469 case 'S':
c277df42
IZ
8470 SvREFCNT_dec((SV*)r->data->data[n]);
8471 break;
653099ff
GS
8472 case 'f':
8473 Safefree(r->data->data[n]);
8474 break;
dfad63ad
HS
8475 case 'p':
8476 new_comppad = (AV*)r->data->data[n];
8477 break;
c277df42 8478 case 'o':
dfad63ad 8479 if (new_comppad == NULL)
cea2e8a9 8480 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8481 PAD_SAVE_LOCAL(old_comppad,
8482 /* Watch out for global destruction's random ordering. */
c445ea15 8483 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8484 );
b34c0dd4 8485 OP_REFCNT_LOCK;
4026c95a
SH
8486 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8487 OP_REFCNT_UNLOCK;
8488 if (!refcnt)
9b978d73 8489 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8490
f3548bdc 8491 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8492 SvREFCNT_dec((SV*)new_comppad);
8493 new_comppad = NULL;
c277df42
IZ
8494 break;
8495 case 'n':
9e55ce06 8496 break;
07be1b83 8497 case 'T':
be8e71aa
YO
8498 { /* Aho Corasick add-on structure for a trie node.
8499 Used in stclass optimization only */
07be1b83
YO
8500 U32 refcount;
8501 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8502 OP_REFCNT_LOCK;
8503 refcount = --aho->refcount;
8504 OP_REFCNT_UNLOCK;
8505 if ( !refcount ) {
8506 Safefree(aho->states);
8507 Safefree(aho->fail);
8508 aho->trie=NULL; /* not necessary to free this as it is
8509 handled by the 't' case */
8510 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8511 Safefree(r->regstclass);
07be1b83
YO
8512 }
8513 }
8514 break;
a3621e74 8515 case 't':
07be1b83 8516 {
be8e71aa 8517 /* trie structure. */
07be1b83
YO
8518 U32 refcount;
8519 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8520 OP_REFCNT_LOCK;
8521 refcount = --trie->refcount;
8522 OP_REFCNT_UNLOCK;
8523 if ( !refcount ) {
8524 Safefree(trie->charmap);
8525 if (trie->widecharmap)
8526 SvREFCNT_dec((SV*)trie->widecharmap);
8527 Safefree(trie->states);
8528 Safefree(trie->trans);
8529 if (trie->bitmap)
8530 Safefree(trie->bitmap);
8531 if (trie->wordlen)
8532 Safefree(trie->wordlen);
786e8c11
YO
8533 if (trie->jump)
8534 Safefree(trie->jump);
8535 if (trie->nextword)
8536 Safefree(trie->nextword);
a3621e74 8537#ifdef DEBUGGING
5c9f2f80
YO
8538 if (trie->words)
8539 SvREFCNT_dec((SV*)trie->words);
8540 if (trie->revcharmap)
8541 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8542#endif
07be1b83 8543 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8544 }
07be1b83
YO
8545 }
8546 break;
c277df42 8547 default:
830247a4 8548 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8549 }
8550 }
8551 Safefree(r->data->what);
8552 Safefree(r->data);
a0d0e21e
LW
8553 }
8554 Safefree(r->startp);
8555 Safefree(r->endp);
c74340f9
YO
8556 if (r->swap) {
8557 Safefree(r->swap->startp);
8558 Safefree(r->swap->endp);
8559 Safefree(r->swap);
8560 }
a0d0e21e 8561 Safefree(r);
a687059c 8562}
c277df42 8563
84da74a7
YO
8564#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8565#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8566#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8567#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8568
8569/*
8570 regdupe - duplicate a regexp.
8571
8572 This routine is called by sv.c's re_dup and is expected to clone a
8573 given regexp structure. It is a no-op when not under USE_ITHREADS.
8574 (Originally this *was* re_dup() for change history see sv.c)
8575
8576 See pregfree() above if you change anything here.
8577*/
a3c0e9ca 8578#if defined(USE_ITHREADS)
84da74a7
YO
8579regexp *
8580Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8581{
84da74a7
YO
8582 dVAR;
8583 REGEXP *ret;
8584 int i, len, npar;
8585 struct reg_substr_datum *s;
8586
8587 if (!r)
8588 return (REGEXP *)NULL;
8589
8590 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8591 return ret;
8592
8593 len = r->offsets[0];
8594 npar = r->nparens+1;
8595
8596 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8597 Copy(r->program, ret->program, len+1, regnode);
8598
8599 Newx(ret->startp, npar, I32);
8600 Copy(r->startp, ret->startp, npar, I32);
8601 Newx(ret->endp, npar, I32);
8602 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8603 if(r->swap) {
8604 Newx(ret->swap, 1, regexp_paren_ofs);
8605 /* no need to copy these */
8606 Newx(ret->swap->startp, npar, I32);
8607 Newx(ret->swap->endp, npar, I32);
8608 } else {
8609 ret->swap = NULL;
8610 }
84da74a7
YO
8611
8612 Newx(ret->substrs, 1, struct reg_substr_data);
8613 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8614 s->min_offset = r->substrs->data[i].min_offset;
8615 s->max_offset = r->substrs->data[i].max_offset;
8616 s->end_shift = r->substrs->data[i].end_shift;
8617 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8618 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8619 }
8620
8621 ret->regstclass = NULL;
8622 if (r->data) {
8623 struct reg_data *d;
8624 const int count = r->data->count;
8625 int i;
8626
8627 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8628 char, struct reg_data);
8629 Newx(d->what, count, U8);
8630
8631 d->count = count;
8632 for (i = 0; i < count; i++) {
8633 d->what[i] = r->data->what[i];
8634 switch (d->what[i]) {
e2e6a0f1 8635 /* legal options are one of: sSfpont
84da74a7
YO
8636 see also regcomp.h and pregfree() */
8637 case 's':
81714fb9 8638 case 'S':
84da74a7
YO
8639 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8640 break;
8641 case 'p':
8642 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8643 break;
8644 case 'f':
8645 /* This is cheating. */
8646 Newx(d->data[i], 1, struct regnode_charclass_class);
8647 StructCopy(r->data->data[i], d->data[i],
8648 struct regnode_charclass_class);
8649 ret->regstclass = (regnode*)d->data[i];
8650 break;
8651 case 'o':
8652 /* Compiled op trees are readonly, and can thus be
8653 shared without duplication. */
8654 OP_REFCNT_LOCK;
8655 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8656 OP_REFCNT_UNLOCK;
8657 break;
8658 case 'n':
8659 d->data[i] = r->data->data[i];
8660 break;
8661 case 't':
8662 d->data[i] = r->data->data[i];
8663 OP_REFCNT_LOCK;
8664 ((reg_trie_data*)d->data[i])->refcount++;
8665 OP_REFCNT_UNLOCK;
8666 break;
8667 case 'T':
8668 d->data[i] = r->data->data[i];
8669 OP_REFCNT_LOCK;
8670 ((reg_ac_data*)d->data[i])->refcount++;
8671 OP_REFCNT_UNLOCK;
8672 /* Trie stclasses are readonly and can thus be shared
8673 * without duplication. We free the stclass in pregfree
8674 * when the corresponding reg_ac_data struct is freed.
8675 */
8676 ret->regstclass= r->regstclass;
8677 break;
8678 default:
8679 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8680 }
8681 }
8682
8683 ret->data = d;
8684 }
8685 else
8686 ret->data = NULL;
8687
8688 Newx(ret->offsets, 2*len+1, U32);
8689 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8690
8691 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8692 ret->refcnt = r->refcnt;
8693 ret->minlen = r->minlen;
de8c5301 8694 ret->minlenret = r->minlenret;
84da74a7
YO
8695 ret->prelen = r->prelen;
8696 ret->nparens = r->nparens;
8697 ret->lastparen = r->lastparen;
8698 ret->lastcloseparen = r->lastcloseparen;
8699 ret->reganch = r->reganch;
8700
8701 ret->sublen = r->sublen;
8702
f9f4320a 8703 ret->engine = r->engine;
81714fb9
YO
8704
8705 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8706
84da74a7
YO
8707 if (RX_MATCH_COPIED(ret))
8708 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8709 else
8710 ret->subbeg = NULL;
8711#ifdef PERL_OLD_COPY_ON_WRITE
8712 ret->saved_copy = NULL;
8713#endif
8714
8715 ptr_table_store(PL_ptr_table, r, ret);
8716 return ret;
84da74a7 8717}
a3c0e9ca 8718#endif
84da74a7 8719
de8c5301
YO
8720/*
8721 reg_stringify()
8722
8723 converts a regexp embedded in a MAGIC struct to its stringified form,
8724 caching the converted form in the struct and returns the cached
8725 string.
8726
8727 If lp is nonnull then it is used to return the length of the
8728 resulting string
8729
8730 If flags is nonnull and the returned string contains UTF8 then
8731 (flags & 1) will be true.
8732
8733 If haseval is nonnull then it is used to return whether the pattern
8734 contains evals.
8735
8736 Normally called via macro:
8737
8738 CALLREG_STRINGIFY(mg,0,0);
8739
8740 And internally with
8741
8742 CALLREG_AS_STR(mg,lp,flags,haseval)
8743
8744 See sv_2pv_flags() in sv.c for an example of internal usage.
8745
8746 */
8747
8748char *
8749Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8750 dVAR;
8751 const regexp * const re = (regexp *)mg->mg_obj;
8752
8753 if (!mg->mg_ptr) {
8754 const char *fptr = "msix";
8755 char reflags[6];
8756 char ch;
8757 int left = 0;
8758 int right = 4;
8759 bool need_newline = 0;
8760 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8761
8762 while((ch = *fptr++)) {
8763 if(reganch & 1) {
8764 reflags[left++] = ch;
8765 }
8766 else {
8767 reflags[right--] = ch;
8768 }
8769 reganch >>= 1;
8770 }
8771 if(left != 4) {
8772 reflags[left] = '-';
8773 left = 5;
8774 }
8775
8776 mg->mg_len = re->prelen + 4 + left;
8777 /*
8778 * If /x was used, we have to worry about a regex ending with a
8779 * comment later being embedded within another regex. If so, we don't
8780 * want this regex's "commentization" to leak out to the right part of
8781 * the enclosing regex, we must cap it with a newline.
8782 *
8783 * So, if /x was used, we scan backwards from the end of the regex. If
8784 * we find a '#' before we find a newline, we need to add a newline
8785 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8786 * we don't need to add anything. -jfriedl
8787 */
8788 if (PMf_EXTENDED & re->reganch) {
8789 const char *endptr = re->precomp + re->prelen;
8790 while (endptr >= re->precomp) {
8791 const char c = *(endptr--);
8792 if (c == '\n')
8793 break; /* don't need another */
8794 if (c == '#') {
8795 /* we end while in a comment, so we need a newline */
8796 mg->mg_len++; /* save space for it */
8797 need_newline = 1; /* note to add it */
8798 break;
8799 }
8800 }
8801 }
8802
8803 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8804 mg->mg_ptr[0] = '(';
8805 mg->mg_ptr[1] = '?';
8806 Copy(reflags, mg->mg_ptr+2, left, char);
8807 *(mg->mg_ptr+left+2) = ':';
8808 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8809 if (need_newline)
8810 mg->mg_ptr[mg->mg_len - 2] = '\n';
8811 mg->mg_ptr[mg->mg_len - 1] = ')';
8812 mg->mg_ptr[mg->mg_len] = 0;
8813 }
8814 if (haseval)
8815 *haseval = re->program[0].next_off;
8816 if (flags)
8817 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8818
8819 if (lp)
8820 *lp = mg->mg_len;
8821 return mg->mg_ptr;
8822}
8823
8824
76234dfb 8825#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8826/*
8827 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8828 */
8829regnode *
864dbfa3 8830Perl_regnext(pTHX_ register regnode *p)
c277df42 8831{
97aff369 8832 dVAR;
c277df42
IZ
8833 register I32 offset;
8834
3280af22 8835 if (p == &PL_regdummy)
c277df42
IZ
8836 return(NULL);
8837
8838 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8839 if (offset == 0)
8840 return(NULL);
8841
c277df42 8842 return(p+offset);
c277df42 8843}
76234dfb 8844#endif
c277df42 8845
01f988be 8846STATIC void
cea2e8a9 8847S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8848{
8849 va_list args;
8850 STRLEN l1 = strlen(pat1);
8851 STRLEN l2 = strlen(pat2);
8852 char buf[512];
06bf62c7 8853 SV *msv;
73d840c0 8854 const char *message;
c277df42
IZ
8855
8856 if (l1 > 510)
8857 l1 = 510;
8858 if (l1 + l2 > 510)
8859 l2 = 510 - l1;
8860 Copy(pat1, buf, l1 , char);
8861 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8862 buf[l1 + l2] = '\n';
8863 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8864#ifdef I_STDARG
8865 /* ANSI variant takes additional second argument */
c277df42 8866 va_start(args, pat2);
8736538c
AS
8867#else
8868 va_start(args);
8869#endif
5a844595 8870 msv = vmess(buf, &args);
c277df42 8871 va_end(args);
cfd0369c 8872 message = SvPV_const(msv,l1);
c277df42
IZ
8873 if (l1 > 512)
8874 l1 = 512;
8875 Copy(message, buf, l1 , char);
197cf9b9 8876 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8877 Perl_croak(aTHX_ "%s", buf);
c277df42 8878}
a0ed51b3
LW
8879
8880/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8881
76234dfb 8882#ifndef PERL_IN_XSUB_RE
a0ed51b3 8883void
864dbfa3 8884Perl_save_re_context(pTHX)
b81d288d 8885{
97aff369 8886 dVAR;
1ade1aa1
NC
8887
8888 struct re_save_state *state;
8889
8890 SAVEVPTR(PL_curcop);
8891 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8892
8893 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8894 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8895 SSPUSHINT(SAVEt_RE_STATE);
8896
46ab3289 8897 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8898
a0ed51b3 8899 PL_reg_start_tmp = 0;
a0ed51b3 8900 PL_reg_start_tmpl = 0;
c445ea15 8901 PL_reg_oldsaved = NULL;
a5db57d6 8902 PL_reg_oldsavedlen = 0;
a5db57d6 8903 PL_reg_maxiter = 0;
a5db57d6 8904 PL_reg_leftiter = 0;
c445ea15 8905 PL_reg_poscache = NULL;
a5db57d6 8906 PL_reg_poscache_size = 0;
1ade1aa1
NC
8907#ifdef PERL_OLD_COPY_ON_WRITE
8908 PL_nrs = NULL;
8909#endif
ada6e8a9 8910
c445ea15
AL
8911 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8912 if (PL_curpm) {
8913 const REGEXP * const rx = PM_GETRE(PL_curpm);
8914 if (rx) {
1df70142 8915 U32 i;
ada6e8a9 8916 for (i = 1; i <= rx->nparens; i++) {
1df70142 8917 char digits[TYPE_CHARS(long)];
d9fad198 8918 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8919 GV *const *const gvp
8920 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8921
b37c2d43
AL
8922 if (gvp) {
8923 GV * const gv = *gvp;
8924 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8925 save_scalar(gv);
49f27e4b 8926 }
ada6e8a9
AMS
8927 }
8928 }
8929 }
a0ed51b3 8930}
76234dfb 8931#endif
51371543 8932
51371543 8933static void
acfe0abc 8934clear_re(pTHX_ void *r)
51371543 8935{
97aff369 8936 dVAR;
51371543
GS
8937 ReREFCNT_dec((regexp *)r);
8938}
ffbc6a93 8939
a28509cc
AL
8940#ifdef DEBUGGING
8941
8942STATIC void
8943S_put_byte(pTHX_ SV *sv, int c)
8944{
8945 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8946 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8947 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8948 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8949 else
8950 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8951}
8952
786e8c11 8953
3dab1dad
YO
8954#define CLEAR_OPTSTART \
8955 if (optstart) STMT_START { \
07be1b83 8956 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8957 optstart=NULL; \
8958 } STMT_END
8959
786e8c11 8960#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8961
b5a2f8d8
NC
8962STATIC const regnode *
8963S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8964 const regnode *last, const regnode *plast,
8965 SV* sv, I32 indent, U32 depth)
a28509cc 8966{
97aff369 8967 dVAR;
786e8c11 8968 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8969 register const regnode *next;
3dab1dad
YO
8970 const regnode *optstart= NULL;
8971 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8972
786e8c11
YO
8973#ifdef DEBUG_DUMPUNTIL
8974 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8975 last ? last-start : 0,plast ? plast-start : 0);
8976#endif
8977
8978 if (plast && plast < last)
8979 last= plast;
8980
8981 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8982 /* While that wasn't END last time... */
8983
8984 NODE_ALIGN(node);
8985 op = OP(node);
8986 if (op == CLOSE)
786e8c11 8987 indent--;
b5a2f8d8 8988 next = regnext((regnode *)node);
07be1b83 8989
a28509cc 8990 /* Where, what. */
8e11feef 8991 if (OP(node) == OPTIMIZED) {
e68ec53f 8992 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8993 optstart = node;
3dab1dad 8994 else
8e11feef 8995 goto after_print;
3dab1dad
YO
8996 } else
8997 CLEAR_OPTSTART;
07be1b83 8998
32fc9b6a 8999 regprop(r, sv, node);
a28509cc 9000 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9001 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9002
9003 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9004 if (next == NULL) /* Next ptr. */
9005 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9006 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9007 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9008 else
9009 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9010
1de06328 9011 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9012 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9013 }
9014
a28509cc
AL
9015 after_print:
9016 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9017 assert(next);
9018 {
9019 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9020 ? regnext((regnode *)next)
9021 : next);
be8e71aa
YO
9022 if (last && nnode > last)
9023 nnode = last;
786e8c11 9024 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9025 }
a28509cc
AL
9026 }
9027 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9028 assert(next);
786e8c11 9029 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9030 }
9031 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9032 const regnode *this_trie = node;
1de06328 9033 const char op = OP(node);
a28509cc 9034 const I32 n = ARG(node);
1de06328
YO
9035 const reg_ac_data * const ac = op>=AHOCORASICK ?
9036 (reg_ac_data *)r->data->data[n] :
9037 NULL;
9038 const reg_trie_data * const trie = op<AHOCORASICK ?
9039 (reg_trie_data*)r->data->data[n] :
9040 ac->trie;
786e8c11 9041 const regnode *nextbranch= NULL;
a28509cc 9042 I32 word_idx;
1de06328 9043 sv_setpvn(sv, "", 0);
786e8c11 9044 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9045 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9046
9047 PerlIO_printf(Perl_debug_log, "%*s%s ",
9048 (int)(2*(indent+3)), "",
9049 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9050 PL_colors[0], PL_colors[1],
9051 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9052 PERL_PV_PRETTY_ELIPSES |
7f69552c 9053 PERL_PV_PRETTY_LTGT
786e8c11
YO
9054 )
9055 : "???"
9056 );
9057 if (trie->jump) {
40d049e4 9058 U16 dist= trie->jump[word_idx+1];
7f69552c 9059 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9060 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9061 if (dist) {
9062 if (!nextbranch)
24b23f37 9063 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9064 DUMPUNTIL(this_trie + dist, nextbranch);
9065 }
786e8c11
YO
9066 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9067 nextbranch= regnext((regnode *)nextbranch);
9068 } else {
9069 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9070 }
786e8c11
YO
9071 }
9072 if (last && next > last)
9073 node= last;
9074 else
9075 node= next;
a28509cc 9076 }
786e8c11
YO
9077 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9078 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9079 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9080 }
9081 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9082 assert(next);
786e8c11 9083 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9084 }
9085 else if ( op == PLUS || op == STAR) {
786e8c11 9086 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9087 }
9088 else if (op == ANYOF) {
9089 /* arglen 1 + class block */
9090 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9091 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9092 node = NEXTOPER(node);
9093 }
9094 else if (PL_regkind[(U8)op] == EXACT) {
9095 /* Literal string, where present. */
9096 node += NODE_SZ_STR(node) - 1;
9097 node = NEXTOPER(node);
9098 }
9099 else {
9100 node = NEXTOPER(node);
9101 node += regarglen[(U8)op];
9102 }
9103 if (op == CURLYX || op == OPEN)
786e8c11 9104 indent++;
a28509cc 9105 else if (op == WHILEM)
786e8c11 9106 indent--;
a28509cc 9107 }
3dab1dad 9108 CLEAR_OPTSTART;
786e8c11
YO
9109#ifdef DEBUG_DUMPUNTIL
9110 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9111#endif
1de06328 9112 return node;
a28509cc
AL
9113}
9114
9115#endif /* DEBUGGING */
9116
241d1a3b
NC
9117/*
9118 * Local variables:
9119 * c-indentation-style: bsd
9120 * c-basic-offset: 4
9121 * indent-tabs-mode: t
9122 * End:
9123 *
37442d52
RGS
9124 * ex: set ts=8 sts=4 sw=4 noet:
9125 */