This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update skip count for VMS.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83 2121#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2122 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2129 }});
07be1b83 2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
b515a41d
YO
2292/* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2295struct scan_frame;
2296typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2301} scan_frame;
2302
76e3520e 2303STATIC I32
40d049e4 2304S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2305 I32 *minlenp, I32 *deltap,
40d049e4
YO
2306 regnode *last,
2307 scan_data_t *data,
2308 I32 stopparen,
2309 U8* recursed,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
c277df42
IZ
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
40d049e4
YO
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2319{
97aff369 2320 dVAR;
c277df42
IZ
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2323 I32 delta = 0;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2325 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
a3621e74 2328 SV *re_trie_maxbuff = NULL;
786e8c11 2329 regnode *first_non_open = scan;
e2e6a0f1 2330 I32 stopmin = I32_MAX;
b515a41d
YO
2331 scan_frame last_frame= { last, NULL, NULL, stopparen };
2332 scan_frame *frame=&last_frame;
2333
a3621e74 2334 GET_RE_DEBUG_FLAGS_DECL;
b515a41d 2335
13a24bad 2336#ifdef DEBUGGING
40d049e4 2337 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2338#endif
40d049e4 2339
786e8c11 2340 if ( depth == 0 ) {
40d049e4 2341 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2342 first_non_open=regnext(first_non_open);
2343 }
2344
b515a41d
YO
2345 while (frame) {
2346
2347 DEBUG_PEEP("FBEG",scan,depth);
2348 while ( scan && OP(scan) != END && scan < frame->last ) {
2349 /* Peephole optimizer: */
2350 DEBUG_STUDYDATA(data,depth);
2351 DEBUG_PEEP("Peep",scan,depth);
2352 JOIN_EXACT(scan,&min,0);
2353
2354 /* Follow the next-chain of the current node and optimize
2355 away all the NOTHINGs from it. */
2356 if (OP(scan) != CURLYX) {
2357 const int max = (reg_off_by_arg[OP(scan)]
2358 ? I32_MAX
2359 /* I32 may be smaller than U16 on CRAYs! */
2360 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2361 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2362 int noff;
2363 regnode *n = scan;
2364
2365 /* Skip NOTHING and LONGJMP. */
2366 while ((n = regnext(n))
2367 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2368 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2369 && off + noff < max)
2370 off += noff;
2371 if (reg_off_by_arg[OP(scan)])
2372 ARG(scan) = off;
2373 else
2374 NEXT_OFF(scan) = off;
2375 }
b81d288d 2376
b515a41d
YO
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2382 code = OP(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2384
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2388 too. */
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2392
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
a3621e74 2397
b515a41d
YO
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
c277df42 2401
b515a41d
YO
2402 num++;
2403 data_fake.flags = 0;
2404 if (data) {
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2407 }
2408 else
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
c277df42 2411 scan = NEXTOPER(scan);
b515a41d
YO
2412 if (code != BRANCH)
2413 scan = NEXTOPER(scan);
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2418 }
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2421
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2424 next, &data_fake,
2425 stopparen, recursed, NULL, f,depth+1);
2426 if (min1 > minnext)
2427 min1 = minnext;
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2432 scan = next;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2434 pars++;
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2439 if (data)
2440 data->flags |= SCF_SEEN_ACCEPT;
2441 }
2442 if (data) {
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
2446 }
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
3dab1dad 2449 }
b515a41d
YO
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2451 min1 = 0;
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
653099ff 2457 }
b515a41d
YO
2458 min += min1;
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2462 if (min1) {
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
2465 }
de0c8cb8 2466 }
b515a41d
YO
2467 else if (flags & SCF_DO_STCLASS_AND) {
2468 if (min1) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
2471 }
2472 else {
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2475 INIT_AND_WITHP;
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
2483 }
de0c8cb8 2484 }
a3621e74 2485
b515a41d
YO
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2487 /* demq.
a3621e74 2488
b515a41d
YO
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2492 for subsequences of
a3621e74 2493
b515a41d
YO
2494 BRANCH->EXACT=>x1
2495 BRANCH->EXACT=>x2
2496 tail
a3621e74 2497
b515a41d 2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2499
b515a41d
YO
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
a3621e74 2503
b515a41d 2504 We have two cases
a3621e74 2505
b515a41d 2506 1. patterns where the whole set of branch can be converted.
a3621e74 2507
b515a41d 2508 2. patterns where only a subset can be converted.
a3621e74 2509
b515a41d
YO
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2512 branchs so
a3621e74 2513
b515a41d
YO
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
a3621e74 2516
b515a41d
YO
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
786e8c11 2520
b515a41d
YO
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
a3621e74 2525
b515a41d
YO
2526 */
2527
2528 int made=0;
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2533 }
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2535 regnode *cur;
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2539 U8 optype = 0;
2540 U32 count=0;
a3621e74
YO
2541
2542#ifdef DEBUGGING
b515a41d 2543 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2544#endif
b515a41d
YO
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2550 tails.
2551 */
2552
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2556 }
a3621e74 2557
a3621e74 2558
b515a41d
YO
2559 DEBUG_OPTIMISE_r({
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2565 );
2566 });
2567
2568 /*
2569
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
a3621e74 2590
b515a41d 2591*/
a3621e74 2592
b515a41d
YO
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
2596#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2597 regnode * const noper_next = regnext( noper );
2598#endif
2599
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2604
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2608
2609 if ( noper_next ) {
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2613 }
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2616 });
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
786e8c11 2620#ifdef NOJUMPTRIE
b515a41d 2621 && noper_next == tail
786e8c11 2622#endif
b515a41d
YO
2623 && count < U16_MAX)
2624 {
2625 count++;
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2629 } else {
2630 last = cur;
2631 }
2632 } else {
2633 if ( last ) {
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2636 optype, depth+1 );
2637 }
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2639#ifdef NOJUMPTRIE
b515a41d 2640 && noper_next == tail
786e8c11 2641#endif
b515a41d
YO
2642 ) {
2643 count = 1;
2644 first = cur;
2645 optype = OP( noper );
2646 } else {
2647 count = 0;
2648 first = NULL;
2649 optype = 0;
2650 }
2651 last = NULL;
2652 }
2653 }
2654 DEBUG_OPTIMISE_r({
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2659
2660 });
2661 if ( last ) {
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2663#ifdef TRIE_STUDY_OPT
b515a41d
YO
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2667 depth==0 ) {
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2670 && scan == tail )
2671 {
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2673 }
2674 }
3dab1dad 2675#endif
b515a41d
YO
2676 }
2677 }
2678
2679 } /* do trie */
2680
c277df42 2681 }
b515a41d
YO
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2686 continue;
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2689 I32 paren;
2690 regnode *start;
2691 regnode *end;
2692
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2696 paren = ARG(scan);
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2700 } else {
2701 paren = 0;
2702 start = RExC_rx->program + 1;
2703 end = RExC_opend;
2704 }
2705 if (!recursed) {
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2708 }
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2712 } else {
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2716 }
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2721 }
2722 } else {
2723 Newx(newframe,1,scan_frame);
2724 paren = stopparen;
2725 start = scan+2;
2726 end = regnext(scan);
0eda9292 2727 }
b515a41d
YO
2728 if (newframe) {
2729 assert(start);
2730 assert(end);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = end;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2736 frame = newframe;
2737 scan = start;
2738 stopparen = paren;
2739 continue;
2740 }
653099ff 2741 }
b515a41d
YO
2742 else if (OP(scan) == EXACT) {
2743 I32 l = STR_LEN(scan);
2744 UV uc;
2745 if (UTF) {
2746 const U8 * const s = (U8*)STRING(scan);
2747 l = utf8_length(s, s + l);
2748 uc = utf8_to_uvchr(s, NULL);
2749 } else {
2750 uc = *((U8*)STRING(scan));
2751 }
2752 min += l;
2753 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2754 /* The code below prefers earlier match for fixed
2755 offset, later match for variable offset. */
2756 if (data->last_end == -1) { /* Update the start info. */
2757 data->last_start_min = data->pos_min;
2758 data->last_start_max = is_inf
2759 ? I32_MAX : data->pos_min + data->pos_delta;
2760 }
2761 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2762 if (UTF)
2763 SvUTF8_on(data->last_found);
2764 {
2765 SV * const sv = data->last_found;
2766 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2767 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2768 if (mg && mg->mg_len >= 0)
2769 mg->mg_len += utf8_length((U8*)STRING(scan),
2770 (U8*)STRING(scan)+STR_LEN(scan));
2771 }
2772 data->last_end = data->pos_min + l;
2773 data->pos_min += l; /* As in the first entry. */
2774 data->flags &= ~SF_BEFORE_EOL;
2775 }
2776 if (flags & SCF_DO_STCLASS_AND) {
2777 /* Check whether it is compatible with what we know already! */
2778 int compat = 1;
2779
2780 if (uc >= 0x100 ||
2781 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2782 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2783 && (!(data->start_class->flags & ANYOF_FOLD)
2784 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2785 )
2786 compat = 0;
2787 ANYOF_CLASS_ZERO(data->start_class);
2788 ANYOF_BITMAP_ZERO(data->start_class);
2789 if (compat)
2790 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2791 data->start_class->flags &= ~ANYOF_EOS;
b515a41d
YO
2792 if (uc < 0x100)
2793 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff 2794 }
b515a41d
YO
2795 else if (flags & SCF_DO_STCLASS_OR) {
2796 /* false positive possible if the class is case-folded */
1aa99e6b
IH
2797 if (uc < 0x100)
2798 ANYOF_BITMAP_SET(data->start_class, uc);
b515a41d
YO
2799 else
2800 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2801 data->start_class->flags &= ~ANYOF_EOS;
b515a41d 2802 cl_and(data->start_class, and_withp);
653099ff 2803 }
b515a41d 2804 flags &= ~SCF_DO_STCLASS;
653099ff 2805 }
b515a41d
YO
2806 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2807 I32 l = STR_LEN(scan);
2808 UV uc = *((U8*)STRING(scan));
2809
2810 /* Search for fixed substrings supports EXACT only. */
c277df42 2811 if (flags & SCF_DO_SUBSTR) {
b515a41d
YO
2812 assert(data);
2813 scan_commit(pRExC_state, data, minlenp);
c277df42 2814 }
b515a41d
YO
2815 if (UTF) {
2816 const U8 * const s = (U8 *)STRING(scan);
2817 l = utf8_length(s, s + l);
2818 uc = utf8_to_uvchr(s, NULL);
653099ff 2819 }
b515a41d
YO
2820 min += l;
2821 if (flags & SCF_DO_SUBSTR)
2822 data->pos_min += l;
2823 if (flags & SCF_DO_STCLASS_AND) {
2824 /* Check whether it is compatible with what we know already! */
2825 int compat = 1;
2826
2827 if (uc >= 0x100 ||
2828 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2829 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2830 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2831 compat = 0;
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2834 if (compat) {
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 data->start_class->flags |= ANYOF_FOLD;
2838 if (OP(scan) == EXACTFL)
2839 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2840 }
653099ff 2841 }
b515a41d
YO
2842 else if (flags & SCF_DO_STCLASS_OR) {
2843 if (data->start_class->flags & ANYOF_FOLD) {
2844 /* false positive possible if the class is case-folded.
2845 Assume that the locale settings are the same... */
2846 if (uc < 0x100)
2847 ANYOF_BITMAP_SET(data->start_class, uc);
2848 data->start_class->flags &= ~ANYOF_EOS;
2849 }
2850 cl_and(data->start_class, and_withp);
b45f050a 2851 }
b515a41d
YO
2852 flags &= ~SCF_DO_STCLASS;
2853 }
2854 else if (strchr((const char*)PL_varies,OP(scan))) {
2855 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2856 I32 f = flags, pos_before = 0;
2857 regnode * const oscan = scan;
2858 struct regnode_charclass_class this_class;
2859 struct regnode_charclass_class *oclass = NULL;
2860 I32 next_is_eval = 0;
b45f050a 2861
b515a41d
YO
2862 switch (PL_regkind[OP(scan)]) {
2863 case WHILEM: /* End of (?:...)* . */
2864 scan = NEXTOPER(scan);
2865 goto finish;
2866 case PLUS:
2867 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2868 next = NEXTOPER(scan);
2869 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2870 mincount = 1;
2871 maxcount = REG_INFTY;
2872 next = regnext(scan);
2873 scan = NEXTOPER(scan);
2874 goto do_curly;
2875 }
2876 }
2877 if (flags & SCF_DO_SUBSTR)
2878 data->pos_min++;
2879 min++;
2880 /* Fall through. */
2881 case STAR:
2882 if (flags & SCF_DO_STCLASS) {
2883 mincount = 0;
2884 maxcount = REG_INFTY;
2885 next = regnext(scan);
2886 scan = NEXTOPER(scan);
2887 goto do_curly;
2888 }
2889 is_inf = is_inf_internal = 1;
2890 scan = regnext(scan);
2891 if (flags & SCF_DO_SUBSTR) {
2892 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2893 data->longest = &(data->longest_float);
2894 }
2895 goto optimize_curly_tail;
2896 case CURLY:
2897 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2898 && (scan->flags == stopparen))
2899 {
2900 mincount = 1;
2901 maxcount = 1;
2902 } else {
2903 mincount = ARG1(scan);
2904 maxcount = ARG2(scan);
2905 }
2906 next = regnext(scan);
2907 if (OP(scan) == CURLYX) {
2908 I32 lp = (data ? *(data->last_closep) : 0);
2909 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2910 }
2911 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2912 next_is_eval = (OP(scan) == EVAL);
2913do_curly:
2914 if (flags & SCF_DO_SUBSTR) {
2915 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2916 pos_before = data->pos_min;
2917 }
2918 if (data) {
2919 fl = data->flags;
2920 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2921 if (is_inf)
2922 data->flags |= SF_IS_INF;
2923 }
2924 if (flags & SCF_DO_STCLASS) {
2925 cl_init(pRExC_state, &this_class);
2926 oclass = data->start_class;
2927 data->start_class = &this_class;
2928 f |= SCF_DO_STCLASS_AND;
2929 f &= ~SCF_DO_STCLASS_OR;
2930 }
2931 /* These are the cases when once a subexpression
2932 fails at a particular position, it cannot succeed
2933 even after backtracking at the enclosing scope.
2934
2935 XXXX what if minimal match and we are at the
2936 initial run of {n,m}? */
2937 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2938 f &= ~SCF_WHILEM_VISITED_POS;
2939
2940 /* This will finish on WHILEM, setting scan, or on NULL: */
2941 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2942 last, data, stopparen, recursed, NULL,
2943 (mincount == 0
2944 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2945
2946 if (flags & SCF_DO_STCLASS)
2947 data->start_class = oclass;
2948 if (mincount == 0 || minnext == 0) {
2949 if (flags & SCF_DO_STCLASS_OR) {
2950 cl_or(pRExC_state, data->start_class, &this_class);
2951 }
2952 else if (flags & SCF_DO_STCLASS_AND) {
2953 /* Switch to OR mode: cache the old value of
2954 * data->start_class */
2955 INIT_AND_WITHP;
2956 StructCopy(data->start_class, and_withp,
2957 struct regnode_charclass_class);
2958 flags &= ~SCF_DO_STCLASS_AND;
2959 StructCopy(&this_class, data->start_class,
2960 struct regnode_charclass_class);
2961 flags |= SCF_DO_STCLASS_OR;
2962 data->start_class->flags |= ANYOF_EOS;
2963 }
2964 } else { /* Non-zero len */
2965 if (flags & SCF_DO_STCLASS_OR) {
2966 cl_or(pRExC_state, data->start_class, &this_class);
2967 cl_and(data->start_class, and_withp);
2968 }
2969 else if (flags & SCF_DO_STCLASS_AND)
2970 cl_and(data->start_class, &this_class);
2971 flags &= ~SCF_DO_STCLASS;
2972 }
2973 if (!scan) /* It was not CURLYX, but CURLY. */
2974 scan = next;
2975 if ( /* ? quantifier ok, except for (?{ ... }) */
2976 (next_is_eval || !(mincount == 0 && maxcount == 1))
2977 && (minnext == 0) && (deltanext == 0)
2978 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2979 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2980 && ckWARN(WARN_REGEXP))
2981 {
2982 vWARN(RExC_parse,
2983 "Quantifier unexpected on zero-length expression");
2984 }
2985
2986 min += minnext * mincount;
2987 is_inf_internal |= ((maxcount == REG_INFTY
2988 && (minnext + deltanext) > 0)
2989 || deltanext == I32_MAX);
2990 is_inf |= is_inf_internal;
2991 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2992
2993 /* Try powerful optimization CURLYX => CURLYN. */
2994 if ( OP(oscan) == CURLYX && data
2995 && data->flags & SF_IN_PAR
2996 && !(data->flags & SF_HAS_EVAL)
2997 && !deltanext && minnext == 1 ) {
2998 /* Try to optimize to CURLYN. */
2999 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3000 regnode * const nxt1 = nxt;
497b47a8 3001#ifdef DEBUGGING
b515a41d 3002 regnode *nxt2;
497b47a8 3003#endif
c277df42 3004
b515a41d
YO
3005 /* Skip open. */
3006 nxt = regnext(nxt);
3007 if (!strchr((const char*)PL_simple,OP(nxt))
3008 && !(PL_regkind[OP(nxt)] == EXACT
3009 && STR_LEN(nxt) == 1))
3010 goto nogo;
497b47a8 3011#ifdef DEBUGGING
b515a41d 3012 nxt2 = nxt;
497b47a8 3013#endif
b515a41d
YO
3014 nxt = regnext(nxt);
3015 if (OP(nxt) != CLOSE)
3016 goto nogo;
3017 if (RExC_open_parens) {
3018 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3019 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3020 }
3021 /* Now we know that nxt2 is the only contents: */
3022 oscan->flags = (U8)ARG(nxt);
3023 OP(oscan) = CURLYN;
3024 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3025
c277df42 3026#ifdef DEBUGGING
b515a41d
YO
3027 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3028 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3029 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3030 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3031 OP(nxt + 1) = OPTIMIZED; /* was count. */
3032 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3033#endif
40d049e4 3034 }
b515a41d
YO
3035nogo:
3036
3037 /* Try optimization CURLYX => CURLYM. */
3038 if ( OP(oscan) == CURLYX && data
3039 && !(data->flags & SF_HAS_PAR)
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext /* atom is fixed width */
3042 && minnext != 0 /* CURLYM can't handle zero width */
3043 ) {
3044 /* XXXX How to optimize if data == 0? */
3045 /* Optimize to a simpler form. */
3046 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3047 regnode *nxt2;
3048
3049 OP(oscan) = CURLYM;
3050 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3051 && (OP(nxt2) != WHILEM))
3052 nxt = nxt2;
3053 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3054 /* Need to optimize away parenths. */
3055 if (data->flags & SF_IN_PAR) {
3056 /* Set the parenth number. */
3057 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3058
3059 if (OP(nxt) != CLOSE)
3060 FAIL("Panic opt close");
3061 oscan->flags = (U8)ARG(nxt);
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3065 }
3066 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3067 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3068
c277df42 3069#ifdef DEBUGGING
b515a41d
YO
3070 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3071 OP(nxt + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3074#endif
c277df42 3075#if 0
b515a41d
YO
3076 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3077 regnode *nnxt = regnext(nxt1);
3078
3079 if (nnxt == nxt) {
3080 if (reg_off_by_arg[OP(nxt1)])
3081 ARG_SET(nxt1, nxt2 - nxt1);
3082 else if (nxt2 - nxt1 < U16_MAX)
3083 NEXT_OFF(nxt1) = nxt2 - nxt1;
3084 else
3085 OP(nxt) = NOTHING; /* Cannot beautify */
3086 }
3087 nxt1 = nnxt;
3088 }
3089#endif
3090 /* Optimize again: */
3091 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3092 NULL, stopparen, recursed, NULL, 0,depth+1);
c277df42 3093 }
b515a41d
YO
3094 else
3095 oscan->flags = 0;
c277df42 3096 }
b515a41d
YO
3097 else if ((OP(oscan) == CURLYX)
3098 && (flags & SCF_WHILEM_VISITED_POS)
3099 /* See the comment on a similar expression above.
3100 However, this time it not a subexpression
3101 we care about, but the expression itself. */
3102 && (maxcount == REG_INFTY)
3103 && data && ++data->whilem_c < 16) {
3104 /* This stays as CURLYX, we can put the count/of pair. */
3105 /* Find WHILEM (as in regexec.c) */
3106 regnode *nxt = oscan + NEXT_OFF(oscan);
3107
3108 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3109 nxt += ARG(nxt);
3110 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3111 | (RExC_whilem_seen << 4)); /* On WHILEM */
3112 }
3113 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3114 pars++;
3115 if (flags & SCF_DO_SUBSTR) {
3116 SV *last_str = NULL;
3117 int counted = mincount != 0;
c277df42 3118
b515a41d 3119 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c 3120#if defined(SPARC64_GCC_WORKAROUND)
b515a41d
YO
3121 I32 b = 0;
3122 STRLEN l = 0;
3123 const char *s = NULL;
3124 I32 old = 0;
5d1c421c 3125
b515a41d
YO
3126 if (pos_before >= data->last_start_min)
3127 b = pos_before;
3128 else
3129 b = data->last_start_min;
5d1c421c 3130
b515a41d
YO
3131 l = 0;
3132 s = SvPV_const(data->last_found, l);
3133 old = b - data->last_start_min;
5d1c421c
JH
3134
3135#else
b515a41d
YO
3136 I32 b = pos_before >= data->last_start_min
3137 ? pos_before : data->last_start_min;
3138 STRLEN l;
3139 const char * const s = SvPV_const(data->last_found, l);
3140 I32 old = b - data->last_start_min;
5d1c421c 3141#endif
a0ed51b3 3142
b515a41d
YO
3143 if (UTF)
3144 old = utf8_hop((U8*)s, old) - (U8*)s;
3145
3146 l -= old;
3147 /* Get the added string: */
3148 last_str = newSVpvn(s + old, l);
3149 if (UTF)
3150 SvUTF8_on(last_str);
3151 if (deltanext == 0 && pos_before == b) {
3152 /* What was added is a constant string */
3153 if (mincount > 1) {
3154 SvGROW(last_str, (mincount * l) + 1);
3155 repeatcpy(SvPVX(last_str) + l,
3156 SvPVX_const(last_str), l, mincount - 1);
3157 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3158 /* Add additional parts. */
3159 SvCUR_set(data->last_found,
3160 SvCUR(data->last_found) - l);
3161 sv_catsv(data->last_found, last_str);
3162 {
3163 SV * sv = data->last_found;
3164 MAGIC *mg =
3165 SvUTF8(sv) && SvMAGICAL(sv) ?
3166 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3167 if (mg && mg->mg_len >= 0)
3168 mg->mg_len += CHR_SVLEN(last_str);
3169 }
3170 data->last_end += l * (mincount - 1);
3171 }
3172 } else {
3173 /* start offset must point into the last copy */
3174 data->last_start_min += minnext * (mincount - 1);
3175 data->last_start_max += is_inf ? I32_MAX
3176 : (maxcount - 1) * (minnext + data->pos_delta);
3177 }
3178 }
3179 /* It is counted once already... */
3180 data->pos_min += minnext * (mincount - counted);
3181 data->pos_delta += - counted * deltanext +
3182 (minnext + deltanext) * maxcount - minnext * mincount;
3183 if (mincount != maxcount) {
3184 /* Cannot extend fixed substrings found inside
3185 the group. */
3186 scan_commit(pRExC_state,data,minlenp);
3187 if (mincount && last_str) {
3188 SV * const sv = data->last_found;
3189 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292 3190 mg_find(sv, PERL_MAGIC_utf8) : NULL;
b515a41d
YO
3191
3192 if (mg)
3193 mg->mg_len = -1;
3194 sv_setsv(sv, last_str);
3195 data->last_end = data->pos_min;
3196 data->last_start_min =
3197 data->pos_min - CHR_SVLEN(last_str);
3198 data->last_start_max = is_inf
3199 ? I32_MAX
3200 : data->pos_min + data->pos_delta
3201 - CHR_SVLEN(last_str);
0eda9292 3202 }
b515a41d 3203 data->longest = &(data->longest_float);
c277df42 3204 }
b515a41d 3205 SvREFCNT_dec(last_str);
c277df42 3206 }
b515a41d
YO
3207 if (data && (fl & SF_HAS_EVAL))
3208 data->flags |= SF_HAS_EVAL;
3209optimize_curly_tail:
3210 if (OP(oscan) != CURLYX) {
3211 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3212 && NEXT_OFF(next))
3213 NEXT_OFF(oscan) += NEXT_OFF(next);
c277df42 3214 }
b515a41d
YO
3215 continue;
3216 default: /* REF and CLUMP only? */
3217 if (flags & SCF_DO_SUBSTR) {
3218 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3219 data->longest = &(data->longest_float);
3220 }
3221 is_inf = is_inf_internal = 1;
3222 if (flags & SCF_DO_STCLASS_OR)
3223 cl_anything(pRExC_state, data->start_class);
3224 flags &= ~SCF_DO_STCLASS;
3225 break;
3226 }
3227 }
3228 else if (strchr((const char*)PL_simple,OP(scan))) {
3229 int value = 0;
3230
3231 if (flags & SCF_DO_SUBSTR) {
3232 scan_commit(pRExC_state,data,minlenp);
3233 data->pos_min++;
3234 }
3235 min++;
3236 if (flags & SCF_DO_STCLASS) {
3237 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3238
3239 /* Some of the logic below assumes that switching
3240 locale on will only add false positives. */
3241 switch (PL_regkind[OP(scan)]) {
3242 case SANY:
3243 default:
3244do_default:
3245 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3246 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3247 cl_anything(pRExC_state, data->start_class);
3248 break;
3249 case REG_ANY:
3250 if (OP(scan) == SANY)
3251 goto do_default;
3252 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3253 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3254 || (data->start_class->flags & ANYOF_CLASS));
3255 cl_anything(pRExC_state, data->start_class);
3256 }
3257 if (flags & SCF_DO_STCLASS_AND || !value)
3258 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3259 break;
3260 case ANYOF:
3261 if (flags & SCF_DO_STCLASS_AND)
3262 cl_and(data->start_class,
3263 (struct regnode_charclass_class*)scan);
3264 else
3265 cl_or(pRExC_state, data->start_class,
3266 (struct regnode_charclass_class*)scan);
3267 break;
3268 case ALNUM:
3269 if (flags & SCF_DO_STCLASS_AND) {
3270 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3271 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3272 for (value = 0; value < 256; value++)
3273 if (!isALNUM(value))
3274 ANYOF_BITMAP_CLEAR(data->start_class, value);
3275 }
3276 }
3277 else {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3280 else {
3281 for (value = 0; value < 256; value++)
3282 if (isALNUM(value))
3283 ANYOF_BITMAP_SET(data->start_class, value);
3284 }
3285 }
3286 break;
3287 case ALNUML:
3288 if (flags & SCF_DO_STCLASS_AND) {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3291 }
3292 else {
3293 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3294 data->start_class->flags |= ANYOF_LOCALE;
3295 }
3296 break;
3297 case NALNUM:
3298 if (flags & SCF_DO_STCLASS_AND) {
3299 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3300 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3301 for (value = 0; value < 256; value++)
3302 if (isALNUM(value))
3303 ANYOF_BITMAP_CLEAR(data->start_class, value);
3304 }
3305 }
3306 else {
3307 if (data->start_class->flags & ANYOF_LOCALE)
3308 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3309 else {
3310 for (value = 0; value < 256; value++)
3311 if (!isALNUM(value))
3312 ANYOF_BITMAP_SET(data->start_class, value);
3313 }
3314 }
3315 break;
3316 case NALNUML:
3317 if (flags & SCF_DO_STCLASS_AND) {
3318 if (data->start_class->flags & ANYOF_LOCALE)
3319 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3320 }
3321 else {
3322 data->start_class->flags |= ANYOF_LOCALE;
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3324 }
3325 break;
3326 case SPACE:
3327 if (flags & SCF_DO_STCLASS_AND) {
3328 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3329 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3330 for (value = 0; value < 256; value++)
3331 if (!isSPACE(value))
3332 ANYOF_BITMAP_CLEAR(data->start_class, value);
3333 }
3334 }
3335 else {
3336 if (data->start_class->flags & ANYOF_LOCALE)
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3338 else {
3339 for (value = 0; value < 256; value++)
3340 if (isSPACE(value))
3341 ANYOF_BITMAP_SET(data->start_class, value);
3342 }
3343 }
3344 break;
3345 case SPACEL:
3346 if (flags & SCF_DO_STCLASS_AND) {
3347 if (data->start_class->flags & ANYOF_LOCALE)
3348 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3349 }
3350 else {
3351 data->start_class->flags |= ANYOF_LOCALE;
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3353 }
3354 break;
3355 case NSPACE:
3356 if (flags & SCF_DO_STCLASS_AND) {
3357 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3358 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3359 for (value = 0; value < 256; value++)
3360 if (isSPACE(value))
3361 ANYOF_BITMAP_CLEAR(data->start_class, value);
3362 }
3363 }
3364 else {
3365 if (data->start_class->flags & ANYOF_LOCALE)
3366 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3367 else {
3368 for (value = 0; value < 256; value++)
3369 if (!isSPACE(value))
3370 ANYOF_BITMAP_SET(data->start_class, value);
3371 }
3372 }
3373 break;
3374 case NSPACEL:
3375 if (flags & SCF_DO_STCLASS_AND) {
3376 if (data->start_class->flags & ANYOF_LOCALE) {
3377 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3378 for (value = 0; value < 256; value++)
3379 if (!isSPACE(value))
3380 ANYOF_BITMAP_CLEAR(data->start_class, value);
3381 }
3382 }
3383 else {
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3386 }
3387 break;
3388 case DIGIT:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3391 for (value = 0; value < 256; value++)
3392 if (!isDIGIT(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (isDIGIT(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3402 }
3403 }
3404 break;
3405 case NDIGIT:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3408 for (value = 0; value < 256; value++)
3409 if (isDIGIT(value))
3410 ANYOF_BITMAP_CLEAR(data->start_class, value);
3411 }
3412 else {
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3415 else {
3416 for (value = 0; value < 256; value++)
3417 if (!isDIGIT(value))
3418 ANYOF_BITMAP_SET(data->start_class, value);
3419 }
3420 }
3421 break;
c277df42 3422 }
b515a41d
YO
3423 if (flags & SCF_DO_STCLASS_OR)
3424 cl_and(data->start_class, and_withp);
3425 flags &= ~SCF_DO_STCLASS;
c277df42 3426 }
c277df42 3427 }
b515a41d
YO
3428 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3429 data->flags |= (OP(scan) == MEOL
3430 ? SF_BEFORE_MEOL
3431 : SF_BEFORE_SEOL);
c277df42 3432 }
b515a41d
YO
3433 else if ( PL_regkind[OP(scan)] == BRANCHJ
3434 /* Lookbehind, or need to calculate parens/evals/stclass: */
3435 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3436 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3437 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3438 || OP(scan) == UNLESSM )
3439 {
3440 /* Negative Lookahead/lookbehind
3441 In this case we can't do fixed string optimisation.
3442 */
653099ff 3443
b515a41d
YO
3444 I32 deltanext, minnext, fake = 0;
3445 regnode *nscan;
3446 struct regnode_charclass_class intrnl;
3447 int f = 0;
3448
3449 data_fake.flags = 0;
3450 if (data) {
3451 data_fake.whilem_c = data->whilem_c;
3452 data_fake.last_closep = data->last_closep;
653099ff 3453 }
653099ff 3454 else
b515a41d
YO
3455 data_fake.last_closep = &fake;
3456 if ( flags & SCF_DO_STCLASS && !scan->flags
3457 && OP(scan) == IFMATCH ) { /* Lookahead */
3458 cl_init(pRExC_state, &intrnl);
3459 data_fake.start_class = &intrnl;
3460 f |= SCF_DO_STCLASS_AND;
653099ff 3461 }
b515a41d
YO
3462 if (flags & SCF_WHILEM_VISITED_POS)
3463 f |= SCF_WHILEM_VISITED_POS;
3464 next = regnext(scan);
3465 nscan = NEXTOPER(NEXTOPER(scan));
3466 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3467 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3468 if (scan->flags) {
3469 if (deltanext) {
3470 vFAIL("Variable length lookbehind not implemented");
653099ff 3471 }
b515a41d
YO
3472 else if (minnext > (I32)U8_MAX) {
3473 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3474 }
b515a41d 3475 scan->flags = (U8)minnext;
653099ff 3476 }
b515a41d
YO
3477 if (data) {
3478 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3479 pars++;
3480 if (data_fake.flags & SF_HAS_EVAL)
3481 data->flags |= SF_HAS_EVAL;
3482 data->whilem_c = data_fake.whilem_c;
653099ff 3483 }
b515a41d
YO
3484 if (f & SCF_DO_STCLASS_AND) {
3485 const int was = (data->start_class->flags & ANYOF_EOS);
3486
3487 cl_and(data->start_class, &intrnl);
3488 if (was)
3489 data->start_class->flags |= ANYOF_EOS;
653099ff 3490 }
b515a41d
YO
3491 }
3492#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3493 else {
3494 /* Positive Lookahead/lookbehind
3495 In this case we can do fixed string optimisation,
3496 but we must be careful about it. Note in the case of
3497 lookbehind the positions will be offset by the minimum
3498 length of the pattern, something we won't know about
3499 until after the recurse.
3500 */
3501 I32 deltanext, fake = 0;
3502 regnode *nscan;
3503 struct regnode_charclass_class intrnl;
3504 int f = 0;
3505 /* We use SAVEFREEPV so that when the full compile
3506 is finished perl will clean up the allocated
3507 minlens when its all done. This was we don't
3508 have to worry about freeing them when we know
3509 they wont be used, which would be a pain.
3510 */
3511 I32 *minnextp;
3512 Newx( minnextp, 1, I32 );
3513 SAVEFREEPV(minnextp);
3514
3515 if (data) {
3516 StructCopy(data, &data_fake, scan_data_t);
3517 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3518 f |= SCF_DO_SUBSTR;
3519 if (scan->flags)
3520 scan_commit(pRExC_state, &data_fake,minlenp);
3521 data_fake.last_found=newSVsv(data->last_found);
653099ff
GS
3522 }
3523 }
b515a41d
YO
3524 else
3525 data_fake.last_closep = &fake;
3526 data_fake.flags = 0;
3527 if (is_inf)
3528 data_fake.flags |= SF_IS_INF;
3529 if ( flags & SCF_DO_STCLASS && !scan->flags
3530 && OP(scan) == IFMATCH ) { /* Lookahead */
3531 cl_init(pRExC_state, &intrnl);
3532 data_fake.start_class = &intrnl;
3533 f |= SCF_DO_STCLASS_AND;
653099ff 3534 }
b515a41d
YO
3535 if (flags & SCF_WHILEM_VISITED_POS)
3536 f |= SCF_WHILEM_VISITED_POS;
3537 next = regnext(scan);
3538 nscan = NEXTOPER(NEXTOPER(scan));
3539
3540 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3541 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3542 if (scan->flags) {
3543 if (deltanext) {
3544 vFAIL("Variable length lookbehind not implemented");
653099ff 3545 }
b515a41d
YO
3546 else if (*minnextp > (I32)U8_MAX) {
3547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3548 }
b515a41d 3549 scan->flags = (U8)*minnextp;
653099ff 3550 }
b515a41d
YO
3551
3552 *minnextp += min;
3553
3554 if (f & SCF_DO_STCLASS_AND) {
3555 const int was = (data->start_class->flags & ANYOF_EOS);
3556
3557 cl_and(data->start_class, &intrnl);
3558 if (was)
3559 data->start_class->flags |= ANYOF_EOS;
653099ff 3560 }
b515a41d
YO
3561 if (data) {
3562 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3563 pars++;
3564 if (data_fake.flags & SF_HAS_EVAL)
3565 data->flags |= SF_HAS_EVAL;
3566 data->whilem_c = data_fake.whilem_c;
3567 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3568 if (RExC_rx->minlen<*minnextp)
3569 RExC_rx->minlen=*minnextp;
3570 scan_commit(pRExC_state, &data_fake, minnextp);
3571 SvREFCNT_dec(data_fake.last_found);
3572
3573 if ( data_fake.minlen_fixed != minlenp )
3574 {
3575 data->offset_fixed= data_fake.offset_fixed;
3576 data->minlen_fixed= data_fake.minlen_fixed;
3577 data->lookbehind_fixed+= scan->flags;
3578 }
3579 if ( data_fake.minlen_float != minlenp )
3580 {
3581 data->minlen_float= data_fake.minlen_float;
3582 data->offset_float_min=data_fake.offset_float_min;
3583 data->offset_float_max=data_fake.offset_float_max;
3584 data->lookbehind_float+= scan->flags;
3585 }
653099ff
GS
3586 }
3587 }
1de06328 3588
1de06328 3589
c277df42 3590 }
1de06328 3591#endif
cb434fcc 3592 }
b515a41d
YO
3593 else if (OP(scan) == OPEN) {
3594 if (stopparen != (I32)ARG(scan))
3595 pars++;
40d049e4 3596 }
b515a41d
YO
3597 else if (OP(scan) == CLOSE) {
3598 if (stopparen == (I32)ARG(scan)) {
3599 break;
40d049e4 3600 }
b515a41d
YO
3601 if ((I32)ARG(scan) == is_par) {
3602 next = regnext(scan);
3603
3604 if ( next && (OP(next) != WHILEM) && next < last)
3605 is_par = 0; /* Disable optimization */
40d049e4 3606 }
b515a41d
YO
3607 if (data)
3608 *(data->last_closep) = ARG(scan);
40d049e4 3609 }
b515a41d 3610 else if (OP(scan) == EVAL) {
c277df42
IZ
3611 if (data)
3612 data->flags |= SF_HAS_EVAL;
40d049e4 3613 }
b515a41d
YO
3614 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3615 if (flags & SCF_DO_SUBSTR) {
3616 scan_commit(pRExC_state,data,minlenp);
3617 flags &= ~SCF_DO_SUBSTR;
3618 }
3619 if (data && OP(scan)==ACCEPT) {
3620 data->flags |= SCF_SEEN_ACCEPT;
3621 if (stopmin > min)
3622 stopmin = min;
3623 }
e2e6a0f1 3624 }
b515a41d
YO
3625 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3626 {
0f5d15d6 3627 if (flags & SCF_DO_SUBSTR) {
1de06328 3628 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3629 data->longest = &(data->longest_float);
3630 }
3631 is_inf = is_inf_internal = 1;
653099ff 3632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3633 cl_anything(pRExC_state, data->start_class);
96776eda 3634 flags &= ~SCF_DO_STCLASS;
b515a41d 3635 }
786e8c11 3636#ifdef TRIE_STUDY_OPT
40d049e4 3637#ifdef FULL_TRIE_STUDY
b515a41d
YO
3638 else if (PL_regkind[OP(scan)] == TRIE) {
3639 /* NOTE - There is similar code to this block above for handling
3640 BRANCH nodes on the initial study. If you change stuff here
3641 check there too. */
3642 regnode *trie_node= scan;
3643 regnode *tail= regnext(scan);
3644 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3645 I32 max1 = 0, min1 = I32_MAX;
3646 struct regnode_charclass_class accum;
3647
3648 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3649 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3650 if (flags & SCF_DO_STCLASS)
3651 cl_init_zero(pRExC_state, &accum);
3652
3653 if (!trie->jump) {
3654 min1= trie->minlen;
3655 max1= trie->maxlen;
3656 } else {
3657 const regnode *nextbranch= NULL;
3658 U32 word;
3659
3660 for ( word=1 ; word <= trie->wordcount ; word++)
3661 {
3662 I32 deltanext=0, minnext=0, f = 0, fake;
3663 struct regnode_charclass_class this_class;
3664
3665 data_fake.flags = 0;
3666 if (data) {
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3669 }
3670 else
3671 data_fake.last_closep = &fake;
3672
3673 if (flags & SCF_DO_STCLASS) {
3674 cl_init(pRExC_state, &this_class);
3675 data_fake.start_class = &this_class;
3676 f = SCF_DO_STCLASS_AND;
3677 }
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3680
3681 if (trie->jump[word]) {
3682 if (!nextbranch)
3683 nextbranch = trie_node + trie->jump[0];
3684 scan= trie_node + trie->jump[word];
3685 /* We go from the jump point to the branch that follows
3686 it. Note this means we need the vestigal unused branches
3687 even though they arent otherwise used.
3688 */
3689 minnext = study_chunk(pRExC_state, &scan, minlenp,
3690 &deltanext, (regnode *)nextbranch, &data_fake,
3691 stopparen, recursed, NULL, f,depth+1);
3692 }
3693 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3694 nextbranch= regnext((regnode*)nextbranch);
3695
3696 if (min1 > (I32)(minnext + trie->minlen))
3697 min1 = minnext + trie->minlen;
3698 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3699 max1 = minnext + deltanext + trie->maxlen;
3700 if (deltanext == I32_MAX)
3701 is_inf = is_inf_internal = 1;
3702
3703 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3704 pars++;
3705 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3706 if ( stopmin > min + min1)
3707 stopmin = min + min1;
3708 flags &= ~SCF_DO_SUBSTR;
3709 if (data)
3710 data->flags |= SCF_SEEN_ACCEPT;
3711 }
3712 if (data) {
3713 if (data_fake.flags & SF_HAS_EVAL)
3714 data->flags |= SF_HAS_EVAL;
3715 data->whilem_c = data_fake.whilem_c;
3716 }
3717 if (flags & SCF_DO_STCLASS)
3718 cl_or(pRExC_state, &accum, &this_class);
3719 }
3720 }
3721 if (flags & SCF_DO_SUBSTR) {
3722 data->pos_min += min1;
3723 data->pos_delta += max1 - min1;
3724 if (max1 != min1 || is_inf)
3725 data->longest = &(data->longest_float);
3726 }
3727 min += min1;
3728 delta += max1 - min1;
3729 if (flags & SCF_DO_STCLASS_OR) {
3730 cl_or(pRExC_state, data->start_class, &accum);
3731 if (min1) {
3732 cl_and(data->start_class, and_withp);
3733 flags &= ~SCF_DO_STCLASS;
3734 }
3735 }
3736 else if (flags & SCF_DO_STCLASS_AND) {
3737 if (min1) {
3738 cl_and(data->start_class, &accum);
3739 flags &= ~SCF_DO_STCLASS;
3740 }
3741 else {
3742 /* Switch to OR mode: cache the old value of
3743 * data->start_class */
3744 INIT_AND_WITHP;
3745 StructCopy(data->start_class, and_withp,
3746 struct regnode_charclass_class);
3747 flags &= ~SCF_DO_STCLASS_AND;
3748 StructCopy(&accum, data->start_class,
3749 struct regnode_charclass_class);
3750 flags |= SCF_DO_STCLASS_OR;
3751 data->start_class->flags |= ANYOF_EOS;
3752 }
3753 }
3754 scan= tail;
3755 continue;
3756 }
786e8c11 3757#else
b515a41d
YO
3758 else if (PL_regkind[OP(scan)] == TRIE) {
3759 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3760 U8*bang=NULL;
3761
3762 min += trie->minlen;
3763 delta += (trie->maxlen - trie->minlen);
3764 flags &= ~SCF_DO_STCLASS; /* xxx */
3765 if (flags & SCF_DO_SUBSTR) {
3766 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3767 data->pos_min += trie->minlen;
3768 data->pos_delta += (trie->maxlen - trie->minlen);
3769 if (trie->maxlen != trie->minlen)
3770 data->longest = &(data->longest_float);
3771 }
3772 if (trie->jump) /* no more substrings -- for now /grr*/
3773 flags &= ~SCF_DO_SUBSTR;
3774 }
786e8c11 3775#endif /* old or new */
b515a41d
YO
3776#endif /* TRIE_STUDY_OPT */
3777 /* Else: zero-length, ignore. */
3778 scan = regnext(scan);
3779 }
3780 DEBUG_PEEP("FEND",scan,depth);
3781 scan = frame->next;
3782 stopparen = frame->stop;
3783 frame = frame->prev;
3784
c277df42
IZ
3785 }
3786
3787 finish:
3788 *scanp = scan;
aca2d497 3789 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3790 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3791 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3792 if (is_par > (I32)U8_MAX)
c277df42
IZ
3793 is_par = 0;
3794 if (is_par && pars==1 && data) {
3795 data->flags |= SF_IN_PAR;
3796 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3797 }
3798 else if (pars && data) {
c277df42
IZ
3799 data->flags |= SF_HAS_PAR;
3800 data->flags &= ~SF_IN_PAR;
3801 }
653099ff 3802 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3803 cl_and(data->start_class, and_withp);
786e8c11
YO
3804 if (flags & SCF_TRIE_RESTUDY)
3805 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3806
3807 DEBUG_STUDYDATA(data,depth);
3808
e2e6a0f1 3809 return min < stopmin ? min : stopmin;
c277df42
IZ
3810}
3811
76e3520e 3812STATIC I32
5f66b61c 3813S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3814{
830247a4 3815 if (RExC_rx->data) {
2eb97020 3816 const U32 count = RExC_rx->data->count;
b81d288d 3817 Renewc(RExC_rx->data,
2eb97020 3818 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3819 char, struct reg_data);
2eb97020 3820 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3821 RExC_rx->data->count += n;
a0ed51b3
LW
3822 }
3823 else {
a02a5408 3824 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3825 char, struct reg_data);
a02a5408 3826 Newx(RExC_rx->data->what, n, U8);
830247a4 3827 RExC_rx->data->count = n;
c277df42 3828 }
830247a4
IZ
3829 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3830 return RExC_rx->data->count - n;
c277df42
IZ
3831}
3832
76234dfb 3833#ifndef PERL_IN_XSUB_RE
d88dccdf 3834void
864dbfa3 3835Perl_reginitcolors(pTHX)
d88dccdf 3836{
97aff369 3837 dVAR;
1df70142 3838 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3839 if (s) {
1df70142
AL
3840 char *t = savepv(s);
3841 int i = 0;
3842 PL_colors[0] = t;
d88dccdf 3843 while (++i < 6) {
1df70142
AL
3844 t = strchr(t, '\t');
3845 if (t) {
3846 *t = '\0';
3847 PL_colors[i] = ++t;
d88dccdf
IZ
3848 }
3849 else
1df70142 3850 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3851 }
3852 } else {
1df70142 3853 int i = 0;
b81d288d 3854 while (i < 6)
06b5626a 3855 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3856 }
3857 PL_colorset = 1;
3858}
76234dfb 3859#endif
8615cb43 3860
07be1b83 3861
786e8c11
YO
3862#ifdef TRIE_STUDY_OPT
3863#define CHECK_RESTUDY_GOTO \
3864 if ( \
3865 (data.flags & SCF_TRIE_RESTUDY) \
3866 && ! restudied++ \
3867 ) goto reStudy
3868#else
3869#define CHECK_RESTUDY_GOTO
3870#endif
f9f4320a 3871
a687059c 3872/*
e50aee73 3873 - pregcomp - compile a regular expression into internal code
a687059c
LW
3874 *
3875 * We can't allocate space until we know how big the compiled form will be,
3876 * but we can't compile it (and thus know how big it is) until we've got a
3877 * place to put the code. So we cheat: we compile it twice, once with code
3878 * generation turned off and size counting turned on, and once "for real".
3879 * This also means that we don't allocate space until we are sure that the
3880 * thing really will compile successfully, and we never have to move the
3881 * code and thus invalidate pointers into it. (Note that it has to be in
3882 * one piece because free() must be able to free it all.) [NB: not true in perl]
3883 *
3884 * Beware that the optimization-preparation code in here knows about some
3885 * of the structure of the compiled regexp. [I'll say.]
3886 */
b9b4dddf
YO
3887
3888
3889
f9f4320a 3890#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3891#define RE_ENGINE_PTR &PL_core_reg_engine
3892#else
f9f4320a
YO
3893extern const struct regexp_engine my_reg_engine;
3894#define RE_ENGINE_PTR &my_reg_engine
3895#endif
b9b4dddf
YO
3896/* these make a few things look better, to avoid indentation */
3897#define BEGIN_BLOCK {
f9f4320a
YO
3898#define END_BLOCK }
3899
a687059c 3900regexp *
864dbfa3 3901Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3902{
97aff369 3903 dVAR;
f9f4320a
YO
3904 GET_RE_DEBUG_FLAGS_DECL;
3905 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3906#ifndef PERL_IN_XSUB_RE
3907 BEGIN_BLOCK
f9f4320a
YO
3908 /* Dispatch a request to compile a regexp to correct
3909 regexp engine. */
3910 HV * const table = GvHV(PL_hintgv);
3911 if (table) {
3912 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3913 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3914 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3915 DEBUG_COMPILE_r({
8d8756e7 3916 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3917 SvIV(*ptr));
3918 });
f2f78491 3919 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3920 }
b9b4dddf
YO
3921 }
3922 END_BLOCK
3923#endif
3924 BEGIN_BLOCK
a0d0e21e 3925 register regexp *r;
c277df42 3926 regnode *scan;
c277df42 3927 regnode *first;
a0d0e21e 3928 I32 flags;
a0d0e21e
LW
3929 I32 minlen = 0;
3930 I32 sawplus = 0;
3931 I32 sawopen = 0;
2c2d71f5 3932 scan_data_t data;
830247a4 3933 RExC_state_t RExC_state;
be8e71aa 3934 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3935#ifdef TRIE_STUDY_OPT
3936 int restudied= 0;
3937 RExC_state_t copyRExC_state;
3938#endif
a0d0e21e 3939 if (exp == NULL)
c277df42 3940 FAIL("NULL regexp argument");
a0d0e21e 3941
a5961de5 3942 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3943
5cfc7842 3944 RExC_precomp = exp;
a3621e74 3945 DEBUG_COMPILE_r({
ab3bbdeb
YO
3946 SV *dsv= sv_newmortal();
3947 RE_PV_QUOTED_DECL(s, RExC_utf8,
3948 dsv, RExC_precomp, (xend - exp), 60);
3949 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3950 PL_colors[4],PL_colors[5],s);
a5961de5 3951 });
e2509266 3952 RExC_flags = pm->op_pmflags;
830247a4 3953 RExC_sawback = 0;
bbce6d69 3954
830247a4
IZ
3955 RExC_seen = 0;
3956 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3957 RExC_seen_evals = 0;
3958 RExC_extralen = 0;
c277df42 3959
bbce6d69 3960 /* First pass: determine size, legality. */
830247a4 3961 RExC_parse = exp;
fac92740 3962 RExC_start = exp;
830247a4
IZ
3963 RExC_end = xend;
3964 RExC_naughty = 0;
3965 RExC_npar = 1;
c74340f9 3966 RExC_cpar = 1;
e2e6a0f1 3967 RExC_nestroot = 0;
830247a4
IZ
3968 RExC_size = 0L;
3969 RExC_emit = &PL_regdummy;
3970 RExC_whilem_seen = 0;
fc8cd66c 3971 RExC_charnames = NULL;
40d049e4
YO
3972 RExC_open_parens = NULL;
3973 RExC_close_parens = NULL;
3974 RExC_opend = NULL;
81714fb9 3975 RExC_paren_names = NULL;
40d049e4
YO
3976 RExC_recurse = NULL;
3977 RExC_recurse_count = 0;
81714fb9 3978
85ddcde9
JH
3979#if 0 /* REGC() is (currently) a NOP at the first pass.
3980 * Clever compilers notice this and complain. --jhi */
830247a4 3981 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3982#endif
3dab1dad
YO
3983 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3984 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3985 RExC_precomp = NULL;
a0d0e21e
LW
3986 return(NULL);
3987 }
07be1b83 3988 DEBUG_PARSE_r({
81714fb9
YO
3989 PerlIO_printf(Perl_debug_log,
3990 "Required size %"IVdf" nodes\n"
3991 "Starting second pass (creation)\n",
3992 (IV)RExC_size);
07be1b83
YO
3993 RExC_lastnum=0;
3994 RExC_lastparse=NULL;
3995 });
c277df42
IZ
3996 /* Small enough for pointer-storage convention?
3997 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3998 if (RExC_size >= 0x10000L && RExC_extralen)
3999 RExC_size += RExC_extralen;
c277df42 4000 else
830247a4
IZ
4001 RExC_extralen = 0;
4002 if (RExC_whilem_seen > 15)
4003 RExC_whilem_seen = 15;
a0d0e21e 4004
e2e6a0f1
YO
4005#ifdef DEBUGGING
4006 /* Make room for a sentinel value at the end of the program */
4007 RExC_size++;
4008#endif
4009
f9f4320a
YO
4010 /* Allocate space and zero-initialize. Note, the two step process
4011 of zeroing when in debug mode, thus anything assigned has to
4012 happen after that */
a02a5408 4013 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 4014 char, regexp);
a0d0e21e 4015 if (r == NULL)
b45f050a 4016 FAIL("Regexp out of space");
0f79a09d
GS
4017#ifdef DEBUGGING
4018 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4019 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 4020#endif
f9f4320a
YO
4021 /* initialization begins here */
4022 r->engine= RE_ENGINE_PTR;
c277df42 4023 r->refcnt = 1;
bbce6d69 4024 r->prelen = xend - exp;
5cfc7842 4025 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4026 r->subbeg = NULL;
f8c7b90f 4027#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4028 r->saved_copy = NULL;
ed252734 4029#endif
cf93c79d 4030 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4031 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4032 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4033
4034 r->substrs = 0; /* Useful during FAIL. */
4035 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4036 r->endp = 0;
c74340f9 4037 r->swap = NULL;
81714fb9
YO
4038 r->paren_names = 0;
4039
6bda09f9 4040 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4041 Newxz(RExC_open_parens, RExC_npar,regnode *);
4042 SAVEFREEPV(RExC_open_parens);
4043 Newxz(RExC_close_parens,RExC_npar,regnode *);
4044 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4045 }
4046
4047 /* Useful during FAIL. */
a02a5408 4048 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4049 if (r->offsets) {
2af232bd 4050 r->offsets[0] = RExC_size;
fac92740 4051 }
a3621e74 4052 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4053 "%s %"UVuf" bytes for offset annotations.\n",
4054 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4055 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4056
830247a4 4057 RExC_rx = r;
bbce6d69 4058
4059 /* Second pass: emit code. */
e2509266 4060 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4061 RExC_parse = exp;
4062 RExC_end = xend;
4063 RExC_naughty = 0;
4064 RExC_npar = 1;
c74340f9 4065 RExC_cpar = 1;
fac92740 4066 RExC_emit_start = r->program;
830247a4 4067 RExC_emit = r->program;
e2e6a0f1
YO
4068#ifdef DEBUGGING
4069 /* put a sentinal on the end of the program so we can check for
4070 overwrites */
4071 r->program[RExC_size].type = 255;
4072#endif
2cd61cdb 4073 /* Store the count of eval-groups for security checks: */
786e8c11 4074 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4075 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4076 r->data = 0;
3dab1dad 4077 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4078 return(NULL);
6bda09f9 4079
07be1b83
YO
4080 /* XXXX To minimize changes to RE engine we always allocate
4081 3-units-long substrs field. */
4082 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4083 if (RExC_recurse_count) {
4084 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4085 SAVEFREEPV(RExC_recurse);
4086 }
a0d0e21e 4087
07be1b83 4088reStudy:
1de06328 4089 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4090 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4091
07be1b83
YO
4092#ifdef TRIE_STUDY_OPT
4093 if ( restudied ) {
5d458dd8 4094 U32 seen=RExC_seen;
07be1b83 4095 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4096
4097 RExC_state = copyRExC_state;
4098 if (seen & REG_TOP_LEVEL_BRANCHES)
4099 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4100 else
4101 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4102 if (data.last_found) {
07be1b83 4103 SvREFCNT_dec(data.longest_fixed);
07be1b83 4104 SvREFCNT_dec(data.longest_float);
07be1b83 4105 SvREFCNT_dec(data.last_found);
1de06328 4106 }
40d049e4 4107 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4108 } else {
40d049e4 4109 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4110 copyRExC_state = RExC_state;
07be1b83 4111 }
40d049e4
YO
4112#else
4113 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4114#endif
fc8cd66c 4115
a0d0e21e 4116 /* Dig out information for optimizations. */
cf93c79d 4117 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4118 pm->op_pmflags = RExC_flags;
a0ed51b3 4119 if (UTF)
5ff6fc6d 4120 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4121 r->regstclass = NULL;
830247a4 4122 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4123 r->reganch |= ROPT_NAUGHTY;
c277df42 4124 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4125
1de06328
YO
4126 /* testing for BRANCH here tells us whether there is "must appear"
4127 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4128 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4129 I32 fake;
c5254dd6 4130 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4131 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4132 int stclass_flag;
07be1b83 4133 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4134
4135 first = scan;
c277df42 4136 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4137 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4138 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4139 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4140 /* for now we can't handle lookbehind IFMATCH*/
4141 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4142 (OP(first) == PLUS) ||
4143 (OP(first) == MINMOD) ||
653099ff 4144 /* An {n,m} with n>0 */
07be1b83
YO
4145 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4146 {
786e8c11 4147
a0d0e21e
LW
4148 if (OP(first) == PLUS)
4149 sawplus = 1;
4150 else
3dab1dad 4151 first += regarglen[OP(first)];
07be1b83
YO
4152 if (OP(first) == IFMATCH) {
4153 first = NEXTOPER(first);
4154 first += EXTRA_STEP_2ARGS;
7c167cea 4155 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4156 first = NEXTOPER(first);
a687059c
LW
4157 }
4158
a0d0e21e
LW
4159 /* Starting-point info. */
4160 again:
786e8c11 4161 DEBUG_PEEP("first:",first,0);
07be1b83 4162 /* Ignore EXACT as we deal with it later. */
3dab1dad 4163 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4164 if (OP(first) == EXACT)
6f207bd3 4165 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4166 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4167 r->regstclass = first;
4168 }
07be1b83 4169#ifdef TRIE_STCLASS
786e8c11 4170 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4171 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4172 {
786e8c11 4173 regnode *trie_op;
07be1b83 4174 /* this can happen only on restudy */
786e8c11
YO
4175 if ( OP(first) == TRIE ) {
4176 struct regnode_1 *trieop;
4177 Newxz(trieop,1,struct regnode_1);
4178 StructCopy(first,trieop,struct regnode_1);
4179 trie_op=(regnode *)trieop;
4180 } else {
4181 struct regnode_charclass *trieop;
4182 Newxz(trieop,1,struct regnode_charclass);
4183 StructCopy(first,trieop,struct regnode_charclass);
4184 trie_op=(regnode *)trieop;
4185 }
1de06328 4186 OP(trie_op)+=2;
786e8c11
YO
4187 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4188 r->regstclass = trie_op;
07be1b83
YO
4189 }
4190#endif
bfed75c6 4191 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4192 r->regstclass = first;
3dab1dad
YO
4193 else if (PL_regkind[OP(first)] == BOUND ||
4194 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4195 r->regstclass = first;
3dab1dad 4196 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4197 r->reganch |= (OP(first) == MBOL
4198 ? ROPT_ANCH_MBOL
4199 : (OP(first) == SBOL
4200 ? ROPT_ANCH_SBOL
4201 : ROPT_ANCH_BOL));
a0d0e21e 4202 first = NEXTOPER(first);
774d564b 4203 goto again;
4204 }
4205 else if (OP(first) == GPOS) {
4206 r->reganch |= ROPT_ANCH_GPOS;
4207 first = NEXTOPER(first);
4208 goto again;
a0d0e21e 4209 }
cf2a2b69
YO
4210 else if ((!sawopen || !RExC_sawback) &&
4211 (OP(first) == STAR &&
3dab1dad 4212 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4213 !(r->reganch & ROPT_ANCH) )
4214 {
4215 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4216 const int type =
4217 (OP(NEXTOPER(first)) == REG_ANY)
4218 ? ROPT_ANCH_MBOL
4219 : ROPT_ANCH_SBOL;
cad2e5aa 4220 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4221 first = NEXTOPER(first);
774d564b 4222 goto again;
a0d0e21e 4223 }
b81d288d 4224 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4225 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4226 /* x+ must match at the 1st pos of run of x's */
4227 r->reganch |= ROPT_SKIP;
a0d0e21e 4228
c277df42 4229 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4230#ifdef TRIE_STUDY_OPT
81714fb9 4231 DEBUG_PARSE_r(
be8e71aa
YO
4232 if (!restudied)
4233 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4234 (IV)(first - scan + 1))
4235 );
4236#else
81714fb9 4237 DEBUG_PARSE_r(
be8e71aa
YO
4238 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4239 (IV)(first - scan + 1))
4240 );
4241#endif
4242
4243
a0d0e21e
LW
4244 /*
4245 * If there's something expensive in the r.e., find the
4246 * longest literal string that must appear and make it the
4247 * regmust. Resolve ties in favor of later strings, since
4248 * the regstart check works with the beginning of the r.e.
4249 * and avoiding duplication strengthens checking. Not a
4250 * strong reason, but sufficient in the absence of others.
4251 * [Now we resolve ties in favor of the earlier string if
c277df42 4252 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4253 * earlier string may buy us something the later one won't.]
4254 */
de8c5301 4255
396482e1
GA
4256 data.longest_fixed = newSVpvs("");
4257 data.longest_float = newSVpvs("");
4258 data.last_found = newSVpvs("");
c277df42
IZ
4259 data.longest = &(data.longest_fixed);
4260 first = scan;
653099ff 4261 if (!r->regstclass) {
830247a4 4262 cl_init(pRExC_state, &ch_class);
653099ff
GS
4263 data.start_class = &ch_class;
4264 stclass_flag = SCF_DO_STCLASS_AND;
4265 } else /* XXXX Check for BOUND? */
4266 stclass_flag = 0;
cb434fcc 4267 data.last_closep = &last_close;
de8c5301 4268
1de06328 4269 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4270 &data, -1, NULL, NULL,
4271 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4272
07be1b83 4273
786e8c11
YO
4274 CHECK_RESTUDY_GOTO;
4275
4276
830247a4 4277 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4278 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4279 && !RExC_seen_zerolen
4280 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4281 r->reganch |= ROPT_CHECK_ALL;
1de06328 4282 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4283 SvREFCNT_dec(data.last_found);
4284
1de06328
YO
4285 /* Note that code very similar to this but for anchored string
4286 follows immediately below, changes may need to be made to both.
4287 Be careful.
4288 */
a0ed51b3 4289 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4290 if (longest_float_length
c277df42
IZ
4291 || (data.flags & SF_FL_BEFORE_EOL
4292 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4293 || (RExC_flags & PMf_MULTILINE))))
4294 {
1182767e 4295 I32 t,ml;
cf93c79d 4296
1de06328 4297 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4298 && data.offset_fixed == data.offset_float_min
4299 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4300 goto remove_float; /* As in (a)+. */
4301
1de06328
YO
4302 /* copy the information about the longest float from the reg_scan_data
4303 over to the program. */
33b8afdf
JH
4304 if (SvUTF8(data.longest_float)) {
4305 r->float_utf8 = data.longest_float;
c445ea15 4306 r->float_substr = NULL;
33b8afdf
JH
4307 } else {
4308 r->float_substr = data.longest_float;
c445ea15 4309 r->float_utf8 = NULL;
33b8afdf 4310 }
1de06328
YO
4311 /* float_end_shift is how many chars that must be matched that
4312 follow this item. We calculate it ahead of time as once the
4313 lookbehind offset is added in we lose the ability to correctly
4314 calculate it.*/
4315 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4316 : (I32)longest_float_length;
1de06328
YO
4317 r->float_end_shift = ml - data.offset_float_min
4318 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4319 + data.lookbehind_float;
4320 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4321 r->float_max_offset = data.offset_float_max;
1182767e 4322 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4323 r->float_max_offset -= data.lookbehind_float;
4324
cf93c79d
IZ
4325 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4326 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4327 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4328 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4329 }
4330 else {
aca2d497 4331 remove_float:
c445ea15 4332 r->float_substr = r->float_utf8 = NULL;
c277df42 4333 SvREFCNT_dec(data.longest_float);
c5254dd6 4334 longest_float_length = 0;
a0d0e21e 4335 }
c277df42 4336
1de06328
YO
4337 /* Note that code very similar to this but for floating string
4338 is immediately above, changes may need to be made to both.
4339 Be careful.
4340 */
a0ed51b3 4341 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4342 if (longest_fixed_length
c277df42
IZ
4343 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4344 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4345 || (RExC_flags & PMf_MULTILINE))))
4346 {
1182767e 4347 I32 t,ml;
cf93c79d 4348
1de06328
YO
4349 /* copy the information about the longest fixed
4350 from the reg_scan_data over to the program. */
33b8afdf
JH
4351 if (SvUTF8(data.longest_fixed)) {
4352 r->anchored_utf8 = data.longest_fixed;
c445ea15 4353 r->anchored_substr = NULL;
33b8afdf
JH
4354 } else {
4355 r->anchored_substr = data.longest_fixed;
c445ea15 4356 r->anchored_utf8 = NULL;
33b8afdf 4357 }
1de06328
YO
4358 /* fixed_end_shift is how many chars that must be matched that
4359 follow this item. We calculate it ahead of time as once the
4360 lookbehind offset is added in we lose the ability to correctly
4361 calculate it.*/
4362 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4363 : (I32)longest_fixed_length;
1de06328
YO
4364 r->anchored_end_shift = ml - data.offset_fixed
4365 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4366 + data.lookbehind_fixed;
4367 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4368
cf93c79d
IZ
4369 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4370 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4371 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4372 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4373 }
4374 else {
c445ea15 4375 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4376 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4377 longest_fixed_length = 0;
a0d0e21e 4378 }
b81d288d 4379 if (r->regstclass
ffc61ed2 4380 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4381 r->regstclass = NULL;
33b8afdf
JH
4382 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4383 && stclass_flag
653099ff 4384 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4385 && !cl_is_anything(data.start_class))
4386 {
1df70142 4387 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4388
a02a5408 4389 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4390 struct regnode_charclass_class);
4391 StructCopy(data.start_class,
830247a4 4392 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4393 struct regnode_charclass_class);
830247a4 4394 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4395 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4396 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4397 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4398 PerlIO_printf(Perl_debug_log,
a0288114 4399 "synthetic stclass \"%s\".\n",
3f7c398e 4400 SvPVX_const(sv));});
653099ff 4401 }
c277df42
IZ
4402
4403 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4404 if (longest_fixed_length > longest_float_length) {
1de06328 4405 r->check_end_shift = r->anchored_end_shift;
c277df42 4406 r->check_substr = r->anchored_substr;
33b8afdf 4407 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4408 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4409 if (r->reganch & ROPT_ANCH_SINGLE)
4410 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4411 }
4412 else {
1de06328 4413 r->check_end_shift = r->float_end_shift;
c277df42 4414 r->check_substr = r->float_substr;
33b8afdf 4415 r->check_utf8 = r->float_utf8;
1de06328
YO
4416 r->check_offset_min = r->float_min_offset;
4417 r->check_offset_max = r->float_max_offset;
a0d0e21e 4418 }
30382c73
IZ
4419 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4420 This should be changed ASAP! */
33b8afdf 4421 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4422 r->reganch |= RE_USE_INTUIT;
33b8afdf 4423 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4424 r->reganch |= RE_INTUIT_TAIL;
4425 }
1de06328
YO
4426 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4427 if ( (STRLEN)minlen < longest_float_length )
4428 minlen= longest_float_length;
4429 if ( (STRLEN)minlen < longest_fixed_length )
4430 minlen= longest_fixed_length;
4431 */
a0ed51b3
LW
4432 }
4433 else {
c277df42
IZ
4434 /* Several toplevels. Best we can is to set minlen. */
4435 I32 fake;
653099ff 4436 struct regnode_charclass_class ch_class;
cb434fcc 4437 I32 last_close = 0;
c277df42 4438
5d458dd8 4439 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4440
c277df42 4441 scan = r->program + 1;
830247a4 4442 cl_init(pRExC_state, &ch_class);
653099ff 4443 data.start_class = &ch_class;
cb434fcc 4444 data.last_closep = &last_close;
07be1b83 4445
de8c5301 4446
1de06328 4447 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4448 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4449
786e8c11 4450 CHECK_RESTUDY_GOTO;
07be1b83 4451
33b8afdf 4452 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4453 = r->float_substr = r->float_utf8 = NULL;
653099ff 4454 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4455 && !cl_is_anything(data.start_class))
4456 {
1df70142 4457 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4458
a02a5408 4459 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4460 struct regnode_charclass_class);
4461 StructCopy(data.start_class,
830247a4 4462 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4463 struct regnode_charclass_class);
830247a4 4464 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4465 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4466 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4467 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4468 PerlIO_printf(Perl_debug_log,
a0288114 4469 "synthetic stclass \"%s\".\n",
3f7c398e 4470 SvPVX_const(sv));});
653099ff 4471 }
a0d0e21e
LW
4472 }
4473
1de06328
YO
4474 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4475 the "real" pattern. */
cf9788e3
RGS
4476 DEBUG_OPTIMISE_r({
4477 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4478 minlen, r->minlen);
4479 });
de8c5301 4480 r->minlenret = minlen;
1de06328
YO
4481 if (r->minlen < minlen)
4482 r->minlen = minlen;
4483
b81d288d 4484 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4485 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4486 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4487 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4488 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4489 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4490 if (RExC_seen & REG_SEEN_CANY)
4491 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4492 if (RExC_seen & REG_SEEN_VERBARG)
4493 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4494 if (RExC_seen & REG_SEEN_CUTGROUP)
4495 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4496 if (RExC_paren_names)
4497 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4498 else
4499 r->paren_names = NULL;
4500
40d049e4
YO
4501 if (RExC_recurse_count) {
4502 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4503 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4504 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4505 }
4506 }
a02a5408
JC
4507 Newxz(r->startp, RExC_npar, I32);
4508 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4509 /* assume we don't need to swap parens around before we match */
4510
be8e71aa
YO
4511 DEBUG_DUMP_r({
4512 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4513 regdump(r);
4514 });
8e9a8a48
YO
4515 DEBUG_OFFSETS_r(if (r->offsets) {
4516 const U32 len = r->offsets[0];
4517 U32 i;
4518 GET_RE_DEBUG_FLAGS_DECL;
4519 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4520 for (i = 1; i <= len; i++) {
4521 if (r->offsets[i*2-1] || r->offsets[i*2])
4522 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4523 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4524 }
4525 PerlIO_printf(Perl_debug_log, "\n");
4526 });
a0d0e21e 4527 return(r);
f9f4320a 4528 END_BLOCK
a687059c
LW
4529}
4530
f9f4320a
YO
4531#undef CORE_ONLY_BLOCK
4532#undef END_BLOCK
4533#undef RE_ENGINE_PTR
3dab1dad 4534
9af228c6 4535#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4536SV*
4537Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4538{
4539 I32 parno = 0; /* no match */
4540 if (PL_curpm) {
4541 const REGEXP * const rx = PM_GETRE(PL_curpm);
4542 if (rx && rx->paren_names) {
4543 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4544 if (he_str) {
4545 IV i;
4546 SV* sv_dat=HeVAL(he_str);
4547 I32 *nums=(I32*)SvPVX(sv_dat);
4548 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4549 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4550 rx->endp[nums[i]] != -1)
4551 {
4552 parno = nums[i];
4553 break;
4554 }
4555 }
4556 }
4557 }
4558 }
4559 if ( !parno ) {
4560 return 0;
4561 } else {
4562 GV *gv_paren;
4563 SV *sv= sv_newmortal();
4564 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4565 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4566 return GvSVn(gv_paren);
4567 }
4568}
9af228c6 4569#endif
0a4db386 4570
894be9b7 4571/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4572 * If flags is REG_RSN_RETURN_NULL returns null.
4573 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4574 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4575 * to the parsed name as looked up in the RExC_paren_names hash.
4576 * If there is an error throws a vFAIL().. type exception.
894be9b7 4577 */
0a4db386
YO
4578
4579#define REG_RSN_RETURN_NULL 0
4580#define REG_RSN_RETURN_NAME 1
4581#define REG_RSN_RETURN_DATA 2
4582
894be9b7
YO
4583STATIC SV*
4584S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4585 char *name_start = RExC_parse;
0a4db386 4586 if ( UTF ) {
894be9b7 4587 STRLEN numlen;
0a4db386
YO
4588 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4589 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4590 {
4591 RExC_parse += numlen;
4592 }
4593 } else {
4594 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4595 RExC_parse++;
4596 }
0a4db386
YO
4597 if ( flags ) {
4598 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4599 (int)(RExC_parse - name_start)));
894be9b7 4600 if (UTF)
0a4db386
YO
4601 SvUTF8_on(sv_name);
4602 if ( flags == REG_RSN_RETURN_NAME)
4603 return sv_name;
4604 else if (flags==REG_RSN_RETURN_DATA) {
4605 HE *he_str = NULL;
4606 SV *sv_dat = NULL;
4607 if ( ! sv_name ) /* should not happen*/
4608 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4609 if (RExC_paren_names)
4610 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4611 if ( he_str )
4612 sv_dat = HeVAL(he_str);
4613 if ( ! sv_dat )
4614 vFAIL("Reference to nonexistent named group");
4615 return sv_dat;
4616 }
4617 else {
4618 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4619 }
4620 /* NOT REACHED */
894be9b7 4621 }
0a4db386 4622 return NULL;
894be9b7
YO
4623}
4624
3dab1dad
YO
4625#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4626 int rem=(int)(RExC_end - RExC_parse); \
4627 int cut; \
4628 int num; \
4629 int iscut=0; \
4630 if (rem>10) { \
4631 rem=10; \
4632 iscut=1; \
4633 } \
4634 cut=10-rem; \
4635 if (RExC_lastparse!=RExC_parse) \
4636 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4637 rem, RExC_parse, \
4638 cut + 4, \
4639 iscut ? "..." : "<" \
4640 ); \
4641 else \
4642 PerlIO_printf(Perl_debug_log,"%16s",""); \
4643 \
4644 if (SIZE_ONLY) \
4645 num=RExC_size; \
4646 else \
4647 num=REG_NODE_NUM(RExC_emit); \
4648 if (RExC_lastnum!=num) \
0a4db386 4649 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4650 else \
0a4db386 4651 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4652 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4653 (int)((depth*2)), "", \
3dab1dad
YO
4654 (funcname) \
4655 ); \
4656 RExC_lastnum=num; \
4657 RExC_lastparse=RExC_parse; \
4658})
4659
07be1b83
YO
4660
4661
3dab1dad
YO
4662#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4663 DEBUG_PARSE_MSG((funcname)); \
4664 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4665})
6bda09f9
YO
4666#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4667 DEBUG_PARSE_MSG((funcname)); \
4668 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4669})
a687059c
LW
4670/*
4671 - reg - regular expression, i.e. main body or parenthesized thing
4672 *
4673 * Caller must absorb opening parenthesis.
4674 *
4675 * Combining parenthesis handling with the base level of regular expression
4676 * is a trifle forced, but the need to tie the tails of the branches to what
4677 * follows makes it hard to avoid.
4678 */
07be1b83
YO
4679#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4680#ifdef DEBUGGING
4681#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4682#else
4683#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4684#endif
3dab1dad 4685
e2e6a0f1
YO
4686/* this idea is borrowed from STR_WITH_LEN in handy.h */
4687#define CHECK_WORD(s,v,l) \
4688 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4689
76e3520e 4690STATIC regnode *
3dab1dad 4691S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4692 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4693{
27da23d5 4694 dVAR;
c277df42
IZ
4695 register regnode *ret; /* Will be the head of the group. */
4696 register regnode *br;
4697 register regnode *lastbr;
cbbf8932 4698 register regnode *ender = NULL;
a0d0e21e 4699 register I32 parno = 0;
cbbf8932
AL
4700 I32 flags;
4701 const I32 oregflags = RExC_flags;
6136c704
AL
4702 bool have_branch = 0;
4703 bool is_open = 0;
9d1d55b5
JP
4704
4705 /* for (?g), (?gc), and (?o) warnings; warning
4706 about (?c) will warn about (?g) -- japhy */
4707
6136c704
AL
4708#define WASTED_O 0x01
4709#define WASTED_G 0x02
4710#define WASTED_C 0x04
4711#define WASTED_GC (0x02|0x04)
cbbf8932 4712 I32 wastedflags = 0x00;
9d1d55b5 4713
fac92740 4714 char * parse_start = RExC_parse; /* MJD */
a28509cc 4715 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4716
3dab1dad
YO
4717 GET_RE_DEBUG_FLAGS_DECL;
4718 DEBUG_PARSE("reg ");
4719
4720
821b33a5 4721 *flagp = 0; /* Tentatively. */
a0d0e21e 4722
9d1d55b5 4723
a0d0e21e
LW
4724 /* Make an OPEN node, if parenthesized. */
4725 if (paren) {
e2e6a0f1
YO
4726 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4727 char *start_verb = RExC_parse;
4728 STRLEN verb_len = 0;
4729 char *start_arg = NULL;
4730 unsigned char op = 0;
4731 int argok = 1;
4732 int internal_argval = 0; /* internal_argval is only useful if !argok */
4733 while ( *RExC_parse && *RExC_parse != ')' ) {
4734 if ( *RExC_parse == ':' ) {
4735 start_arg = RExC_parse + 1;
4736 break;
4737 }
4738 RExC_parse++;
4739 }
4740 ++start_verb;
4741 verb_len = RExC_parse - start_verb;
4742 if ( start_arg ) {
4743 RExC_parse++;
4744 while ( *RExC_parse && *RExC_parse != ')' )
4745 RExC_parse++;
4746 if ( *RExC_parse != ')' )
4747 vFAIL("Unterminated verb pattern argument");
4748 if ( RExC_parse == start_arg )
4749 start_arg = NULL;
4750 } else {
4751 if ( *RExC_parse != ')' )
4752 vFAIL("Unterminated verb pattern");
4753 }
5d458dd8 4754
e2e6a0f1
YO
4755 switch ( *start_verb ) {
4756 case 'A': /* (*ACCEPT) */
4757 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4758 op = ACCEPT;
4759 internal_argval = RExC_nestroot;
4760 }
4761 break;
4762 case 'C': /* (*COMMIT) */
4763 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4764 op = COMMIT;
e2e6a0f1
YO
4765 break;
4766 case 'F': /* (*FAIL) */
4767 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4768 op = OPFAIL;
4769 argok = 0;
4770 }
4771 break;
5d458dd8
YO
4772 case ':': /* (*:NAME) */
4773 case 'M': /* (*MARK:NAME) */
4774 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4775 op = MARKPOINT;
5d458dd8
YO
4776 argok = -1;
4777 }
4778 break;
4779 case 'P': /* (*PRUNE) */
4780 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4781 op = PRUNE;
e2e6a0f1 4782 break;
5d458dd8
YO
4783 case 'S': /* (*SKIP) */
4784 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4785 op = SKIP;
4786 break;
4787 case 'T': /* (*THEN) */
4788 /* [19:06] <TimToady> :: is then */
4789 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4790 op = CUTGROUP;
4791 RExC_seen |= REG_SEEN_CUTGROUP;
4792 }
e2e6a0f1
YO
4793 break;
4794 }
4795 if ( ! op ) {
4796 RExC_parse++;
4797 vFAIL3("Unknown verb pattern '%.*s'",
4798 verb_len, start_verb);
4799 }
4800 if ( argok ) {
4801 if ( start_arg && internal_argval ) {
4802 vFAIL3("Verb pattern '%.*s' may not have an argument",
4803 verb_len, start_verb);
4804 } else if ( argok < 0 && !start_arg ) {
4805 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4806 verb_len, start_verb);
4807 } else {
4808 ret = reganode(pRExC_state, op, internal_argval);
4809 if ( ! internal_argval && ! SIZE_ONLY ) {
4810 if (start_arg) {
4811 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4812 ARG(ret) = add_data( pRExC_state, 1, "S" );
4813 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4814 ret->flags = 0;
4815 } else {
4816 ret->flags = 1;
4817 }
4818 }
4819 }
4820 if (!internal_argval)
4821 RExC_seen |= REG_SEEN_VERBARG;
4822 } else if ( start_arg ) {
4823 vFAIL3("Verb pattern '%.*s' may not have an argument",
4824 verb_len, start_verb);
4825 } else {
4826 ret = reg_node(pRExC_state, op);
4827 }
4828 nextchar(pRExC_state);
4829 return ret;
4830 } else
fac92740 4831 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4832 U32 posflags = 0, negflags = 0;
4833 U32 *flagsp = &posflags;
6136c704 4834 bool is_logical = 0;
a28509cc 4835 const char * const seqstart = RExC_parse;
ca9dfc88 4836
830247a4
IZ
4837 RExC_parse++;
4838 paren = *RExC_parse++;
c277df42 4839 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4840 switch (paren) {
894be9b7 4841
fac92740 4842 case '<': /* (?<...) */
b81d288d 4843 if (*RExC_parse == '!')
c277df42 4844 paren = ',';
0a4db386
YO
4845 else if (*RExC_parse != '=')
4846 { /* (?<...>) */
81714fb9 4847 char *name_start;
894be9b7 4848 SV *svname;
81714fb9
YO
4849 paren= '>';
4850 case '\'': /* (?'...') */
4851 name_start= RExC_parse;
0a4db386
YO
4852 svname = reg_scan_name(pRExC_state,
4853 SIZE_ONLY ? /* reverse test from the others */
4854 REG_RSN_RETURN_NAME :
4855 REG_RSN_RETURN_NULL);
81714fb9
YO
4856 if (RExC_parse == name_start)
4857 goto unknown;
4858 if (*RExC_parse != paren)
4859 vFAIL2("Sequence (?%c... not terminated",
4860 paren=='>' ? '<' : paren);
4861 if (SIZE_ONLY) {
e62cc96a
YO
4862 HE *he_str;
4863 SV *sv_dat = NULL;
894be9b7
YO
4864 if (!svname) /* shouldnt happen */
4865 Perl_croak(aTHX_
4866 "panic: reg_scan_name returned NULL");
81714fb9
YO
4867 if (!RExC_paren_names) {
4868 RExC_paren_names= newHV();
4869 sv_2mortal((SV*)RExC_paren_names);
4870 }
4871 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4872 if ( he_str )
81714fb9 4873 sv_dat = HeVAL(he_str);
e62cc96a 4874 if ( ! sv_dat ) {
81714fb9 4875 /* croak baby croak */
e62cc96a
YO
4876 Perl_croak(aTHX_
4877 "panic: paren_name hash element allocation failed");
4878 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4879 IV count=SvIV(sv_dat);
4880 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4881 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4882 pv[count]=RExC_npar;
4883 SvIVX(sv_dat)++;
4884 } else {
4885 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4886 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4887 SvIOK_on(sv_dat);
4888 SvIVX(sv_dat)= 1;
e62cc96a
YO
4889 }
4890
81714fb9
YO
4891 /*sv_dump(sv_dat);*/
4892 }
4893 nextchar(pRExC_state);
4894 paren = 1;
4895 goto capturing_parens;
4896 }
4897 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4898 RExC_parse++;
fac92740
MJD
4899 case '=': /* (?=...) */
4900 case '!': /* (?!...) */
830247a4 4901 RExC_seen_zerolen++;
e2e6a0f1
YO
4902 if (*RExC_parse == ')') {
4903 ret=reg_node(pRExC_state, OPFAIL);
4904 nextchar(pRExC_state);
4905 return ret;
4906 }
fac92740
MJD
4907 case ':': /* (?:...) */
4908 case '>': /* (?>...) */
a0d0e21e 4909 break;
fac92740
MJD
4910 case '$': /* (?$...) */
4911 case '@': /* (?@...) */
8615cb43 4912 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4913 break;
fac92740 4914 case '#': /* (?#...) */
830247a4
IZ
4915 while (*RExC_parse && *RExC_parse != ')')
4916 RExC_parse++;
4917 if (*RExC_parse != ')')
c277df42 4918 FAIL("Sequence (?#... not terminated");
830247a4 4919 nextchar(pRExC_state);
a0d0e21e
LW
4920 *flagp = TRYAGAIN;
4921 return NULL;
894be9b7
YO
4922 case '0' : /* (?0) */
4923 case 'R' : /* (?R) */
4924 if (*RExC_parse != ')')
6bda09f9 4925 FAIL("Sequence (?R) not terminated");
1a147d38 4926 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4927 nextchar(pRExC_state);
4928 return ret;
4929 /*notreached*/
894be9b7
YO
4930 { /* named and numeric backreferences */
4931 I32 num;
4932 char * parse_start;
4933 case '&': /* (?&NAME) */
4934 parse_start = RExC_parse - 1;
4935 {
0a4db386
YO
4936 SV *sv_dat = reg_scan_name(pRExC_state,
4937 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4938 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4939 }
4940 goto gen_recurse_regop;
4941 /* NOT REACHED */
542fa716
YO
4942 case '+':
4943 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4944 RExC_parse++;
4945 vFAIL("Illegal pattern");
4946 }
4947 goto parse_recursion;
4948 /* NOT REACHED*/
4949 case '-': /* (?-1) */
4950 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4951 RExC_parse--; /* rewind to let it be handled later */
4952 goto parse_flags;
4953 }
4954 /*FALLTHROUGH */
6bda09f9
YO
4955 case '1': case '2': case '3': case '4': /* (?1) */
4956 case '5': case '6': case '7': case '8': case '9':
4957 RExC_parse--;
542fa716 4958 parse_recursion:
894be9b7
YO
4959 num = atoi(RExC_parse);
4960 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
4961 if (*RExC_parse == '-')
4962 RExC_parse++;
6bda09f9
YO
4963 while (isDIGIT(*RExC_parse))
4964 RExC_parse++;
4965 if (*RExC_parse!=')')
4966 vFAIL("Expecting close bracket");
894be9b7
YO
4967
4968 gen_recurse_regop:
542fa716
YO
4969 if ( paren == '-' ) {
4970 /*
4971 Diagram of capture buffer numbering.
4972 Top line is the normal capture buffer numbers
4973 Botton line is the negative indexing as from
4974 the X (the (?-2))
4975
4976 + 1 2 3 4 5 X 6 7
4977 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4978 - 5 4 3 2 1 X x x
4979
4980 */
4981 num = RExC_npar + num;
4982 if (num < 1) {
4983 RExC_parse++;
4984 vFAIL("Reference to nonexistent group");
4985 }
4986 } else if ( paren == '+' ) {
4987 num = RExC_npar + num - 1;
4988 }
4989
1a147d38 4990 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4991 if (!SIZE_ONLY) {
4992 if (num > (I32)RExC_rx->nparens) {
4993 RExC_parse++;
4994 vFAIL("Reference to nonexistent group");
4995 }
40d049e4 4996 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4997 RExC_emit++;
226de585 4998 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4999 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5000 } else {
6bda09f9 5001 RExC_size++;
6bda09f9 5002 }
0a4db386 5003 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5004 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5005 Set_Node_Offset(ret, parse_start); /* MJD */
5006
6bda09f9
YO
5007 nextchar(pRExC_state);
5008 return ret;
894be9b7
YO
5009 } /* named and numeric backreferences */
5010 /* NOT REACHED */
5011
fac92740 5012 case 'p': /* (?p...) */
9014280d 5013 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5014 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5015 /* FALL THROUGH*/
fac92740 5016 case '?': /* (??...) */
6136c704 5017 is_logical = 1;
438a3801
YST
5018 if (*RExC_parse != '{')
5019 goto unknown;
830247a4 5020 paren = *RExC_parse++;
0f5d15d6 5021 /* FALL THROUGH */
fac92740 5022 case '{': /* (?{...}) */
c277df42 5023 {
c277df42
IZ
5024 I32 count = 1, n = 0;
5025 char c;
830247a4 5026 char *s = RExC_parse;
c277df42 5027
830247a4
IZ
5028 RExC_seen_zerolen++;
5029 RExC_seen |= REG_SEEN_EVAL;
5030 while (count && (c = *RExC_parse)) {
6136c704
AL
5031 if (c == '\\') {
5032 if (RExC_parse[1])
5033 RExC_parse++;
5034 }
b81d288d 5035 else if (c == '{')
c277df42 5036 count++;
b81d288d 5037 else if (c == '}')
c277df42 5038 count--;
830247a4 5039 RExC_parse++;
c277df42 5040 }
6136c704 5041 if (*RExC_parse != ')') {
b81d288d 5042 RExC_parse = s;
b45f050a
JF
5043 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5044 }
c277df42 5045 if (!SIZE_ONLY) {
f3548bdc 5046 PAD *pad;
6136c704
AL
5047 OP_4tree *sop, *rop;
5048 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5049
569233ed
SB
5050 ENTER;
5051 Perl_save_re_context(aTHX);
f3548bdc 5052 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5053 sop->op_private |= OPpREFCOUNTED;
5054 /* re_dup will OpREFCNT_inc */
5055 OpREFCNT_set(sop, 1);
569233ed 5056 LEAVE;
c277df42 5057
830247a4
IZ
5058 n = add_data(pRExC_state, 3, "nop");
5059 RExC_rx->data->data[n] = (void*)rop;
5060 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5061 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5062 SvREFCNT_dec(sv);
a0ed51b3 5063 }
e24b16f9 5064 else { /* First pass */
830247a4 5065 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5066 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5067 /* No compiled RE interpolated, has runtime
5068 components ===> unsafe. */
5069 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5070 if (PL_tainting && PL_tainted)
cc6b7395 5071 FAIL("Eval-group in insecure regular expression");
54df2634 5072#if PERL_VERSION > 8
923e4eb5 5073 if (IN_PERL_COMPILETIME)
b5c19bd7 5074 PL_cv_has_eval = 1;
54df2634 5075#endif
c277df42 5076 }
b5c19bd7 5077
830247a4 5078 nextchar(pRExC_state);
6136c704 5079 if (is_logical) {
830247a4 5080 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5081 if (!SIZE_ONLY)
5082 ret->flags = 2;
3dab1dad 5083 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5084 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5085 return ret;
5086 }
ccb2c380
MP
5087 ret = reganode(pRExC_state, EVAL, n);
5088 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5089 Set_Node_Offset(ret, parse_start);
5090 return ret;
c277df42 5091 }
fac92740 5092 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5093 {
0a4db386 5094 int is_define= 0;
fac92740 5095 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5096 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5097 || RExC_parse[1] == '<'
830247a4 5098 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5099 I32 flag;
5100
830247a4 5101 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5102 if (!SIZE_ONLY)
5103 ret->flags = 1;
3dab1dad 5104 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5105 goto insert_if;
b81d288d 5106 }
a0ed51b3 5107 }
0a4db386
YO
5108 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5109 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5110 {
5111 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5112 char *name_start= RExC_parse++;
5113 I32 num = 0;
5114 SV *sv_dat=reg_scan_name(pRExC_state,
5115 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5116 if (RExC_parse == name_start || *RExC_parse != ch)
5117 vFAIL2("Sequence (?(%c... not terminated",
5118 (ch == '>' ? '<' : ch));
5119 RExC_parse++;
5120 if (!SIZE_ONLY) {
5121 num = add_data( pRExC_state, 1, "S" );
5122 RExC_rx->data->data[num]=(void*)sv_dat;
5123 SvREFCNT_inc(sv_dat);
5124 }
5125 ret = reganode(pRExC_state,NGROUPP,num);
5126 goto insert_if_check_paren;
5127 }
5128 else if (RExC_parse[0] == 'D' &&
5129 RExC_parse[1] == 'E' &&
5130 RExC_parse[2] == 'F' &&
5131 RExC_parse[3] == 'I' &&
5132 RExC_parse[4] == 'N' &&
5133 RExC_parse[5] == 'E')
5134 {
5135 ret = reganode(pRExC_state,DEFINEP,0);
5136 RExC_parse +=6 ;
5137 is_define = 1;
5138 goto insert_if_check_paren;
5139 }
5140 else if (RExC_parse[0] == 'R') {
5141 RExC_parse++;
5142 parno = 0;
5143 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5144 parno = atoi(RExC_parse++);
5145 while (isDIGIT(*RExC_parse))
5146 RExC_parse++;
5147 } else if (RExC_parse[0] == '&') {
5148 SV *sv_dat;
5149 RExC_parse++;
5150 sv_dat = reg_scan_name(pRExC_state,
5151 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5152 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5153 }
1a147d38 5154 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5155 goto insert_if_check_paren;
5156 }
830247a4 5157 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5158 /* (?(1)...) */
6136c704 5159 char c;
830247a4 5160 parno = atoi(RExC_parse++);
c277df42 5161
830247a4
IZ
5162 while (isDIGIT(*RExC_parse))
5163 RExC_parse++;
fac92740 5164 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5165
0a4db386 5166 insert_if_check_paren:
830247a4 5167 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5168 vFAIL("Switch condition not recognized");
c277df42 5169 insert_if:
3dab1dad
YO
5170 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5171 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5172 if (br == NULL)
830247a4 5173 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5174 else
3dab1dad 5175 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5176 c = *nextchar(pRExC_state);
d1b80229
IZ
5177 if (flags&HASWIDTH)
5178 *flagp |= HASWIDTH;
c277df42 5179 if (c == '|') {
0a4db386
YO
5180 if (is_define)
5181 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5182 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5183 regbranch(pRExC_state, &flags, 1,depth+1);
5184 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5185 if (flags&HASWIDTH)
5186 *flagp |= HASWIDTH;
830247a4 5187 c = *nextchar(pRExC_state);
a0ed51b3
LW
5188 }
5189 else
c277df42
IZ
5190 lastbr = NULL;
5191 if (c != ')')
8615cb43 5192 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5193 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5194 REGTAIL(pRExC_state, br, ender);
c277df42 5195 if (lastbr) {
3dab1dad
YO
5196 REGTAIL(pRExC_state, lastbr, ender);
5197 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5198 }
5199 else
3dab1dad 5200 REGTAIL(pRExC_state, ret, ender);
c277df42 5201 return ret;
a0ed51b3
LW
5202 }
5203 else {
830247a4 5204 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5205 }
5206 }
1b1626e4 5207 case 0:
830247a4 5208 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5209 vFAIL("Sequence (? incomplete");
1b1626e4 5210 break;
a0d0e21e 5211 default:
830247a4 5212 --RExC_parse;
fac92740 5213 parse_flags: /* (?i) */
830247a4 5214 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5215 /* (?g), (?gc) and (?o) are useless here
5216 and must be globally applied -- japhy */
5217
5218 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5219 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5220 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5221 if (! (wastedflags & wflagbit) ) {
5222 wastedflags |= wflagbit;
5223 vWARN5(
5224 RExC_parse + 1,
5225 "Useless (%s%c) - %suse /%c modifier",
5226 flagsp == &negflags ? "?-" : "?",
5227 *RExC_parse,
5228 flagsp == &negflags ? "don't " : "",
5229 *RExC_parse
5230 );
5231 }
5232 }
5233 }
5234 else if (*RExC_parse == 'c') {
5235 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5236 if (! (wastedflags & WASTED_C) ) {
5237 wastedflags |= WASTED_GC;
9d1d55b5
JP
5238 vWARN3(
5239 RExC_parse + 1,
5240 "Useless (%sc) - %suse /gc modifier",
5241 flagsp == &negflags ? "?-" : "?",
5242 flagsp == &negflags ? "don't " : ""
5243 );
5244 }
5245 }
5246 }
5247 else { pmflag(flagsp, *RExC_parse); }
5248
830247a4 5249 ++RExC_parse;
ca9dfc88 5250 }
830247a4 5251 if (*RExC_parse == '-') {
ca9dfc88 5252 flagsp = &negflags;
9d1d55b5 5253 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5254 ++RExC_parse;
ca9dfc88 5255 goto parse_flags;
48c036b1 5256 }
e2509266
JH
5257 RExC_flags |= posflags;
5258 RExC_flags &= ~negflags;
830247a4
IZ
5259 if (*RExC_parse == ':') {
5260 RExC_parse++;
ca9dfc88
IZ
5261 paren = ':';
5262 break;
5263 }
c277df42 5264 unknown:
830247a4
IZ
5265 if (*RExC_parse != ')') {
5266 RExC_parse++;
5267 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5268 }
830247a4 5269 nextchar(pRExC_state);
a0d0e21e
LW
5270 *flagp = TRYAGAIN;
5271 return NULL;
5272 }
5273 }
fac92740 5274 else { /* (...) */
81714fb9 5275 capturing_parens:
830247a4
IZ
5276 parno = RExC_npar;
5277 RExC_npar++;
e2e6a0f1 5278
830247a4 5279 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5280 if (!SIZE_ONLY ){
5281 if (!RExC_nestroot)
5282 RExC_nestroot = parno;
5283 if (RExC_seen & REG_SEEN_RECURSE) {
5284 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5285 "Setting open paren #%"IVdf" to %d\n",
5286 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5287 RExC_open_parens[parno-1]= ret;
5288 }
6bda09f9 5289 }
fac92740
MJD
5290 Set_Node_Length(ret, 1); /* MJD */
5291 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5292 is_open = 1;
a0d0e21e 5293 }
a0ed51b3 5294 }
fac92740 5295 else /* ! paren */
a0d0e21e
LW
5296 ret = NULL;
5297
5298 /* Pick up the branches, linking them together. */
fac92740 5299 parse_start = RExC_parse; /* MJD */
3dab1dad 5300 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5301 /* branch_len = (paren != 0); */
2af232bd 5302
a0d0e21e
LW
5303 if (br == NULL)
5304 return(NULL);
830247a4
IZ
5305 if (*RExC_parse == '|') {
5306 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5307 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5308 }
fac92740 5309 else { /* MJD */
6bda09f9 5310 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5311 Set_Node_Length(br, paren != 0);
5312 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5313 }
c277df42
IZ
5314 have_branch = 1;
5315 if (SIZE_ONLY)
830247a4 5316 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5317 }
5318 else if (paren == ':') {
c277df42
IZ
5319 *flagp |= flags&SIMPLE;
5320 }
6136c704 5321 if (is_open) { /* Starts with OPEN. */
3dab1dad 5322 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5323 }
5324 else if (paren != '?') /* Not Conditional */
a0d0e21e 5325 ret = br;
32a0ca98 5326 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5327 lastbr = br;
830247a4
IZ
5328 while (*RExC_parse == '|') {
5329 if (!SIZE_ONLY && RExC_extralen) {
5330 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5331 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5332 }
5333 if (SIZE_ONLY)
830247a4
IZ
5334 RExC_extralen += 2; /* Account for LONGJMP. */
5335 nextchar(pRExC_state);
3dab1dad 5336 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5337
a687059c 5338 if (br == NULL)
a0d0e21e 5339 return(NULL);
3dab1dad 5340 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5341 lastbr = br;
821b33a5
IZ
5342 if (flags&HASWIDTH)
5343 *flagp |= HASWIDTH;
a687059c 5344 *flagp |= flags&SPSTART;
a0d0e21e
LW
5345 }
5346
c277df42
IZ
5347 if (have_branch || paren != ':') {
5348 /* Make a closing node, and hook it on the end. */
5349 switch (paren) {
5350 case ':':
830247a4 5351 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5352 break;
5353 case 1:
c74340f9 5354 RExC_cpar++;
830247a4 5355 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5356 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5357 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5358 "Setting close paren #%"IVdf" to %d\n",
5359 (IV)parno, REG_NODE_NUM(ender)));
5360 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5361 if (RExC_nestroot == parno)
5362 RExC_nestroot = 0;
40d049e4 5363 }
fac92740
MJD
5364 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5365 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5366 break;
5367 case '<':
c277df42
IZ
5368 case ',':
5369 case '=':
5370 case '!':
c277df42 5371 *flagp &= ~HASWIDTH;
821b33a5
IZ
5372 /* FALL THROUGH */
5373 case '>':
830247a4 5374 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5375 break;
5376 case 0:
830247a4 5377 ender = reg_node(pRExC_state, END);
40d049e4
YO
5378 if (!SIZE_ONLY) {
5379 assert(!RExC_opend); /* there can only be one! */
5380 RExC_opend = ender;
5381 }
c277df42
IZ
5382 break;
5383 }
eaf3ca90 5384 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5385
9674d46a 5386 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5387 if (depth==1)
5388 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5389
c277df42 5390 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5391 for (br = ret; br; br = regnext(br)) {
5392 const U8 op = PL_regkind[OP(br)];
5393 if (op == BRANCH) {
07be1b83 5394 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5395 }
5396 else if (op == BRANCHJ) {
07be1b83 5397 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5398 }
c277df42
IZ
5399 }
5400 }
a0d0e21e 5401 }
c277df42
IZ
5402
5403 {
e1ec3a88
AL
5404 const char *p;
5405 static const char parens[] = "=!<,>";
c277df42
IZ
5406
5407 if (paren && (p = strchr(parens, paren))) {
eb160463 5408 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5409 int flag = (p - parens) > 1;
5410
5411 if (paren == '>')
5412 node = SUSPEND, flag = 0;
6bda09f9 5413 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5414 Set_Node_Cur_Length(ret);
5415 Set_Node_Offset(ret, parse_start + 1);
c277df42 5416 ret->flags = flag;
07be1b83 5417 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5418 }
a0d0e21e
LW
5419 }
5420
5421 /* Check for proper termination. */
ce3e6498 5422 if (paren) {
e2509266 5423 RExC_flags = oregflags;
830247a4
IZ
5424 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5425 RExC_parse = oregcomp_parse;
380a0633 5426 vFAIL("Unmatched (");
ce3e6498 5427 }
a0ed51b3 5428 }
830247a4
IZ
5429 else if (!paren && RExC_parse < RExC_end) {
5430 if (*RExC_parse == ')') {
5431 RExC_parse++;
380a0633 5432 vFAIL("Unmatched )");
a0ed51b3
LW
5433 }
5434 else
b45f050a 5435 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5436 /* NOTREACHED */
5437 }
a687059c 5438
a0d0e21e 5439 return(ret);
a687059c
LW
5440}
5441
5442/*
5443 - regbranch - one alternative of an | operator
5444 *
5445 * Implements the concatenation operator.
5446 */
76e3520e 5447STATIC regnode *
3dab1dad 5448S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5449{
97aff369 5450 dVAR;
c277df42
IZ
5451 register regnode *ret;
5452 register regnode *chain = NULL;
5453 register regnode *latest;
5454 I32 flags = 0, c = 0;
3dab1dad
YO
5455 GET_RE_DEBUG_FLAGS_DECL;
5456 DEBUG_PARSE("brnc");
b81d288d 5457 if (first)
c277df42
IZ
5458 ret = NULL;
5459 else {
b81d288d 5460 if (!SIZE_ONLY && RExC_extralen)
830247a4 5461 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5462 else {
830247a4 5463 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5464 Set_Node_Length(ret, 1);
5465 }
c277df42
IZ
5466 }
5467
b81d288d 5468 if (!first && SIZE_ONLY)
830247a4 5469 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5470
c277df42 5471 *flagp = WORST; /* Tentatively. */
a0d0e21e 5472
830247a4
IZ
5473 RExC_parse--;
5474 nextchar(pRExC_state);
5475 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5476 flags &= ~TRYAGAIN;
3dab1dad 5477 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5478 if (latest == NULL) {
5479 if (flags & TRYAGAIN)
5480 continue;
5481 return(NULL);
a0ed51b3
LW
5482 }
5483 else if (ret == NULL)
c277df42 5484 ret = latest;
a0d0e21e 5485 *flagp |= flags&HASWIDTH;
c277df42 5486 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5487 *flagp |= flags&SPSTART;
5488 else {
830247a4 5489 RExC_naughty++;
3dab1dad 5490 REGTAIL(pRExC_state, chain, latest);
a687059c 5491 }
a0d0e21e 5492 chain = latest;
c277df42
IZ
5493 c++;
5494 }
5495 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5496 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5497 if (ret == NULL)
5498 ret = chain;
5499 }
5500 if (c == 1) {
5501 *flagp |= flags&SIMPLE;
a0d0e21e 5502 }
a687059c 5503
d4c19fe8 5504 return ret;
a687059c
LW
5505}
5506
5507/*
5508 - regpiece - something followed by possible [*+?]
5509 *
5510 * Note that the branching code sequences used for ? and the general cases
5511 * of * and + are somewhat optimized: they use the same NOTHING node as
5512 * both the endmarker for their branch list and the body of the last branch.
5513 * It might seem that this node could be dispensed with entirely, but the
5514 * endmarker role is not redundant.
5515 */
76e3520e 5516STATIC regnode *
3dab1dad 5517S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5518{
97aff369 5519 dVAR;
c277df42 5520 register regnode *ret;
a0d0e21e
LW
5521 register char op;
5522 register char *next;
5523 I32 flags;
1df70142 5524 const char * const origparse = RExC_parse;
a0d0e21e 5525 I32 min;
c277df42 5526 I32 max = REG_INFTY;
fac92740 5527 char *parse_start;
10edeb5d 5528 const char *maxpos = NULL;
3dab1dad
YO
5529 GET_RE_DEBUG_FLAGS_DECL;
5530 DEBUG_PARSE("piec");
a0d0e21e 5531
3dab1dad 5532 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5533 if (ret == NULL) {
5534 if (flags & TRYAGAIN)
5535 *flagp |= TRYAGAIN;
5536 return(NULL);
5537 }
5538
830247a4 5539 op = *RExC_parse;
a0d0e21e 5540
830247a4 5541 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5542 maxpos = NULL;
fac92740 5543 parse_start = RExC_parse; /* MJD */
830247a4 5544 next = RExC_parse + 1;
a0d0e21e
LW
5545 while (isDIGIT(*next) || *next == ',') {
5546 if (*next == ',') {
5547 if (maxpos)
5548 break;
5549 else
5550 maxpos = next;
a687059c 5551 }
a0d0e21e
LW
5552 next++;
5553 }
5554 if (*next == '}') { /* got one */
5555 if (!maxpos)
5556 maxpos = next;
830247a4
IZ
5557 RExC_parse++;
5558 min = atoi(RExC_parse);
a0d0e21e
LW
5559 if (*maxpos == ',')
5560 maxpos++;
5561 else
830247a4 5562 maxpos = RExC_parse;
a0d0e21e
LW
5563 max = atoi(maxpos);
5564 if (!max && *maxpos != '0')
c277df42
IZ
5565 max = REG_INFTY; /* meaning "infinity" */
5566 else if (max >= REG_INFTY)
8615cb43 5567 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5568 RExC_parse = next;
5569 nextchar(pRExC_state);
a0d0e21e
LW
5570
5571 do_curly:
5572 if ((flags&SIMPLE)) {
830247a4 5573 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5574 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5575 Set_Node_Offset(ret, parse_start+1); /* MJD */
5576 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5577 }
5578 else {
3dab1dad 5579 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5580
5581 w->flags = 0;
3dab1dad 5582 REGTAIL(pRExC_state, ret, w);
830247a4 5583 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5584 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5585 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5586 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5587 }
6bda09f9 5588 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5589 /* MJD hk */
5590 Set_Node_Offset(ret, parse_start+1);
2af232bd 5591 Set_Node_Length(ret,
fac92740 5592 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5593
830247a4 5594 if (!SIZE_ONLY && RExC_extralen)
c277df42 5595 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5596 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5597 if (SIZE_ONLY)
830247a4
IZ
5598 RExC_whilem_seen++, RExC_extralen += 3;
5599 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5600 }
c277df42 5601 ret->flags = 0;
a0d0e21e
LW
5602
5603 if (min > 0)
821b33a5
IZ
5604 *flagp = WORST;
5605 if (max > 0)
5606 *flagp |= HASWIDTH;
a0d0e21e 5607 if (max && max < min)
8615cb43 5608 vFAIL("Can't do {n,m} with n > m");
c277df42 5609 if (!SIZE_ONLY) {
eb160463
GS
5610 ARG1_SET(ret, (U16)min);
5611 ARG2_SET(ret, (U16)max);
a687059c 5612 }
a687059c 5613
a0d0e21e 5614 goto nest_check;
a687059c 5615 }
a0d0e21e 5616 }
a687059c 5617
a0d0e21e
LW
5618 if (!ISMULT1(op)) {
5619 *flagp = flags;
a687059c 5620 return(ret);
a0d0e21e 5621 }
bb20fd44 5622
c277df42 5623#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5624
5625 /* if this is reinstated, don't forget to put this back into perldiag:
5626
5627 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5628
5629 (F) The part of the regexp subject to either the * or + quantifier
5630 could match an empty string. The {#} shows in the regular
5631 expression about where the problem was discovered.
5632
5633 */
5634
bb20fd44 5635 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5636 vFAIL("Regexp *+ operand could be empty");
b81d288d 5637#endif
bb20fd44 5638
fac92740 5639 parse_start = RExC_parse;
830247a4 5640 nextchar(pRExC_state);
a0d0e21e 5641
821b33a5 5642 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5643
5644 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5645 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5646 ret->flags = 0;
830247a4 5647 RExC_naughty += 4;
a0d0e21e
LW
5648 }
5649 else if (op == '*') {
5650 min = 0;
5651 goto do_curly;
a0ed51b3
LW
5652 }
5653 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5654 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5655 ret->flags = 0;
830247a4 5656 RExC_naughty += 3;
a0d0e21e
LW
5657 }
5658 else if (op == '+') {
5659 min = 1;
5660 goto do_curly;
a0ed51b3
LW
5661 }
5662 else if (op == '?') {
a0d0e21e
LW
5663 min = 0; max = 1;
5664 goto do_curly;
5665 }
5666 nest_check:
041457d9 5667 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5668 vWARN3(RExC_parse,
b45f050a 5669 "%.*s matches null string many times",
afd78fd5 5670 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5671 origparse);
a0d0e21e
LW
5672 }
5673
b9b4dddf 5674 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5675 nextchar(pRExC_state);
6bda09f9 5676 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5677 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5678 }
b9b4dddf
YO
5679#ifndef REG_ALLOW_MINMOD_SUSPEND
5680 else
5681#endif
5682 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5683 regnode *ender;
5684 nextchar(pRExC_state);
5685 ender = reg_node(pRExC_state, SUCCEED);
5686 REGTAIL(pRExC_state, ret, ender);
5687 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5688 ret->flags = 0;
5689 ender = reg_node(pRExC_state, TAIL);
5690 REGTAIL(pRExC_state, ret, ender);
5691 /*ret= ender;*/
5692 }
5693
5694 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5695 RExC_parse++;
b45f050a
JF
5696 vFAIL("Nested quantifiers");
5697 }
a0d0e21e
LW
5698
5699 return(ret);
a687059c
LW
5700}
5701
fc8cd66c
YO
5702
5703/* reg_namedseq(pRExC_state,UVp)
5704
5705 This is expected to be called by a parser routine that has
5706 recognized'\N' and needs to handle the rest. RExC_parse is
5707 expected to point at the first char following the N at the time
5708 of the call.
5709
5710 If valuep is non-null then it is assumed that we are parsing inside
5711 of a charclass definition and the first codepoint in the resolved
5712 string is returned via *valuep and the routine will return NULL.
5713 In this mode if a multichar string is returned from the charnames
5714 handler a warning will be issued, and only the first char in the
5715 sequence will be examined. If the string returned is zero length
5716 then the value of *valuep is undefined and NON-NULL will
5717 be returned to indicate failure. (This will NOT be a valid pointer
5718 to a regnode.)
5719
5720 If value is null then it is assumed that we are parsing normal text
5721 and inserts a new EXACT node into the program containing the resolved
5722 string and returns a pointer to the new node. If the string is
5723 zerolength a NOTHING node is emitted.
5724
5725 On success RExC_parse is set to the char following the endbrace.
5726 Parsing failures will generate a fatal errorvia vFAIL(...)
5727
5728 NOTE: We cache all results from the charnames handler locally in
5729 the RExC_charnames hash (created on first use) to prevent a charnames
5730 handler from playing silly-buggers and returning a short string and
5731 then a long string for a given pattern. Since the regexp program
5732 size is calculated during an initial parse this would result
5733 in a buffer overrun so we cache to prevent the charname result from
5734 changing during the course of the parse.
5735
5736 */
5737STATIC regnode *
5738S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5739{
5740 char * name; /* start of the content of the name */
5741 char * endbrace; /* endbrace following the name */
5742 SV *sv_str = NULL;
5743 SV *sv_name = NULL;
5744 STRLEN len; /* this has various purposes throughout the code */
5745 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5746 regnode *ret = NULL;
5747
5748 if (*RExC_parse != '{') {
5749 vFAIL("Missing braces on \\N{}");
5750 }
5751 name = RExC_parse+1;
5752 endbrace = strchr(RExC_parse, '}');
5753 if ( ! endbrace ) {
5754 RExC_parse++;
5755 vFAIL("Missing right brace on \\N{}");
5756 }
5757 RExC_parse = endbrace + 1;
5758
5759
5760 /* RExC_parse points at the beginning brace,
5761 endbrace points at the last */
5762 if ( name[0]=='U' && name[1]=='+' ) {
5763 /* its a "unicode hex" notation {U+89AB} */
5764 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5765 | PERL_SCAN_DISALLOW_PREFIX
5766 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5767 UV cp;
196f1508 5768 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5769 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5770 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5771 cp = 0xFFFD;
5772 }
5773 if (cp > 0xff)
5774 RExC_utf8 = 1;
5775 if ( valuep ) {
5776 *valuep = cp;
5777 return NULL;
5778 }
5779 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5780 } else {
5781 /* fetch the charnames handler for this scope */
5782 HV * const table = GvHV(PL_hintgv);
5783 SV **cvp= table ?
5784 hv_fetchs(table, "charnames", FALSE) :
5785 NULL;
5786 SV *cv= cvp ? *cvp : NULL;
5787 HE *he_str;
5788 int count;
5789 /* create an SV with the name as argument */
5790 sv_name = newSVpvn(name, endbrace - name);
5791
5792 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5793 vFAIL2("Constant(\\N{%s}) unknown: "
5794 "(possibly a missing \"use charnames ...\")",
5795 SvPVX(sv_name));
5796 }
5797 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5798 vFAIL2("Constant(\\N{%s}): "
5799 "$^H{charnames} is not defined",SvPVX(sv_name));
5800 }
5801
5802
5803
5804 if (!RExC_charnames) {
5805 /* make sure our cache is allocated */
5806 RExC_charnames = newHV();
6bda09f9 5807 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5808 }
5809 /* see if we have looked this one up before */
5810 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5811 if ( he_str ) {
5812 sv_str = HeVAL(he_str);
5813 cached = 1;
5814 } else {
5815 dSP ;
5816
5817 ENTER ;
5818 SAVETMPS ;
5819 PUSHMARK(SP) ;
5820
5821 XPUSHs(sv_name);
5822
5823 PUTBACK ;
5824
5825 count= call_sv(cv, G_SCALAR);
5826
5827 if (count == 1) { /* XXXX is this right? dmq */
5828 sv_str = POPs;
5829 SvREFCNT_inc_simple_void(sv_str);
5830 }
5831
5832 SPAGAIN ;
5833 PUTBACK ;
5834 FREETMPS ;
5835 LEAVE ;
5836
5837 if ( !sv_str || !SvOK(sv_str) ) {
5838 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5839 "did not return a defined value",SvPVX(sv_name));
5840 }
5841 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5842 cached = 1;
5843 }
5844 }
5845 if (valuep) {
5846 char *p = SvPV(sv_str, len);
5847 if (len) {
5848 STRLEN numlen = 1;
5849 if ( SvUTF8(sv_str) ) {
196f1508 5850 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5851 if (*valuep > 0x7F)
5852 RExC_utf8 = 1;
5853 /* XXXX
5854 We have to turn on utf8 for high bit chars otherwise
5855 we get failures with
5856
5857 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5858 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5859
5860 This is different from what \x{} would do with the same
5861 codepoint, where the condition is > 0xFF.
5862 - dmq
5863 */
5864
5865
5866 } else {
5867 *valuep = (UV)*p;
5868 /* warn if we havent used the whole string? */
5869 }
5870 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5871 vWARN2(RExC_parse,
5872 "Ignoring excess chars from \\N{%s} in character class",
5873 SvPVX(sv_name)
5874 );
5875 }
5876 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5877 vWARN2(RExC_parse,
5878 "Ignoring zero length \\N{%s} in character class",
5879 SvPVX(sv_name)
5880 );
5881 }
5882 if (sv_name)
5883 SvREFCNT_dec(sv_name);
5884 if (!cached)
5885 SvREFCNT_dec(sv_str);
5886 return len ? NULL : (regnode *)&len;
5887 } else if(SvCUR(sv_str)) {
5888
5889 char *s;
5890 char *p, *pend;
5891 STRLEN charlen = 1;
5892 char * parse_start = name-3; /* needed for the offsets */
5893 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5894
5895 ret = reg_node(pRExC_state,
5896 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5897 s= STRING(ret);
5898
5899 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5900 sv_utf8_upgrade(sv_str);
5901 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5902 RExC_utf8= 1;
5903 }
5904
5905 p = SvPV(sv_str, len);
5906 pend = p + len;
5907 /* len is the length written, charlen is the size the char read */
5908 for ( len = 0; p < pend; p += charlen ) {
5909 if (UTF) {
196f1508 5910 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5911 if (FOLD) {
5912 STRLEN foldlen,numlen;
5913 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5914 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5915 /* Emit all the Unicode characters. */
5916
5917 for (foldbuf = tmpbuf;
5918 foldlen;
5919 foldlen -= numlen)
5920 {
5921 uvc = utf8_to_uvchr(foldbuf, &numlen);
5922 if (numlen > 0) {
5923 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5924 s += unilen;
5925 len += unilen;
5926 /* In EBCDIC the numlen
5927 * and unilen can differ. */
5928 foldbuf += numlen;
5929 if (numlen >= foldlen)
5930 break;
5931 }
5932 else
5933 break; /* "Can't happen." */
5934 }
5935 } else {
5936 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5937 if (unilen > 0) {
5938 s += unilen;
5939 len += unilen;
5940 }
5941 }
5942 } else {
5943 len++;
5944 REGC(*p, s++);
5945 }
5946 }
5947 if (SIZE_ONLY) {
5948 RExC_size += STR_SZ(len);
5949 } else {
5950 STR_LEN(ret) = len;
5951 RExC_emit += STR_SZ(len);
5952 }
5953 Set_Node_Cur_Length(ret); /* MJD */
5954 RExC_parse--;
5955 nextchar(pRExC_state);
5956 } else {
5957 ret = reg_node(pRExC_state,NOTHING);
5958 }
5959 if (!cached) {
5960 SvREFCNT_dec(sv_str);
5961 }
5962 if (sv_name) {
5963 SvREFCNT_dec(sv_name);
5964 }
5965 return ret;
5966
5967}
5968
5969
9e08bc66
TS
5970/*
5971 * reg_recode
5972 *
5973 * It returns the code point in utf8 for the value in *encp.
5974 * value: a code value in the source encoding
5975 * encp: a pointer to an Encode object
5976 *
5977 * If the result from Encode is not a single character,
5978 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5979 */
5980STATIC UV
5981S_reg_recode(pTHX_ const char value, SV **encp)
5982{
5983 STRLEN numlen = 1;
5984 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5985 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5986 : SvPVX(sv);
5987 const STRLEN newlen = SvCUR(sv);
5988 UV uv = UNICODE_REPLACEMENT;
5989
5990 if (newlen)
5991 uv = SvUTF8(sv)
5992 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5993 : *(U8*)s;
5994
5995 if (!newlen || numlen != newlen) {
5996 uv = UNICODE_REPLACEMENT;
5997 if (encp)
5998 *encp = NULL;
5999 }
6000 return uv;
6001}
6002
fc8cd66c 6003
a687059c
LW
6004/*
6005 - regatom - the lowest level
6006 *
6007 * Optimization: gobbles an entire sequence of ordinary characters so that
6008 * it can turn them into a single node, which is smaller to store and
6009 * faster to run. Backslashed characters are exceptions, each becoming a
6010 * separate node; the code is simpler that way and it's not worth fixing.
6011 *
7f6f358c
YO
6012 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6013 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6014 */
76e3520e 6015STATIC regnode *
3dab1dad 6016S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6017{
97aff369 6018 dVAR;
cbbf8932 6019 register regnode *ret = NULL;
a0d0e21e 6020 I32 flags;
45948336 6021 char *parse_start = RExC_parse;
3dab1dad
YO
6022 GET_RE_DEBUG_FLAGS_DECL;
6023 DEBUG_PARSE("atom");
a0d0e21e
LW
6024 *flagp = WORST; /* Tentatively. */
6025
6026tryagain:
830247a4 6027 switch (*RExC_parse) {
a0d0e21e 6028 case '^':
830247a4
IZ
6029 RExC_seen_zerolen++;
6030 nextchar(pRExC_state);
e2509266 6031 if (RExC_flags & PMf_MULTILINE)
830247a4 6032 ret = reg_node(pRExC_state, MBOL);
e2509266 6033 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6034 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6035 else
830247a4 6036 ret = reg_node(pRExC_state, BOL);
fac92740 6037 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6038 break;
6039 case '$':
830247a4 6040 nextchar(pRExC_state);
b81d288d 6041 if (*RExC_parse)
830247a4 6042 RExC_seen_zerolen++;
e2509266 6043 if (RExC_flags & PMf_MULTILINE)
830247a4 6044 ret = reg_node(pRExC_state, MEOL);
e2509266 6045 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6046 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6047 else
830247a4 6048 ret = reg_node(pRExC_state, EOL);
fac92740 6049 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6050 break;
6051 case '.':
830247a4 6052 nextchar(pRExC_state);
e2509266 6053 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6054 ret = reg_node(pRExC_state, SANY);
6055 else
6056 ret = reg_node(pRExC_state, REG_ANY);
6057 *flagp |= HASWIDTH|SIMPLE;
830247a4 6058 RExC_naughty++;
fac92740 6059 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6060 break;
6061 case '[':
b45f050a 6062 {
3dab1dad
YO
6063 char * const oregcomp_parse = ++RExC_parse;
6064 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6065 if (*RExC_parse != ']') {
6066 RExC_parse = oregcomp_parse;
b45f050a
JF
6067 vFAIL("Unmatched [");
6068 }
830247a4 6069 nextchar(pRExC_state);
a0d0e21e 6070 *flagp |= HASWIDTH|SIMPLE;
fac92740 6071 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6072 break;
b45f050a 6073 }
a0d0e21e 6074 case '(':
830247a4 6075 nextchar(pRExC_state);
3dab1dad 6076 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6077 if (ret == NULL) {
bf93d4cc 6078 if (flags & TRYAGAIN) {
830247a4 6079 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6080 /* Make parent create an empty node if needed. */
6081 *flagp |= TRYAGAIN;
6082 return(NULL);
6083 }
a0d0e21e 6084 goto tryagain;
bf93d4cc 6085 }
a0d0e21e
LW
6086 return(NULL);
6087 }
c277df42 6088 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6089 break;
6090 case '|':
6091 case ')':
6092 if (flags & TRYAGAIN) {
6093 *flagp |= TRYAGAIN;
6094 return NULL;
6095 }
b45f050a 6096 vFAIL("Internal urp");
a0d0e21e
LW
6097 /* Supposed to be caught earlier. */
6098 break;
85afd4ae 6099 case '{':
830247a4
IZ
6100 if (!regcurly(RExC_parse)) {
6101 RExC_parse++;
85afd4ae
CS
6102 goto defchar;
6103 }
6104 /* FALL THROUGH */
a0d0e21e
LW
6105 case '?':
6106 case '+':
6107 case '*':
830247a4 6108 RExC_parse++;
b45f050a 6109 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6110 break;
6111 case '\\':
830247a4 6112 switch (*++RExC_parse) {
a0d0e21e 6113 case 'A':
830247a4
IZ
6114 RExC_seen_zerolen++;
6115 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6116 *flagp |= SIMPLE;
830247a4 6117 nextchar(pRExC_state);
fac92740 6118 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6119 break;
6120 case 'G':
830247a4
IZ
6121 ret = reg_node(pRExC_state, GPOS);
6122 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6123 *flagp |= SIMPLE;
830247a4 6124 nextchar(pRExC_state);
fac92740 6125 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6126 break;
6127 case 'Z':
830247a4 6128 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6129 *flagp |= SIMPLE;
a1917ab9 6130 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6131 nextchar(pRExC_state);
a0d0e21e 6132 break;
b85d18e9 6133 case 'z':
830247a4 6134 ret = reg_node(pRExC_state, EOS);
b85d18e9 6135 *flagp |= SIMPLE;
830247a4
IZ
6136 RExC_seen_zerolen++; /* Do not optimize RE away */
6137 nextchar(pRExC_state);
fac92740 6138 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6139 break;
4a2d328f 6140 case 'C':
f33976b4
DB
6141 ret = reg_node(pRExC_state, CANY);
6142 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6143 *flagp |= HASWIDTH|SIMPLE;
830247a4 6144 nextchar(pRExC_state);
fac92740 6145 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6146 break;
6147 case 'X':
830247a4 6148 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6149 *flagp |= HASWIDTH;
830247a4 6150 nextchar(pRExC_state);
fac92740 6151 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6152 break;
a0d0e21e 6153 case 'w':
eb160463 6154 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6155 *flagp |= HASWIDTH|SIMPLE;
830247a4 6156 nextchar(pRExC_state);
fac92740 6157 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6158 break;
6159 case 'W':
eb160463 6160 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6161 *flagp |= HASWIDTH|SIMPLE;
830247a4 6162 nextchar(pRExC_state);
fac92740 6163 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6164 break;
6165 case 'b':
830247a4
IZ
6166 RExC_seen_zerolen++;
6167 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6168 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6169 *flagp |= SIMPLE;
830247a4 6170 nextchar(pRExC_state);
fac92740 6171 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6172 break;
6173 case 'B':
830247a4
IZ
6174 RExC_seen_zerolen++;
6175 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6176 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6177 *flagp |= SIMPLE;
830247a4 6178 nextchar(pRExC_state);
fac92740 6179 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6180 break;
6181 case 's':
eb160463 6182 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6183 *flagp |= HASWIDTH|SIMPLE;
830247a4 6184 nextchar(pRExC_state);
fac92740 6185 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6186 break;
6187 case 'S':
eb160463 6188 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6189 *flagp |= HASWIDTH|SIMPLE;
830247a4 6190 nextchar(pRExC_state);
fac92740 6191 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6192 break;
6193 case 'd':
ffc61ed2 6194 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6195 *flagp |= HASWIDTH|SIMPLE;
830247a4 6196 nextchar(pRExC_state);
fac92740 6197 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6198 break;
6199 case 'D':
ffc61ed2 6200 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6201 *flagp |= HASWIDTH|SIMPLE;
830247a4 6202 nextchar(pRExC_state);
fac92740 6203 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6204 break;
a14b48bc
LW
6205 case 'p':
6206 case 'P':
3568d838 6207 {
3dab1dad 6208 char* const oldregxend = RExC_end;
ccb2c380 6209 char* parse_start = RExC_parse - 2;
a14b48bc 6210
830247a4 6211 if (RExC_parse[1] == '{') {
3568d838 6212 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6213 RExC_end = strchr(RExC_parse, '}');
6214 if (!RExC_end) {
3dab1dad 6215 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6216 RExC_parse += 2;
6217 RExC_end = oldregxend;
0da60cf5 6218 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6219 }
830247a4 6220 RExC_end++;
a14b48bc 6221 }
af6f566e 6222 else {
830247a4 6223 RExC_end = RExC_parse + 2;
af6f566e
HS
6224 if (RExC_end > oldregxend)
6225 RExC_end = oldregxend;
6226 }
830247a4 6227 RExC_parse--;
a14b48bc 6228
3dab1dad 6229 ret = regclass(pRExC_state,depth+1);
a14b48bc 6230
830247a4
IZ
6231 RExC_end = oldregxend;
6232 RExC_parse--;
ccb2c380
MP
6233
6234 Set_Node_Offset(ret, parse_start + 2);
6235 Set_Node_Cur_Length(ret);
830247a4 6236 nextchar(pRExC_state);
a14b48bc
LW
6237 *flagp |= HASWIDTH|SIMPLE;
6238 }
6239 break;
fc8cd66c
YO
6240 case 'N':
6241 /* Handle \N{NAME} here and not below because it can be
6242 multicharacter. join_exact() will join them up later on.
6243 Also this makes sure that things like /\N{BLAH}+/ and
6244 \N{BLAH} being multi char Just Happen. dmq*/
6245 ++RExC_parse;
6246 ret= reg_namedseq(pRExC_state, NULL);
6247 break;
0a4db386 6248 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6249 {
6250 char ch= RExC_parse[1];
6251 if (ch != '<' && ch != '\'') {
6252 if (SIZE_ONLY)
6253 vWARN( RExC_parse + 1,
6254 "Possible broken named back reference treated as literal k");
6255 parse_start--;
6256 goto defchar;
6257 } else {
6258 char* name_start = (RExC_parse += 2);
6259 I32 num = 0;
0a4db386
YO
6260 SV *sv_dat = reg_scan_name(pRExC_state,
6261 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6262 ch= (ch == '<') ? '>' : '\'';
6263
81714fb9
YO
6264 if (RExC_parse == name_start || *RExC_parse != ch)
6265 vFAIL2("Sequence \\k%c... not terminated",
6266 (ch == '>' ? '<' : ch));
6267
6268 RExC_sawback = 1;
6269 ret = reganode(pRExC_state,
6270 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6271 num);
6272 *flagp |= HASWIDTH;
6273
6274
6275 if (!SIZE_ONLY) {
81714fb9
YO
6276 num = add_data( pRExC_state, 1, "S" );
6277 ARG_SET(ret,num);
6278 RExC_rx->data->data[num]=(void*)sv_dat;
6279 SvREFCNT_inc(sv_dat);
6280 }
6281 /* override incorrect value set in reganode MJD */
6282 Set_Node_Offset(ret, parse_start+1);
6283 Set_Node_Cur_Length(ret); /* MJD */
6284 nextchar(pRExC_state);
6285
6286 }
6287 break;
6288 }
a0d0e21e
LW
6289 case 'n':
6290 case 'r':
6291 case 't':
6292 case 'f':
6293 case 'e':
6294 case 'a':
6295 case 'x':
6296 case 'c':
6297 case '0':
6298 goto defchar;
c74340f9 6299 case 'R':
a0d0e21e
LW
6300 case '1': case '2': case '3': case '4':
6301 case '5': case '6': case '7': case '8': case '9':
6302 {
c74340f9
YO
6303 I32 num;
6304 bool isrel=(*RExC_parse=='R');
6305 if (isrel)
6306 RExC_parse++;
6307 num = atoi(RExC_parse);
6308 if (isrel) {
6309 num = RExC_cpar - num;
6310 if (num < 1)
6311 vFAIL("Reference to nonexistent or unclosed group");
6312 }
830247a4 6313 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6314 goto defchar;
6315 else {
3dab1dad 6316 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6317 while (isDIGIT(*RExC_parse))
6318 RExC_parse++;
b45f050a 6319
c74340f9
YO
6320 if (!SIZE_ONLY) {
6321 if (num > (I32)RExC_rx->nparens)
6322 vFAIL("Reference to nonexistent group");
6323 /* People make this error all the time apparently.
6324 So we cant fail on it, even though we should
6325
6326 else if (num >= RExC_cpar)
6327 vFAIL("Reference to unclosed group will always match");
6328 */
6329 }
830247a4 6330 RExC_sawback = 1;
eb160463
GS
6331 ret = reganode(pRExC_state,
6332 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6333 num);
a0d0e21e 6334 *flagp |= HASWIDTH;
2af232bd 6335
fac92740 6336 /* override incorrect value set in reganode MJD */
2af232bd 6337 Set_Node_Offset(ret, parse_start+1);
fac92740 6338 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6339 RExC_parse--;
6340 nextchar(pRExC_state);
a0d0e21e
LW
6341 }
6342 }
6343 break;
6344 case '\0':
830247a4 6345 if (RExC_parse >= RExC_end)
b45f050a 6346 FAIL("Trailing \\");
a0d0e21e
LW
6347 /* FALL THROUGH */
6348 default:
a0288114 6349 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6350 back into the quick-grab loop below */
45948336 6351 parse_start--;
a0d0e21e
LW
6352 goto defchar;
6353 }
6354 break;
4633a7c4
LW
6355
6356 case '#':
e2509266 6357 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6358 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6359 RExC_parse++;
830247a4 6360 if (RExC_parse < RExC_end)
4633a7c4
LW
6361 goto tryagain;
6362 }
6363 /* FALL THROUGH */
6364
a0d0e21e 6365 default: {
ba210ebe 6366 register STRLEN len;
58ae7d3f 6367 register UV ender;
a0d0e21e 6368 register char *p;
3dab1dad 6369 char *s;
80aecb99 6370 STRLEN foldlen;
89ebb4a3 6371 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6372
6373 parse_start = RExC_parse - 1;
a0d0e21e 6374
830247a4 6375 RExC_parse++;
a0d0e21e
LW
6376
6377 defchar:
58ae7d3f 6378 ender = 0;
eb160463
GS
6379 ret = reg_node(pRExC_state,
6380 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6381 s = STRING(ret);
830247a4
IZ
6382 for (len = 0, p = RExC_parse - 1;
6383 len < 127 && p < RExC_end;
a0d0e21e
LW
6384 len++)
6385 {
3dab1dad 6386 char * const oldp = p;
5b5a24f7 6387
e2509266 6388 if (RExC_flags & PMf_EXTENDED)
830247a4 6389 p = regwhite(p, RExC_end);
a0d0e21e
LW
6390 switch (*p) {
6391 case '^':
6392 case '$':
6393 case '.':
6394 case '[':
6395 case '(':
6396 case ')':
6397 case '|':
6398 goto loopdone;
6399 case '\\':
6400 switch (*++p) {
6401 case 'A':
1ed8eac0
JF
6402 case 'C':
6403 case 'X':
a0d0e21e
LW
6404 case 'G':
6405 case 'Z':
b85d18e9 6406 case 'z':
a0d0e21e
LW
6407 case 'w':
6408 case 'W':
6409 case 'b':
6410 case 'B':
6411 case 's':
6412 case 'S':
6413 case 'd':
6414 case 'D':
a14b48bc
LW
6415 case 'p':
6416 case 'P':
fc8cd66c 6417 case 'N':
c74340f9 6418 case 'R':
a0d0e21e
LW
6419 --p;
6420 goto loopdone;
6421 case 'n':
6422 ender = '\n';
6423 p++;
a687059c 6424 break;
a0d0e21e
LW
6425 case 'r':
6426 ender = '\r';
6427 p++;
a687059c 6428 break;
a0d0e21e
LW
6429 case 't':
6430 ender = '\t';
6431 p++;
a687059c 6432 break;
a0d0e21e
LW
6433 case 'f':
6434 ender = '\f';
6435 p++;
a687059c 6436 break;
a0d0e21e 6437 case 'e':
c7f1f016 6438 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6439 p++;
a687059c 6440 break;
a0d0e21e 6441 case 'a':
c7f1f016 6442 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6443 p++;
a687059c 6444 break;
a0d0e21e 6445 case 'x':
a0ed51b3 6446 if (*++p == '{') {
1df70142 6447 char* const e = strchr(p, '}');
b81d288d 6448
b45f050a 6449 if (!e) {
830247a4 6450 RExC_parse = p + 1;
b45f050a
JF
6451 vFAIL("Missing right brace on \\x{}");
6452 }
de5f0749 6453 else {
a4c04bdc
NC
6454 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6455 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6456 STRLEN numlen = e - p - 1;
53305cf1 6457 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6458 if (ender > 0xff)
6459 RExC_utf8 = 1;
a0ed51b3
LW
6460 p = e + 1;
6461 }
a0ed51b3
LW
6462 }
6463 else {
a4c04bdc 6464 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6465 STRLEN numlen = 2;
53305cf1 6466 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6467 p += numlen;
6468 }
9e08bc66
TS
6469 if (PL_encoding && ender < 0x100)
6470 goto recode_encoding;
a687059c 6471 break;
a0d0e21e
LW
6472 case 'c':
6473 p++;
bbce6d69 6474 ender = UCHARAT(p++);
6475 ender = toCTRL(ender);
a687059c 6476 break;
a0d0e21e
LW
6477 case '0': case '1': case '2': case '3':case '4':
6478 case '5': case '6': case '7': case '8':case '9':
6479 if (*p == '0' ||
830247a4 6480 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6481 I32 flags = 0;
1df70142 6482 STRLEN numlen = 3;
53305cf1 6483 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6484 p += numlen;
6485 }
6486 else {
6487 --p;
6488 goto loopdone;
a687059c 6489 }
9e08bc66
TS
6490 if (PL_encoding && ender < 0x100)
6491 goto recode_encoding;
6492 break;
6493 recode_encoding:
6494 {
6495 SV* enc = PL_encoding;
6496 ender = reg_recode((const char)(U8)ender, &enc);
6497 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6498 vWARN(p, "Invalid escape in the specified encoding");
6499 RExC_utf8 = 1;
6500 }
a687059c 6501 break;
a0d0e21e 6502 case '\0':
830247a4 6503 if (p >= RExC_end)
b45f050a 6504 FAIL("Trailing \\");
a687059c 6505 /* FALL THROUGH */
a0d0e21e 6506 default:
041457d9 6507 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6508 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6509 goto normal_default;
a0d0e21e
LW
6510 }
6511 break;
a687059c 6512 default:
a0ed51b3 6513 normal_default:
fd400ab9 6514 if (UTF8_IS_START(*p) && UTF) {
1df70142 6515 STRLEN numlen;
5e12f4fb 6516 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6517 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6518 p += numlen;
6519 }
6520 else
6521 ender = *p++;
a0d0e21e 6522 break;
a687059c 6523 }
e2509266 6524 if (RExC_flags & PMf_EXTENDED)
830247a4 6525 p = regwhite(p, RExC_end);
60a8b682
JH
6526 if (UTF && FOLD) {
6527 /* Prime the casefolded buffer. */
ac7e0132 6528 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6529 }
a0d0e21e
LW
6530 if (ISMULT2(p)) { /* Back off on ?+*. */
6531 if (len)
6532 p = oldp;
16ea2a2e 6533 else if (UTF) {
80aecb99 6534 if (FOLD) {
60a8b682 6535 /* Emit all the Unicode characters. */
1df70142 6536 STRLEN numlen;
80aecb99
JH
6537 for (foldbuf = tmpbuf;
6538 foldlen;
6539 foldlen -= numlen) {
6540 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6541 if (numlen > 0) {
71207a34 6542 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6543 s += unilen;
6544 len += unilen;
6545 /* In EBCDIC the numlen
6546 * and unilen can differ. */
9dc45d57 6547 foldbuf += numlen;
47654450
JH
6548 if (numlen >= foldlen)
6549 break;
9dc45d57
JH
6550 }
6551 else
6552 break; /* "Can't happen." */
80aecb99
JH
6553 }
6554 }
6555 else {
71207a34 6556 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6557 if (unilen > 0) {
0ebc6274
JH
6558 s += unilen;
6559 len += unilen;
9dc45d57 6560 }
80aecb99 6561 }
a0ed51b3 6562 }
a0d0e21e
LW
6563 else {
6564 len++;
eb160463 6565 REGC((char)ender, s++);
a0d0e21e
LW
6566 }
6567 break;
a687059c 6568 }
16ea2a2e 6569 if (UTF) {
80aecb99 6570 if (FOLD) {
60a8b682 6571 /* Emit all the Unicode characters. */
1df70142 6572 STRLEN numlen;
80aecb99
JH
6573 for (foldbuf = tmpbuf;
6574 foldlen;
6575 foldlen -= numlen) {
6576 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6577 if (numlen > 0) {
71207a34 6578 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6579 len += unilen;
6580 s += unilen;
6581 /* In EBCDIC the numlen
6582 * and unilen can differ. */
9dc45d57 6583 foldbuf += numlen;
47654450
JH
6584 if (numlen >= foldlen)
6585 break;
9dc45d57
JH
6586 }
6587 else
6588 break;
80aecb99
JH
6589 }
6590 }
6591 else {
71207a34 6592 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6593 if (unilen > 0) {
0ebc6274
JH
6594 s += unilen;
6595 len += unilen;
9dc45d57 6596 }
80aecb99
JH
6597 }
6598 len--;
a0ed51b3
LW
6599 }
6600 else
eb160463 6601 REGC((char)ender, s++);
a0d0e21e
LW
6602 }
6603 loopdone:
830247a4 6604 RExC_parse = p - 1;
fac92740 6605 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6606 nextchar(pRExC_state);
793db0cb
JH
6607 {
6608 /* len is STRLEN which is unsigned, need to copy to signed */
6609 IV iv = len;
6610 if (iv < 0)
6611 vFAIL("Internal disaster");
6612 }
a0d0e21e
LW
6613 if (len > 0)
6614 *flagp |= HASWIDTH;
090f7165 6615 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6616 *flagp |= SIMPLE;
3dab1dad 6617
cd439c50 6618 if (SIZE_ONLY)
830247a4 6619 RExC_size += STR_SZ(len);
3dab1dad
YO
6620 else {
6621 STR_LEN(ret) = len;
830247a4 6622 RExC_emit += STR_SZ(len);
07be1b83 6623 }
3dab1dad 6624 }
a0d0e21e
LW
6625 break;
6626 }
a687059c 6627
a0d0e21e 6628 return(ret);
a687059c
LW
6629}
6630
873ef191 6631STATIC char *
5f66b61c 6632S_regwhite(char *p, const char *e)
5b5a24f7
CS
6633{
6634 while (p < e) {
6635 if (isSPACE(*p))
6636 ++p;
6637 else if (*p == '#') {
6638 do {
6639 p++;
6640 } while (p < e && *p != '\n');
6641 }
6642 else
6643 break;
6644 }
6645 return p;
6646}
6647
b8c5462f
JH
6648/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6649 Character classes ([:foo:]) can also be negated ([:^foo:]).
6650 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6651 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6652 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6653
6654#define POSIXCC_DONE(c) ((c) == ':')
6655#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6656#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6657
b8c5462f 6658STATIC I32
830247a4 6659S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6660{
97aff369 6661 dVAR;
936ed897 6662 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6663
830247a4 6664 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6665 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6666 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6667 const char c = UCHARAT(RExC_parse);
097eb12c 6668 char* const s = RExC_parse++;
b81d288d 6669
9a86a77b 6670 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6671 RExC_parse++;
6672 if (RExC_parse == RExC_end)
620e46c5 6673 /* Grandfather lone [:, [=, [. */
830247a4 6674 RExC_parse = s;
620e46c5 6675 else {
3dab1dad 6676 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6677 assert(*t == c);
6678
9a86a77b 6679 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6680 const char *posixcc = s + 1;
830247a4 6681 RExC_parse++; /* skip over the ending ] */
3dab1dad 6682
b8c5462f 6683 if (*s == ':') {
1df70142
AL
6684 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6685 const I32 skip = t - posixcc;
80916619
NC
6686
6687 /* Initially switch on the length of the name. */
6688 switch (skip) {
6689 case 4:
3dab1dad
YO
6690 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6691 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6692 break;
80916619
NC
6693 case 5:
6694 /* Names all of length 5. */
6695 /* alnum alpha ascii blank cntrl digit graph lower
6696 print punct space upper */
6697 /* Offset 4 gives the best switch position. */
6698 switch (posixcc[4]) {
6699 case 'a':
3dab1dad
YO
6700 if (memEQ(posixcc, "alph", 4)) /* alpha */
6701 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6702 break;
6703 case 'e':
3dab1dad
YO
6704 if (memEQ(posixcc, "spac", 4)) /* space */
6705 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6706 break;
6707 case 'h':
3dab1dad
YO
6708 if (memEQ(posixcc, "grap", 4)) /* graph */
6709 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6710 break;
6711 case 'i':
3dab1dad
YO
6712 if (memEQ(posixcc, "asci", 4)) /* ascii */
6713 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6714 break;
6715 case 'k':
3dab1dad
YO
6716 if (memEQ(posixcc, "blan", 4)) /* blank */
6717 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6718 break;
6719 case 'l':
3dab1dad
YO
6720 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6721 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6722 break;
6723 case 'm':
3dab1dad
YO
6724 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6725 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6726 break;
6727 case 'r':
3dab1dad
YO
6728 if (memEQ(posixcc, "lowe", 4)) /* lower */
6729 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6730 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6731 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6732 break;
6733 case 't':
3dab1dad
YO
6734 if (memEQ(posixcc, "digi", 4)) /* digit */
6735 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6736 else if (memEQ(posixcc, "prin", 4)) /* print */
6737 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6738 else if (memEQ(posixcc, "punc", 4)) /* punct */
6739 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6740 break;
b8c5462f
JH
6741 }
6742 break;
80916619 6743 case 6:
3dab1dad
YO
6744 if (memEQ(posixcc, "xdigit", 6))
6745 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6746 break;
6747 }
80916619
NC
6748
6749 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6750 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6751 t - s - 1, s + 1);
80916619
NC
6752 assert (posixcc[skip] == ':');
6753 assert (posixcc[skip+1] == ']');
b45f050a 6754 } else if (!SIZE_ONLY) {
b8c5462f 6755 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6756
830247a4 6757 /* adjust RExC_parse so the warning shows after
b45f050a 6758 the class closes */
9a86a77b 6759 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6760 RExC_parse++;
b45f050a
JF
6761 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6762 }
b8c5462f
JH
6763 } else {
6764 /* Maternal grandfather:
6765 * "[:" ending in ":" but not in ":]" */
830247a4 6766 RExC_parse = s;
767d463e 6767 }
620e46c5
JH
6768 }
6769 }
6770
b8c5462f
JH
6771 return namedclass;
6772}
6773
6774STATIC void
830247a4 6775S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6776{
97aff369 6777 dVAR;
3dab1dad 6778 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6779 const char *s = RExC_parse;
6780 const char c = *s++;
b8c5462f 6781
3dab1dad 6782 while (isALNUM(*s))
b8c5462f
JH
6783 s++;
6784 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6785 if (ckWARN(WARN_REGEXP))
6786 vWARN3(s+2,
6787 "POSIX syntax [%c %c] belongs inside character classes",
6788 c, c);
b45f050a
JF
6789
6790 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6791 if (POSIXCC_NOTYET(c)) {
830247a4 6792 /* adjust RExC_parse so the error shows after
b45f050a 6793 the class closes */
9a86a77b 6794 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6795 NOOP;
b45f050a
JF
6796 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6797 }
b8c5462f
JH
6798 }
6799 }
620e46c5
JH
6800}
6801
7f6f358c
YO
6802
6803/*
6804 parse a class specification and produce either an ANYOF node that
6805 matches the pattern. If the pattern matches a single char only and
6806 that char is < 256 then we produce an EXACT node instead.
6807*/
76e3520e 6808STATIC regnode *
3dab1dad 6809S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6810{
97aff369 6811 dVAR;
9ef43ace 6812 register UV value = 0;
9a86a77b 6813 register UV nextvalue;
3568d838 6814 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6815 register IV range = 0;
c277df42 6816 register regnode *ret;
ba210ebe 6817 STRLEN numlen;
ffc61ed2 6818 IV namedclass;
cbbf8932 6819 char *rangebegin = NULL;
936ed897 6820 bool need_class = 0;
c445ea15 6821 SV *listsv = NULL;
ffc61ed2 6822 UV n;
9e55ce06 6823 bool optimize_invert = TRUE;
cbbf8932 6824 AV* unicode_alternate = NULL;
1b2d223b
JH
6825#ifdef EBCDIC
6826 UV literal_endpoint = 0;
6827#endif
7f6f358c 6828 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6829
3dab1dad 6830 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6831 case we need to change the emitted regop to an EXACT. */
07be1b83 6832 const char * orig_parse = RExC_parse;
72f13be8 6833 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6834#ifndef DEBUGGING
6835 PERL_UNUSED_ARG(depth);
6836#endif
72f13be8 6837
3dab1dad 6838 DEBUG_PARSE("clas");
7f6f358c
YO
6839
6840 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6841 ret = reganode(pRExC_state, ANYOF, 0);
6842
6843 if (!SIZE_ONLY)
6844 ANYOF_FLAGS(ret) = 0;
6845
9a86a77b 6846 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6847 RExC_naughty++;
6848 RExC_parse++;
6849 if (!SIZE_ONLY)
6850 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6851 }
a0d0e21e 6852
73060fc4 6853 if (SIZE_ONLY) {
830247a4 6854 RExC_size += ANYOF_SKIP;
73060fc4
JH
6855 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6856 }
936ed897 6857 else {
830247a4 6858 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6859 if (FOLD)
6860 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6861 if (LOC)
6862 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6863 ANYOF_BITMAP_ZERO(ret);
396482e1 6864 listsv = newSVpvs("# comment\n");
a0d0e21e 6865 }
b8c5462f 6866
9a86a77b
JH
6867 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6868
b938889d 6869 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6870 checkposixcc(pRExC_state);
b8c5462f 6871
f064b6ad
HS
6872 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6873 if (UCHARAT(RExC_parse) == ']')
6874 goto charclassloop;
ffc61ed2 6875
fc8cd66c 6876parseit:
9a86a77b 6877 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6878
6879 charclassloop:
6880
6881 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6882
73b437c8 6883 if (!range)
830247a4 6884 rangebegin = RExC_parse;
ffc61ed2 6885 if (UTF) {
5e12f4fb 6886 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6887 RExC_end - RExC_parse,
9f7f3913 6888 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6889 RExC_parse += numlen;
6890 }
6891 else
6892 value = UCHARAT(RExC_parse++);
7f6f358c 6893
9a86a77b
JH
6894 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6895 if (value == '[' && POSIXCC(nextvalue))
830247a4 6896 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6897 else if (value == '\\') {
ffc61ed2 6898 if (UTF) {
5e12f4fb 6899 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6900 RExC_end - RExC_parse,
9f7f3913 6901 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6902 RExC_parse += numlen;
6903 }
6904 else
6905 value = UCHARAT(RExC_parse++);
470c3474 6906 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6907 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6908 * be a problem later if we want switch on Unicode.
6909 * A similar issue a little bit later when switching on
6910 * namedclass. --jhi */
ffc61ed2 6911 switch ((I32)value) {
b8c5462f
JH
6912 case 'w': namedclass = ANYOF_ALNUM; break;
6913 case 'W': namedclass = ANYOF_NALNUM; break;
6914 case 's': namedclass = ANYOF_SPACE; break;
6915 case 'S': namedclass = ANYOF_NSPACE; break;
6916 case 'd': namedclass = ANYOF_DIGIT; break;
6917 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6918 case 'N': /* Handle \N{NAME} in class */
6919 {
6920 /* We only pay attention to the first char of
6921 multichar strings being returned. I kinda wonder
6922 if this makes sense as it does change the behaviour
6923 from earlier versions, OTOH that behaviour was broken
6924 as well. */
6925 UV v; /* value is register so we cant & it /grrr */
6926 if (reg_namedseq(pRExC_state, &v)) {
6927 goto parseit;
6928 }
6929 value= v;
6930 }
6931 break;
ffc61ed2
JH
6932 case 'p':
6933 case 'P':
3dab1dad
YO
6934 {
6935 char *e;
af6f566e 6936 if (RExC_parse >= RExC_end)
2a4859cd 6937 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6938 if (*RExC_parse == '{') {
1df70142 6939 const U8 c = (U8)value;
ffc61ed2
JH
6940 e = strchr(RExC_parse++, '}');
6941 if (!e)
0da60cf5 6942 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6943 while (isSPACE(UCHARAT(RExC_parse)))
6944 RExC_parse++;
6945 if (e == RExC_parse)
0da60cf5 6946 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6947 n = e - RExC_parse;
ab13f0c7
JH
6948 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6949 n--;
ffc61ed2
JH
6950 }
6951 else {
6952 e = RExC_parse;
6953 n = 1;
6954 }
6955 if (!SIZE_ONLY) {
ab13f0c7
JH
6956 if (UCHARAT(RExC_parse) == '^') {
6957 RExC_parse++;
6958 n--;
6959 value = value == 'p' ? 'P' : 'p'; /* toggle */
6960 while (isSPACE(UCHARAT(RExC_parse))) {
6961 RExC_parse++;
6962 n--;
6963 }
6964 }
097eb12c
AL
6965 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6966 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6967 }
6968 RExC_parse = e + 1;
6969 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6970 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6971 }
f81125e2 6972 break;
b8c5462f
JH
6973 case 'n': value = '\n'; break;
6974 case 'r': value = '\r'; break;
6975 case 't': value = '\t'; break;
6976 case 'f': value = '\f'; break;
6977 case 'b': value = '\b'; break;
c7f1f016
NIS
6978 case 'e': value = ASCII_TO_NATIVE('\033');break;
6979 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6980 case 'x':
ffc61ed2 6981 if (*RExC_parse == '{') {
a4c04bdc
NC
6982 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6983 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6984 char * const e = strchr(RExC_parse++, '}');
b81d288d 6985 if (!e)
ffc61ed2 6986 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6987
6988 numlen = e - RExC_parse;
6989 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6990 RExC_parse = e + 1;
6991 }
6992 else {
a4c04bdc 6993 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6994 numlen = 2;
6995 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6996 RExC_parse += numlen;
6997 }
9e08bc66
TS
6998 if (PL_encoding && value < 0x100)
6999 goto recode_encoding;
b8c5462f
JH
7000 break;
7001 case 'c':
830247a4 7002 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7003 value = toCTRL(value);
7004 break;
7005 case '0': case '1': case '2': case '3': case '4':
7006 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7007 {
7008 I32 flags = 0;
7009 numlen = 3;
7010 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7011 RExC_parse += numlen;
7012 if (PL_encoding && value < 0x100)
7013 goto recode_encoding;
7014 break;
7015 }
7016 recode_encoding:
7017 {
7018 SV* enc = PL_encoding;
7019 value = reg_recode((const char)(U8)value, &enc);
7020 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7021 vWARN(RExC_parse,
7022 "Invalid escape in the specified encoding");
7023 break;
7024 }
1028017a 7025 default:
041457d9 7026 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7027 vWARN2(RExC_parse,
7028 "Unrecognized escape \\%c in character class passed through",
7029 (int)value);
1028017a 7030 break;
b8c5462f 7031 }
ffc61ed2 7032 } /* end of \blah */
1b2d223b
JH
7033#ifdef EBCDIC
7034 else
7035 literal_endpoint++;
7036#endif
ffc61ed2
JH
7037
7038 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7039
7040 if (!SIZE_ONLY && !need_class)
936ed897 7041 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7042
936ed897 7043 need_class = 1;
ffc61ed2
JH
7044
7045 /* a bad range like a-\d, a-[:digit:] ? */
7046 if (range) {
73b437c8 7047 if (!SIZE_ONLY) {
afd78fd5 7048 if (ckWARN(WARN_REGEXP)) {
097eb12c 7049 const int w =
afd78fd5
JH
7050 RExC_parse >= rangebegin ?
7051 RExC_parse - rangebegin : 0;
830247a4 7052 vWARN4(RExC_parse,
b45f050a 7053 "False [] range \"%*.*s\"",
097eb12c 7054 w, w, rangebegin);
afd78fd5 7055 }
3568d838
JH
7056 if (prevvalue < 256) {
7057 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7058 ANYOF_BITMAP_SET(ret, '-');
7059 }
7060 else {
7061 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7062 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7063 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7064 }
b8c5462f 7065 }
ffc61ed2
JH
7066
7067 range = 0; /* this was not a true range */
73b437c8 7068 }
ffc61ed2 7069
73b437c8 7070 if (!SIZE_ONLY) {
c49a72a9
NC
7071 const char *what = NULL;
7072 char yesno = 0;
7073
3568d838
JH
7074 if (namedclass > OOB_NAMEDCLASS)
7075 optimize_invert = FALSE;
e2962f66
JH
7076 /* Possible truncation here but in some 64-bit environments
7077 * the compiler gets heartburn about switch on 64-bit values.
7078 * A similar issue a little earlier when switching on value.
98f323fa 7079 * --jhi */
e2962f66 7080 switch ((I32)namedclass) {
73b437c8
JH
7081 case ANYOF_ALNUM:
7082 if (LOC)
936ed897 7083 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7084 else {
7085 for (value = 0; value < 256; value++)
7086 if (isALNUM(value))
936ed897 7087 ANYOF_BITMAP_SET(ret, value);
73b437c8 7088 }
c49a72a9
NC
7089 yesno = '+';
7090 what = "Word";
73b437c8
JH
7091 break;
7092 case ANYOF_NALNUM:
7093 if (LOC)
936ed897 7094 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7095 else {
7096 for (value = 0; value < 256; value++)
7097 if (!isALNUM(value))
936ed897 7098 ANYOF_BITMAP_SET(ret, value);
73b437c8 7099 }
c49a72a9
NC
7100 yesno = '!';
7101 what = "Word";
73b437c8 7102 break;
ffc61ed2 7103 case ANYOF_ALNUMC:
73b437c8 7104 if (LOC)
ffc61ed2 7105 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7106 else {
7107 for (value = 0; value < 256; value++)
ffc61ed2 7108 if (isALNUMC(value))
936ed897 7109 ANYOF_BITMAP_SET(ret, value);
73b437c8 7110 }
c49a72a9
NC
7111 yesno = '+';
7112 what = "Alnum";
73b437c8
JH
7113 break;
7114 case ANYOF_NALNUMC:
7115 if (LOC)
936ed897 7116 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7117 else {
7118 for (value = 0; value < 256; value++)
7119 if (!isALNUMC(value))
936ed897 7120 ANYOF_BITMAP_SET(ret, value);
73b437c8 7121 }
c49a72a9
NC
7122 yesno = '!';
7123 what = "Alnum";
73b437c8
JH
7124 break;
7125 case ANYOF_ALPHA:
7126 if (LOC)
936ed897 7127 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7128 else {
7129 for (value = 0; value < 256; value++)
7130 if (isALPHA(value))
936ed897 7131 ANYOF_BITMAP_SET(ret, value);
73b437c8 7132 }
c49a72a9
NC
7133 yesno = '+';
7134 what = "Alpha";
73b437c8
JH
7135 break;
7136 case ANYOF_NALPHA:
7137 if (LOC)
936ed897 7138 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7139 else {
7140 for (value = 0; value < 256; value++)
7141 if (!isALPHA(value))
936ed897 7142 ANYOF_BITMAP_SET(ret, value);
73b437c8 7143 }
c49a72a9
NC
7144 yesno = '!';
7145 what = "Alpha";
73b437c8
JH
7146 break;
7147 case ANYOF_ASCII:
7148 if (LOC)
936ed897 7149 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7150 else {
c7f1f016 7151#ifndef EBCDIC
1ba5c669
JH
7152 for (value = 0; value < 128; value++)
7153 ANYOF_BITMAP_SET(ret, value);
7154#else /* EBCDIC */
ffbc6a93 7155 for (value = 0; value < 256; value++) {
3a3c4447
JH
7156 if (isASCII(value))
7157 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7158 }
1ba5c669 7159#endif /* EBCDIC */
73b437c8 7160 }
c49a72a9
NC
7161 yesno = '+';
7162 what = "ASCII";
73b437c8
JH
7163 break;
7164 case ANYOF_NASCII:
7165 if (LOC)
936ed897 7166 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7167 else {
c7f1f016 7168#ifndef EBCDIC
1ba5c669
JH
7169 for (value = 128; value < 256; value++)
7170 ANYOF_BITMAP_SET(ret, value);
7171#else /* EBCDIC */
ffbc6a93 7172 for (value = 0; value < 256; value++) {
3a3c4447
JH
7173 if (!isASCII(value))
7174 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7175 }
1ba5c669 7176#endif /* EBCDIC */
73b437c8 7177 }
c49a72a9
NC
7178 yesno = '!';
7179 what = "ASCII";
73b437c8 7180 break;
aaa51d5e
JF
7181 case ANYOF_BLANK:
7182 if (LOC)
7183 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7184 else {
7185 for (value = 0; value < 256; value++)
7186 if (isBLANK(value))
7187 ANYOF_BITMAP_SET(ret, value);
7188 }
c49a72a9
NC
7189 yesno = '+';
7190 what = "Blank";
aaa51d5e
JF
7191 break;
7192 case ANYOF_NBLANK:
7193 if (LOC)
7194 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7195 else {
7196 for (value = 0; value < 256; value++)
7197 if (!isBLANK(value))
7198 ANYOF_BITMAP_SET(ret, value);
7199 }
c49a72a9
NC
7200 yesno = '!';
7201 what = "Blank";
aaa51d5e 7202 break;
73b437c8
JH
7203 case ANYOF_CNTRL:
7204 if (LOC)
936ed897 7205 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7206 else {
7207 for (value = 0; value < 256; value++)
7208 if (isCNTRL(value))
936ed897 7209 ANYOF_BITMAP_SET(ret, value);
73b437c8 7210 }
c49a72a9
NC
7211 yesno = '+';
7212 what = "Cntrl";
73b437c8
JH
7213 break;
7214 case ANYOF_NCNTRL:
7215 if (LOC)
936ed897 7216 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7217 else {
7218 for (value = 0; value < 256; value++)
7219 if (!isCNTRL(value))
936ed897 7220 ANYOF_BITMAP_SET(ret, value);
73b437c8 7221 }
c49a72a9
NC
7222 yesno = '!';
7223 what = "Cntrl";
ffc61ed2
JH
7224 break;
7225 case ANYOF_DIGIT:
7226 if (LOC)
7227 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7228 else {
7229 /* consecutive digits assumed */
7230 for (value = '0'; value <= '9'; value++)
7231 ANYOF_BITMAP_SET(ret, value);
7232 }
c49a72a9
NC
7233 yesno = '+';
7234 what = "Digit";
ffc61ed2
JH
7235 break;
7236 case ANYOF_NDIGIT:
7237 if (LOC)
7238 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7239 else {
7240 /* consecutive digits assumed */
7241 for (value = 0; value < '0'; value++)
7242 ANYOF_BITMAP_SET(ret, value);
7243 for (value = '9' + 1; value < 256; value++)
7244 ANYOF_BITMAP_SET(ret, value);
7245 }
c49a72a9
NC
7246 yesno = '!';
7247 what = "Digit";
73b437c8
JH
7248 break;
7249 case ANYOF_GRAPH:
7250 if (LOC)
936ed897 7251 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7252 else {
7253 for (value = 0; value < 256; value++)
7254 if (isGRAPH(value))
936ed897 7255 ANYOF_BITMAP_SET(ret, value);
73b437c8 7256 }
c49a72a9
NC
7257 yesno = '+';
7258 what = "Graph";
73b437c8
JH
7259 break;
7260 case ANYOF_NGRAPH:
7261 if (LOC)
936ed897 7262 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7263 else {
7264 for (value = 0; value < 256; value++)
7265 if (!isGRAPH(value))
936ed897 7266 ANYOF_BITMAP_SET(ret, value);
73b437c8 7267 }
c49a72a9
NC
7268 yesno = '!';
7269 what = "Graph";
73b437c8
JH
7270 break;
7271 case ANYOF_LOWER:
7272 if (LOC)
936ed897 7273 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7274 else {
7275 for (value = 0; value < 256; value++)
7276 if (isLOWER(value))
936ed897 7277 ANYOF_BITMAP_SET(ret, value);
73b437c8 7278 }
c49a72a9
NC
7279 yesno = '+';
7280 what = "Lower";
73b437c8
JH
7281 break;
7282 case ANYOF_NLOWER:
7283 if (LOC)
936ed897 7284 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7285 else {
7286 for (value = 0; value < 256; value++)
7287 if (!isLOWER(value))
936ed897 7288 ANYOF_BITMAP_SET(ret, value);
73b437c8 7289 }
c49a72a9
NC
7290 yesno = '!';
7291 what = "Lower";
73b437c8
JH
7292 break;
7293 case ANYOF_PRINT:
7294 if (LOC)
936ed897 7295 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7296 else {
7297 for (value = 0; value < 256; value++)
7298 if (isPRINT(value))
936ed897 7299 ANYOF_BITMAP_SET(ret, value);
73b437c8 7300 }
c49a72a9
NC
7301 yesno = '+';
7302 what = "Print";
73b437c8
JH
7303 break;
7304 case ANYOF_NPRINT:
7305 if (LOC)
936ed897 7306 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7307 else {
7308 for (value = 0; value < 256; value++)
7309 if (!isPRINT(value))
936ed897 7310 ANYOF_BITMAP_SET(ret, value);
73b437c8 7311 }
c49a72a9
NC
7312 yesno = '!';
7313 what = "Print";
73b437c8 7314 break;
aaa51d5e
JF
7315 case ANYOF_PSXSPC:
7316 if (LOC)
7317 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7318 else {
7319 for (value = 0; value < 256; value++)
7320 if (isPSXSPC(value))
7321 ANYOF_BITMAP_SET(ret, value);
7322 }
c49a72a9
NC
7323 yesno = '+';
7324 what = "Space";
aaa51d5e
JF
7325 break;
7326 case ANYOF_NPSXSPC:
7327 if (LOC)
7328 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7329 else {
7330 for (value = 0; value < 256; value++)
7331 if (!isPSXSPC(value))
7332 ANYOF_BITMAP_SET(ret, value);
7333 }
c49a72a9
NC
7334 yesno = '!';
7335 what = "Space";
aaa51d5e 7336 break;
73b437c8
JH
7337 case ANYOF_PUNCT:
7338 if (LOC)
936ed897 7339 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7340 else {
7341 for (value = 0; value < 256; value++)
7342 if (isPUNCT(value))
936ed897 7343 ANYOF_BITMAP_SET(ret, value);
73b437c8 7344 }
c49a72a9
NC
7345 yesno = '+';
7346 what = "Punct";
73b437c8
JH
7347 break;
7348 case ANYOF_NPUNCT:
7349 if (LOC)
936ed897 7350 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7351 else {
7352 for (value = 0; value < 256; value++)
7353 if (!isPUNCT(value))
936ed897 7354 ANYOF_BITMAP_SET(ret, value);
73b437c8 7355 }
c49a72a9
NC
7356 yesno = '!';
7357 what = "Punct";
ffc61ed2
JH
7358 break;
7359 case ANYOF_SPACE:
7360 if (LOC)
7361 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7362 else {
7363 for (value = 0; value < 256; value++)
7364 if (isSPACE(value))
7365 ANYOF_BITMAP_SET(ret, value);
7366 }
c49a72a9
NC
7367 yesno = '+';
7368 what = "SpacePerl";
ffc61ed2
JH
7369 break;
7370 case ANYOF_NSPACE:
7371 if (LOC)
7372 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7373 else {
7374 for (value = 0; value < 256; value++)
7375 if (!isSPACE(value))
7376 ANYOF_BITMAP_SET(ret, value);
7377 }
c49a72a9
NC
7378 yesno = '!';
7379 what = "SpacePerl";
73b437c8
JH
7380 break;
7381 case ANYOF_UPPER:
7382 if (LOC)
936ed897 7383 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7384 else {
7385 for (value = 0; value < 256; value++)
7386 if (isUPPER(value))
936ed897 7387 ANYOF_BITMAP_SET(ret, value);
73b437c8 7388 }
c49a72a9
NC
7389 yesno = '+';
7390 what = "Upper";
73b437c8
JH
7391 break;
7392 case ANYOF_NUPPER:
7393 if (LOC)
936ed897 7394 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7395 else {
7396 for (value = 0; value < 256; value++)
7397 if (!isUPPER(value))
936ed897 7398 ANYOF_BITMAP_SET(ret, value);
73b437c8 7399 }
c49a72a9
NC
7400 yesno = '!';
7401 what = "Upper";
73b437c8
JH
7402 break;
7403 case ANYOF_XDIGIT:
7404 if (LOC)
936ed897 7405 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7406 else {
7407 for (value = 0; value < 256; value++)
7408 if (isXDIGIT(value))
936ed897 7409 ANYOF_BITMAP_SET(ret, value);
73b437c8 7410 }
c49a72a9
NC
7411 yesno = '+';
7412 what = "XDigit";
73b437c8
JH
7413 break;
7414 case ANYOF_NXDIGIT:
7415 if (LOC)
936ed897 7416 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7417 else {
7418 for (value = 0; value < 256; value++)
7419 if (!isXDIGIT(value))
936ed897 7420 ANYOF_BITMAP_SET(ret, value);
73b437c8 7421 }
c49a72a9
NC
7422 yesno = '!';
7423 what = "XDigit";
73b437c8 7424 break;
f81125e2
JP
7425 case ANYOF_MAX:
7426 /* this is to handle \p and \P */
7427 break;
73b437c8 7428 default:
b45f050a 7429 vFAIL("Invalid [::] class");
73b437c8 7430 break;
b8c5462f 7431 }
c49a72a9
NC
7432 if (what) {
7433 /* Strings such as "+utf8::isWord\n" */
7434 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7435 }
b8c5462f 7436 if (LOC)
936ed897 7437 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7438 continue;
a0d0e21e 7439 }
ffc61ed2
JH
7440 } /* end of namedclass \blah */
7441
a0d0e21e 7442 if (range) {
eb160463 7443 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7444 const int w = RExC_parse - rangebegin;
7445 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7446 range = 0; /* not a valid range */
73b437c8 7447 }
a0d0e21e
LW
7448 }
7449 else {
3568d838 7450 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7451 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7452 RExC_parse[1] != ']') {
7453 RExC_parse++;
ffc61ed2
JH
7454
7455 /* a bad range like \w-, [:word:]- ? */
7456 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7457 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7458 const int w =
afd78fd5
JH
7459 RExC_parse >= rangebegin ?
7460 RExC_parse - rangebegin : 0;
830247a4 7461 vWARN4(RExC_parse,
b45f050a 7462 "False [] range \"%*.*s\"",
097eb12c 7463 w, w, rangebegin);
afd78fd5 7464 }
73b437c8 7465 if (!SIZE_ONLY)
936ed897 7466 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7467 } else
ffc61ed2
JH
7468 range = 1; /* yeah, it's a range! */
7469 continue; /* but do it the next time */
a0d0e21e 7470 }
a687059c 7471 }
ffc61ed2 7472
93733859 7473 /* now is the next time */
07be1b83 7474 /*stored += (value - prevvalue + 1);*/
ae5c130c 7475 if (!SIZE_ONLY) {
3568d838 7476 if (prevvalue < 256) {
1df70142 7477 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7478 IV i;
3568d838 7479#ifdef EBCDIC
1b2d223b
JH
7480 /* In EBCDIC [\x89-\x91] should include
7481 * the \x8e but [i-j] should not. */
7482 if (literal_endpoint == 2 &&
7483 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7484 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7485 {
3568d838
JH
7486 if (isLOWER(prevvalue)) {
7487 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7488 if (isLOWER(i))
7489 ANYOF_BITMAP_SET(ret, i);
7490 } else {
3568d838 7491 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7492 if (isUPPER(i))
7493 ANYOF_BITMAP_SET(ret, i);
7494 }
8ada0baa 7495 }
ffc61ed2 7496 else
8ada0baa 7497#endif
07be1b83
YO
7498 for (i = prevvalue; i <= ceilvalue; i++) {
7499 if (!ANYOF_BITMAP_TEST(ret,i)) {
7500 stored++;
7501 ANYOF_BITMAP_SET(ret, i);
7502 }
7503 }
3568d838 7504 }
a5961de5 7505 if (value > 255 || UTF) {
1df70142
AL
7506 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7507 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7508 stored+=2; /* can't optimize this class */
ffc61ed2 7509 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7510 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7511 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7512 prevnatvalue, natvalue);
7513 }
7514 else if (prevnatvalue == natvalue) {
7515 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7516 if (FOLD) {
89ebb4a3 7517 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7518 STRLEN foldlen;
1df70142 7519 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7520
e294cc5d
JH
7521#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7522 if (RExC_precomp[0] == ':' &&
7523 RExC_precomp[1] == '[' &&
7524 (f == 0xDF || f == 0x92)) {
7525 f = NATIVE_TO_UNI(f);
7526 }
7527#endif
c840d2a2
JH
7528 /* If folding and foldable and a single
7529 * character, insert also the folded version
7530 * to the charclass. */
9e55ce06 7531 if (f != value) {
e294cc5d
JH
7532#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7533 if ((RExC_precomp[0] == ':' &&
7534 RExC_precomp[1] == '[' &&
7535 (f == 0xA2 &&
7536 (value == 0xFB05 || value == 0xFB06))) ?
7537 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7538 foldlen == (STRLEN)UNISKIP(f) )
7539#else
eb160463 7540 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7541#endif
9e55ce06
JH
7542 Perl_sv_catpvf(aTHX_ listsv,
7543 "%04"UVxf"\n", f);
7544 else {
7545 /* Any multicharacter foldings
7546 * require the following transform:
7547 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7548 * where E folds into "pq" and F folds
7549 * into "rst", all other characters
7550 * fold to single characters. We save
7551 * away these multicharacter foldings,
7552 * to be later saved as part of the
7553 * additional "s" data. */
7554 SV *sv;
7555
7556 if (!unicode_alternate)
7557 unicode_alternate = newAV();
7558 sv = newSVpvn((char*)foldbuf, foldlen);
7559 SvUTF8_on(sv);
7560 av_push(unicode_alternate, sv);
7561 }
7562 }
254ba52a 7563
60a8b682
JH
7564 /* If folding and the value is one of the Greek
7565 * sigmas insert a few more sigmas to make the
7566 * folding rules of the sigmas to work right.
7567 * Note that not all the possible combinations
7568 * are handled here: some of them are handled
9e55ce06
JH
7569 * by the standard folding rules, and some of
7570 * them (literal or EXACTF cases) are handled
7571 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7572 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7573 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7574 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7575 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7576 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7577 }
7578 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7579 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7580 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7581 }
7582 }
ffc61ed2 7583 }
1b2d223b
JH
7584#ifdef EBCDIC
7585 literal_endpoint = 0;
7586#endif
8ada0baa 7587 }
ffc61ed2
JH
7588
7589 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7590 }
ffc61ed2 7591
936ed897 7592 if (need_class) {
4f66b38d 7593 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7594 if (SIZE_ONLY)
830247a4 7595 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7596 else
830247a4 7597 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7598 }
ffc61ed2 7599
7f6f358c
YO
7600
7601 if (SIZE_ONLY)
7602 return ret;
7603 /****** !SIZE_ONLY AFTER HERE *********/
7604
7605 if( stored == 1 && value < 256
7606 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7607 ) {
7608 /* optimize single char class to an EXACT node
7609 but *only* when its not a UTF/high char */
07be1b83
YO
7610 const char * cur_parse= RExC_parse;
7611 RExC_emit = (regnode *)orig_emit;
7612 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7613 ret = reg_node(pRExC_state,
7614 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7615 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7616 *STRING(ret)= (char)value;
7617 STR_LEN(ret)= 1;
7618 RExC_emit += STR_SZ(1);
7619 return ret;
7620 }
ae5c130c 7621 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7622 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7623 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7624 ) {
a0ed51b3 7625 for (value = 0; value < 256; ++value) {
936ed897 7626 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7627 UV fold = PL_fold[value];
ffc61ed2
JH
7628
7629 if (fold != value)
7630 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7631 }
7632 }
936ed897 7633 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7634 }
ffc61ed2 7635
ae5c130c 7636 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7637 if (optimize_invert &&
ffc61ed2
JH
7638 /* If the only flag is inversion. */
7639 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7640 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7641 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7642 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7643 }
7f6f358c 7644 {
097eb12c 7645 AV * const av = newAV();
ffc61ed2 7646 SV *rv;
9e55ce06 7647 /* The 0th element stores the character class description
6a0407ee 7648 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7649 * to initialize the appropriate swash (which gets stored in
7650 * the 1st element), and also useful for dumping the regnode.
7651 * The 2nd element stores the multicharacter foldings,
6a0407ee 7652 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7653 av_store(av, 0, listsv);
7654 av_store(av, 1, NULL);
9e55ce06 7655 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7656 rv = newRV_noinc((SV*)av);
19860706 7657 n = add_data(pRExC_state, 1, "s");
830247a4 7658 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7659 ARG_SET(ret, n);
a0ed51b3 7660 }
a0ed51b3
LW
7661 return ret;
7662}
7663
76e3520e 7664STATIC char*
830247a4 7665S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7666{
097eb12c 7667 char* const retval = RExC_parse++;
a0d0e21e 7668
4633a7c4 7669 for (;;) {
830247a4
IZ
7670 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7671 RExC_parse[2] == '#') {
e994fd66
AE
7672 while (*RExC_parse != ')') {
7673 if (RExC_parse == RExC_end)
7674 FAIL("Sequence (?#... not terminated");
830247a4 7675 RExC_parse++;
e994fd66 7676 }
830247a4 7677 RExC_parse++;
4633a7c4
LW
7678 continue;
7679 }
e2509266 7680 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7681 if (isSPACE(*RExC_parse)) {
7682 RExC_parse++;
748a9306
LW
7683 continue;
7684 }
830247a4 7685 else if (*RExC_parse == '#') {
e994fd66
AE
7686 while (RExC_parse < RExC_end)
7687 if (*RExC_parse++ == '\n') break;
748a9306
LW
7688 continue;
7689 }
748a9306 7690 }
4633a7c4 7691 return retval;
a0d0e21e 7692 }
a687059c
LW
7693}
7694
7695/*
c277df42 7696- reg_node - emit a node
a0d0e21e 7697*/
76e3520e 7698STATIC regnode * /* Location. */
830247a4 7699S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7700{
97aff369 7701 dVAR;
c277df42 7702 register regnode *ptr;
504618e9 7703 regnode * const ret = RExC_emit;
07be1b83 7704 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7705
c277df42 7706 if (SIZE_ONLY) {
830247a4
IZ
7707 SIZE_ALIGN(RExC_size);
7708 RExC_size += 1;
a0d0e21e
LW
7709 return(ret);
7710 }
e2e6a0f1
YO
7711#ifdef DEBUGGING
7712 if (OP(RExC_emit) == 255)
7713 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7714 reg_name[op], OP(RExC_emit));
7715#endif
c277df42 7716 NODE_ALIGN_FILL(ret);
a0d0e21e 7717 ptr = ret;
c277df42 7718 FILL_ADVANCE_NODE(ptr, op);
fac92740 7719 if (RExC_offsets) { /* MJD */
07be1b83 7720 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7721 "reg_node", __LINE__,
7722 reg_name[op],
07be1b83
YO
7723 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7724 ? "Overwriting end of array!\n" : "OK",
7725 (UV)(RExC_emit - RExC_emit_start),
7726 (UV)(RExC_parse - RExC_start),
7727 (UV)RExC_offsets[0]));
ccb2c380 7728 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7729 }
07be1b83 7730
830247a4 7731 RExC_emit = ptr;
a0d0e21e 7732 return(ret);
a687059c
LW
7733}
7734
7735/*
a0d0e21e
LW
7736- reganode - emit a node with an argument
7737*/
76e3520e 7738STATIC regnode * /* Location. */
830247a4 7739S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7740{
97aff369 7741 dVAR;
c277df42 7742 register regnode *ptr;
504618e9 7743 regnode * const ret = RExC_emit;
07be1b83 7744 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7745
c277df42 7746 if (SIZE_ONLY) {
830247a4
IZ
7747 SIZE_ALIGN(RExC_size);
7748 RExC_size += 2;
6bda09f9
YO
7749 /*
7750 We can't do this:
7751
7752 assert(2==regarglen[op]+1);
7753
7754 Anything larger than this has to allocate the extra amount.
7755 If we changed this to be:
7756
7757 RExC_size += (1 + regarglen[op]);
7758
7759 then it wouldn't matter. Its not clear what side effect
7760 might come from that so its not done so far.
7761 -- dmq
7762 */
a0d0e21e
LW
7763 return(ret);
7764 }
e2e6a0f1
YO
7765#ifdef DEBUGGING
7766 if (OP(RExC_emit) == 255)
7767 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7768#endif
c277df42 7769 NODE_ALIGN_FILL(ret);
a0d0e21e 7770 ptr = ret;
c277df42 7771 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7772 if (RExC_offsets) { /* MJD */
07be1b83 7773 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7774 "reganode",
ccb2c380
MP
7775 __LINE__,
7776 reg_name[op],
07be1b83 7777 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7778 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7779 (UV)(RExC_emit - RExC_emit_start),
7780 (UV)(RExC_parse - RExC_start),
7781 (UV)RExC_offsets[0]));
ccb2c380 7782 Set_Cur_Node_Offset;
fac92740
MJD
7783 }
7784
830247a4 7785 RExC_emit = ptr;
a0d0e21e 7786 return(ret);
fe14fcc3
LW
7787}
7788
7789/*
cd439c50 7790- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7791*/
71207a34
AL
7792STATIC STRLEN
7793S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7794{
97aff369 7795 dVAR;
71207a34 7796 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7797}
7798
7799/*
a0d0e21e
LW
7800- reginsert - insert an operator in front of already-emitted operand
7801*
7802* Means relocating the operand.
7803*/
76e3520e 7804STATIC void
6bda09f9 7805S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7806{
97aff369 7807 dVAR;
c277df42
IZ
7808 register regnode *src;
7809 register regnode *dst;
7810 register regnode *place;
504618e9 7811 const int offset = regarglen[(U8)op];
6bda09f9 7812 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7813 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7814/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7815 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7816 if (SIZE_ONLY) {
6bda09f9 7817 RExC_size += size;
a0d0e21e
LW
7818 return;
7819 }
a687059c 7820
830247a4 7821 src = RExC_emit;
6bda09f9 7822 RExC_emit += size;
830247a4 7823 dst = RExC_emit;
40d049e4 7824 if (RExC_open_parens) {
6bda09f9 7825 int paren;
6d99fb9b 7826 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7827 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7828 if ( RExC_open_parens[paren] >= opnd ) {
7829 DEBUG_PARSE_FMT("open"," - %d",size);
7830 RExC_open_parens[paren] += size;
7831 } else {
7832 DEBUG_PARSE_FMT("open"," - %s","ok");
7833 }
7834 if ( RExC_close_parens[paren] >= opnd ) {
7835 DEBUG_PARSE_FMT("close"," - %d",size);
7836 RExC_close_parens[paren] += size;
7837 } else {
7838 DEBUG_PARSE_FMT("close"," - %s","ok");
7839 }
7840 }
6bda09f9 7841 }
40d049e4 7842
fac92740 7843 while (src > opnd) {
c277df42 7844 StructCopy(--src, --dst, regnode);
fac92740 7845 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7846 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7847 "reg_insert",
ccb2c380
MP
7848 __LINE__,
7849 reg_name[op],
07be1b83
YO
7850 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7851 ? "Overwriting end of array!\n" : "OK",
7852 (UV)(src - RExC_emit_start),
7853 (UV)(dst - RExC_emit_start),
7854 (UV)RExC_offsets[0]));
ccb2c380
MP
7855 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7856 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7857 }
7858 }
7859
a0d0e21e
LW
7860
7861 place = opnd; /* Op node, where operand used to be. */
fac92740 7862 if (RExC_offsets) { /* MJD */
07be1b83 7863 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7864 "reginsert",
ccb2c380
MP
7865 __LINE__,
7866 reg_name[op],
07be1b83 7867 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7868 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7869 (UV)(place - RExC_emit_start),
7870 (UV)(RExC_parse - RExC_start),
786e8c11 7871 (UV)RExC_offsets[0]));
ccb2c380 7872 Set_Node_Offset(place, RExC_parse);
45948336 7873 Set_Node_Length(place, 1);
fac92740 7874 }
c277df42
IZ
7875 src = NEXTOPER(place);
7876 FILL_ADVANCE_NODE(place, op);
7877 Zero(src, offset, regnode);
a687059c
LW
7878}
7879
7880/*
c277df42 7881- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7882- SEE ALSO: regtail_study
a0d0e21e 7883*/
097eb12c 7884/* TODO: All three parms should be const */
76e3520e 7885STATIC void
3dab1dad 7886S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7887{
97aff369 7888 dVAR;
c277df42 7889 register regnode *scan;
72f13be8 7890 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7891#ifndef DEBUGGING
7892 PERL_UNUSED_ARG(depth);
7893#endif
a0d0e21e 7894
c277df42 7895 if (SIZE_ONLY)
a0d0e21e
LW
7896 return;
7897
7898 /* Find last node. */
7899 scan = p;
7900 for (;;) {
504618e9 7901 regnode * const temp = regnext(scan);
3dab1dad
YO
7902 DEBUG_PARSE_r({
7903 SV * const mysv=sv_newmortal();
7904 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7905 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7906 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7907 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7908 (temp == NULL ? "->" : ""),
7909 (temp == NULL ? reg_name[OP(val)] : "")
7910 );
3dab1dad
YO
7911 });
7912 if (temp == NULL)
7913 break;
7914 scan = temp;
7915 }
7916
7917 if (reg_off_by_arg[OP(scan)]) {
7918 ARG_SET(scan, val - scan);
7919 }
7920 else {
7921 NEXT_OFF(scan) = val - scan;
7922 }
7923}
7924
07be1b83 7925#ifdef DEBUGGING
3dab1dad
YO
7926/*
7927- regtail_study - set the next-pointer at the end of a node chain of p to val.
7928- Look for optimizable sequences at the same time.
7929- currently only looks for EXACT chains.
07be1b83
YO
7930
7931This is expermental code. The idea is to use this routine to perform
7932in place optimizations on branches and groups as they are constructed,
7933with the long term intention of removing optimization from study_chunk so
7934that it is purely analytical.
7935
7936Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7937to control which is which.
7938
3dab1dad
YO
7939*/
7940/* TODO: All four parms should be const */
07be1b83 7941
3dab1dad
YO
7942STATIC U8
7943S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7944{
7945 dVAR;
7946 register regnode *scan;
07be1b83
YO
7947 U8 exact = PSEUDO;
7948#ifdef EXPERIMENTAL_INPLACESCAN
7949 I32 min = 0;
7950#endif
7951
3dab1dad
YO
7952 GET_RE_DEBUG_FLAGS_DECL;
7953
07be1b83 7954
3dab1dad
YO
7955 if (SIZE_ONLY)
7956 return exact;
7957
7958 /* Find last node. */
7959
7960 scan = p;
7961 for (;;) {
7962 regnode * const temp = regnext(scan);
07be1b83
YO
7963#ifdef EXPERIMENTAL_INPLACESCAN
7964 if (PL_regkind[OP(scan)] == EXACT)
7965 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7966 return EXACT;
7967#endif
3dab1dad
YO
7968 if ( exact ) {
7969 switch (OP(scan)) {
7970 case EXACT:
7971 case EXACTF:
7972 case EXACTFL:
7973 if( exact == PSEUDO )
7974 exact= OP(scan);
07be1b83
YO
7975 else if ( exact != OP(scan) )
7976 exact= 0;
3dab1dad
YO
7977 case NOTHING:
7978 break;
7979 default:
7980 exact= 0;
7981 }
7982 }
7983 DEBUG_PARSE_r({
7984 SV * const mysv=sv_newmortal();
7985 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7986 regprop(RExC_rx, mysv, scan);
eaf3ca90 7987 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7988 SvPV_nolen_const(mysv),
eaf3ca90
YO
7989 REG_NODE_NUM(scan),
7990 reg_name[exact]);
3dab1dad 7991 });
a0d0e21e
LW
7992 if (temp == NULL)
7993 break;
7994 scan = temp;
7995 }
07be1b83
YO
7996 DEBUG_PARSE_r({
7997 SV * const mysv_val=sv_newmortal();
7998 DEBUG_PARSE_MSG("");
7999 regprop(RExC_rx, mysv_val, val);
8000 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8001 SvPV_nolen_const(mysv_val),
8002 REG_NODE_NUM(val),
8003 val - scan
8004 );
8005 });
c277df42
IZ
8006 if (reg_off_by_arg[OP(scan)]) {
8007 ARG_SET(scan, val - scan);
a0ed51b3
LW
8008 }
8009 else {
c277df42
IZ
8010 NEXT_OFF(scan) = val - scan;
8011 }
3dab1dad
YO
8012
8013 return exact;
a687059c 8014}
07be1b83 8015#endif
a687059c
LW
8016
8017/*
a687059c
LW
8018 - regcurly - a little FSA that accepts {\d+,?\d*}
8019 */
79072805 8020STATIC I32
5f66b61c 8021S_regcurly(register const char *s)
a687059c
LW
8022{
8023 if (*s++ != '{')
8024 return FALSE;
f0fcb552 8025 if (!isDIGIT(*s))
a687059c 8026 return FALSE;
f0fcb552 8027 while (isDIGIT(*s))
a687059c
LW
8028 s++;
8029 if (*s == ',')
8030 s++;
f0fcb552 8031 while (isDIGIT(*s))
a687059c
LW
8032 s++;
8033 if (*s != '}')
8034 return FALSE;
8035 return TRUE;
8036}
8037
a687059c
LW
8038
8039/*
fd181c75 8040 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8041 */
8042void
097eb12c 8043Perl_regdump(pTHX_ const regexp *r)
a687059c 8044{
35ff7856 8045#ifdef DEBUGGING
97aff369 8046 dVAR;
c445ea15 8047 SV * const sv = sv_newmortal();
ab3bbdeb 8048 SV *dsv= sv_newmortal();
a687059c 8049
786e8c11 8050 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8051
8052 /* Header fields of interest. */
ab3bbdeb
YO
8053 if (r->anchored_substr) {
8054 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8055 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8056 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8057 "anchored %s%s at %"IVdf" ",
8058 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8059 (IV)r->anchored_offset);
ab3bbdeb
YO
8060 } else if (r->anchored_utf8) {
8061 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8062 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8063 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8064 "anchored utf8 %s%s at %"IVdf" ",
8065 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8066 (IV)r->anchored_offset);
ab3bbdeb
YO
8067 }
8068 if (r->float_substr) {
8069 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8070 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8071 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8072 "floating %s%s at %"IVdf"..%"UVuf" ",
8073 s, RE_SV_TAIL(r->float_substr),
7b0972df 8074 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8075 } else if (r->float_utf8) {
8076 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8077 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8078 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8079 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8080 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8081 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8082 }
33b8afdf 8083 if (r->check_substr || r->check_utf8)
b81d288d 8084 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8085 (const char *)
8086 (r->check_substr == r->float_substr
8087 && r->check_utf8 == r->float_utf8
8088 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8089 if (r->reganch & ROPT_NOSCAN)
8090 PerlIO_printf(Perl_debug_log, " noscan");
8091 if (r->reganch & ROPT_CHECK_ALL)
8092 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8093 if (r->check_substr || r->check_utf8)
c277df42
IZ
8094 PerlIO_printf(Perl_debug_log, ") ");
8095
46fc3d4c 8096 if (r->regstclass) {
32fc9b6a 8097 regprop(r, sv, r->regstclass);
1de06328 8098 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8099 }
774d564b 8100 if (r->reganch & ROPT_ANCH) {
8101 PerlIO_printf(Perl_debug_log, "anchored");
8102 if (r->reganch & ROPT_ANCH_BOL)
8103 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8104 if (r->reganch & ROPT_ANCH_MBOL)
8105 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8106 if (r->reganch & ROPT_ANCH_SBOL)
8107 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8108 if (r->reganch & ROPT_ANCH_GPOS)
8109 PerlIO_printf(Perl_debug_log, "(GPOS)");
8110 PerlIO_putc(Perl_debug_log, ' ');
8111 }
c277df42
IZ
8112 if (r->reganch & ROPT_GPOS_SEEN)
8113 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8114 if (r->reganch & ROPT_SKIP)
760ac839 8115 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8116 if (r->reganch & ROPT_IMPLICIT)
760ac839 8117 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8118 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8119 if (r->reganch & ROPT_EVAL_SEEN)
8120 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8121 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8122#else
96a5add6 8123 PERL_UNUSED_CONTEXT;
65e66c80 8124 PERL_UNUSED_ARG(r);
17c3b450 8125#endif /* DEBUGGING */
a687059c
LW
8126}
8127
8128/*
a0d0e21e
LW
8129- regprop - printable representation of opcode
8130*/
46fc3d4c 8131void
32fc9b6a 8132Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8133{
35ff7856 8134#ifdef DEBUGGING
97aff369 8135 dVAR;
9b155405 8136 register int k;
1de06328 8137 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8138
54dc92de 8139 sv_setpvn(sv, "", 0);
b515a41d 8140
03363afd 8141 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8142 /* It would be nice to FAIL() here, but this may be called from
8143 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8144 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8145 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8146
3dab1dad 8147 k = PL_regkind[OP(o)];
9b155405 8148
2a782b5b 8149 if (k == EXACT) {
396482e1 8150 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8151 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8152 * is a crude hack but it may be the best for now since
8153 * we have no flag "this EXACTish node was UTF-8"
8154 * --jhi */
8155 const char * const s =
ddc5bc0f 8156 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8157 PL_colors[0], PL_colors[1],
8158 PERL_PV_ESCAPE_UNI_DETECT |
8159 PERL_PV_PRETTY_ELIPSES |
8160 PERL_PV_PRETTY_LTGT
8161 );
8162 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8163 } else if (k == TRIE) {
3dab1dad 8164 /* print the details of the trie in dumpuntil instead, as
4f639d21 8165 * prog->data isn't available here */
1de06328
YO
8166 const char op = OP(o);
8167 const I32 n = ARG(o);
8168 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8169 (reg_ac_data *)prog->data->data[n] :
8170 NULL;
8171 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8172 (reg_trie_data*)prog->data->data[n] :
8173 ac->trie;
8174
8175 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8176 DEBUG_TRIE_COMPILE_r(
8177 Perl_sv_catpvf(aTHX_ sv,
8178 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8179 (UV)trie->startstate,
1e2e3d02 8180 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8181 (UV)trie->wordcount,
8182 (UV)trie->minlen,
8183 (UV)trie->maxlen,
8184 (UV)TRIE_CHARCOUNT(trie),
8185 (UV)trie->uniquecharcount
8186 )
8187 );
8188 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8189 int i;
8190 int rangestart = -1;
f46cb337 8191 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8192 Perl_sv_catpvf(aTHX_ sv, "[");
8193 for (i = 0; i <= 256; i++) {
8194 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8195 if (rangestart == -1)
8196 rangestart = i;
8197 } else if (rangestart != -1) {
8198 if (i <= rangestart + 3)
8199 for (; rangestart < i; rangestart++)
8200 put_byte(sv, rangestart);
8201 else {
8202 put_byte(sv, rangestart);
8203 sv_catpvs(sv, "-");
8204 put_byte(sv, i - 1);
8205 }
8206 rangestart = -1;
8207 }
8208 }
8209 Perl_sv_catpvf(aTHX_ sv, "]");
8210 }
8211
a3621e74 8212 } else if (k == CURLY) {
cb434fcc 8213 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8214 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8215 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8216 }
2c2d71f5
JH
8217 else if (k == WHILEM && o->flags) /* Ordinal/of */
8218 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8219 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8220 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8221 else if (k == GOSUB)
6bda09f9 8222 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8223 else if (k == VERB) {
8224 if (!o->flags)
8225 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8226 (SV*)prog->data->data[ ARG( o ) ]);
8227 } else if (k == LOGICAL)
04ebc1ab 8228 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8229 else if (k == ANYOF) {
8230 int i, rangestart = -1;
2d03de9c 8231 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8232
8233 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8234 static const char * const anyofs[] = {
653099ff
GS
8235 "\\w",
8236 "\\W",
8237 "\\s",
8238 "\\S",
8239 "\\d",
8240 "\\D",
8241 "[:alnum:]",
8242 "[:^alnum:]",
8243 "[:alpha:]",
8244 "[:^alpha:]",
8245 "[:ascii:]",
8246 "[:^ascii:]",
8247 "[:ctrl:]",
8248 "[:^ctrl:]",
8249 "[:graph:]",
8250 "[:^graph:]",
8251 "[:lower:]",
8252 "[:^lower:]",
8253 "[:print:]",
8254 "[:^print:]",
8255 "[:punct:]",
8256 "[:^punct:]",
8257 "[:upper:]",
aaa51d5e 8258 "[:^upper:]",
653099ff 8259 "[:xdigit:]",
aaa51d5e
JF
8260 "[:^xdigit:]",
8261 "[:space:]",
8262 "[:^space:]",
8263 "[:blank:]",
8264 "[:^blank:]"
653099ff
GS
8265 };
8266
19860706 8267 if (flags & ANYOF_LOCALE)
396482e1 8268 sv_catpvs(sv, "{loc}");
19860706 8269 if (flags & ANYOF_FOLD)
396482e1 8270 sv_catpvs(sv, "{i}");
653099ff 8271 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8272 if (flags & ANYOF_INVERT)
396482e1 8273 sv_catpvs(sv, "^");
ffc61ed2
JH
8274 for (i = 0; i <= 256; i++) {
8275 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8276 if (rangestart == -1)
8277 rangestart = i;
8278 } else if (rangestart != -1) {
8279 if (i <= rangestart + 3)
8280 for (; rangestart < i; rangestart++)
653099ff 8281 put_byte(sv, rangestart);
ffc61ed2
JH
8282 else {
8283 put_byte(sv, rangestart);
396482e1 8284 sv_catpvs(sv, "-");
ffc61ed2 8285 put_byte(sv, i - 1);
653099ff 8286 }
ffc61ed2 8287 rangestart = -1;
653099ff 8288 }
847a199f 8289 }
ffc61ed2
JH
8290
8291 if (o->flags & ANYOF_CLASS)
bb7a0f54 8292 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8293 if (ANYOF_CLASS_TEST(o,i))
8294 sv_catpv(sv, anyofs[i]);
8295
8296 if (flags & ANYOF_UNICODE)
396482e1 8297 sv_catpvs(sv, "{unicode}");
1aa99e6b 8298 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8299 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8300
8301 {
8302 SV *lv;
32fc9b6a 8303 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8304
ffc61ed2
JH
8305 if (lv) {
8306 if (sw) {
89ebb4a3 8307 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8308
ffc61ed2 8309 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8310 uvchr_to_utf8(s, i);
ffc61ed2 8311
3568d838 8312 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8313 if (rangestart == -1)
8314 rangestart = i;
8315 } else if (rangestart != -1) {
ffc61ed2
JH
8316 if (i <= rangestart + 3)
8317 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8318 const U8 * const e = uvchr_to_utf8(s,rangestart);
8319 U8 *p;
8320 for(p = s; p < e; p++)
ffc61ed2
JH
8321 put_byte(sv, *p);
8322 }
8323 else {
2d03de9c
AL
8324 const U8 *e = uvchr_to_utf8(s,rangestart);
8325 U8 *p;
8326 for (p = s; p < e; p++)
ffc61ed2 8327 put_byte(sv, *p);
396482e1 8328 sv_catpvs(sv, "-");
2d03de9c
AL
8329 e = uvchr_to_utf8(s, i-1);
8330 for (p = s; p < e; p++)
1df70142 8331 put_byte(sv, *p);
ffc61ed2
JH
8332 }
8333 rangestart = -1;
8334 }
19860706 8335 }
ffc61ed2 8336
396482e1 8337 sv_catpvs(sv, "..."); /* et cetera */
19860706 8338 }
fde631ed 8339
ffc61ed2 8340 {
2e0de35c 8341 char *s = savesvpv(lv);
c445ea15 8342 char * const origs = s;
b81d288d 8343
3dab1dad
YO
8344 while (*s && *s != '\n')
8345 s++;
b81d288d 8346
ffc61ed2 8347 if (*s == '\n') {
2d03de9c 8348 const char * const t = ++s;
ffc61ed2
JH
8349
8350 while (*s) {
8351 if (*s == '\n')
8352 *s = ' ';
8353 s++;
8354 }
8355 if (s[-1] == ' ')
8356 s[-1] = 0;
8357
8358 sv_catpv(sv, t);
fde631ed 8359 }
b81d288d 8360
ffc61ed2 8361 Safefree(origs);
fde631ed
JH
8362 }
8363 }
653099ff 8364 }
ffc61ed2 8365
653099ff
GS
8366 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8367 }
9b155405 8368 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8369 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8370#else
96a5add6 8371 PERL_UNUSED_CONTEXT;
65e66c80
SP
8372 PERL_UNUSED_ARG(sv);
8373 PERL_UNUSED_ARG(o);
f9049ba1 8374 PERL_UNUSED_ARG(prog);
17c3b450 8375#endif /* DEBUGGING */
35ff7856 8376}
a687059c 8377
cad2e5aa
JH
8378SV *
8379Perl_re_intuit_string(pTHX_ regexp *prog)
8380{ /* Assume that RE_INTUIT is set */
97aff369 8381 dVAR;
a3621e74 8382 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8383 PERL_UNUSED_CONTEXT;
8384
a3621e74 8385 DEBUG_COMPILE_r(
cfd0369c 8386 {
2d03de9c 8387 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8388 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8389
8390 if (!PL_colorset) reginitcolors();
8391 PerlIO_printf(Perl_debug_log,
a0288114 8392 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8393 PL_colors[4],
8394 prog->check_substr ? "" : "utf8 ",
8395 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8396 s,
8397 PL_colors[1],
8398 (strlen(s) > 60 ? "..." : ""));
8399 } );
8400
33b8afdf 8401 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8402}
8403
84da74a7
YO
8404/*
8405 pregfree - free a regexp
8406
8407 See regdupe below if you change anything here.
8408*/
8409
2b69d0c2 8410void
864dbfa3 8411Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8412{
27da23d5 8413 dVAR;
0df25f3d 8414
fc32ee4a 8415 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8416
7821416a
IZ
8417 if (!r || (--r->refcnt > 0))
8418 return;
ab3bbdeb 8419 DEBUG_COMPILE_r({
0df25f3d
YO
8420 if (!PL_colorset)
8421 reginitcolors();
5c9f2f80
YO
8422 {
8423 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8424 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8425 dsv, r->precomp, r->prelen, 60);
8426 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8427 PL_colors[4],PL_colors[5],s);
8428 }
9e55ce06 8429 });
cad2e5aa 8430
43c5f42d
NC
8431 /* gcov results gave these as non-null 100% of the time, so there's no
8432 optimisation in checking them before calling Safefree */
8433 Safefree(r->precomp);
8434 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8435 RX_MATCH_COPY_FREE(r);
f8c7b90f 8436#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8437 if (r->saved_copy)
8438 SvREFCNT_dec(r->saved_copy);
8439#endif
a193d654
GS
8440 if (r->substrs) {
8441 if (r->anchored_substr)
8442 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8443 if (r->anchored_utf8)
8444 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8445 if (r->float_substr)
8446 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8447 if (r->float_utf8)
8448 SvREFCNT_dec(r->float_utf8);
2779dcf1 8449 Safefree(r->substrs);
a193d654 8450 }
81714fb9
YO
8451 if (r->paren_names)
8452 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8453 if (r->data) {
8454 int n = r->data->count;
f3548bdc
DM
8455 PAD* new_comppad = NULL;
8456 PAD* old_comppad;
4026c95a 8457 PADOFFSET refcnt;
dfad63ad 8458
c277df42 8459 while (--n >= 0) {
261faec3 8460 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8461 switch (r->data->what[n]) {
8462 case 's':
81714fb9 8463 case 'S':
c277df42
IZ
8464 SvREFCNT_dec((SV*)r->data->data[n]);
8465 break;
653099ff
GS
8466 case 'f':
8467 Safefree(r->data->data[n]);
8468 break;
dfad63ad
HS
8469 case 'p':
8470 new_comppad = (AV*)r->data->data[n];
8471 break;
c277df42 8472 case 'o':
dfad63ad 8473 if (new_comppad == NULL)
cea2e8a9 8474 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8475 PAD_SAVE_LOCAL(old_comppad,
8476 /* Watch out for global destruction's random ordering. */
c445ea15 8477 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8478 );
b34c0dd4 8479 OP_REFCNT_LOCK;
4026c95a
SH
8480 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8481 OP_REFCNT_UNLOCK;
8482 if (!refcnt)
9b978d73 8483 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8484
f3548bdc 8485 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8486 SvREFCNT_dec((SV*)new_comppad);
8487 new_comppad = NULL;
c277df42
IZ
8488 break;
8489 case 'n':
9e55ce06 8490 break;
07be1b83 8491 case 'T':
be8e71aa
YO
8492 { /* Aho Corasick add-on structure for a trie node.
8493 Used in stclass optimization only */
07be1b83
YO
8494 U32 refcount;
8495 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8496 OP_REFCNT_LOCK;
8497 refcount = --aho->refcount;
8498 OP_REFCNT_UNLOCK;
8499 if ( !refcount ) {
8500 Safefree(aho->states);
8501 Safefree(aho->fail);
8502 aho->trie=NULL; /* not necessary to free this as it is
8503 handled by the 't' case */
8504 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8505 Safefree(r->regstclass);
07be1b83
YO
8506 }
8507 }
8508 break;
a3621e74 8509 case 't':
07be1b83 8510 {
be8e71aa 8511 /* trie structure. */
07be1b83
YO
8512 U32 refcount;
8513 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8514 OP_REFCNT_LOCK;
8515 refcount = --trie->refcount;
8516 OP_REFCNT_UNLOCK;
8517 if ( !refcount ) {
8518 Safefree(trie->charmap);
8519 if (trie->widecharmap)
8520 SvREFCNT_dec((SV*)trie->widecharmap);
8521 Safefree(trie->states);
8522 Safefree(trie->trans);
8523 if (trie->bitmap)
8524 Safefree(trie->bitmap);
8525 if (trie->wordlen)
8526 Safefree(trie->wordlen);
786e8c11
YO
8527 if (trie->jump)
8528 Safefree(trie->jump);
8529 if (trie->nextword)
8530 Safefree(trie->nextword);
a3621e74 8531#ifdef DEBUGGING
5c9f2f80
YO
8532 if (trie->words)
8533 SvREFCNT_dec((SV*)trie->words);
8534 if (trie->revcharmap)
8535 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8536#endif
07be1b83 8537 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8538 }
07be1b83
YO
8539 }
8540 break;
c277df42 8541 default:
830247a4 8542 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8543 }
8544 }
8545 Safefree(r->data->what);
8546 Safefree(r->data);
a0d0e21e
LW
8547 }
8548 Safefree(r->startp);
8549 Safefree(r->endp);
c74340f9
YO
8550 if (r->swap) {
8551 Safefree(r->swap->startp);
8552 Safefree(r->swap->endp);
8553 Safefree(r->swap);
8554 }
a0d0e21e 8555 Safefree(r);
a687059c 8556}
c277df42 8557
84da74a7
YO
8558#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8559#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8560#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8561#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8562
8563/*
8564 regdupe - duplicate a regexp.
8565
8566 This routine is called by sv.c's re_dup and is expected to clone a
8567 given regexp structure. It is a no-op when not under USE_ITHREADS.
8568 (Originally this *was* re_dup() for change history see sv.c)
8569
8570 See pregfree() above if you change anything here.
8571*/
a3c0e9ca 8572#if defined(USE_ITHREADS)
84da74a7
YO
8573regexp *
8574Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8575{
84da74a7
YO
8576 dVAR;
8577 REGEXP *ret;
8578 int i, len, npar;
8579 struct reg_substr_datum *s;
8580
8581 if (!r)
8582 return (REGEXP *)NULL;
8583
8584 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8585 return ret;
8586
8587 len = r->offsets[0];
8588 npar = r->nparens+1;
8589
8590 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8591 Copy(r->program, ret->program, len+1, regnode);
8592
8593 Newx(ret->startp, npar, I32);
8594 Copy(r->startp, ret->startp, npar, I32);
8595 Newx(ret->endp, npar, I32);
8596 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8597 if(r->swap) {
8598 Newx(ret->swap, 1, regexp_paren_ofs);
8599 /* no need to copy these */
8600 Newx(ret->swap->startp, npar, I32);
8601 Newx(ret->swap->endp, npar, I32);
8602 } else {
8603 ret->swap = NULL;
8604 }
84da74a7
YO
8605
8606 Newx(ret->substrs, 1, struct reg_substr_data);
8607 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8608 s->min_offset = r->substrs->data[i].min_offset;
8609 s->max_offset = r->substrs->data[i].max_offset;
8610 s->end_shift = r->substrs->data[i].end_shift;
8611 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8612 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8613 }
8614
8615 ret->regstclass = NULL;
8616 if (r->data) {
8617 struct reg_data *d;
8618 const int count = r->data->count;
8619 int i;
8620
8621 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8622 char, struct reg_data);
8623 Newx(d->what, count, U8);
8624
8625 d->count = count;
8626 for (i = 0; i < count; i++) {
8627 d->what[i] = r->data->what[i];
8628 switch (d->what[i]) {
e2e6a0f1 8629 /* legal options are one of: sSfpont
84da74a7
YO
8630 see also regcomp.h and pregfree() */
8631 case 's':
81714fb9 8632 case 'S':
84da74a7
YO
8633 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8634 break;
8635 case 'p':
8636 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8637 break;
8638 case 'f':
8639 /* This is cheating. */
8640 Newx(d->data[i], 1, struct regnode_charclass_class);
8641 StructCopy(r->data->data[i], d->data[i],
8642 struct regnode_charclass_class);
8643 ret->regstclass = (regnode*)d->data[i];
8644 break;
8645 case 'o':
8646 /* Compiled op trees are readonly, and can thus be
8647 shared without duplication. */
8648 OP_REFCNT_LOCK;
8649 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8650 OP_REFCNT_UNLOCK;
8651 break;
8652 case 'n':
8653 d->data[i] = r->data->data[i];
8654 break;
8655 case 't':
8656 d->data[i] = r->data->data[i];
8657 OP_REFCNT_LOCK;
8658 ((reg_trie_data*)d->data[i])->refcount++;
8659 OP_REFCNT_UNLOCK;
8660 break;
8661 case 'T':
8662 d->data[i] = r->data->data[i];
8663 OP_REFCNT_LOCK;
8664 ((reg_ac_data*)d->data[i])->refcount++;
8665 OP_REFCNT_UNLOCK;
8666 /* Trie stclasses are readonly and can thus be shared
8667 * without duplication. We free the stclass in pregfree
8668 * when the corresponding reg_ac_data struct is freed.
8669 */
8670 ret->regstclass= r->regstclass;
8671 break;
8672 default:
8673 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8674 }
8675 }
8676
8677 ret->data = d;
8678 }
8679 else
8680 ret->data = NULL;
8681
8682 Newx(ret->offsets, 2*len+1, U32);
8683 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8684
8685 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8686 ret->refcnt = r->refcnt;
8687 ret->minlen = r->minlen;
de8c5301 8688 ret->minlenret = r->minlenret;
84da74a7
YO
8689 ret->prelen = r->prelen;
8690 ret->nparens = r->nparens;
8691 ret->lastparen = r->lastparen;
8692 ret->lastcloseparen = r->lastcloseparen;
8693 ret->reganch = r->reganch;
8694
8695 ret->sublen = r->sublen;
8696
f9f4320a 8697 ret->engine = r->engine;
81714fb9
YO
8698
8699 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8700
84da74a7
YO
8701 if (RX_MATCH_COPIED(ret))
8702 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8703 else
8704 ret->subbeg = NULL;
8705#ifdef PERL_OLD_COPY_ON_WRITE
8706 ret->saved_copy = NULL;
8707#endif
8708
8709 ptr_table_store(PL_ptr_table, r, ret);
8710 return ret;
84da74a7 8711}
a3c0e9ca 8712#endif
84da74a7 8713
de8c5301
YO
8714/*
8715 reg_stringify()
8716
8717 converts a regexp embedded in a MAGIC struct to its stringified form,
8718 caching the converted form in the struct and returns the cached
8719 string.
8720
8721 If lp is nonnull then it is used to return the length of the
8722 resulting string
8723
8724 If flags is nonnull and the returned string contains UTF8 then
8725 (flags & 1) will be true.
8726
8727 If haseval is nonnull then it is used to return whether the pattern
8728 contains evals.
8729
8730 Normally called via macro:
8731
8732 CALLREG_STRINGIFY(mg,0,0);
8733
8734 And internally with
8735
8736 CALLREG_AS_STR(mg,lp,flags,haseval)
8737
8738 See sv_2pv_flags() in sv.c for an example of internal usage.
8739
8740 */
8741
8742char *
8743Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8744 dVAR;
8745 const regexp * const re = (regexp *)mg->mg_obj;
8746
8747 if (!mg->mg_ptr) {
8748 const char *fptr = "msix";
8749 char reflags[6];
8750 char ch;
8751 int left = 0;
8752 int right = 4;
8753 bool need_newline = 0;
8754 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8755
8756 while((ch = *fptr++)) {
8757 if(reganch & 1) {
8758 reflags[left++] = ch;
8759 }
8760 else {
8761 reflags[right--] = ch;
8762 }
8763 reganch >>= 1;
8764 }
8765 if(left != 4) {
8766 reflags[left] = '-';
8767 left = 5;
8768 }
8769
8770 mg->mg_len = re->prelen + 4 + left;
8771 /*
8772 * If /x was used, we have to worry about a regex ending with a
8773 * comment later being embedded within another regex. If so, we don't
8774 * want this regex's "commentization" to leak out to the right part of
8775 * the enclosing regex, we must cap it with a newline.
8776 *
8777 * So, if /x was used, we scan backwards from the end of the regex. If
8778 * we find a '#' before we find a newline, we need to add a newline
8779 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8780 * we don't need to add anything. -jfriedl
8781 */
8782 if (PMf_EXTENDED & re->reganch) {
8783 const char *endptr = re->precomp + re->prelen;
8784 while (endptr >= re->precomp) {
8785 const char c = *(endptr--);
8786 if (c == '\n')
8787 break; /* don't need another */
8788 if (c == '#') {
8789 /* we end while in a comment, so we need a newline */
8790 mg->mg_len++; /* save space for it */
8791 need_newline = 1; /* note to add it */
8792 break;
8793 }
8794 }
8795 }
8796
8797 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8798 mg->mg_ptr[0] = '(';
8799 mg->mg_ptr[1] = '?';
8800 Copy(reflags, mg->mg_ptr+2, left, char);
8801 *(mg->mg_ptr+left+2) = ':';
8802 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8803 if (need_newline)
8804 mg->mg_ptr[mg->mg_len - 2] = '\n';
8805 mg->mg_ptr[mg->mg_len - 1] = ')';
8806 mg->mg_ptr[mg->mg_len] = 0;
8807 }
8808 if (haseval)
8809 *haseval = re->program[0].next_off;
8810 if (flags)
8811 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8812
8813 if (lp)
8814 *lp = mg->mg_len;
8815 return mg->mg_ptr;
8816}
8817
8818
76234dfb 8819#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8820/*
8821 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8822 */
8823regnode *
864dbfa3 8824Perl_regnext(pTHX_ register regnode *p)
c277df42 8825{
97aff369 8826 dVAR;
c277df42
IZ
8827 register I32 offset;
8828
3280af22 8829 if (p == &PL_regdummy)
c277df42
IZ
8830 return(NULL);
8831
8832 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8833 if (offset == 0)
8834 return(NULL);
8835
c277df42 8836 return(p+offset);
c277df42 8837}
76234dfb 8838#endif
c277df42 8839
01f988be 8840STATIC void
cea2e8a9 8841S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8842{
8843 va_list args;
8844 STRLEN l1 = strlen(pat1);
8845 STRLEN l2 = strlen(pat2);
8846 char buf[512];
06bf62c7 8847 SV *msv;
73d840c0 8848 const char *message;
c277df42
IZ
8849
8850 if (l1 > 510)
8851 l1 = 510;
8852 if (l1 + l2 > 510)
8853 l2 = 510 - l1;
8854 Copy(pat1, buf, l1 , char);
8855 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8856 buf[l1 + l2] = '\n';
8857 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8858#ifdef I_STDARG
8859 /* ANSI variant takes additional second argument */
c277df42 8860 va_start(args, pat2);
8736538c
AS
8861#else
8862 va_start(args);
8863#endif
5a844595 8864 msv = vmess(buf, &args);
c277df42 8865 va_end(args);
cfd0369c 8866 message = SvPV_const(msv,l1);
c277df42
IZ
8867 if (l1 > 512)
8868 l1 = 512;
8869 Copy(message, buf, l1 , char);
197cf9b9 8870 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8871 Perl_croak(aTHX_ "%s", buf);
c277df42 8872}
a0ed51b3
LW
8873
8874/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8875
76234dfb 8876#ifndef PERL_IN_XSUB_RE
a0ed51b3 8877void
864dbfa3 8878Perl_save_re_context(pTHX)
b81d288d 8879{
97aff369 8880 dVAR;
1ade1aa1
NC
8881
8882 struct re_save_state *state;
8883
8884 SAVEVPTR(PL_curcop);
8885 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8886
8887 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8888 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8889 SSPUSHINT(SAVEt_RE_STATE);
8890
46ab3289 8891 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8892
a0ed51b3 8893 PL_reg_start_tmp = 0;
a0ed51b3 8894 PL_reg_start_tmpl = 0;
c445ea15 8895 PL_reg_oldsaved = NULL;
a5db57d6 8896 PL_reg_oldsavedlen = 0;
a5db57d6 8897 PL_reg_maxiter = 0;
a5db57d6 8898 PL_reg_leftiter = 0;
c445ea15 8899 PL_reg_poscache = NULL;
a5db57d6 8900 PL_reg_poscache_size = 0;
1ade1aa1
NC
8901#ifdef PERL_OLD_COPY_ON_WRITE
8902 PL_nrs = NULL;
8903#endif
ada6e8a9 8904
c445ea15
AL
8905 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8906 if (PL_curpm) {
8907 const REGEXP * const rx = PM_GETRE(PL_curpm);
8908 if (rx) {
1df70142 8909 U32 i;
ada6e8a9 8910 for (i = 1; i <= rx->nparens; i++) {
1df70142 8911 char digits[TYPE_CHARS(long)];
d9fad198 8912 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8913 GV *const *const gvp
8914 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8915
b37c2d43
AL
8916 if (gvp) {
8917 GV * const gv = *gvp;
8918 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8919 save_scalar(gv);
49f27e4b 8920 }
ada6e8a9
AMS
8921 }
8922 }
8923 }
a0ed51b3 8924}
76234dfb 8925#endif
51371543 8926
51371543 8927static void
acfe0abc 8928clear_re(pTHX_ void *r)
51371543 8929{
97aff369 8930 dVAR;
51371543
GS
8931 ReREFCNT_dec((regexp *)r);
8932}
ffbc6a93 8933
a28509cc
AL
8934#ifdef DEBUGGING
8935
8936STATIC void
8937S_put_byte(pTHX_ SV *sv, int c)
8938{
8939 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8940 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8941 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8942 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8943 else
8944 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8945}
8946
786e8c11 8947
3dab1dad
YO
8948#define CLEAR_OPTSTART \
8949 if (optstart) STMT_START { \
07be1b83 8950 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8951 optstart=NULL; \
8952 } STMT_END
8953
786e8c11 8954#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8955
b5a2f8d8
NC
8956STATIC const regnode *
8957S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8958 const regnode *last, const regnode *plast,
8959 SV* sv, I32 indent, U32 depth)
a28509cc 8960{
97aff369 8961 dVAR;
786e8c11 8962 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8963 register const regnode *next;
3dab1dad
YO
8964 const regnode *optstart= NULL;
8965 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8966
786e8c11
YO
8967#ifdef DEBUG_DUMPUNTIL
8968 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8969 last ? last-start : 0,plast ? plast-start : 0);
8970#endif
8971
8972 if (plast && plast < last)
8973 last= plast;
8974
8975 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8976 /* While that wasn't END last time... */
8977
8978 NODE_ALIGN(node);
8979 op = OP(node);
8980 if (op == CLOSE)
786e8c11 8981 indent--;
b5a2f8d8 8982 next = regnext((regnode *)node);
07be1b83 8983
a28509cc 8984 /* Where, what. */
8e11feef 8985 if (OP(node) == OPTIMIZED) {
e68ec53f 8986 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8987 optstart = node;
3dab1dad 8988 else
8e11feef 8989 goto after_print;
3dab1dad
YO
8990 } else
8991 CLEAR_OPTSTART;
07be1b83 8992
32fc9b6a 8993 regprop(r, sv, node);
a28509cc 8994 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8995 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8996
8997 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8998 if (next == NULL) /* Next ptr. */
8999 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9000 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9001 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9002 else
9003 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9004
1de06328 9005 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9006 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9007 }
9008
a28509cc
AL
9009 after_print:
9010 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9011 assert(next);
9012 {
9013 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9014 ? regnext((regnode *)next)
9015 : next);
be8e71aa
YO
9016 if (last && nnode > last)
9017 nnode = last;
786e8c11 9018 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9019 }
a28509cc
AL
9020 }
9021 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9022 assert(next);
786e8c11 9023 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9024 }
9025 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9026 const regnode *this_trie = node;
1de06328 9027 const char op = OP(node);
a28509cc 9028 const I32 n = ARG(node);
1de06328
YO
9029 const reg_ac_data * const ac = op>=AHOCORASICK ?
9030 (reg_ac_data *)r->data->data[n] :
9031 NULL;
9032 const reg_trie_data * const trie = op<AHOCORASICK ?
9033 (reg_trie_data*)r->data->data[n] :
9034 ac->trie;
786e8c11 9035 const regnode *nextbranch= NULL;
a28509cc 9036 I32 word_idx;
1de06328 9037 sv_setpvn(sv, "", 0);
786e8c11 9038 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9039 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9040
9041 PerlIO_printf(Perl_debug_log, "%*s%s ",
9042 (int)(2*(indent+3)), "",
9043 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9044 PL_colors[0], PL_colors[1],
9045 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9046 PERL_PV_PRETTY_ELIPSES |
7f69552c 9047 PERL_PV_PRETTY_LTGT
786e8c11
YO
9048 )
9049 : "???"
9050 );
9051 if (trie->jump) {
40d049e4 9052 U16 dist= trie->jump[word_idx+1];
7f69552c 9053 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9054 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9055 if (dist) {
9056 if (!nextbranch)
24b23f37 9057 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9058 DUMPUNTIL(this_trie + dist, nextbranch);
9059 }
786e8c11
YO
9060 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9061 nextbranch= regnext((regnode *)nextbranch);
9062 } else {
9063 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9064 }
786e8c11
YO
9065 }
9066 if (last && next > last)
9067 node= last;
9068 else
9069 node= next;
a28509cc 9070 }
786e8c11
YO
9071 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9072 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9073 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9074 }
9075 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9076 assert(next);
786e8c11 9077 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9078 }
9079 else if ( op == PLUS || op == STAR) {
786e8c11 9080 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9081 }
9082 else if (op == ANYOF) {
9083 /* arglen 1 + class block */
9084 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9085 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9086 node = NEXTOPER(node);
9087 }
9088 else if (PL_regkind[(U8)op] == EXACT) {
9089 /* Literal string, where present. */
9090 node += NODE_SZ_STR(node) - 1;
9091 node = NEXTOPER(node);
9092 }
9093 else {
9094 node = NEXTOPER(node);
9095 node += regarglen[(U8)op];
9096 }
9097 if (op == CURLYX || op == OPEN)
786e8c11 9098 indent++;
a28509cc 9099 else if (op == WHILEM)
786e8c11 9100 indent--;
a28509cc 9101 }
3dab1dad 9102 CLEAR_OPTSTART;
786e8c11
YO
9103#ifdef DEBUG_DUMPUNTIL
9104 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9105#endif
1de06328 9106 return node;
a28509cc
AL
9107}
9108
9109#endif /* DEBUGGING */
9110
241d1a3b
NC
9111/*
9112 * Local variables:
9113 * c-indentation-style: bsd
9114 * c-basic-offset: 4
9115 * indent-tabs-mode: t
9116 * End:
9117 *
37442d52
RGS
9118 * ex: set ts=8 sts=4 sw=4 noet:
9119 */