This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #17542] regex confusion
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83 2121#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2122 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2129 }});
07be1b83 2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
b515a41d
YO
2292/* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2295struct scan_frame;
2296typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2301} scan_frame;
2302
76e3520e 2303STATIC I32
40d049e4 2304S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2305 I32 *minlenp, I32 *deltap,
40d049e4
YO
2306 regnode *last,
2307 scan_data_t *data,
2308 I32 stopparen,
2309 U8* recursed,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
c277df42
IZ
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
40d049e4
YO
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2319{
97aff369 2320 dVAR;
c277df42
IZ
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2323 I32 delta = 0;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2325 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
a3621e74 2328 SV *re_trie_maxbuff = NULL;
786e8c11 2329 regnode *first_non_open = scan;
e2e6a0f1 2330 I32 stopmin = I32_MAX;
b515a41d
YO
2331 scan_frame last_frame= { last, NULL, NULL, stopparen };
2332 scan_frame *frame=&last_frame;
2333
a3621e74 2334 GET_RE_DEBUG_FLAGS_DECL;
b515a41d 2335
13a24bad 2336#ifdef DEBUGGING
40d049e4 2337 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2338#endif
40d049e4 2339
786e8c11 2340 if ( depth == 0 ) {
40d049e4 2341 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2342 first_non_open=regnext(first_non_open);
2343 }
2344
b515a41d
YO
2345 while (frame) {
2346
2347 DEBUG_PEEP("FBEG",scan,depth);
2348 while ( scan && OP(scan) != END && scan < frame->last ) {
2349 /* Peephole optimizer: */
2350 DEBUG_STUDYDATA(data,depth);
2351 DEBUG_PEEP("Peep",scan,depth);
2352 JOIN_EXACT(scan,&min,0);
2353
2354 /* Follow the next-chain of the current node and optimize
2355 away all the NOTHINGs from it. */
2356 if (OP(scan) != CURLYX) {
2357 const int max = (reg_off_by_arg[OP(scan)]
2358 ? I32_MAX
2359 /* I32 may be smaller than U16 on CRAYs! */
2360 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2361 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2362 int noff;
2363 regnode *n = scan;
2364
2365 /* Skip NOTHING and LONGJMP. */
2366 while ((n = regnext(n))
2367 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2368 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2369 && off + noff < max)
2370 off += noff;
2371 if (reg_off_by_arg[OP(scan)])
2372 ARG(scan) = off;
2373 else
2374 NEXT_OFF(scan) = off;
2375 }
b81d288d 2376
b515a41d
YO
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2382 code = OP(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2384
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2388 too. */
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2392
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
a3621e74 2397
b515a41d
YO
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
c277df42 2401
b515a41d
YO
2402 num++;
2403 data_fake.flags = 0;
2404 if (data) {
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2407 }
2408 else
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
c277df42 2411 scan = NEXTOPER(scan);
b515a41d
YO
2412 if (code != BRANCH)
2413 scan = NEXTOPER(scan);
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2418 }
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2421
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2424 next, &data_fake,
2425 stopparen, recursed, NULL, f,depth+1);
2426 if (min1 > minnext)
2427 min1 = minnext;
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2432 scan = next;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2434 pars++;
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2439 if (data)
2440 data->flags |= SCF_SEEN_ACCEPT;
2441 }
2442 if (data) {
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
2446 }
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
3dab1dad 2449 }
b515a41d
YO
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2451 min1 = 0;
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
653099ff 2457 }
b515a41d
YO
2458 min += min1;
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2462 if (min1) {
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
2465 }
de0c8cb8 2466 }
b515a41d
YO
2467 else if (flags & SCF_DO_STCLASS_AND) {
2468 if (min1) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
2471 }
2472 else {
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2475 INIT_AND_WITHP;
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
2483 }
de0c8cb8 2484 }
a3621e74 2485
b515a41d
YO
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2487 /* demq.
a3621e74 2488
b515a41d
YO
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2492 for subsequences of
a3621e74 2493
b515a41d
YO
2494 BRANCH->EXACT=>x1
2495 BRANCH->EXACT=>x2
2496 tail
a3621e74 2497
b515a41d 2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2499
b515a41d
YO
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
a3621e74 2503
b515a41d 2504 We have two cases
a3621e74 2505
b515a41d 2506 1. patterns where the whole set of branch can be converted.
a3621e74 2507
b515a41d 2508 2. patterns where only a subset can be converted.
a3621e74 2509
b515a41d
YO
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2512 branchs so
a3621e74 2513
b515a41d
YO
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
a3621e74 2516
b515a41d
YO
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
786e8c11 2520
b515a41d
YO
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
a3621e74 2525
b515a41d
YO
2526 */
2527
2528 int made=0;
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2533 }
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2535 regnode *cur;
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2539 U8 optype = 0;
2540 U32 count=0;
a3621e74
YO
2541
2542#ifdef DEBUGGING
b515a41d 2543 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2544#endif
b515a41d
YO
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2550 tails.
2551 */
2552
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2556 }
a3621e74 2557
a3621e74 2558
b515a41d
YO
2559 DEBUG_OPTIMISE_r({
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2565 );
2566 });
2567
2568 /*
2569
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
a3621e74 2590
b515a41d 2591*/
a3621e74 2592
b515a41d
YO
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
2596#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2597 regnode * const noper_next = regnext( noper );
2598#endif
2599
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2604
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2608
2609 if ( noper_next ) {
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2613 }
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2616 });
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
786e8c11 2620#ifdef NOJUMPTRIE
b515a41d 2621 && noper_next == tail
786e8c11 2622#endif
b515a41d
YO
2623 && count < U16_MAX)
2624 {
2625 count++;
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2629 } else {
2630 last = cur;
2631 }
2632 } else {
2633 if ( last ) {
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2636 optype, depth+1 );
2637 }
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2639#ifdef NOJUMPTRIE
b515a41d 2640 && noper_next == tail
786e8c11 2641#endif
b515a41d
YO
2642 ) {
2643 count = 1;
2644 first = cur;
2645 optype = OP( noper );
2646 } else {
2647 count = 0;
2648 first = NULL;
2649 optype = 0;
2650 }
2651 last = NULL;
2652 }
2653 }
2654 DEBUG_OPTIMISE_r({
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2659
2660 });
2661 if ( last ) {
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2663#ifdef TRIE_STUDY_OPT
b515a41d
YO
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2667 depth==0 ) {
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2670 && scan == tail )
2671 {
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2673 }
2674 }
3dab1dad 2675#endif
b515a41d
YO
2676 }
2677 }
2678
2679 } /* do trie */
2680
c277df42 2681 }
b515a41d
YO
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2686 continue;
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2689 I32 paren;
2690 regnode *start;
2691 regnode *end;
2692
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2696 paren = ARG(scan);
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2700 } else {
2701 paren = 0;
2702 start = RExC_rx->program + 1;
2703 end = RExC_opend;
2704 }
2705 if (!recursed) {
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2708 }
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2712 } else {
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2716 }
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2721 }
2722 } else {
2723 Newx(newframe,1,scan_frame);
2724 paren = stopparen;
2725 start = scan+2;
2726 end = regnext(scan);
0eda9292 2727 }
b515a41d
YO
2728 if (newframe) {
2729 assert(start);
2730 assert(end);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = end;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2736 frame = newframe;
2737 scan = start;
2738 stopparen = paren;
2739 continue;
2740 }
653099ff 2741 }
b515a41d
YO
2742 else if (OP(scan) == EXACT) {
2743 I32 l = STR_LEN(scan);
2744 UV uc;
2745 if (UTF) {
2746 const U8 * const s = (U8*)STRING(scan);
2747 l = utf8_length(s, s + l);
2748 uc = utf8_to_uvchr(s, NULL);
2749 } else {
2750 uc = *((U8*)STRING(scan));
2751 }
2752 min += l;
2753 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2754 /* The code below prefers earlier match for fixed
2755 offset, later match for variable offset. */
2756 if (data->last_end == -1) { /* Update the start info. */
2757 data->last_start_min = data->pos_min;
2758 data->last_start_max = is_inf
2759 ? I32_MAX : data->pos_min + data->pos_delta;
2760 }
2761 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2762 if (UTF)
2763 SvUTF8_on(data->last_found);
2764 {
2765 SV * const sv = data->last_found;
2766 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2767 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2768 if (mg && mg->mg_len >= 0)
2769 mg->mg_len += utf8_length((U8*)STRING(scan),
2770 (U8*)STRING(scan)+STR_LEN(scan));
2771 }
2772 data->last_end = data->pos_min + l;
2773 data->pos_min += l; /* As in the first entry. */
2774 data->flags &= ~SF_BEFORE_EOL;
2775 }
2776 if (flags & SCF_DO_STCLASS_AND) {
2777 /* Check whether it is compatible with what we know already! */
2778 int compat = 1;
2779
2780 if (uc >= 0x100 ||
2781 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2782 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2783 && (!(data->start_class->flags & ANYOF_FOLD)
2784 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2785 )
2786 compat = 0;
2787 ANYOF_CLASS_ZERO(data->start_class);
2788 ANYOF_BITMAP_ZERO(data->start_class);
2789 if (compat)
2790 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2791 data->start_class->flags &= ~ANYOF_EOS;
b515a41d
YO
2792 if (uc < 0x100)
2793 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff 2794 }
b515a41d
YO
2795 else if (flags & SCF_DO_STCLASS_OR) {
2796 /* false positive possible if the class is case-folded */
1aa99e6b
IH
2797 if (uc < 0x100)
2798 ANYOF_BITMAP_SET(data->start_class, uc);
b515a41d
YO
2799 else
2800 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2801 data->start_class->flags &= ~ANYOF_EOS;
b515a41d 2802 cl_and(data->start_class, and_withp);
653099ff 2803 }
b515a41d 2804 flags &= ~SCF_DO_STCLASS;
653099ff 2805 }
b515a41d
YO
2806 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2807 I32 l = STR_LEN(scan);
2808 UV uc = *((U8*)STRING(scan));
2809
2810 /* Search for fixed substrings supports EXACT only. */
c277df42 2811 if (flags & SCF_DO_SUBSTR) {
b515a41d
YO
2812 assert(data);
2813 scan_commit(pRExC_state, data, minlenp);
c277df42 2814 }
b515a41d
YO
2815 if (UTF) {
2816 const U8 * const s = (U8 *)STRING(scan);
2817 l = utf8_length(s, s + l);
2818 uc = utf8_to_uvchr(s, NULL);
653099ff 2819 }
b515a41d
YO
2820 min += l;
2821 if (flags & SCF_DO_SUBSTR)
2822 data->pos_min += l;
2823 if (flags & SCF_DO_STCLASS_AND) {
2824 /* Check whether it is compatible with what we know already! */
2825 int compat = 1;
2826
2827 if (uc >= 0x100 ||
2828 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2829 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2830 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2831 compat = 0;
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2834 if (compat) {
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 data->start_class->flags |= ANYOF_FOLD;
2838 if (OP(scan) == EXACTFL)
2839 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2840 }
653099ff 2841 }
b515a41d
YO
2842 else if (flags & SCF_DO_STCLASS_OR) {
2843 if (data->start_class->flags & ANYOF_FOLD) {
2844 /* false positive possible if the class is case-folded.
2845 Assume that the locale settings are the same... */
2846 if (uc < 0x100)
2847 ANYOF_BITMAP_SET(data->start_class, uc);
2848 data->start_class->flags &= ~ANYOF_EOS;
2849 }
2850 cl_and(data->start_class, and_withp);
b45f050a 2851 }
b515a41d
YO
2852 flags &= ~SCF_DO_STCLASS;
2853 }
2854 else if (strchr((const char*)PL_varies,OP(scan))) {
2855 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2856 I32 f = flags, pos_before = 0;
2857 regnode * const oscan = scan;
2858 struct regnode_charclass_class this_class;
2859 struct regnode_charclass_class *oclass = NULL;
2860 I32 next_is_eval = 0;
b45f050a 2861
b515a41d
YO
2862 switch (PL_regkind[OP(scan)]) {
2863 case WHILEM: /* End of (?:...)* . */
2864 scan = NEXTOPER(scan);
2865 goto finish;
2866 case PLUS:
2867 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2868 next = NEXTOPER(scan);
2869 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2870 mincount = 1;
2871 maxcount = REG_INFTY;
2872 next = regnext(scan);
2873 scan = NEXTOPER(scan);
2874 goto do_curly;
2875 }
2876 }
2877 if (flags & SCF_DO_SUBSTR)
2878 data->pos_min++;
2879 min++;
2880 /* Fall through. */
2881 case STAR:
2882 if (flags & SCF_DO_STCLASS) {
2883 mincount = 0;
2884 maxcount = REG_INFTY;
2885 next = regnext(scan);
2886 scan = NEXTOPER(scan);
2887 goto do_curly;
2888 }
2889 is_inf = is_inf_internal = 1;
2890 scan = regnext(scan);
2891 if (flags & SCF_DO_SUBSTR) {
2892 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2893 data->longest = &(data->longest_float);
2894 }
2895 goto optimize_curly_tail;
2896 case CURLY:
2897 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2898 && (scan->flags == stopparen))
2899 {
2900 mincount = 1;
2901 maxcount = 1;
2902 } else {
2903 mincount = ARG1(scan);
2904 maxcount = ARG2(scan);
2905 }
2906 next = regnext(scan);
2907 if (OP(scan) == CURLYX) {
2908 I32 lp = (data ? *(data->last_closep) : 0);
2909 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2910 }
2911 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2912 next_is_eval = (OP(scan) == EVAL);
2913do_curly:
2914 if (flags & SCF_DO_SUBSTR) {
2915 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2916 pos_before = data->pos_min;
2917 }
2918 if (data) {
2919 fl = data->flags;
2920 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2921 if (is_inf)
2922 data->flags |= SF_IS_INF;
2923 }
2924 if (flags & SCF_DO_STCLASS) {
2925 cl_init(pRExC_state, &this_class);
2926 oclass = data->start_class;
2927 data->start_class = &this_class;
2928 f |= SCF_DO_STCLASS_AND;
2929 f &= ~SCF_DO_STCLASS_OR;
2930 }
2931 /* These are the cases when once a subexpression
2932 fails at a particular position, it cannot succeed
2933 even after backtracking at the enclosing scope.
2934
2935 XXXX what if minimal match and we are at the
2936 initial run of {n,m}? */
2937 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2938 f &= ~SCF_WHILEM_VISITED_POS;
2939
2940 /* This will finish on WHILEM, setting scan, or on NULL: */
2941 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2942 last, data, stopparen, recursed, NULL,
2943 (mincount == 0
2944 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2945
2946 if (flags & SCF_DO_STCLASS)
2947 data->start_class = oclass;
2948 if (mincount == 0 || minnext == 0) {
2949 if (flags & SCF_DO_STCLASS_OR) {
2950 cl_or(pRExC_state, data->start_class, &this_class);
2951 }
2952 else if (flags & SCF_DO_STCLASS_AND) {
2953 /* Switch to OR mode: cache the old value of
2954 * data->start_class */
2955 INIT_AND_WITHP;
2956 StructCopy(data->start_class, and_withp,
2957 struct regnode_charclass_class);
2958 flags &= ~SCF_DO_STCLASS_AND;
2959 StructCopy(&this_class, data->start_class,
2960 struct regnode_charclass_class);
2961 flags |= SCF_DO_STCLASS_OR;
2962 data->start_class->flags |= ANYOF_EOS;
2963 }
2964 } else { /* Non-zero len */
2965 if (flags & SCF_DO_STCLASS_OR) {
2966 cl_or(pRExC_state, data->start_class, &this_class);
2967 cl_and(data->start_class, and_withp);
2968 }
2969 else if (flags & SCF_DO_STCLASS_AND)
2970 cl_and(data->start_class, &this_class);
2971 flags &= ~SCF_DO_STCLASS;
2972 }
2973 if (!scan) /* It was not CURLYX, but CURLY. */
2974 scan = next;
2975 if ( /* ? quantifier ok, except for (?{ ... }) */
2976 (next_is_eval || !(mincount == 0 && maxcount == 1))
2977 && (minnext == 0) && (deltanext == 0)
2978 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2979 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2980 && ckWARN(WARN_REGEXP))
2981 {
2982 vWARN(RExC_parse,
2983 "Quantifier unexpected on zero-length expression");
2984 }
2985
2986 min += minnext * mincount;
2987 is_inf_internal |= ((maxcount == REG_INFTY
2988 && (minnext + deltanext) > 0)
2989 || deltanext == I32_MAX);
2990 is_inf |= is_inf_internal;
2991 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2992
2993 /* Try powerful optimization CURLYX => CURLYN. */
2994 if ( OP(oscan) == CURLYX && data
2995 && data->flags & SF_IN_PAR
2996 && !(data->flags & SF_HAS_EVAL)
2997 && !deltanext && minnext == 1 ) {
2998 /* Try to optimize to CURLYN. */
2999 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3000 regnode * const nxt1 = nxt;
497b47a8 3001#ifdef DEBUGGING
b515a41d 3002 regnode *nxt2;
497b47a8 3003#endif
c277df42 3004
b515a41d
YO
3005 /* Skip open. */
3006 nxt = regnext(nxt);
3007 if (!strchr((const char*)PL_simple,OP(nxt))
3008 && !(PL_regkind[OP(nxt)] == EXACT
3009 && STR_LEN(nxt) == 1))
3010 goto nogo;
497b47a8 3011#ifdef DEBUGGING
b515a41d 3012 nxt2 = nxt;
497b47a8 3013#endif
b515a41d
YO
3014 nxt = regnext(nxt);
3015 if (OP(nxt) != CLOSE)
3016 goto nogo;
3017 if (RExC_open_parens) {
3018 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3019 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3020 }
3021 /* Now we know that nxt2 is the only contents: */
3022 oscan->flags = (U8)ARG(nxt);
3023 OP(oscan) = CURLYN;
3024 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3025
c277df42 3026#ifdef DEBUGGING
b515a41d
YO
3027 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3028 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3029 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3030 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3031 OP(nxt + 1) = OPTIMIZED; /* was count. */
3032 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3033#endif
40d049e4 3034 }
b515a41d
YO
3035nogo:
3036
3037 /* Try optimization CURLYX => CURLYM. */
3038 if ( OP(oscan) == CURLYX && data
3039 && !(data->flags & SF_HAS_PAR)
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext /* atom is fixed width */
3042 && minnext != 0 /* CURLYM can't handle zero width */
3043 ) {
3044 /* XXXX How to optimize if data == 0? */
3045 /* Optimize to a simpler form. */
3046 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3047 regnode *nxt2;
3048
3049 OP(oscan) = CURLYM;
3050 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3051 && (OP(nxt2) != WHILEM))
3052 nxt = nxt2;
3053 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3054 /* Need to optimize away parenths. */
3055 if (data->flags & SF_IN_PAR) {
3056 /* Set the parenth number. */
3057 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3058
3059 if (OP(nxt) != CLOSE)
3060 FAIL("Panic opt close");
3061 oscan->flags = (U8)ARG(nxt);
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3065 }
3066 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3067 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3068
c277df42 3069#ifdef DEBUGGING
b515a41d
YO
3070 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3071 OP(nxt + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3074#endif
c277df42 3075#if 0
b515a41d
YO
3076 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3077 regnode *nnxt = regnext(nxt1);
3078
3079 if (nnxt == nxt) {
3080 if (reg_off_by_arg[OP(nxt1)])
3081 ARG_SET(nxt1, nxt2 - nxt1);
3082 else if (nxt2 - nxt1 < U16_MAX)
3083 NEXT_OFF(nxt1) = nxt2 - nxt1;
3084 else
3085 OP(nxt) = NOTHING; /* Cannot beautify */
3086 }
3087 nxt1 = nnxt;
3088 }
3089#endif
3090 /* Optimize again: */
3091 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3092 NULL, stopparen, recursed, NULL, 0,depth+1);
c277df42 3093 }
b515a41d
YO
3094 else
3095 oscan->flags = 0;
c277df42 3096 }
b515a41d
YO
3097 else if ((OP(oscan) == CURLYX)
3098 && (flags & SCF_WHILEM_VISITED_POS)
3099 /* See the comment on a similar expression above.
3100 However, this time it not a subexpression
3101 we care about, but the expression itself. */
3102 && (maxcount == REG_INFTY)
3103 && data && ++data->whilem_c < 16) {
3104 /* This stays as CURLYX, we can put the count/of pair. */
3105 /* Find WHILEM (as in regexec.c) */
3106 regnode *nxt = oscan + NEXT_OFF(oscan);
3107
3108 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3109 nxt += ARG(nxt);
3110 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3111 | (RExC_whilem_seen << 4)); /* On WHILEM */
3112 }
3113 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3114 pars++;
3115 if (flags & SCF_DO_SUBSTR) {
3116 SV *last_str = NULL;
3117 int counted = mincount != 0;
c277df42 3118
b515a41d 3119 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c 3120#if defined(SPARC64_GCC_WORKAROUND)
b515a41d
YO
3121 I32 b = 0;
3122 STRLEN l = 0;
3123 const char *s = NULL;
3124 I32 old = 0;
5d1c421c 3125
b515a41d
YO
3126 if (pos_before >= data->last_start_min)
3127 b = pos_before;
3128 else
3129 b = data->last_start_min;
5d1c421c 3130
b515a41d
YO
3131 l = 0;
3132 s = SvPV_const(data->last_found, l);
3133 old = b - data->last_start_min;
5d1c421c
JH
3134
3135#else
b515a41d
YO
3136 I32 b = pos_before >= data->last_start_min
3137 ? pos_before : data->last_start_min;
3138 STRLEN l;
3139 const char * const s = SvPV_const(data->last_found, l);
3140 I32 old = b - data->last_start_min;
5d1c421c 3141#endif
a0ed51b3 3142
b515a41d
YO
3143 if (UTF)
3144 old = utf8_hop((U8*)s, old) - (U8*)s;
3145
3146 l -= old;
3147 /* Get the added string: */
3148 last_str = newSVpvn(s + old, l);
3149 if (UTF)
3150 SvUTF8_on(last_str);
3151 if (deltanext == 0 && pos_before == b) {
3152 /* What was added is a constant string */
3153 if (mincount > 1) {
3154 SvGROW(last_str, (mincount * l) + 1);
3155 repeatcpy(SvPVX(last_str) + l,
3156 SvPVX_const(last_str), l, mincount - 1);
3157 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3158 /* Add additional parts. */
3159 SvCUR_set(data->last_found,
3160 SvCUR(data->last_found) - l);
3161 sv_catsv(data->last_found, last_str);
3162 {
3163 SV * sv = data->last_found;
3164 MAGIC *mg =
3165 SvUTF8(sv) && SvMAGICAL(sv) ?
3166 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3167 if (mg && mg->mg_len >= 0)
3168 mg->mg_len += CHR_SVLEN(last_str);
3169 }
3170 data->last_end += l * (mincount - 1);
3171 }
3172 } else {
3173 /* start offset must point into the last copy */
3174 data->last_start_min += minnext * (mincount - 1);
3175 data->last_start_max += is_inf ? I32_MAX
3176 : (maxcount - 1) * (minnext + data->pos_delta);
3177 }
3178 }
3179 /* It is counted once already... */
3180 data->pos_min += minnext * (mincount - counted);
3181 data->pos_delta += - counted * deltanext +
3182 (minnext + deltanext) * maxcount - minnext * mincount;
3183 if (mincount != maxcount) {
3184 /* Cannot extend fixed substrings found inside
3185 the group. */
3186 scan_commit(pRExC_state,data,minlenp);
3187 if (mincount && last_str) {
3188 SV * const sv = data->last_found;
3189 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292 3190 mg_find(sv, PERL_MAGIC_utf8) : NULL;
b515a41d
YO
3191
3192 if (mg)
3193 mg->mg_len = -1;
3194 sv_setsv(sv, last_str);
3195 data->last_end = data->pos_min;
3196 data->last_start_min =
3197 data->pos_min - CHR_SVLEN(last_str);
3198 data->last_start_max = is_inf
3199 ? I32_MAX
3200 : data->pos_min + data->pos_delta
3201 - CHR_SVLEN(last_str);
0eda9292 3202 }
b515a41d 3203 data->longest = &(data->longest_float);
c277df42 3204 }
b515a41d 3205 SvREFCNT_dec(last_str);
c277df42 3206 }
b515a41d
YO
3207 if (data && (fl & SF_HAS_EVAL))
3208 data->flags |= SF_HAS_EVAL;
3209optimize_curly_tail:
3210 if (OP(oscan) != CURLYX) {
3211 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3212 && NEXT_OFF(next))
3213 NEXT_OFF(oscan) += NEXT_OFF(next);
c277df42 3214 }
b515a41d
YO
3215 continue;
3216 default: /* REF and CLUMP only? */
3217 if (flags & SCF_DO_SUBSTR) {
3218 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3219 data->longest = &(data->longest_float);
3220 }
3221 is_inf = is_inf_internal = 1;
3222 if (flags & SCF_DO_STCLASS_OR)
3223 cl_anything(pRExC_state, data->start_class);
3224 flags &= ~SCF_DO_STCLASS;
3225 break;
3226 }
3227 }
3228 else if (strchr((const char*)PL_simple,OP(scan))) {
3229 int value = 0;
3230
3231 if (flags & SCF_DO_SUBSTR) {
3232 scan_commit(pRExC_state,data,minlenp);
3233 data->pos_min++;
3234 }
3235 min++;
3236 if (flags & SCF_DO_STCLASS) {
3237 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3238
3239 /* Some of the logic below assumes that switching
3240 locale on will only add false positives. */
3241 switch (PL_regkind[OP(scan)]) {
3242 case SANY:
3243 default:
3244do_default:
3245 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3246 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3247 cl_anything(pRExC_state, data->start_class);
3248 break;
3249 case REG_ANY:
3250 if (OP(scan) == SANY)
3251 goto do_default;
3252 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3253 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3254 || (data->start_class->flags & ANYOF_CLASS));
3255 cl_anything(pRExC_state, data->start_class);
3256 }
3257 if (flags & SCF_DO_STCLASS_AND || !value)
3258 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3259 break;
3260 case ANYOF:
3261 if (flags & SCF_DO_STCLASS_AND)
3262 cl_and(data->start_class,
3263 (struct regnode_charclass_class*)scan);
3264 else
3265 cl_or(pRExC_state, data->start_class,
3266 (struct regnode_charclass_class*)scan);
3267 break;
3268 case ALNUM:
3269 if (flags & SCF_DO_STCLASS_AND) {
3270 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3271 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3272 for (value = 0; value < 256; value++)
3273 if (!isALNUM(value))
3274 ANYOF_BITMAP_CLEAR(data->start_class, value);
3275 }
3276 }
3277 else {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3280 else {
3281 for (value = 0; value < 256; value++)
3282 if (isALNUM(value))
3283 ANYOF_BITMAP_SET(data->start_class, value);
3284 }
3285 }
3286 break;
3287 case ALNUML:
3288 if (flags & SCF_DO_STCLASS_AND) {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3291 }
3292 else {
3293 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3294 data->start_class->flags |= ANYOF_LOCALE;
3295 }
3296 break;
3297 case NALNUM:
3298 if (flags & SCF_DO_STCLASS_AND) {
3299 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3300 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3301 for (value = 0; value < 256; value++)
3302 if (isALNUM(value))
3303 ANYOF_BITMAP_CLEAR(data->start_class, value);
3304 }
3305 }
3306 else {
3307 if (data->start_class->flags & ANYOF_LOCALE)
3308 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3309 else {
3310 for (value = 0; value < 256; value++)
3311 if (!isALNUM(value))
3312 ANYOF_BITMAP_SET(data->start_class, value);
3313 }
3314 }
3315 break;
3316 case NALNUML:
3317 if (flags & SCF_DO_STCLASS_AND) {
3318 if (data->start_class->flags & ANYOF_LOCALE)
3319 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3320 }
3321 else {
3322 data->start_class->flags |= ANYOF_LOCALE;
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3324 }
3325 break;
3326 case SPACE:
3327 if (flags & SCF_DO_STCLASS_AND) {
3328 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3329 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3330 for (value = 0; value < 256; value++)
3331 if (!isSPACE(value))
3332 ANYOF_BITMAP_CLEAR(data->start_class, value);
3333 }
3334 }
3335 else {
3336 if (data->start_class->flags & ANYOF_LOCALE)
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3338 else {
3339 for (value = 0; value < 256; value++)
3340 if (isSPACE(value))
3341 ANYOF_BITMAP_SET(data->start_class, value);
3342 }
3343 }
3344 break;
3345 case SPACEL:
3346 if (flags & SCF_DO_STCLASS_AND) {
3347 if (data->start_class->flags & ANYOF_LOCALE)
3348 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3349 }
3350 else {
3351 data->start_class->flags |= ANYOF_LOCALE;
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3353 }
3354 break;
3355 case NSPACE:
3356 if (flags & SCF_DO_STCLASS_AND) {
3357 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3358 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3359 for (value = 0; value < 256; value++)
3360 if (isSPACE(value))
3361 ANYOF_BITMAP_CLEAR(data->start_class, value);
3362 }
3363 }
3364 else {
3365 if (data->start_class->flags & ANYOF_LOCALE)
3366 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3367 else {
3368 for (value = 0; value < 256; value++)
3369 if (!isSPACE(value))
3370 ANYOF_BITMAP_SET(data->start_class, value);
3371 }
3372 }
3373 break;
3374 case NSPACEL:
3375 if (flags & SCF_DO_STCLASS_AND) {
3376 if (data->start_class->flags & ANYOF_LOCALE) {
3377 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3378 for (value = 0; value < 256; value++)
3379 if (!isSPACE(value))
3380 ANYOF_BITMAP_CLEAR(data->start_class, value);
3381 }
3382 }
3383 else {
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3386 }
3387 break;
3388 case DIGIT:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3391 for (value = 0; value < 256; value++)
3392 if (!isDIGIT(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (isDIGIT(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3402 }
3403 }
3404 break;
3405 case NDIGIT:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3408 for (value = 0; value < 256; value++)
3409 if (isDIGIT(value))
3410 ANYOF_BITMAP_CLEAR(data->start_class, value);
3411 }
3412 else {
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3415 else {
3416 for (value = 0; value < 256; value++)
3417 if (!isDIGIT(value))
3418 ANYOF_BITMAP_SET(data->start_class, value);
3419 }
3420 }
3421 break;
c277df42 3422 }
b515a41d
YO
3423 if (flags & SCF_DO_STCLASS_OR)
3424 cl_and(data->start_class, and_withp);
3425 flags &= ~SCF_DO_STCLASS;
c277df42 3426 }
c277df42 3427 }
b515a41d
YO
3428 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3429 data->flags |= (OP(scan) == MEOL
3430 ? SF_BEFORE_MEOL
3431 : SF_BEFORE_SEOL);
c277df42 3432 }
b515a41d
YO
3433 else if ( PL_regkind[OP(scan)] == BRANCHJ
3434 /* Lookbehind, or need to calculate parens/evals/stclass: */
3435 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3436 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3437 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3438 || OP(scan) == UNLESSM )
3439 {
3440 /* Negative Lookahead/lookbehind
3441 In this case we can't do fixed string optimisation.
3442 */
653099ff 3443
b515a41d
YO
3444 I32 deltanext, minnext, fake = 0;
3445 regnode *nscan;
3446 struct regnode_charclass_class intrnl;
3447 int f = 0;
3448
3449 data_fake.flags = 0;
3450 if (data) {
3451 data_fake.whilem_c = data->whilem_c;
3452 data_fake.last_closep = data->last_closep;
653099ff 3453 }
653099ff 3454 else
b515a41d
YO
3455 data_fake.last_closep = &fake;
3456 if ( flags & SCF_DO_STCLASS && !scan->flags
3457 && OP(scan) == IFMATCH ) { /* Lookahead */
3458 cl_init(pRExC_state, &intrnl);
3459 data_fake.start_class = &intrnl;
3460 f |= SCF_DO_STCLASS_AND;
653099ff 3461 }
b515a41d
YO
3462 if (flags & SCF_WHILEM_VISITED_POS)
3463 f |= SCF_WHILEM_VISITED_POS;
3464 next = regnext(scan);
3465 nscan = NEXTOPER(NEXTOPER(scan));
3466 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3467 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3468 if (scan->flags) {
3469 if (deltanext) {
3470 vFAIL("Variable length lookbehind not implemented");
653099ff 3471 }
b515a41d
YO
3472 else if (minnext > (I32)U8_MAX) {
3473 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3474 }
b515a41d 3475 scan->flags = (U8)minnext;
653099ff 3476 }
b515a41d
YO
3477 if (data) {
3478 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3479 pars++;
3480 if (data_fake.flags & SF_HAS_EVAL)
3481 data->flags |= SF_HAS_EVAL;
3482 data->whilem_c = data_fake.whilem_c;
653099ff 3483 }
b515a41d
YO
3484 if (f & SCF_DO_STCLASS_AND) {
3485 const int was = (data->start_class->flags & ANYOF_EOS);
3486
3487 cl_and(data->start_class, &intrnl);
3488 if (was)
3489 data->start_class->flags |= ANYOF_EOS;
653099ff 3490 }
b515a41d
YO
3491 }
3492#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3493 else {
3494 /* Positive Lookahead/lookbehind
3495 In this case we can do fixed string optimisation,
3496 but we must be careful about it. Note in the case of
3497 lookbehind the positions will be offset by the minimum
3498 length of the pattern, something we won't know about
3499 until after the recurse.
3500 */
3501 I32 deltanext, fake = 0;
3502 regnode *nscan;
3503 struct regnode_charclass_class intrnl;
3504 int f = 0;
3505 /* We use SAVEFREEPV so that when the full compile
3506 is finished perl will clean up the allocated
3507 minlens when its all done. This was we don't
3508 have to worry about freeing them when we know
3509 they wont be used, which would be a pain.
3510 */
3511 I32 *minnextp;
3512 Newx( minnextp, 1, I32 );
3513 SAVEFREEPV(minnextp);
3514
3515 if (data) {
3516 StructCopy(data, &data_fake, scan_data_t);
3517 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3518 f |= SCF_DO_SUBSTR;
3519 if (scan->flags)
3520 scan_commit(pRExC_state, &data_fake,minlenp);
3521 data_fake.last_found=newSVsv(data->last_found);
653099ff
GS
3522 }
3523 }
b515a41d
YO
3524 else
3525 data_fake.last_closep = &fake;
3526 data_fake.flags = 0;
3527 if (is_inf)
3528 data_fake.flags |= SF_IS_INF;
3529 if ( flags & SCF_DO_STCLASS && !scan->flags
3530 && OP(scan) == IFMATCH ) { /* Lookahead */
3531 cl_init(pRExC_state, &intrnl);
3532 data_fake.start_class = &intrnl;
3533 f |= SCF_DO_STCLASS_AND;
653099ff 3534 }
b515a41d
YO
3535 if (flags & SCF_WHILEM_VISITED_POS)
3536 f |= SCF_WHILEM_VISITED_POS;
3537 next = regnext(scan);
3538 nscan = NEXTOPER(NEXTOPER(scan));
3539
3540 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3541 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3542 if (scan->flags) {
3543 if (deltanext) {
3544 vFAIL("Variable length lookbehind not implemented");
653099ff 3545 }
b515a41d
YO
3546 else if (*minnextp > (I32)U8_MAX) {
3547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3548 }
b515a41d 3549 scan->flags = (U8)*minnextp;
653099ff 3550 }
b515a41d
YO
3551
3552 *minnextp += min;
3553
3554 if (f & SCF_DO_STCLASS_AND) {
3555 const int was = (data->start_class->flags & ANYOF_EOS);
3556
3557 cl_and(data->start_class, &intrnl);
3558 if (was)
3559 data->start_class->flags |= ANYOF_EOS;
653099ff 3560 }
b515a41d
YO
3561 if (data) {
3562 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3563 pars++;
3564 if (data_fake.flags & SF_HAS_EVAL)
3565 data->flags |= SF_HAS_EVAL;
3566 data->whilem_c = data_fake.whilem_c;
3567 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3568 if (RExC_rx->minlen<*minnextp)
3569 RExC_rx->minlen=*minnextp;
3570 scan_commit(pRExC_state, &data_fake, minnextp);
3571 SvREFCNT_dec(data_fake.last_found);
3572
3573 if ( data_fake.minlen_fixed != minlenp )
3574 {
3575 data->offset_fixed= data_fake.offset_fixed;
3576 data->minlen_fixed= data_fake.minlen_fixed;
3577 data->lookbehind_fixed+= scan->flags;
3578 }
3579 if ( data_fake.minlen_float != minlenp )
3580 {
3581 data->minlen_float= data_fake.minlen_float;
3582 data->offset_float_min=data_fake.offset_float_min;
3583 data->offset_float_max=data_fake.offset_float_max;
3584 data->lookbehind_float+= scan->flags;
3585 }
653099ff
GS
3586 }
3587 }
1de06328 3588
1de06328 3589
c277df42 3590 }
1de06328 3591#endif
cb434fcc 3592 }
b515a41d
YO
3593 else if (OP(scan) == OPEN) {
3594 if (stopparen != (I32)ARG(scan))
3595 pars++;
40d049e4 3596 }
b515a41d
YO
3597 else if (OP(scan) == CLOSE) {
3598 if (stopparen == (I32)ARG(scan)) {
3599 break;
40d049e4 3600 }
b515a41d
YO
3601 if ((I32)ARG(scan) == is_par) {
3602 next = regnext(scan);
3603
3604 if ( next && (OP(next) != WHILEM) && next < last)
3605 is_par = 0; /* Disable optimization */
40d049e4 3606 }
b515a41d
YO
3607 if (data)
3608 *(data->last_closep) = ARG(scan);
40d049e4 3609 }
b515a41d 3610 else if (OP(scan) == EVAL) {
c277df42
IZ
3611 if (data)
3612 data->flags |= SF_HAS_EVAL;
40d049e4 3613 }
b515a41d
YO
3614 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3615 if (flags & SCF_DO_SUBSTR) {
3616 scan_commit(pRExC_state,data,minlenp);
3617 flags &= ~SCF_DO_SUBSTR;
3618 }
3619 if (data && OP(scan)==ACCEPT) {
3620 data->flags |= SCF_SEEN_ACCEPT;
3621 if (stopmin > min)
3622 stopmin = min;
3623 }
e2e6a0f1 3624 }
b515a41d
YO
3625 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3626 {
0f5d15d6 3627 if (flags & SCF_DO_SUBSTR) {
1de06328 3628 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3629 data->longest = &(data->longest_float);
3630 }
3631 is_inf = is_inf_internal = 1;
653099ff 3632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3633 cl_anything(pRExC_state, data->start_class);
96776eda 3634 flags &= ~SCF_DO_STCLASS;
b515a41d 3635 }
786e8c11 3636#ifdef TRIE_STUDY_OPT
40d049e4 3637#ifdef FULL_TRIE_STUDY
b515a41d
YO
3638 else if (PL_regkind[OP(scan)] == TRIE) {
3639 /* NOTE - There is similar code to this block above for handling
3640 BRANCH nodes on the initial study. If you change stuff here
3641 check there too. */
3642 regnode *trie_node= scan;
3643 regnode *tail= regnext(scan);
3644 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3645 I32 max1 = 0, min1 = I32_MAX;
3646 struct regnode_charclass_class accum;
3647
3648 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3649 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3650 if (flags & SCF_DO_STCLASS)
3651 cl_init_zero(pRExC_state, &accum);
3652
3653 if (!trie->jump) {
3654 min1= trie->minlen;
3655 max1= trie->maxlen;
3656 } else {
3657 const regnode *nextbranch= NULL;
3658 U32 word;
3659
3660 for ( word=1 ; word <= trie->wordcount ; word++)
3661 {
3662 I32 deltanext=0, minnext=0, f = 0, fake;
3663 struct regnode_charclass_class this_class;
3664
3665 data_fake.flags = 0;
3666 if (data) {
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3669 }
3670 else
3671 data_fake.last_closep = &fake;
3672
3673 if (flags & SCF_DO_STCLASS) {
3674 cl_init(pRExC_state, &this_class);
3675 data_fake.start_class = &this_class;
3676 f = SCF_DO_STCLASS_AND;
3677 }
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3680
3681 if (trie->jump[word]) {
3682 if (!nextbranch)
3683 nextbranch = trie_node + trie->jump[0];
3684 scan= trie_node + trie->jump[word];
3685 /* We go from the jump point to the branch that follows
3686 it. Note this means we need the vestigal unused branches
3687 even though they arent otherwise used.
3688 */
3689 minnext = study_chunk(pRExC_state, &scan, minlenp,
3690 &deltanext, (regnode *)nextbranch, &data_fake,
3691 stopparen, recursed, NULL, f,depth+1);
3692 }
3693 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3694 nextbranch= regnext((regnode*)nextbranch);
3695
3696 if (min1 > (I32)(minnext + trie->minlen))
3697 min1 = minnext + trie->minlen;
3698 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3699 max1 = minnext + deltanext + trie->maxlen;
3700 if (deltanext == I32_MAX)
3701 is_inf = is_inf_internal = 1;
3702
3703 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3704 pars++;
3705 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3706 if ( stopmin > min + min1)
3707 stopmin = min + min1;
3708 flags &= ~SCF_DO_SUBSTR;
3709 if (data)
3710 data->flags |= SCF_SEEN_ACCEPT;
3711 }
3712 if (data) {
3713 if (data_fake.flags & SF_HAS_EVAL)
3714 data->flags |= SF_HAS_EVAL;
3715 data->whilem_c = data_fake.whilem_c;
3716 }
3717 if (flags & SCF_DO_STCLASS)
3718 cl_or(pRExC_state, &accum, &this_class);
3719 }
3720 }
3721 if (flags & SCF_DO_SUBSTR) {
3722 data->pos_min += min1;
3723 data->pos_delta += max1 - min1;
3724 if (max1 != min1 || is_inf)
3725 data->longest = &(data->longest_float);
3726 }
3727 min += min1;
3728 delta += max1 - min1;
3729 if (flags & SCF_DO_STCLASS_OR) {
3730 cl_or(pRExC_state, data->start_class, &accum);
3731 if (min1) {
3732 cl_and(data->start_class, and_withp);
3733 flags &= ~SCF_DO_STCLASS;
3734 }
3735 }
3736 else if (flags & SCF_DO_STCLASS_AND) {
3737 if (min1) {
3738 cl_and(data->start_class, &accum);
3739 flags &= ~SCF_DO_STCLASS;
3740 }
3741 else {
3742 /* Switch to OR mode: cache the old value of
3743 * data->start_class */
3744 INIT_AND_WITHP;
3745 StructCopy(data->start_class, and_withp,
3746 struct regnode_charclass_class);
3747 flags &= ~SCF_DO_STCLASS_AND;
3748 StructCopy(&accum, data->start_class,
3749 struct regnode_charclass_class);
3750 flags |= SCF_DO_STCLASS_OR;
3751 data->start_class->flags |= ANYOF_EOS;
3752 }
3753 }
3754 scan= tail;
3755 continue;
3756 }
786e8c11 3757#else
b515a41d
YO
3758 else if (PL_regkind[OP(scan)] == TRIE) {
3759 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3760 U8*bang=NULL;
3761
3762 min += trie->minlen;
3763 delta += (trie->maxlen - trie->minlen);
3764 flags &= ~SCF_DO_STCLASS; /* xxx */
3765 if (flags & SCF_DO_SUBSTR) {
3766 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3767 data->pos_min += trie->minlen;
3768 data->pos_delta += (trie->maxlen - trie->minlen);
3769 if (trie->maxlen != trie->minlen)
3770 data->longest = &(data->longest_float);
3771 }
3772 if (trie->jump) /* no more substrings -- for now /grr*/
3773 flags &= ~SCF_DO_SUBSTR;
3774 }
786e8c11 3775#endif /* old or new */
b515a41d
YO
3776#endif /* TRIE_STUDY_OPT */
3777 /* Else: zero-length, ignore. */
3778 scan = regnext(scan);
3779 }
3780 DEBUG_PEEP("FEND",scan,depth);
3781 scan = frame->next;
3782 stopparen = frame->stop;
3783 frame = frame->prev;
3784
c277df42
IZ
3785 }
3786
3787 finish:
3788 *scanp = scan;
aca2d497 3789 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3790 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3791 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3792 if (is_par > (I32)U8_MAX)
c277df42
IZ
3793 is_par = 0;
3794 if (is_par && pars==1 && data) {
3795 data->flags |= SF_IN_PAR;
3796 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3797 }
3798 else if (pars && data) {
c277df42
IZ
3799 data->flags |= SF_HAS_PAR;
3800 data->flags &= ~SF_IN_PAR;
3801 }
653099ff 3802 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3803 cl_and(data->start_class, and_withp);
786e8c11
YO
3804 if (flags & SCF_TRIE_RESTUDY)
3805 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3806
3807 DEBUG_STUDYDATA(data,depth);
3808
e2e6a0f1 3809 return min < stopmin ? min : stopmin;
c277df42
IZ
3810}
3811
76e3520e 3812STATIC I32
5f66b61c 3813S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3814{
830247a4 3815 if (RExC_rx->data) {
2eb97020 3816 const U32 count = RExC_rx->data->count;
b81d288d 3817 Renewc(RExC_rx->data,
2eb97020 3818 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3819 char, struct reg_data);
2eb97020 3820 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3821 RExC_rx->data->count += n;
a0ed51b3
LW
3822 }
3823 else {
a02a5408 3824 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3825 char, struct reg_data);
a02a5408 3826 Newx(RExC_rx->data->what, n, U8);
830247a4 3827 RExC_rx->data->count = n;
c277df42 3828 }
830247a4
IZ
3829 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3830 return RExC_rx->data->count - n;
c277df42
IZ
3831}
3832
76234dfb 3833#ifndef PERL_IN_XSUB_RE
d88dccdf 3834void
864dbfa3 3835Perl_reginitcolors(pTHX)
d88dccdf 3836{
97aff369 3837 dVAR;
1df70142 3838 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3839 if (s) {
1df70142
AL
3840 char *t = savepv(s);
3841 int i = 0;
3842 PL_colors[0] = t;
d88dccdf 3843 while (++i < 6) {
1df70142
AL
3844 t = strchr(t, '\t');
3845 if (t) {
3846 *t = '\0';
3847 PL_colors[i] = ++t;
d88dccdf
IZ
3848 }
3849 else
1df70142 3850 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3851 }
3852 } else {
1df70142 3853 int i = 0;
b81d288d 3854 while (i < 6)
06b5626a 3855 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3856 }
3857 PL_colorset = 1;
3858}
76234dfb 3859#endif
8615cb43 3860
07be1b83 3861
786e8c11
YO
3862#ifdef TRIE_STUDY_OPT
3863#define CHECK_RESTUDY_GOTO \
3864 if ( \
3865 (data.flags & SCF_TRIE_RESTUDY) \
3866 && ! restudied++ \
3867 ) goto reStudy
3868#else
3869#define CHECK_RESTUDY_GOTO
3870#endif
f9f4320a 3871
a687059c 3872/*
e50aee73 3873 - pregcomp - compile a regular expression into internal code
a687059c
LW
3874 *
3875 * We can't allocate space until we know how big the compiled form will be,
3876 * but we can't compile it (and thus know how big it is) until we've got a
3877 * place to put the code. So we cheat: we compile it twice, once with code
3878 * generation turned off and size counting turned on, and once "for real".
3879 * This also means that we don't allocate space until we are sure that the
3880 * thing really will compile successfully, and we never have to move the
3881 * code and thus invalidate pointers into it. (Note that it has to be in
3882 * one piece because free() must be able to free it all.) [NB: not true in perl]
3883 *
3884 * Beware that the optimization-preparation code in here knows about some
3885 * of the structure of the compiled regexp. [I'll say.]
3886 */
b9b4dddf
YO
3887
3888
3889
f9f4320a 3890#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3891#define RE_ENGINE_PTR &PL_core_reg_engine
3892#else
f9f4320a
YO
3893extern const struct regexp_engine my_reg_engine;
3894#define RE_ENGINE_PTR &my_reg_engine
3895#endif
b9b4dddf
YO
3896/* these make a few things look better, to avoid indentation */
3897#define BEGIN_BLOCK {
f9f4320a
YO
3898#define END_BLOCK }
3899
a687059c 3900regexp *
864dbfa3 3901Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3902{
97aff369 3903 dVAR;
f9f4320a
YO
3904 GET_RE_DEBUG_FLAGS_DECL;
3905 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3906#ifndef PERL_IN_XSUB_RE
3907 BEGIN_BLOCK
f9f4320a
YO
3908 /* Dispatch a request to compile a regexp to correct
3909 regexp engine. */
3910 HV * const table = GvHV(PL_hintgv);
3911 if (table) {
3912 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3913 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3914 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3915 DEBUG_COMPILE_r({
8d8756e7 3916 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3917 SvIV(*ptr));
3918 });
f2f78491 3919 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3920 }
b9b4dddf
YO
3921 }
3922 END_BLOCK
3923#endif
3924 BEGIN_BLOCK
a0d0e21e 3925 register regexp *r;
c277df42 3926 regnode *scan;
c277df42 3927 regnode *first;
a0d0e21e 3928 I32 flags;
a0d0e21e
LW
3929 I32 minlen = 0;
3930 I32 sawplus = 0;
3931 I32 sawopen = 0;
2c2d71f5 3932 scan_data_t data;
830247a4 3933 RExC_state_t RExC_state;
be8e71aa 3934 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3935#ifdef TRIE_STUDY_OPT
3936 int restudied= 0;
3937 RExC_state_t copyRExC_state;
3938#endif
a0d0e21e 3939 if (exp == NULL)
c277df42 3940 FAIL("NULL regexp argument");
a0d0e21e 3941
a5961de5 3942 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3943
5cfc7842 3944 RExC_precomp = exp;
a3621e74 3945 DEBUG_COMPILE_r({
ab3bbdeb
YO
3946 SV *dsv= sv_newmortal();
3947 RE_PV_QUOTED_DECL(s, RExC_utf8,
3948 dsv, RExC_precomp, (xend - exp), 60);
3949 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3950 PL_colors[4],PL_colors[5],s);
a5961de5 3951 });
e2509266 3952 RExC_flags = pm->op_pmflags;
830247a4 3953 RExC_sawback = 0;
bbce6d69 3954
830247a4
IZ
3955 RExC_seen = 0;
3956 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3957 RExC_seen_evals = 0;
3958 RExC_extralen = 0;
c277df42 3959
bbce6d69 3960 /* First pass: determine size, legality. */
830247a4 3961 RExC_parse = exp;
fac92740 3962 RExC_start = exp;
830247a4
IZ
3963 RExC_end = xend;
3964 RExC_naughty = 0;
3965 RExC_npar = 1;
c74340f9 3966 RExC_cpar = 1;
e2e6a0f1 3967 RExC_nestroot = 0;
830247a4
IZ
3968 RExC_size = 0L;
3969 RExC_emit = &PL_regdummy;
3970 RExC_whilem_seen = 0;
fc8cd66c 3971 RExC_charnames = NULL;
40d049e4
YO
3972 RExC_open_parens = NULL;
3973 RExC_close_parens = NULL;
3974 RExC_opend = NULL;
81714fb9 3975 RExC_paren_names = NULL;
40d049e4
YO
3976 RExC_recurse = NULL;
3977 RExC_recurse_count = 0;
81714fb9 3978
85ddcde9
JH
3979#if 0 /* REGC() is (currently) a NOP at the first pass.
3980 * Clever compilers notice this and complain. --jhi */
830247a4 3981 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3982#endif
3dab1dad
YO
3983 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3984 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3985 RExC_precomp = NULL;
a0d0e21e
LW
3986 return(NULL);
3987 }
07be1b83 3988 DEBUG_PARSE_r({
81714fb9
YO
3989 PerlIO_printf(Perl_debug_log,
3990 "Required size %"IVdf" nodes\n"
3991 "Starting second pass (creation)\n",
3992 (IV)RExC_size);
07be1b83
YO
3993 RExC_lastnum=0;
3994 RExC_lastparse=NULL;
3995 });
c277df42
IZ
3996 /* Small enough for pointer-storage convention?
3997 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3998 if (RExC_size >= 0x10000L && RExC_extralen)
3999 RExC_size += RExC_extralen;
c277df42 4000 else
830247a4
IZ
4001 RExC_extralen = 0;
4002 if (RExC_whilem_seen > 15)
4003 RExC_whilem_seen = 15;
a0d0e21e 4004
e2e6a0f1
YO
4005#ifdef DEBUGGING
4006 /* Make room for a sentinel value at the end of the program */
4007 RExC_size++;
4008#endif
4009
f9f4320a
YO
4010 /* Allocate space and zero-initialize. Note, the two step process
4011 of zeroing when in debug mode, thus anything assigned has to
4012 happen after that */
a02a5408 4013 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 4014 char, regexp);
a0d0e21e 4015 if (r == NULL)
b45f050a 4016 FAIL("Regexp out of space");
0f79a09d
GS
4017#ifdef DEBUGGING
4018 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4019 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 4020#endif
f9f4320a
YO
4021 /* initialization begins here */
4022 r->engine= RE_ENGINE_PTR;
c277df42 4023 r->refcnt = 1;
bbce6d69 4024 r->prelen = xend - exp;
5cfc7842 4025 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4026 r->subbeg = NULL;
f8c7b90f 4027#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4028 r->saved_copy = NULL;
ed252734 4029#endif
cf93c79d 4030 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4031 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4032 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4033
4034 r->substrs = 0; /* Useful during FAIL. */
4035 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4036 r->endp = 0;
c74340f9 4037 r->swap = NULL;
81714fb9
YO
4038 r->paren_names = 0;
4039
6bda09f9 4040 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4041 Newxz(RExC_open_parens, RExC_npar,regnode *);
4042 SAVEFREEPV(RExC_open_parens);
4043 Newxz(RExC_close_parens,RExC_npar,regnode *);
4044 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4045 }
4046
4047 /* Useful during FAIL. */
a02a5408 4048 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4049 if (r->offsets) {
2af232bd 4050 r->offsets[0] = RExC_size;
fac92740 4051 }
a3621e74 4052 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4053 "%s %"UVuf" bytes for offset annotations.\n",
4054 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4055 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4056
830247a4 4057 RExC_rx = r;
bbce6d69 4058
4059 /* Second pass: emit code. */
e2509266 4060 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4061 RExC_parse = exp;
4062 RExC_end = xend;
4063 RExC_naughty = 0;
4064 RExC_npar = 1;
c74340f9 4065 RExC_cpar = 1;
fac92740 4066 RExC_emit_start = r->program;
830247a4 4067 RExC_emit = r->program;
e2e6a0f1
YO
4068#ifdef DEBUGGING
4069 /* put a sentinal on the end of the program so we can check for
4070 overwrites */
4071 r->program[RExC_size].type = 255;
4072#endif
2cd61cdb 4073 /* Store the count of eval-groups for security checks: */
786e8c11 4074 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4075 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4076 r->data = 0;
3dab1dad 4077 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4078 return(NULL);
6bda09f9 4079
07be1b83
YO
4080 /* XXXX To minimize changes to RE engine we always allocate
4081 3-units-long substrs field. */
4082 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4083 if (RExC_recurse_count) {
4084 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4085 SAVEFREEPV(RExC_recurse);
4086 }
a0d0e21e 4087
07be1b83 4088reStudy:
1de06328 4089 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4090 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4091
07be1b83
YO
4092#ifdef TRIE_STUDY_OPT
4093 if ( restudied ) {
5d458dd8 4094 U32 seen=RExC_seen;
07be1b83 4095 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4096
4097 RExC_state = copyRExC_state;
4098 if (seen & REG_TOP_LEVEL_BRANCHES)
4099 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4100 else
4101 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4102 if (data.last_found) {
07be1b83 4103 SvREFCNT_dec(data.longest_fixed);
07be1b83 4104 SvREFCNT_dec(data.longest_float);
07be1b83 4105 SvREFCNT_dec(data.last_found);
1de06328 4106 }
40d049e4 4107 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4108 } else {
40d049e4 4109 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4110 copyRExC_state = RExC_state;
07be1b83 4111 }
40d049e4
YO
4112#else
4113 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4114#endif
fc8cd66c 4115
a0d0e21e 4116 /* Dig out information for optimizations. */
cf93c79d 4117 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4118 pm->op_pmflags = RExC_flags;
a0ed51b3 4119 if (UTF)
5ff6fc6d 4120 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4121 r->regstclass = NULL;
830247a4 4122 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4123 r->reganch |= ROPT_NAUGHTY;
c277df42 4124 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4125
1de06328
YO
4126 /* testing for BRANCH here tells us whether there is "must appear"
4127 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4128 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4129 I32 fake;
c5254dd6 4130 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4131 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4132 int stclass_flag;
07be1b83 4133 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4134
4135 first = scan;
c277df42 4136 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4137 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4138 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4139 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4140 /* for now we can't handle lookbehind IFMATCH*/
4141 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4142 (OP(first) == PLUS) ||
4143 (OP(first) == MINMOD) ||
653099ff 4144 /* An {n,m} with n>0 */
07be1b83
YO
4145 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4146 {
786e8c11 4147
a0d0e21e
LW
4148 if (OP(first) == PLUS)
4149 sawplus = 1;
4150 else
3dab1dad 4151 first += regarglen[OP(first)];
07be1b83
YO
4152 if (OP(first) == IFMATCH) {
4153 first = NEXTOPER(first);
4154 first += EXTRA_STEP_2ARGS;
7c167cea 4155 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4156 first = NEXTOPER(first);
a687059c
LW
4157 }
4158
a0d0e21e
LW
4159 /* Starting-point info. */
4160 again:
786e8c11 4161 DEBUG_PEEP("first:",first,0);
07be1b83 4162 /* Ignore EXACT as we deal with it later. */
3dab1dad 4163 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4164 if (OP(first) == EXACT)
6f207bd3 4165 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4166 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4167 r->regstclass = first;
4168 }
07be1b83 4169#ifdef TRIE_STCLASS
786e8c11 4170 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4171 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4172 {
786e8c11 4173 regnode *trie_op;
07be1b83 4174 /* this can happen only on restudy */
786e8c11
YO
4175 if ( OP(first) == TRIE ) {
4176 struct regnode_1 *trieop;
4177 Newxz(trieop,1,struct regnode_1);
4178 StructCopy(first,trieop,struct regnode_1);
4179 trie_op=(regnode *)trieop;
4180 } else {
4181 struct regnode_charclass *trieop;
4182 Newxz(trieop,1,struct regnode_charclass);
4183 StructCopy(first,trieop,struct regnode_charclass);
4184 trie_op=(regnode *)trieop;
4185 }
1de06328 4186 OP(trie_op)+=2;
786e8c11
YO
4187 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4188 r->regstclass = trie_op;
07be1b83
YO
4189 }
4190#endif
bfed75c6 4191 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4192 r->regstclass = first;
3dab1dad
YO
4193 else if (PL_regkind[OP(first)] == BOUND ||
4194 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4195 r->regstclass = first;
3dab1dad 4196 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4197 r->reganch |= (OP(first) == MBOL
4198 ? ROPT_ANCH_MBOL
4199 : (OP(first) == SBOL
4200 ? ROPT_ANCH_SBOL
4201 : ROPT_ANCH_BOL));
a0d0e21e 4202 first = NEXTOPER(first);
774d564b 4203 goto again;
4204 }
4205 else if (OP(first) == GPOS) {
4206 r->reganch |= ROPT_ANCH_GPOS;
4207 first = NEXTOPER(first);
4208 goto again;
a0d0e21e 4209 }
e09294f4 4210 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4211 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4212 !(r->reganch & ROPT_ANCH) )
4213 {
4214 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4215 const int type =
4216 (OP(NEXTOPER(first)) == REG_ANY)
4217 ? ROPT_ANCH_MBOL
4218 : ROPT_ANCH_SBOL;
cad2e5aa 4219 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4220 first = NEXTOPER(first);
774d564b 4221 goto again;
a0d0e21e 4222 }
b81d288d 4223 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4224 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4225 /* x+ must match at the 1st pos of run of x's */
4226 r->reganch |= ROPT_SKIP;
a0d0e21e 4227
c277df42 4228 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4229#ifdef TRIE_STUDY_OPT
81714fb9 4230 DEBUG_PARSE_r(
be8e71aa
YO
4231 if (!restudied)
4232 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4233 (IV)(first - scan + 1))
4234 );
4235#else
81714fb9 4236 DEBUG_PARSE_r(
be8e71aa
YO
4237 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4238 (IV)(first - scan + 1))
4239 );
4240#endif
4241
4242
a0d0e21e
LW
4243 /*
4244 * If there's something expensive in the r.e., find the
4245 * longest literal string that must appear and make it the
4246 * regmust. Resolve ties in favor of later strings, since
4247 * the regstart check works with the beginning of the r.e.
4248 * and avoiding duplication strengthens checking. Not a
4249 * strong reason, but sufficient in the absence of others.
4250 * [Now we resolve ties in favor of the earlier string if
c277df42 4251 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4252 * earlier string may buy us something the later one won't.]
4253 */
de8c5301 4254
396482e1
GA
4255 data.longest_fixed = newSVpvs("");
4256 data.longest_float = newSVpvs("");
4257 data.last_found = newSVpvs("");
c277df42
IZ
4258 data.longest = &(data.longest_fixed);
4259 first = scan;
653099ff 4260 if (!r->regstclass) {
830247a4 4261 cl_init(pRExC_state, &ch_class);
653099ff
GS
4262 data.start_class = &ch_class;
4263 stclass_flag = SCF_DO_STCLASS_AND;
4264 } else /* XXXX Check for BOUND? */
4265 stclass_flag = 0;
cb434fcc 4266 data.last_closep = &last_close;
de8c5301 4267
1de06328 4268 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4269 &data, -1, NULL, NULL,
4270 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4271
07be1b83 4272
786e8c11
YO
4273 CHECK_RESTUDY_GOTO;
4274
4275
830247a4 4276 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4277 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4278 && !RExC_seen_zerolen
4279 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4280 r->reganch |= ROPT_CHECK_ALL;
1de06328 4281 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4282 SvREFCNT_dec(data.last_found);
4283
1de06328
YO
4284 /* Note that code very similar to this but for anchored string
4285 follows immediately below, changes may need to be made to both.
4286 Be careful.
4287 */
a0ed51b3 4288 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4289 if (longest_float_length
c277df42
IZ
4290 || (data.flags & SF_FL_BEFORE_EOL
4291 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4292 || (RExC_flags & PMf_MULTILINE))))
4293 {
1182767e 4294 I32 t,ml;
cf93c79d 4295
1de06328 4296 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4297 && data.offset_fixed == data.offset_float_min
4298 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4299 goto remove_float; /* As in (a)+. */
4300
1de06328
YO
4301 /* copy the information about the longest float from the reg_scan_data
4302 over to the program. */
33b8afdf
JH
4303 if (SvUTF8(data.longest_float)) {
4304 r->float_utf8 = data.longest_float;
c445ea15 4305 r->float_substr = NULL;
33b8afdf
JH
4306 } else {
4307 r->float_substr = data.longest_float;
c445ea15 4308 r->float_utf8 = NULL;
33b8afdf 4309 }
1de06328
YO
4310 /* float_end_shift is how many chars that must be matched that
4311 follow this item. We calculate it ahead of time as once the
4312 lookbehind offset is added in we lose the ability to correctly
4313 calculate it.*/
4314 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4315 : (I32)longest_float_length;
1de06328
YO
4316 r->float_end_shift = ml - data.offset_float_min
4317 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4318 + data.lookbehind_float;
4319 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4320 r->float_max_offset = data.offset_float_max;
1182767e 4321 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4322 r->float_max_offset -= data.lookbehind_float;
4323
cf93c79d
IZ
4324 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4325 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4326 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4327 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4328 }
4329 else {
aca2d497 4330 remove_float:
c445ea15 4331 r->float_substr = r->float_utf8 = NULL;
c277df42 4332 SvREFCNT_dec(data.longest_float);
c5254dd6 4333 longest_float_length = 0;
a0d0e21e 4334 }
c277df42 4335
1de06328
YO
4336 /* Note that code very similar to this but for floating string
4337 is immediately above, changes may need to be made to both.
4338 Be careful.
4339 */
a0ed51b3 4340 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4341 if (longest_fixed_length
c277df42
IZ
4342 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4343 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4344 || (RExC_flags & PMf_MULTILINE))))
4345 {
1182767e 4346 I32 t,ml;
cf93c79d 4347
1de06328
YO
4348 /* copy the information about the longest fixed
4349 from the reg_scan_data over to the program. */
33b8afdf
JH
4350 if (SvUTF8(data.longest_fixed)) {
4351 r->anchored_utf8 = data.longest_fixed;
c445ea15 4352 r->anchored_substr = NULL;
33b8afdf
JH
4353 } else {
4354 r->anchored_substr = data.longest_fixed;
c445ea15 4355 r->anchored_utf8 = NULL;
33b8afdf 4356 }
1de06328
YO
4357 /* fixed_end_shift is how many chars that must be matched that
4358 follow this item. We calculate it ahead of time as once the
4359 lookbehind offset is added in we lose the ability to correctly
4360 calculate it.*/
4361 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4362 : (I32)longest_fixed_length;
1de06328
YO
4363 r->anchored_end_shift = ml - data.offset_fixed
4364 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4365 + data.lookbehind_fixed;
4366 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4367
cf93c79d
IZ
4368 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4369 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4370 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4371 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4372 }
4373 else {
c445ea15 4374 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4375 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4376 longest_fixed_length = 0;
a0d0e21e 4377 }
b81d288d 4378 if (r->regstclass
ffc61ed2 4379 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4380 r->regstclass = NULL;
33b8afdf
JH
4381 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4382 && stclass_flag
653099ff 4383 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4384 && !cl_is_anything(data.start_class))
4385 {
1df70142 4386 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4387
a02a5408 4388 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4389 struct regnode_charclass_class);
4390 StructCopy(data.start_class,
830247a4 4391 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4392 struct regnode_charclass_class);
830247a4 4393 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4394 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4395 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4396 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4397 PerlIO_printf(Perl_debug_log,
a0288114 4398 "synthetic stclass \"%s\".\n",
3f7c398e 4399 SvPVX_const(sv));});
653099ff 4400 }
c277df42
IZ
4401
4402 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4403 if (longest_fixed_length > longest_float_length) {
1de06328 4404 r->check_end_shift = r->anchored_end_shift;
c277df42 4405 r->check_substr = r->anchored_substr;
33b8afdf 4406 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4407 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4408 if (r->reganch & ROPT_ANCH_SINGLE)
4409 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4410 }
4411 else {
1de06328 4412 r->check_end_shift = r->float_end_shift;
c277df42 4413 r->check_substr = r->float_substr;
33b8afdf 4414 r->check_utf8 = r->float_utf8;
1de06328
YO
4415 r->check_offset_min = r->float_min_offset;
4416 r->check_offset_max = r->float_max_offset;
a0d0e21e 4417 }
30382c73
IZ
4418 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4419 This should be changed ASAP! */
33b8afdf 4420 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4421 r->reganch |= RE_USE_INTUIT;
33b8afdf 4422 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4423 r->reganch |= RE_INTUIT_TAIL;
4424 }
1de06328
YO
4425 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4426 if ( (STRLEN)minlen < longest_float_length )
4427 minlen= longest_float_length;
4428 if ( (STRLEN)minlen < longest_fixed_length )
4429 minlen= longest_fixed_length;
4430 */
a0ed51b3
LW
4431 }
4432 else {
c277df42
IZ
4433 /* Several toplevels. Best we can is to set minlen. */
4434 I32 fake;
653099ff 4435 struct regnode_charclass_class ch_class;
cb434fcc 4436 I32 last_close = 0;
c277df42 4437
5d458dd8 4438 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4439
c277df42 4440 scan = r->program + 1;
830247a4 4441 cl_init(pRExC_state, &ch_class);
653099ff 4442 data.start_class = &ch_class;
cb434fcc 4443 data.last_closep = &last_close;
07be1b83 4444
de8c5301 4445
1de06328 4446 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4447 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4448
786e8c11 4449 CHECK_RESTUDY_GOTO;
07be1b83 4450
33b8afdf 4451 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4452 = r->float_substr = r->float_utf8 = NULL;
653099ff 4453 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4454 && !cl_is_anything(data.start_class))
4455 {
1df70142 4456 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4457
a02a5408 4458 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4459 struct regnode_charclass_class);
4460 StructCopy(data.start_class,
830247a4 4461 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4462 struct regnode_charclass_class);
830247a4 4463 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4464 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4465 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4466 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4467 PerlIO_printf(Perl_debug_log,
a0288114 4468 "synthetic stclass \"%s\".\n",
3f7c398e 4469 SvPVX_const(sv));});
653099ff 4470 }
a0d0e21e
LW
4471 }
4472
1de06328
YO
4473 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4474 the "real" pattern. */
cf9788e3
RGS
4475 DEBUG_OPTIMISE_r({
4476 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4477 minlen, r->minlen);
4478 });
de8c5301 4479 r->minlenret = minlen;
1de06328
YO
4480 if (r->minlen < minlen)
4481 r->minlen = minlen;
4482
b81d288d 4483 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4484 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4485 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4486 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4487 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4488 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4489 if (RExC_seen & REG_SEEN_CANY)
4490 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4491 if (RExC_seen & REG_SEEN_VERBARG)
4492 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4493 if (RExC_seen & REG_SEEN_CUTGROUP)
4494 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4495 if (RExC_paren_names)
4496 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4497 else
4498 r->paren_names = NULL;
4499
40d049e4
YO
4500 if (RExC_recurse_count) {
4501 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4502 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4503 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4504 }
4505 }
a02a5408
JC
4506 Newxz(r->startp, RExC_npar, I32);
4507 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4508 /* assume we don't need to swap parens around before we match */
4509
be8e71aa
YO
4510 DEBUG_DUMP_r({
4511 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4512 regdump(r);
4513 });
8e9a8a48
YO
4514 DEBUG_OFFSETS_r(if (r->offsets) {
4515 const U32 len = r->offsets[0];
4516 U32 i;
4517 GET_RE_DEBUG_FLAGS_DECL;
4518 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4519 for (i = 1; i <= len; i++) {
4520 if (r->offsets[i*2-1] || r->offsets[i*2])
4521 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4522 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4523 }
4524 PerlIO_printf(Perl_debug_log, "\n");
4525 });
a0d0e21e 4526 return(r);
f9f4320a 4527 END_BLOCK
a687059c
LW
4528}
4529
f9f4320a
YO
4530#undef CORE_ONLY_BLOCK
4531#undef END_BLOCK
4532#undef RE_ENGINE_PTR
3dab1dad 4533
9af228c6 4534#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4535SV*
4536Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4537{
4538 I32 parno = 0; /* no match */
4539 if (PL_curpm) {
4540 const REGEXP * const rx = PM_GETRE(PL_curpm);
4541 if (rx && rx->paren_names) {
4542 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4543 if (he_str) {
4544 IV i;
4545 SV* sv_dat=HeVAL(he_str);
4546 I32 *nums=(I32*)SvPVX(sv_dat);
4547 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4548 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4549 rx->endp[nums[i]] != -1)
4550 {
4551 parno = nums[i];
4552 break;
4553 }
4554 }
4555 }
4556 }
4557 }
4558 if ( !parno ) {
4559 return 0;
4560 } else {
4561 GV *gv_paren;
4562 SV *sv= sv_newmortal();
4563 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4564 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4565 return GvSVn(gv_paren);
4566 }
4567}
9af228c6 4568#endif
0a4db386 4569
894be9b7 4570/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4571 * If flags is REG_RSN_RETURN_NULL returns null.
4572 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4573 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4574 * to the parsed name as looked up in the RExC_paren_names hash.
4575 * If there is an error throws a vFAIL().. type exception.
894be9b7 4576 */
0a4db386
YO
4577
4578#define REG_RSN_RETURN_NULL 0
4579#define REG_RSN_RETURN_NAME 1
4580#define REG_RSN_RETURN_DATA 2
4581
894be9b7
YO
4582STATIC SV*
4583S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4584 char *name_start = RExC_parse;
0a4db386 4585 if ( UTF ) {
894be9b7 4586 STRLEN numlen;
0a4db386
YO
4587 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4588 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4589 {
4590 RExC_parse += numlen;
4591 }
4592 } else {
4593 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4594 RExC_parse++;
4595 }
0a4db386
YO
4596 if ( flags ) {
4597 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4598 (int)(RExC_parse - name_start)));
894be9b7 4599 if (UTF)
0a4db386
YO
4600 SvUTF8_on(sv_name);
4601 if ( flags == REG_RSN_RETURN_NAME)
4602 return sv_name;
4603 else if (flags==REG_RSN_RETURN_DATA) {
4604 HE *he_str = NULL;
4605 SV *sv_dat = NULL;
4606 if ( ! sv_name ) /* should not happen*/
4607 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4608 if (RExC_paren_names)
4609 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4610 if ( he_str )
4611 sv_dat = HeVAL(he_str);
4612 if ( ! sv_dat )
4613 vFAIL("Reference to nonexistent named group");
4614 return sv_dat;
4615 }
4616 else {
4617 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4618 }
4619 /* NOT REACHED */
894be9b7 4620 }
0a4db386 4621 return NULL;
894be9b7
YO
4622}
4623
3dab1dad
YO
4624#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4625 int rem=(int)(RExC_end - RExC_parse); \
4626 int cut; \
4627 int num; \
4628 int iscut=0; \
4629 if (rem>10) { \
4630 rem=10; \
4631 iscut=1; \
4632 } \
4633 cut=10-rem; \
4634 if (RExC_lastparse!=RExC_parse) \
4635 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4636 rem, RExC_parse, \
4637 cut + 4, \
4638 iscut ? "..." : "<" \
4639 ); \
4640 else \
4641 PerlIO_printf(Perl_debug_log,"%16s",""); \
4642 \
4643 if (SIZE_ONLY) \
4644 num=RExC_size; \
4645 else \
4646 num=REG_NODE_NUM(RExC_emit); \
4647 if (RExC_lastnum!=num) \
0a4db386 4648 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4649 else \
0a4db386 4650 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4651 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4652 (int)((depth*2)), "", \
3dab1dad
YO
4653 (funcname) \
4654 ); \
4655 RExC_lastnum=num; \
4656 RExC_lastparse=RExC_parse; \
4657})
4658
07be1b83
YO
4659
4660
3dab1dad
YO
4661#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4662 DEBUG_PARSE_MSG((funcname)); \
4663 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4664})
6bda09f9
YO
4665#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4666 DEBUG_PARSE_MSG((funcname)); \
4667 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4668})
a687059c
LW
4669/*
4670 - reg - regular expression, i.e. main body or parenthesized thing
4671 *
4672 * Caller must absorb opening parenthesis.
4673 *
4674 * Combining parenthesis handling with the base level of regular expression
4675 * is a trifle forced, but the need to tie the tails of the branches to what
4676 * follows makes it hard to avoid.
4677 */
07be1b83
YO
4678#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4679#ifdef DEBUGGING
4680#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4681#else
4682#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4683#endif
3dab1dad 4684
e2e6a0f1
YO
4685/* this idea is borrowed from STR_WITH_LEN in handy.h */
4686#define CHECK_WORD(s,v,l) \
4687 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4688
76e3520e 4689STATIC regnode *
3dab1dad 4690S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4691 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4692{
27da23d5 4693 dVAR;
c277df42
IZ
4694 register regnode *ret; /* Will be the head of the group. */
4695 register regnode *br;
4696 register regnode *lastbr;
cbbf8932 4697 register regnode *ender = NULL;
a0d0e21e 4698 register I32 parno = 0;
cbbf8932
AL
4699 I32 flags;
4700 const I32 oregflags = RExC_flags;
6136c704
AL
4701 bool have_branch = 0;
4702 bool is_open = 0;
9d1d55b5
JP
4703
4704 /* for (?g), (?gc), and (?o) warnings; warning
4705 about (?c) will warn about (?g) -- japhy */
4706
6136c704
AL
4707#define WASTED_O 0x01
4708#define WASTED_G 0x02
4709#define WASTED_C 0x04
4710#define WASTED_GC (0x02|0x04)
cbbf8932 4711 I32 wastedflags = 0x00;
9d1d55b5 4712
fac92740 4713 char * parse_start = RExC_parse; /* MJD */
a28509cc 4714 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4715
3dab1dad
YO
4716 GET_RE_DEBUG_FLAGS_DECL;
4717 DEBUG_PARSE("reg ");
4718
4719
821b33a5 4720 *flagp = 0; /* Tentatively. */
a0d0e21e 4721
9d1d55b5 4722
a0d0e21e
LW
4723 /* Make an OPEN node, if parenthesized. */
4724 if (paren) {
e2e6a0f1
YO
4725 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4726 char *start_verb = RExC_parse;
4727 STRLEN verb_len = 0;
4728 char *start_arg = NULL;
4729 unsigned char op = 0;
4730 int argok = 1;
4731 int internal_argval = 0; /* internal_argval is only useful if !argok */
4732 while ( *RExC_parse && *RExC_parse != ')' ) {
4733 if ( *RExC_parse == ':' ) {
4734 start_arg = RExC_parse + 1;
4735 break;
4736 }
4737 RExC_parse++;
4738 }
4739 ++start_verb;
4740 verb_len = RExC_parse - start_verb;
4741 if ( start_arg ) {
4742 RExC_parse++;
4743 while ( *RExC_parse && *RExC_parse != ')' )
4744 RExC_parse++;
4745 if ( *RExC_parse != ')' )
4746 vFAIL("Unterminated verb pattern argument");
4747 if ( RExC_parse == start_arg )
4748 start_arg = NULL;
4749 } else {
4750 if ( *RExC_parse != ')' )
4751 vFAIL("Unterminated verb pattern");
4752 }
5d458dd8 4753
e2e6a0f1
YO
4754 switch ( *start_verb ) {
4755 case 'A': /* (*ACCEPT) */
4756 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4757 op = ACCEPT;
4758 internal_argval = RExC_nestroot;
4759 }
4760 break;
4761 case 'C': /* (*COMMIT) */
4762 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4763 op = COMMIT;
e2e6a0f1
YO
4764 break;
4765 case 'F': /* (*FAIL) */
4766 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4767 op = OPFAIL;
4768 argok = 0;
4769 }
4770 break;
5d458dd8
YO
4771 case ':': /* (*:NAME) */
4772 case 'M': /* (*MARK:NAME) */
4773 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4774 op = MARKPOINT;
5d458dd8
YO
4775 argok = -1;
4776 }
4777 break;
4778 case 'P': /* (*PRUNE) */
4779 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4780 op = PRUNE;
e2e6a0f1 4781 break;
5d458dd8
YO
4782 case 'S': /* (*SKIP) */
4783 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4784 op = SKIP;
4785 break;
4786 case 'T': /* (*THEN) */
4787 /* [19:06] <TimToady> :: is then */
4788 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4789 op = CUTGROUP;
4790 RExC_seen |= REG_SEEN_CUTGROUP;
4791 }
e2e6a0f1
YO
4792 break;
4793 }
4794 if ( ! op ) {
4795 RExC_parse++;
4796 vFAIL3("Unknown verb pattern '%.*s'",
4797 verb_len, start_verb);
4798 }
4799 if ( argok ) {
4800 if ( start_arg && internal_argval ) {
4801 vFAIL3("Verb pattern '%.*s' may not have an argument",
4802 verb_len, start_verb);
4803 } else if ( argok < 0 && !start_arg ) {
4804 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4805 verb_len, start_verb);
4806 } else {
4807 ret = reganode(pRExC_state, op, internal_argval);
4808 if ( ! internal_argval && ! SIZE_ONLY ) {
4809 if (start_arg) {
4810 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4811 ARG(ret) = add_data( pRExC_state, 1, "S" );
4812 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4813 ret->flags = 0;
4814 } else {
4815 ret->flags = 1;
4816 }
4817 }
4818 }
4819 if (!internal_argval)
4820 RExC_seen |= REG_SEEN_VERBARG;
4821 } else if ( start_arg ) {
4822 vFAIL3("Verb pattern '%.*s' may not have an argument",
4823 verb_len, start_verb);
4824 } else {
4825 ret = reg_node(pRExC_state, op);
4826 }
4827 nextchar(pRExC_state);
4828 return ret;
4829 } else
fac92740 4830 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4831 U32 posflags = 0, negflags = 0;
4832 U32 *flagsp = &posflags;
6136c704 4833 bool is_logical = 0;
a28509cc 4834 const char * const seqstart = RExC_parse;
ca9dfc88 4835
830247a4
IZ
4836 RExC_parse++;
4837 paren = *RExC_parse++;
c277df42 4838 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4839 switch (paren) {
894be9b7 4840
fac92740 4841 case '<': /* (?<...) */
b81d288d 4842 if (*RExC_parse == '!')
c277df42 4843 paren = ',';
0a4db386
YO
4844 else if (*RExC_parse != '=')
4845 { /* (?<...>) */
81714fb9 4846 char *name_start;
894be9b7 4847 SV *svname;
81714fb9
YO
4848 paren= '>';
4849 case '\'': /* (?'...') */
4850 name_start= RExC_parse;
0a4db386
YO
4851 svname = reg_scan_name(pRExC_state,
4852 SIZE_ONLY ? /* reverse test from the others */
4853 REG_RSN_RETURN_NAME :
4854 REG_RSN_RETURN_NULL);
81714fb9
YO
4855 if (RExC_parse == name_start)
4856 goto unknown;
4857 if (*RExC_parse != paren)
4858 vFAIL2("Sequence (?%c... not terminated",
4859 paren=='>' ? '<' : paren);
4860 if (SIZE_ONLY) {
e62cc96a
YO
4861 HE *he_str;
4862 SV *sv_dat = NULL;
894be9b7
YO
4863 if (!svname) /* shouldnt happen */
4864 Perl_croak(aTHX_
4865 "panic: reg_scan_name returned NULL");
81714fb9
YO
4866 if (!RExC_paren_names) {
4867 RExC_paren_names= newHV();
4868 sv_2mortal((SV*)RExC_paren_names);
4869 }
4870 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4871 if ( he_str )
81714fb9 4872 sv_dat = HeVAL(he_str);
e62cc96a 4873 if ( ! sv_dat ) {
81714fb9 4874 /* croak baby croak */
e62cc96a
YO
4875 Perl_croak(aTHX_
4876 "panic: paren_name hash element allocation failed");
4877 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4878 IV count=SvIV(sv_dat);
4879 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4880 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4881 pv[count]=RExC_npar;
4882 SvIVX(sv_dat)++;
4883 } else {
4884 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4885 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4886 SvIOK_on(sv_dat);
4887 SvIVX(sv_dat)= 1;
e62cc96a
YO
4888 }
4889
81714fb9
YO
4890 /*sv_dump(sv_dat);*/
4891 }
4892 nextchar(pRExC_state);
4893 paren = 1;
4894 goto capturing_parens;
4895 }
4896 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4897 RExC_parse++;
fac92740
MJD
4898 case '=': /* (?=...) */
4899 case '!': /* (?!...) */
830247a4 4900 RExC_seen_zerolen++;
e2e6a0f1
YO
4901 if (*RExC_parse == ')') {
4902 ret=reg_node(pRExC_state, OPFAIL);
4903 nextchar(pRExC_state);
4904 return ret;
4905 }
fac92740
MJD
4906 case ':': /* (?:...) */
4907 case '>': /* (?>...) */
a0d0e21e 4908 break;
fac92740
MJD
4909 case '$': /* (?$...) */
4910 case '@': /* (?@...) */
8615cb43 4911 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4912 break;
fac92740 4913 case '#': /* (?#...) */
830247a4
IZ
4914 while (*RExC_parse && *RExC_parse != ')')
4915 RExC_parse++;
4916 if (*RExC_parse != ')')
c277df42 4917 FAIL("Sequence (?#... not terminated");
830247a4 4918 nextchar(pRExC_state);
a0d0e21e
LW
4919 *flagp = TRYAGAIN;
4920 return NULL;
894be9b7
YO
4921 case '0' : /* (?0) */
4922 case 'R' : /* (?R) */
4923 if (*RExC_parse != ')')
6bda09f9 4924 FAIL("Sequence (?R) not terminated");
1a147d38 4925 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4926 nextchar(pRExC_state);
4927 return ret;
4928 /*notreached*/
894be9b7
YO
4929 { /* named and numeric backreferences */
4930 I32 num;
4931 char * parse_start;
4932 case '&': /* (?&NAME) */
4933 parse_start = RExC_parse - 1;
4934 {
0a4db386
YO
4935 SV *sv_dat = reg_scan_name(pRExC_state,
4936 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4937 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4938 }
4939 goto gen_recurse_regop;
4940 /* NOT REACHED */
542fa716
YO
4941 case '+':
4942 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4943 RExC_parse++;
4944 vFAIL("Illegal pattern");
4945 }
4946 goto parse_recursion;
4947 /* NOT REACHED*/
4948 case '-': /* (?-1) */
4949 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4950 RExC_parse--; /* rewind to let it be handled later */
4951 goto parse_flags;
4952 }
4953 /*FALLTHROUGH */
6bda09f9
YO
4954 case '1': case '2': case '3': case '4': /* (?1) */
4955 case '5': case '6': case '7': case '8': case '9':
4956 RExC_parse--;
542fa716 4957 parse_recursion:
894be9b7
YO
4958 num = atoi(RExC_parse);
4959 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
4960 if (*RExC_parse == '-')
4961 RExC_parse++;
6bda09f9
YO
4962 while (isDIGIT(*RExC_parse))
4963 RExC_parse++;
4964 if (*RExC_parse!=')')
4965 vFAIL("Expecting close bracket");
894be9b7
YO
4966
4967 gen_recurse_regop:
542fa716
YO
4968 if ( paren == '-' ) {
4969 /*
4970 Diagram of capture buffer numbering.
4971 Top line is the normal capture buffer numbers
4972 Botton line is the negative indexing as from
4973 the X (the (?-2))
4974
4975 + 1 2 3 4 5 X 6 7
4976 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4977 - 5 4 3 2 1 X x x
4978
4979 */
4980 num = RExC_npar + num;
4981 if (num < 1) {
4982 RExC_parse++;
4983 vFAIL("Reference to nonexistent group");
4984 }
4985 } else if ( paren == '+' ) {
4986 num = RExC_npar + num - 1;
4987 }
4988
1a147d38 4989 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4990 if (!SIZE_ONLY) {
4991 if (num > (I32)RExC_rx->nparens) {
4992 RExC_parse++;
4993 vFAIL("Reference to nonexistent group");
4994 }
40d049e4 4995 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4996 RExC_emit++;
226de585 4997 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4998 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4999 } else {
6bda09f9 5000 RExC_size++;
6bda09f9 5001 }
0a4db386 5002 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5003 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5004 Set_Node_Offset(ret, parse_start); /* MJD */
5005
6bda09f9
YO
5006 nextchar(pRExC_state);
5007 return ret;
894be9b7
YO
5008 } /* named and numeric backreferences */
5009 /* NOT REACHED */
5010
fac92740 5011 case 'p': /* (?p...) */
9014280d 5012 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5013 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5014 /* FALL THROUGH*/
fac92740 5015 case '?': /* (??...) */
6136c704 5016 is_logical = 1;
438a3801
YST
5017 if (*RExC_parse != '{')
5018 goto unknown;
830247a4 5019 paren = *RExC_parse++;
0f5d15d6 5020 /* FALL THROUGH */
fac92740 5021 case '{': /* (?{...}) */
c277df42 5022 {
c277df42
IZ
5023 I32 count = 1, n = 0;
5024 char c;
830247a4 5025 char *s = RExC_parse;
c277df42 5026
830247a4
IZ
5027 RExC_seen_zerolen++;
5028 RExC_seen |= REG_SEEN_EVAL;
5029 while (count && (c = *RExC_parse)) {
6136c704
AL
5030 if (c == '\\') {
5031 if (RExC_parse[1])
5032 RExC_parse++;
5033 }
b81d288d 5034 else if (c == '{')
c277df42 5035 count++;
b81d288d 5036 else if (c == '}')
c277df42 5037 count--;
830247a4 5038 RExC_parse++;
c277df42 5039 }
6136c704 5040 if (*RExC_parse != ')') {
b81d288d 5041 RExC_parse = s;
b45f050a
JF
5042 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5043 }
c277df42 5044 if (!SIZE_ONLY) {
f3548bdc 5045 PAD *pad;
6136c704
AL
5046 OP_4tree *sop, *rop;
5047 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5048
569233ed
SB
5049 ENTER;
5050 Perl_save_re_context(aTHX);
f3548bdc 5051 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5052 sop->op_private |= OPpREFCOUNTED;
5053 /* re_dup will OpREFCNT_inc */
5054 OpREFCNT_set(sop, 1);
569233ed 5055 LEAVE;
c277df42 5056
830247a4
IZ
5057 n = add_data(pRExC_state, 3, "nop");
5058 RExC_rx->data->data[n] = (void*)rop;
5059 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5060 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5061 SvREFCNT_dec(sv);
a0ed51b3 5062 }
e24b16f9 5063 else { /* First pass */
830247a4 5064 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5065 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5066 /* No compiled RE interpolated, has runtime
5067 components ===> unsafe. */
5068 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5069 if (PL_tainting && PL_tainted)
cc6b7395 5070 FAIL("Eval-group in insecure regular expression");
54df2634 5071#if PERL_VERSION > 8
923e4eb5 5072 if (IN_PERL_COMPILETIME)
b5c19bd7 5073 PL_cv_has_eval = 1;
54df2634 5074#endif
c277df42 5075 }
b5c19bd7 5076
830247a4 5077 nextchar(pRExC_state);
6136c704 5078 if (is_logical) {
830247a4 5079 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5080 if (!SIZE_ONLY)
5081 ret->flags = 2;
3dab1dad 5082 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5083 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5084 return ret;
5085 }
ccb2c380
MP
5086 ret = reganode(pRExC_state, EVAL, n);
5087 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5088 Set_Node_Offset(ret, parse_start);
5089 return ret;
c277df42 5090 }
fac92740 5091 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5092 {
0a4db386 5093 int is_define= 0;
fac92740 5094 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5095 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5096 || RExC_parse[1] == '<'
830247a4 5097 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5098 I32 flag;
5099
830247a4 5100 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5101 if (!SIZE_ONLY)
5102 ret->flags = 1;
3dab1dad 5103 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5104 goto insert_if;
b81d288d 5105 }
a0ed51b3 5106 }
0a4db386
YO
5107 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5108 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5109 {
5110 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5111 char *name_start= RExC_parse++;
5112 I32 num = 0;
5113 SV *sv_dat=reg_scan_name(pRExC_state,
5114 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5115 if (RExC_parse == name_start || *RExC_parse != ch)
5116 vFAIL2("Sequence (?(%c... not terminated",
5117 (ch == '>' ? '<' : ch));
5118 RExC_parse++;
5119 if (!SIZE_ONLY) {
5120 num = add_data( pRExC_state, 1, "S" );
5121 RExC_rx->data->data[num]=(void*)sv_dat;
5122 SvREFCNT_inc(sv_dat);
5123 }
5124 ret = reganode(pRExC_state,NGROUPP,num);
5125 goto insert_if_check_paren;
5126 }
5127 else if (RExC_parse[0] == 'D' &&
5128 RExC_parse[1] == 'E' &&
5129 RExC_parse[2] == 'F' &&
5130 RExC_parse[3] == 'I' &&
5131 RExC_parse[4] == 'N' &&
5132 RExC_parse[5] == 'E')
5133 {
5134 ret = reganode(pRExC_state,DEFINEP,0);
5135 RExC_parse +=6 ;
5136 is_define = 1;
5137 goto insert_if_check_paren;
5138 }
5139 else if (RExC_parse[0] == 'R') {
5140 RExC_parse++;
5141 parno = 0;
5142 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5143 parno = atoi(RExC_parse++);
5144 while (isDIGIT(*RExC_parse))
5145 RExC_parse++;
5146 } else if (RExC_parse[0] == '&') {
5147 SV *sv_dat;
5148 RExC_parse++;
5149 sv_dat = reg_scan_name(pRExC_state,
5150 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5151 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5152 }
1a147d38 5153 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5154 goto insert_if_check_paren;
5155 }
830247a4 5156 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5157 /* (?(1)...) */
6136c704 5158 char c;
830247a4 5159 parno = atoi(RExC_parse++);
c277df42 5160
830247a4
IZ
5161 while (isDIGIT(*RExC_parse))
5162 RExC_parse++;
fac92740 5163 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5164
0a4db386 5165 insert_if_check_paren:
830247a4 5166 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5167 vFAIL("Switch condition not recognized");
c277df42 5168 insert_if:
3dab1dad
YO
5169 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5170 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5171 if (br == NULL)
830247a4 5172 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5173 else
3dab1dad 5174 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5175 c = *nextchar(pRExC_state);
d1b80229
IZ
5176 if (flags&HASWIDTH)
5177 *flagp |= HASWIDTH;
c277df42 5178 if (c == '|') {
0a4db386
YO
5179 if (is_define)
5180 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5181 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5182 regbranch(pRExC_state, &flags, 1,depth+1);
5183 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5184 if (flags&HASWIDTH)
5185 *flagp |= HASWIDTH;
830247a4 5186 c = *nextchar(pRExC_state);
a0ed51b3
LW
5187 }
5188 else
c277df42
IZ
5189 lastbr = NULL;
5190 if (c != ')')
8615cb43 5191 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5192 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5193 REGTAIL(pRExC_state, br, ender);
c277df42 5194 if (lastbr) {
3dab1dad
YO
5195 REGTAIL(pRExC_state, lastbr, ender);
5196 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5197 }
5198 else
3dab1dad 5199 REGTAIL(pRExC_state, ret, ender);
c277df42 5200 return ret;
a0ed51b3
LW
5201 }
5202 else {
830247a4 5203 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5204 }
5205 }
1b1626e4 5206 case 0:
830247a4 5207 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5208 vFAIL("Sequence (? incomplete");
1b1626e4 5209 break;
a0d0e21e 5210 default:
830247a4 5211 --RExC_parse;
fac92740 5212 parse_flags: /* (?i) */
830247a4 5213 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5214 /* (?g), (?gc) and (?o) are useless here
5215 and must be globally applied -- japhy */
5216
5217 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5218 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5219 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5220 if (! (wastedflags & wflagbit) ) {
5221 wastedflags |= wflagbit;
5222 vWARN5(
5223 RExC_parse + 1,
5224 "Useless (%s%c) - %suse /%c modifier",
5225 flagsp == &negflags ? "?-" : "?",
5226 *RExC_parse,
5227 flagsp == &negflags ? "don't " : "",
5228 *RExC_parse
5229 );
5230 }
5231 }
5232 }
5233 else if (*RExC_parse == 'c') {
5234 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5235 if (! (wastedflags & WASTED_C) ) {
5236 wastedflags |= WASTED_GC;
9d1d55b5
JP
5237 vWARN3(
5238 RExC_parse + 1,
5239 "Useless (%sc) - %suse /gc modifier",
5240 flagsp == &negflags ? "?-" : "?",
5241 flagsp == &negflags ? "don't " : ""
5242 );
5243 }
5244 }
5245 }
5246 else { pmflag(flagsp, *RExC_parse); }
5247
830247a4 5248 ++RExC_parse;
ca9dfc88 5249 }
830247a4 5250 if (*RExC_parse == '-') {
ca9dfc88 5251 flagsp = &negflags;
9d1d55b5 5252 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5253 ++RExC_parse;
ca9dfc88 5254 goto parse_flags;
48c036b1 5255 }
e2509266
JH
5256 RExC_flags |= posflags;
5257 RExC_flags &= ~negflags;
830247a4
IZ
5258 if (*RExC_parse == ':') {
5259 RExC_parse++;
ca9dfc88
IZ
5260 paren = ':';
5261 break;
5262 }
c277df42 5263 unknown:
830247a4
IZ
5264 if (*RExC_parse != ')') {
5265 RExC_parse++;
5266 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5267 }
830247a4 5268 nextchar(pRExC_state);
a0d0e21e
LW
5269 *flagp = TRYAGAIN;
5270 return NULL;
5271 }
5272 }
fac92740 5273 else { /* (...) */
81714fb9 5274 capturing_parens:
830247a4
IZ
5275 parno = RExC_npar;
5276 RExC_npar++;
e2e6a0f1 5277
830247a4 5278 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5279 if (!SIZE_ONLY ){
5280 if (!RExC_nestroot)
5281 RExC_nestroot = parno;
5282 if (RExC_seen & REG_SEEN_RECURSE) {
5283 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5284 "Setting open paren #%"IVdf" to %d\n",
5285 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5286 RExC_open_parens[parno-1]= ret;
5287 }
6bda09f9 5288 }
fac92740
MJD
5289 Set_Node_Length(ret, 1); /* MJD */
5290 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5291 is_open = 1;
a0d0e21e 5292 }
a0ed51b3 5293 }
fac92740 5294 else /* ! paren */
a0d0e21e
LW
5295 ret = NULL;
5296
5297 /* Pick up the branches, linking them together. */
fac92740 5298 parse_start = RExC_parse; /* MJD */
3dab1dad 5299 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5300 /* branch_len = (paren != 0); */
2af232bd 5301
a0d0e21e
LW
5302 if (br == NULL)
5303 return(NULL);
830247a4
IZ
5304 if (*RExC_parse == '|') {
5305 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5306 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5307 }
fac92740 5308 else { /* MJD */
6bda09f9 5309 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5310 Set_Node_Length(br, paren != 0);
5311 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5312 }
c277df42
IZ
5313 have_branch = 1;
5314 if (SIZE_ONLY)
830247a4 5315 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5316 }
5317 else if (paren == ':') {
c277df42
IZ
5318 *flagp |= flags&SIMPLE;
5319 }
6136c704 5320 if (is_open) { /* Starts with OPEN. */
3dab1dad 5321 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5322 }
5323 else if (paren != '?') /* Not Conditional */
a0d0e21e 5324 ret = br;
32a0ca98 5325 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5326 lastbr = br;
830247a4
IZ
5327 while (*RExC_parse == '|') {
5328 if (!SIZE_ONLY && RExC_extralen) {
5329 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5330 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5331 }
5332 if (SIZE_ONLY)
830247a4
IZ
5333 RExC_extralen += 2; /* Account for LONGJMP. */
5334 nextchar(pRExC_state);
3dab1dad 5335 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5336
a687059c 5337 if (br == NULL)
a0d0e21e 5338 return(NULL);
3dab1dad 5339 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5340 lastbr = br;
821b33a5
IZ
5341 if (flags&HASWIDTH)
5342 *flagp |= HASWIDTH;
a687059c 5343 *flagp |= flags&SPSTART;
a0d0e21e
LW
5344 }
5345
c277df42
IZ
5346 if (have_branch || paren != ':') {
5347 /* Make a closing node, and hook it on the end. */
5348 switch (paren) {
5349 case ':':
830247a4 5350 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5351 break;
5352 case 1:
c74340f9 5353 RExC_cpar++;
830247a4 5354 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5355 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5356 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5357 "Setting close paren #%"IVdf" to %d\n",
5358 (IV)parno, REG_NODE_NUM(ender)));
5359 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5360 if (RExC_nestroot == parno)
5361 RExC_nestroot = 0;
40d049e4 5362 }
fac92740
MJD
5363 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5364 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5365 break;
5366 case '<':
c277df42
IZ
5367 case ',':
5368 case '=':
5369 case '!':
c277df42 5370 *flagp &= ~HASWIDTH;
821b33a5
IZ
5371 /* FALL THROUGH */
5372 case '>':
830247a4 5373 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5374 break;
5375 case 0:
830247a4 5376 ender = reg_node(pRExC_state, END);
40d049e4
YO
5377 if (!SIZE_ONLY) {
5378 assert(!RExC_opend); /* there can only be one! */
5379 RExC_opend = ender;
5380 }
c277df42
IZ
5381 break;
5382 }
eaf3ca90 5383 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5384
9674d46a 5385 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5386 if (depth==1)
5387 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5388
c277df42 5389 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5390 for (br = ret; br; br = regnext(br)) {
5391 const U8 op = PL_regkind[OP(br)];
5392 if (op == BRANCH) {
07be1b83 5393 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5394 }
5395 else if (op == BRANCHJ) {
07be1b83 5396 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5397 }
c277df42
IZ
5398 }
5399 }
a0d0e21e 5400 }
c277df42
IZ
5401
5402 {
e1ec3a88
AL
5403 const char *p;
5404 static const char parens[] = "=!<,>";
c277df42
IZ
5405
5406 if (paren && (p = strchr(parens, paren))) {
eb160463 5407 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5408 int flag = (p - parens) > 1;
5409
5410 if (paren == '>')
5411 node = SUSPEND, flag = 0;
6bda09f9 5412 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5413 Set_Node_Cur_Length(ret);
5414 Set_Node_Offset(ret, parse_start + 1);
c277df42 5415 ret->flags = flag;
07be1b83 5416 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5417 }
a0d0e21e
LW
5418 }
5419
5420 /* Check for proper termination. */
ce3e6498 5421 if (paren) {
e2509266 5422 RExC_flags = oregflags;
830247a4
IZ
5423 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5424 RExC_parse = oregcomp_parse;
380a0633 5425 vFAIL("Unmatched (");
ce3e6498 5426 }
a0ed51b3 5427 }
830247a4
IZ
5428 else if (!paren && RExC_parse < RExC_end) {
5429 if (*RExC_parse == ')') {
5430 RExC_parse++;
380a0633 5431 vFAIL("Unmatched )");
a0ed51b3
LW
5432 }
5433 else
b45f050a 5434 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5435 /* NOTREACHED */
5436 }
a687059c 5437
a0d0e21e 5438 return(ret);
a687059c
LW
5439}
5440
5441/*
5442 - regbranch - one alternative of an | operator
5443 *
5444 * Implements the concatenation operator.
5445 */
76e3520e 5446STATIC regnode *
3dab1dad 5447S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5448{
97aff369 5449 dVAR;
c277df42
IZ
5450 register regnode *ret;
5451 register regnode *chain = NULL;
5452 register regnode *latest;
5453 I32 flags = 0, c = 0;
3dab1dad
YO
5454 GET_RE_DEBUG_FLAGS_DECL;
5455 DEBUG_PARSE("brnc");
b81d288d 5456 if (first)
c277df42
IZ
5457 ret = NULL;
5458 else {
b81d288d 5459 if (!SIZE_ONLY && RExC_extralen)
830247a4 5460 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5461 else {
830247a4 5462 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5463 Set_Node_Length(ret, 1);
5464 }
c277df42
IZ
5465 }
5466
b81d288d 5467 if (!first && SIZE_ONLY)
830247a4 5468 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5469
c277df42 5470 *flagp = WORST; /* Tentatively. */
a0d0e21e 5471
830247a4
IZ
5472 RExC_parse--;
5473 nextchar(pRExC_state);
5474 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5475 flags &= ~TRYAGAIN;
3dab1dad 5476 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5477 if (latest == NULL) {
5478 if (flags & TRYAGAIN)
5479 continue;
5480 return(NULL);
a0ed51b3
LW
5481 }
5482 else if (ret == NULL)
c277df42 5483 ret = latest;
a0d0e21e 5484 *flagp |= flags&HASWIDTH;
c277df42 5485 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5486 *flagp |= flags&SPSTART;
5487 else {
830247a4 5488 RExC_naughty++;
3dab1dad 5489 REGTAIL(pRExC_state, chain, latest);
a687059c 5490 }
a0d0e21e 5491 chain = latest;
c277df42
IZ
5492 c++;
5493 }
5494 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5495 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5496 if (ret == NULL)
5497 ret = chain;
5498 }
5499 if (c == 1) {
5500 *flagp |= flags&SIMPLE;
a0d0e21e 5501 }
a687059c 5502
d4c19fe8 5503 return ret;
a687059c
LW
5504}
5505
5506/*
5507 - regpiece - something followed by possible [*+?]
5508 *
5509 * Note that the branching code sequences used for ? and the general cases
5510 * of * and + are somewhat optimized: they use the same NOTHING node as
5511 * both the endmarker for their branch list and the body of the last branch.
5512 * It might seem that this node could be dispensed with entirely, but the
5513 * endmarker role is not redundant.
5514 */
76e3520e 5515STATIC regnode *
3dab1dad 5516S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5517{
97aff369 5518 dVAR;
c277df42 5519 register regnode *ret;
a0d0e21e
LW
5520 register char op;
5521 register char *next;
5522 I32 flags;
1df70142 5523 const char * const origparse = RExC_parse;
a0d0e21e 5524 I32 min;
c277df42 5525 I32 max = REG_INFTY;
fac92740 5526 char *parse_start;
10edeb5d 5527 const char *maxpos = NULL;
3dab1dad
YO
5528 GET_RE_DEBUG_FLAGS_DECL;
5529 DEBUG_PARSE("piec");
a0d0e21e 5530
3dab1dad 5531 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5532 if (ret == NULL) {
5533 if (flags & TRYAGAIN)
5534 *flagp |= TRYAGAIN;
5535 return(NULL);
5536 }
5537
830247a4 5538 op = *RExC_parse;
a0d0e21e 5539
830247a4 5540 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5541 maxpos = NULL;
fac92740 5542 parse_start = RExC_parse; /* MJD */
830247a4 5543 next = RExC_parse + 1;
a0d0e21e
LW
5544 while (isDIGIT(*next) || *next == ',') {
5545 if (*next == ',') {
5546 if (maxpos)
5547 break;
5548 else
5549 maxpos = next;
a687059c 5550 }
a0d0e21e
LW
5551 next++;
5552 }
5553 if (*next == '}') { /* got one */
5554 if (!maxpos)
5555 maxpos = next;
830247a4
IZ
5556 RExC_parse++;
5557 min = atoi(RExC_parse);
a0d0e21e
LW
5558 if (*maxpos == ',')
5559 maxpos++;
5560 else
830247a4 5561 maxpos = RExC_parse;
a0d0e21e
LW
5562 max = atoi(maxpos);
5563 if (!max && *maxpos != '0')
c277df42
IZ
5564 max = REG_INFTY; /* meaning "infinity" */
5565 else if (max >= REG_INFTY)
8615cb43 5566 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5567 RExC_parse = next;
5568 nextchar(pRExC_state);
a0d0e21e
LW
5569
5570 do_curly:
5571 if ((flags&SIMPLE)) {
830247a4 5572 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5573 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5574 Set_Node_Offset(ret, parse_start+1); /* MJD */
5575 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5576 }
5577 else {
3dab1dad 5578 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5579
5580 w->flags = 0;
3dab1dad 5581 REGTAIL(pRExC_state, ret, w);
830247a4 5582 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5583 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5584 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5585 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5586 }
6bda09f9 5587 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5588 /* MJD hk */
5589 Set_Node_Offset(ret, parse_start+1);
2af232bd 5590 Set_Node_Length(ret,
fac92740 5591 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5592
830247a4 5593 if (!SIZE_ONLY && RExC_extralen)
c277df42 5594 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5595 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5596 if (SIZE_ONLY)
830247a4
IZ
5597 RExC_whilem_seen++, RExC_extralen += 3;
5598 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5599 }
c277df42 5600 ret->flags = 0;
a0d0e21e
LW
5601
5602 if (min > 0)
821b33a5
IZ
5603 *flagp = WORST;
5604 if (max > 0)
5605 *flagp |= HASWIDTH;
a0d0e21e 5606 if (max && max < min)
8615cb43 5607 vFAIL("Can't do {n,m} with n > m");
c277df42 5608 if (!SIZE_ONLY) {
eb160463
GS
5609 ARG1_SET(ret, (U16)min);
5610 ARG2_SET(ret, (U16)max);
a687059c 5611 }
a687059c 5612
a0d0e21e 5613 goto nest_check;
a687059c 5614 }
a0d0e21e 5615 }
a687059c 5616
a0d0e21e
LW
5617 if (!ISMULT1(op)) {
5618 *flagp = flags;
a687059c 5619 return(ret);
a0d0e21e 5620 }
bb20fd44 5621
c277df42 5622#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5623
5624 /* if this is reinstated, don't forget to put this back into perldiag:
5625
5626 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5627
5628 (F) The part of the regexp subject to either the * or + quantifier
5629 could match an empty string. The {#} shows in the regular
5630 expression about where the problem was discovered.
5631
5632 */
5633
bb20fd44 5634 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5635 vFAIL("Regexp *+ operand could be empty");
b81d288d 5636#endif
bb20fd44 5637
fac92740 5638 parse_start = RExC_parse;
830247a4 5639 nextchar(pRExC_state);
a0d0e21e 5640
821b33a5 5641 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5642
5643 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5644 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5645 ret->flags = 0;
830247a4 5646 RExC_naughty += 4;
a0d0e21e
LW
5647 }
5648 else if (op == '*') {
5649 min = 0;
5650 goto do_curly;
a0ed51b3
LW
5651 }
5652 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5653 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5654 ret->flags = 0;
830247a4 5655 RExC_naughty += 3;
a0d0e21e
LW
5656 }
5657 else if (op == '+') {
5658 min = 1;
5659 goto do_curly;
a0ed51b3
LW
5660 }
5661 else if (op == '?') {
a0d0e21e
LW
5662 min = 0; max = 1;
5663 goto do_curly;
5664 }
5665 nest_check:
041457d9 5666 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5667 vWARN3(RExC_parse,
b45f050a 5668 "%.*s matches null string many times",
afd78fd5 5669 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5670 origparse);
a0d0e21e
LW
5671 }
5672
b9b4dddf 5673 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5674 nextchar(pRExC_state);
6bda09f9 5675 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5676 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5677 }
b9b4dddf
YO
5678#ifndef REG_ALLOW_MINMOD_SUSPEND
5679 else
5680#endif
5681 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5682 regnode *ender;
5683 nextchar(pRExC_state);
5684 ender = reg_node(pRExC_state, SUCCEED);
5685 REGTAIL(pRExC_state, ret, ender);
5686 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5687 ret->flags = 0;
5688 ender = reg_node(pRExC_state, TAIL);
5689 REGTAIL(pRExC_state, ret, ender);
5690 /*ret= ender;*/
5691 }
5692
5693 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5694 RExC_parse++;
b45f050a
JF
5695 vFAIL("Nested quantifiers");
5696 }
a0d0e21e
LW
5697
5698 return(ret);
a687059c
LW
5699}
5700
fc8cd66c
YO
5701
5702/* reg_namedseq(pRExC_state,UVp)
5703
5704 This is expected to be called by a parser routine that has
5705 recognized'\N' and needs to handle the rest. RExC_parse is
5706 expected to point at the first char following the N at the time
5707 of the call.
5708
5709 If valuep is non-null then it is assumed that we are parsing inside
5710 of a charclass definition and the first codepoint in the resolved
5711 string is returned via *valuep and the routine will return NULL.
5712 In this mode if a multichar string is returned from the charnames
5713 handler a warning will be issued, and only the first char in the
5714 sequence will be examined. If the string returned is zero length
5715 then the value of *valuep is undefined and NON-NULL will
5716 be returned to indicate failure. (This will NOT be a valid pointer
5717 to a regnode.)
5718
5719 If value is null then it is assumed that we are parsing normal text
5720 and inserts a new EXACT node into the program containing the resolved
5721 string and returns a pointer to the new node. If the string is
5722 zerolength a NOTHING node is emitted.
5723
5724 On success RExC_parse is set to the char following the endbrace.
5725 Parsing failures will generate a fatal errorvia vFAIL(...)
5726
5727 NOTE: We cache all results from the charnames handler locally in
5728 the RExC_charnames hash (created on first use) to prevent a charnames
5729 handler from playing silly-buggers and returning a short string and
5730 then a long string for a given pattern. Since the regexp program
5731 size is calculated during an initial parse this would result
5732 in a buffer overrun so we cache to prevent the charname result from
5733 changing during the course of the parse.
5734
5735 */
5736STATIC regnode *
5737S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5738{
5739 char * name; /* start of the content of the name */
5740 char * endbrace; /* endbrace following the name */
5741 SV *sv_str = NULL;
5742 SV *sv_name = NULL;
5743 STRLEN len; /* this has various purposes throughout the code */
5744 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5745 regnode *ret = NULL;
5746
5747 if (*RExC_parse != '{') {
5748 vFAIL("Missing braces on \\N{}");
5749 }
5750 name = RExC_parse+1;
5751 endbrace = strchr(RExC_parse, '}');
5752 if ( ! endbrace ) {
5753 RExC_parse++;
5754 vFAIL("Missing right brace on \\N{}");
5755 }
5756 RExC_parse = endbrace + 1;
5757
5758
5759 /* RExC_parse points at the beginning brace,
5760 endbrace points at the last */
5761 if ( name[0]=='U' && name[1]=='+' ) {
5762 /* its a "unicode hex" notation {U+89AB} */
5763 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5764 | PERL_SCAN_DISALLOW_PREFIX
5765 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5766 UV cp;
196f1508 5767 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5768 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5769 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5770 cp = 0xFFFD;
5771 }
5772 if (cp > 0xff)
5773 RExC_utf8 = 1;
5774 if ( valuep ) {
5775 *valuep = cp;
5776 return NULL;
5777 }
5778 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5779 } else {
5780 /* fetch the charnames handler for this scope */
5781 HV * const table = GvHV(PL_hintgv);
5782 SV **cvp= table ?
5783 hv_fetchs(table, "charnames", FALSE) :
5784 NULL;
5785 SV *cv= cvp ? *cvp : NULL;
5786 HE *he_str;
5787 int count;
5788 /* create an SV with the name as argument */
5789 sv_name = newSVpvn(name, endbrace - name);
5790
5791 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5792 vFAIL2("Constant(\\N{%s}) unknown: "
5793 "(possibly a missing \"use charnames ...\")",
5794 SvPVX(sv_name));
5795 }
5796 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5797 vFAIL2("Constant(\\N{%s}): "
5798 "$^H{charnames} is not defined",SvPVX(sv_name));
5799 }
5800
5801
5802
5803 if (!RExC_charnames) {
5804 /* make sure our cache is allocated */
5805 RExC_charnames = newHV();
6bda09f9 5806 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5807 }
5808 /* see if we have looked this one up before */
5809 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5810 if ( he_str ) {
5811 sv_str = HeVAL(he_str);
5812 cached = 1;
5813 } else {
5814 dSP ;
5815
5816 ENTER ;
5817 SAVETMPS ;
5818 PUSHMARK(SP) ;
5819
5820 XPUSHs(sv_name);
5821
5822 PUTBACK ;
5823
5824 count= call_sv(cv, G_SCALAR);
5825
5826 if (count == 1) { /* XXXX is this right? dmq */
5827 sv_str = POPs;
5828 SvREFCNT_inc_simple_void(sv_str);
5829 }
5830
5831 SPAGAIN ;
5832 PUTBACK ;
5833 FREETMPS ;
5834 LEAVE ;
5835
5836 if ( !sv_str || !SvOK(sv_str) ) {
5837 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5838 "did not return a defined value",SvPVX(sv_name));
5839 }
5840 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5841 cached = 1;
5842 }
5843 }
5844 if (valuep) {
5845 char *p = SvPV(sv_str, len);
5846 if (len) {
5847 STRLEN numlen = 1;
5848 if ( SvUTF8(sv_str) ) {
196f1508 5849 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5850 if (*valuep > 0x7F)
5851 RExC_utf8 = 1;
5852 /* XXXX
5853 We have to turn on utf8 for high bit chars otherwise
5854 we get failures with
5855
5856 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5857 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5858
5859 This is different from what \x{} would do with the same
5860 codepoint, where the condition is > 0xFF.
5861 - dmq
5862 */
5863
5864
5865 } else {
5866 *valuep = (UV)*p;
5867 /* warn if we havent used the whole string? */
5868 }
5869 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5870 vWARN2(RExC_parse,
5871 "Ignoring excess chars from \\N{%s} in character class",
5872 SvPVX(sv_name)
5873 );
5874 }
5875 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5876 vWARN2(RExC_parse,
5877 "Ignoring zero length \\N{%s} in character class",
5878 SvPVX(sv_name)
5879 );
5880 }
5881 if (sv_name)
5882 SvREFCNT_dec(sv_name);
5883 if (!cached)
5884 SvREFCNT_dec(sv_str);
5885 return len ? NULL : (regnode *)&len;
5886 } else if(SvCUR(sv_str)) {
5887
5888 char *s;
5889 char *p, *pend;
5890 STRLEN charlen = 1;
5891 char * parse_start = name-3; /* needed for the offsets */
5892 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5893
5894 ret = reg_node(pRExC_state,
5895 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5896 s= STRING(ret);
5897
5898 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5899 sv_utf8_upgrade(sv_str);
5900 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5901 RExC_utf8= 1;
5902 }
5903
5904 p = SvPV(sv_str, len);
5905 pend = p + len;
5906 /* len is the length written, charlen is the size the char read */
5907 for ( len = 0; p < pend; p += charlen ) {
5908 if (UTF) {
196f1508 5909 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5910 if (FOLD) {
5911 STRLEN foldlen,numlen;
5912 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5913 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5914 /* Emit all the Unicode characters. */
5915
5916 for (foldbuf = tmpbuf;
5917 foldlen;
5918 foldlen -= numlen)
5919 {
5920 uvc = utf8_to_uvchr(foldbuf, &numlen);
5921 if (numlen > 0) {
5922 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5923 s += unilen;
5924 len += unilen;
5925 /* In EBCDIC the numlen
5926 * and unilen can differ. */
5927 foldbuf += numlen;
5928 if (numlen >= foldlen)
5929 break;
5930 }
5931 else
5932 break; /* "Can't happen." */
5933 }
5934 } else {
5935 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5936 if (unilen > 0) {
5937 s += unilen;
5938 len += unilen;
5939 }
5940 }
5941 } else {
5942 len++;
5943 REGC(*p, s++);
5944 }
5945 }
5946 if (SIZE_ONLY) {
5947 RExC_size += STR_SZ(len);
5948 } else {
5949 STR_LEN(ret) = len;
5950 RExC_emit += STR_SZ(len);
5951 }
5952 Set_Node_Cur_Length(ret); /* MJD */
5953 RExC_parse--;
5954 nextchar(pRExC_state);
5955 } else {
5956 ret = reg_node(pRExC_state,NOTHING);
5957 }
5958 if (!cached) {
5959 SvREFCNT_dec(sv_str);
5960 }
5961 if (sv_name) {
5962 SvREFCNT_dec(sv_name);
5963 }
5964 return ret;
5965
5966}
5967
5968
9e08bc66
TS
5969/*
5970 * reg_recode
5971 *
5972 * It returns the code point in utf8 for the value in *encp.
5973 * value: a code value in the source encoding
5974 * encp: a pointer to an Encode object
5975 *
5976 * If the result from Encode is not a single character,
5977 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5978 */
5979STATIC UV
5980S_reg_recode(pTHX_ const char value, SV **encp)
5981{
5982 STRLEN numlen = 1;
5983 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5984 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5985 : SvPVX(sv);
5986 const STRLEN newlen = SvCUR(sv);
5987 UV uv = UNICODE_REPLACEMENT;
5988
5989 if (newlen)
5990 uv = SvUTF8(sv)
5991 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5992 : *(U8*)s;
5993
5994 if (!newlen || numlen != newlen) {
5995 uv = UNICODE_REPLACEMENT;
5996 if (encp)
5997 *encp = NULL;
5998 }
5999 return uv;
6000}
6001
fc8cd66c 6002
a687059c
LW
6003/*
6004 - regatom - the lowest level
6005 *
6006 * Optimization: gobbles an entire sequence of ordinary characters so that
6007 * it can turn them into a single node, which is smaller to store and
6008 * faster to run. Backslashed characters are exceptions, each becoming a
6009 * separate node; the code is simpler that way and it's not worth fixing.
6010 *
7f6f358c
YO
6011 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6012 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6013 */
76e3520e 6014STATIC regnode *
3dab1dad 6015S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6016{
97aff369 6017 dVAR;
cbbf8932 6018 register regnode *ret = NULL;
a0d0e21e 6019 I32 flags;
45948336 6020 char *parse_start = RExC_parse;
3dab1dad
YO
6021 GET_RE_DEBUG_FLAGS_DECL;
6022 DEBUG_PARSE("atom");
a0d0e21e
LW
6023 *flagp = WORST; /* Tentatively. */
6024
6025tryagain:
830247a4 6026 switch (*RExC_parse) {
a0d0e21e 6027 case '^':
830247a4
IZ
6028 RExC_seen_zerolen++;
6029 nextchar(pRExC_state);
e2509266 6030 if (RExC_flags & PMf_MULTILINE)
830247a4 6031 ret = reg_node(pRExC_state, MBOL);
e2509266 6032 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6033 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6034 else
830247a4 6035 ret = reg_node(pRExC_state, BOL);
fac92740 6036 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6037 break;
6038 case '$':
830247a4 6039 nextchar(pRExC_state);
b81d288d 6040 if (*RExC_parse)
830247a4 6041 RExC_seen_zerolen++;
e2509266 6042 if (RExC_flags & PMf_MULTILINE)
830247a4 6043 ret = reg_node(pRExC_state, MEOL);
e2509266 6044 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6045 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6046 else
830247a4 6047 ret = reg_node(pRExC_state, EOL);
fac92740 6048 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6049 break;
6050 case '.':
830247a4 6051 nextchar(pRExC_state);
e2509266 6052 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6053 ret = reg_node(pRExC_state, SANY);
6054 else
6055 ret = reg_node(pRExC_state, REG_ANY);
6056 *flagp |= HASWIDTH|SIMPLE;
830247a4 6057 RExC_naughty++;
fac92740 6058 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6059 break;
6060 case '[':
b45f050a 6061 {
3dab1dad
YO
6062 char * const oregcomp_parse = ++RExC_parse;
6063 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6064 if (*RExC_parse != ']') {
6065 RExC_parse = oregcomp_parse;
b45f050a
JF
6066 vFAIL("Unmatched [");
6067 }
830247a4 6068 nextchar(pRExC_state);
a0d0e21e 6069 *flagp |= HASWIDTH|SIMPLE;
fac92740 6070 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6071 break;
b45f050a 6072 }
a0d0e21e 6073 case '(':
830247a4 6074 nextchar(pRExC_state);
3dab1dad 6075 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6076 if (ret == NULL) {
bf93d4cc 6077 if (flags & TRYAGAIN) {
830247a4 6078 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6079 /* Make parent create an empty node if needed. */
6080 *flagp |= TRYAGAIN;
6081 return(NULL);
6082 }
a0d0e21e 6083 goto tryagain;
bf93d4cc 6084 }
a0d0e21e
LW
6085 return(NULL);
6086 }
c277df42 6087 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6088 break;
6089 case '|':
6090 case ')':
6091 if (flags & TRYAGAIN) {
6092 *flagp |= TRYAGAIN;
6093 return NULL;
6094 }
b45f050a 6095 vFAIL("Internal urp");
a0d0e21e
LW
6096 /* Supposed to be caught earlier. */
6097 break;
85afd4ae 6098 case '{':
830247a4
IZ
6099 if (!regcurly(RExC_parse)) {
6100 RExC_parse++;
85afd4ae
CS
6101 goto defchar;
6102 }
6103 /* FALL THROUGH */
a0d0e21e
LW
6104 case '?':
6105 case '+':
6106 case '*':
830247a4 6107 RExC_parse++;
b45f050a 6108 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6109 break;
6110 case '\\':
830247a4 6111 switch (*++RExC_parse) {
a0d0e21e 6112 case 'A':
830247a4
IZ
6113 RExC_seen_zerolen++;
6114 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6115 *flagp |= SIMPLE;
830247a4 6116 nextchar(pRExC_state);
fac92740 6117 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6118 break;
6119 case 'G':
830247a4
IZ
6120 ret = reg_node(pRExC_state, GPOS);
6121 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6122 *flagp |= SIMPLE;
830247a4 6123 nextchar(pRExC_state);
fac92740 6124 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6125 break;
6126 case 'Z':
830247a4 6127 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6128 *flagp |= SIMPLE;
a1917ab9 6129 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6130 nextchar(pRExC_state);
a0d0e21e 6131 break;
b85d18e9 6132 case 'z':
830247a4 6133 ret = reg_node(pRExC_state, EOS);
b85d18e9 6134 *flagp |= SIMPLE;
830247a4
IZ
6135 RExC_seen_zerolen++; /* Do not optimize RE away */
6136 nextchar(pRExC_state);
fac92740 6137 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6138 break;
4a2d328f 6139 case 'C':
f33976b4
DB
6140 ret = reg_node(pRExC_state, CANY);
6141 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6142 *flagp |= HASWIDTH|SIMPLE;
830247a4 6143 nextchar(pRExC_state);
fac92740 6144 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6145 break;
6146 case 'X':
830247a4 6147 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6148 *flagp |= HASWIDTH;
830247a4 6149 nextchar(pRExC_state);
fac92740 6150 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6151 break;
a0d0e21e 6152 case 'w':
eb160463 6153 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6154 *flagp |= HASWIDTH|SIMPLE;
830247a4 6155 nextchar(pRExC_state);
fac92740 6156 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6157 break;
6158 case 'W':
eb160463 6159 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6160 *flagp |= HASWIDTH|SIMPLE;
830247a4 6161 nextchar(pRExC_state);
fac92740 6162 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6163 break;
6164 case 'b':
830247a4
IZ
6165 RExC_seen_zerolen++;
6166 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6167 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6168 *flagp |= SIMPLE;
830247a4 6169 nextchar(pRExC_state);
fac92740 6170 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6171 break;
6172 case 'B':
830247a4
IZ
6173 RExC_seen_zerolen++;
6174 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6175 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6176 *flagp |= SIMPLE;
830247a4 6177 nextchar(pRExC_state);
fac92740 6178 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6179 break;
6180 case 's':
eb160463 6181 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6182 *flagp |= HASWIDTH|SIMPLE;
830247a4 6183 nextchar(pRExC_state);
fac92740 6184 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6185 break;
6186 case 'S':
eb160463 6187 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6188 *flagp |= HASWIDTH|SIMPLE;
830247a4 6189 nextchar(pRExC_state);
fac92740 6190 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6191 break;
6192 case 'd':
ffc61ed2 6193 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6194 *flagp |= HASWIDTH|SIMPLE;
830247a4 6195 nextchar(pRExC_state);
fac92740 6196 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6197 break;
6198 case 'D':
ffc61ed2 6199 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6200 *flagp |= HASWIDTH|SIMPLE;
830247a4 6201 nextchar(pRExC_state);
fac92740 6202 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6203 break;
a14b48bc
LW
6204 case 'p':
6205 case 'P':
3568d838 6206 {
3dab1dad 6207 char* const oldregxend = RExC_end;
ccb2c380 6208 char* parse_start = RExC_parse - 2;
a14b48bc 6209
830247a4 6210 if (RExC_parse[1] == '{') {
3568d838 6211 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6212 RExC_end = strchr(RExC_parse, '}');
6213 if (!RExC_end) {
3dab1dad 6214 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6215 RExC_parse += 2;
6216 RExC_end = oldregxend;
0da60cf5 6217 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6218 }
830247a4 6219 RExC_end++;
a14b48bc 6220 }
af6f566e 6221 else {
830247a4 6222 RExC_end = RExC_parse + 2;
af6f566e
HS
6223 if (RExC_end > oldregxend)
6224 RExC_end = oldregxend;
6225 }
830247a4 6226 RExC_parse--;
a14b48bc 6227
3dab1dad 6228 ret = regclass(pRExC_state,depth+1);
a14b48bc 6229
830247a4
IZ
6230 RExC_end = oldregxend;
6231 RExC_parse--;
ccb2c380
MP
6232
6233 Set_Node_Offset(ret, parse_start + 2);
6234 Set_Node_Cur_Length(ret);
830247a4 6235 nextchar(pRExC_state);
a14b48bc
LW
6236 *flagp |= HASWIDTH|SIMPLE;
6237 }
6238 break;
fc8cd66c
YO
6239 case 'N':
6240 /* Handle \N{NAME} here and not below because it can be
6241 multicharacter. join_exact() will join them up later on.
6242 Also this makes sure that things like /\N{BLAH}+/ and
6243 \N{BLAH} being multi char Just Happen. dmq*/
6244 ++RExC_parse;
6245 ret= reg_namedseq(pRExC_state, NULL);
6246 break;
0a4db386 6247 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6248 {
6249 char ch= RExC_parse[1];
6250 if (ch != '<' && ch != '\'') {
6251 if (SIZE_ONLY)
6252 vWARN( RExC_parse + 1,
6253 "Possible broken named back reference treated as literal k");
6254 parse_start--;
6255 goto defchar;
6256 } else {
6257 char* name_start = (RExC_parse += 2);
6258 I32 num = 0;
0a4db386
YO
6259 SV *sv_dat = reg_scan_name(pRExC_state,
6260 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6261 ch= (ch == '<') ? '>' : '\'';
6262
81714fb9
YO
6263 if (RExC_parse == name_start || *RExC_parse != ch)
6264 vFAIL2("Sequence \\k%c... not terminated",
6265 (ch == '>' ? '<' : ch));
6266
6267 RExC_sawback = 1;
6268 ret = reganode(pRExC_state,
6269 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6270 num);
6271 *flagp |= HASWIDTH;
6272
6273
6274 if (!SIZE_ONLY) {
81714fb9
YO
6275 num = add_data( pRExC_state, 1, "S" );
6276 ARG_SET(ret,num);
6277 RExC_rx->data->data[num]=(void*)sv_dat;
6278 SvREFCNT_inc(sv_dat);
6279 }
6280 /* override incorrect value set in reganode MJD */
6281 Set_Node_Offset(ret, parse_start+1);
6282 Set_Node_Cur_Length(ret); /* MJD */
6283 nextchar(pRExC_state);
6284
6285 }
6286 break;
6287 }
a0d0e21e
LW
6288 case 'n':
6289 case 'r':
6290 case 't':
6291 case 'f':
6292 case 'e':
6293 case 'a':
6294 case 'x':
6295 case 'c':
6296 case '0':
6297 goto defchar;
c74340f9 6298 case 'R':
a0d0e21e
LW
6299 case '1': case '2': case '3': case '4':
6300 case '5': case '6': case '7': case '8': case '9':
6301 {
c74340f9
YO
6302 I32 num;
6303 bool isrel=(*RExC_parse=='R');
6304 if (isrel)
6305 RExC_parse++;
6306 num = atoi(RExC_parse);
6307 if (isrel) {
6308 num = RExC_cpar - num;
6309 if (num < 1)
6310 vFAIL("Reference to nonexistent or unclosed group");
6311 }
830247a4 6312 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6313 goto defchar;
6314 else {
3dab1dad 6315 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6316 while (isDIGIT(*RExC_parse))
6317 RExC_parse++;
b45f050a 6318
c74340f9
YO
6319 if (!SIZE_ONLY) {
6320 if (num > (I32)RExC_rx->nparens)
6321 vFAIL("Reference to nonexistent group");
6322 /* People make this error all the time apparently.
6323 So we cant fail on it, even though we should
6324
6325 else if (num >= RExC_cpar)
6326 vFAIL("Reference to unclosed group will always match");
6327 */
6328 }
830247a4 6329 RExC_sawback = 1;
eb160463
GS
6330 ret = reganode(pRExC_state,
6331 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6332 num);
a0d0e21e 6333 *flagp |= HASWIDTH;
2af232bd 6334
fac92740 6335 /* override incorrect value set in reganode MJD */
2af232bd 6336 Set_Node_Offset(ret, parse_start+1);
fac92740 6337 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6338 RExC_parse--;
6339 nextchar(pRExC_state);
a0d0e21e
LW
6340 }
6341 }
6342 break;
6343 case '\0':
830247a4 6344 if (RExC_parse >= RExC_end)
b45f050a 6345 FAIL("Trailing \\");
a0d0e21e
LW
6346 /* FALL THROUGH */
6347 default:
a0288114 6348 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6349 back into the quick-grab loop below */
45948336 6350 parse_start--;
a0d0e21e
LW
6351 goto defchar;
6352 }
6353 break;
4633a7c4
LW
6354
6355 case '#':
e2509266 6356 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6357 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6358 RExC_parse++;
830247a4 6359 if (RExC_parse < RExC_end)
4633a7c4
LW
6360 goto tryagain;
6361 }
6362 /* FALL THROUGH */
6363
a0d0e21e 6364 default: {
ba210ebe 6365 register STRLEN len;
58ae7d3f 6366 register UV ender;
a0d0e21e 6367 register char *p;
3dab1dad 6368 char *s;
80aecb99 6369 STRLEN foldlen;
89ebb4a3 6370 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6371
6372 parse_start = RExC_parse - 1;
a0d0e21e 6373
830247a4 6374 RExC_parse++;
a0d0e21e
LW
6375
6376 defchar:
58ae7d3f 6377 ender = 0;
eb160463
GS
6378 ret = reg_node(pRExC_state,
6379 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6380 s = STRING(ret);
830247a4
IZ
6381 for (len = 0, p = RExC_parse - 1;
6382 len < 127 && p < RExC_end;
a0d0e21e
LW
6383 len++)
6384 {
3dab1dad 6385 char * const oldp = p;
5b5a24f7 6386
e2509266 6387 if (RExC_flags & PMf_EXTENDED)
830247a4 6388 p = regwhite(p, RExC_end);
a0d0e21e
LW
6389 switch (*p) {
6390 case '^':
6391 case '$':
6392 case '.':
6393 case '[':
6394 case '(':
6395 case ')':
6396 case '|':
6397 goto loopdone;
6398 case '\\':
6399 switch (*++p) {
6400 case 'A':
1ed8eac0
JF
6401 case 'C':
6402 case 'X':
a0d0e21e
LW
6403 case 'G':
6404 case 'Z':
b85d18e9 6405 case 'z':
a0d0e21e
LW
6406 case 'w':
6407 case 'W':
6408 case 'b':
6409 case 'B':
6410 case 's':
6411 case 'S':
6412 case 'd':
6413 case 'D':
a14b48bc
LW
6414 case 'p':
6415 case 'P':
fc8cd66c 6416 case 'N':
c74340f9 6417 case 'R':
a0d0e21e
LW
6418 --p;
6419 goto loopdone;
6420 case 'n':
6421 ender = '\n';
6422 p++;
a687059c 6423 break;
a0d0e21e
LW
6424 case 'r':
6425 ender = '\r';
6426 p++;
a687059c 6427 break;
a0d0e21e
LW
6428 case 't':
6429 ender = '\t';
6430 p++;
a687059c 6431 break;
a0d0e21e
LW
6432 case 'f':
6433 ender = '\f';
6434 p++;
a687059c 6435 break;
a0d0e21e 6436 case 'e':
c7f1f016 6437 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6438 p++;
a687059c 6439 break;
a0d0e21e 6440 case 'a':
c7f1f016 6441 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6442 p++;
a687059c 6443 break;
a0d0e21e 6444 case 'x':
a0ed51b3 6445 if (*++p == '{') {
1df70142 6446 char* const e = strchr(p, '}');
b81d288d 6447
b45f050a 6448 if (!e) {
830247a4 6449 RExC_parse = p + 1;
b45f050a
JF
6450 vFAIL("Missing right brace on \\x{}");
6451 }
de5f0749 6452 else {
a4c04bdc
NC
6453 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6454 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6455 STRLEN numlen = e - p - 1;
53305cf1 6456 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6457 if (ender > 0xff)
6458 RExC_utf8 = 1;
a0ed51b3
LW
6459 p = e + 1;
6460 }
a0ed51b3
LW
6461 }
6462 else {
a4c04bdc 6463 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6464 STRLEN numlen = 2;
53305cf1 6465 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6466 p += numlen;
6467 }
9e08bc66
TS
6468 if (PL_encoding && ender < 0x100)
6469 goto recode_encoding;
a687059c 6470 break;
a0d0e21e
LW
6471 case 'c':
6472 p++;
bbce6d69 6473 ender = UCHARAT(p++);
6474 ender = toCTRL(ender);
a687059c 6475 break;
a0d0e21e
LW
6476 case '0': case '1': case '2': case '3':case '4':
6477 case '5': case '6': case '7': case '8':case '9':
6478 if (*p == '0' ||
830247a4 6479 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6480 I32 flags = 0;
1df70142 6481 STRLEN numlen = 3;
53305cf1 6482 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6483 p += numlen;
6484 }
6485 else {
6486 --p;
6487 goto loopdone;
a687059c 6488 }
9e08bc66
TS
6489 if (PL_encoding && ender < 0x100)
6490 goto recode_encoding;
6491 break;
6492 recode_encoding:
6493 {
6494 SV* enc = PL_encoding;
6495 ender = reg_recode((const char)(U8)ender, &enc);
6496 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6497 vWARN(p, "Invalid escape in the specified encoding");
6498 RExC_utf8 = 1;
6499 }
a687059c 6500 break;
a0d0e21e 6501 case '\0':
830247a4 6502 if (p >= RExC_end)
b45f050a 6503 FAIL("Trailing \\");
a687059c 6504 /* FALL THROUGH */
a0d0e21e 6505 default:
041457d9 6506 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6507 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6508 goto normal_default;
a0d0e21e
LW
6509 }
6510 break;
a687059c 6511 default:
a0ed51b3 6512 normal_default:
fd400ab9 6513 if (UTF8_IS_START(*p) && UTF) {
1df70142 6514 STRLEN numlen;
5e12f4fb 6515 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6516 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6517 p += numlen;
6518 }
6519 else
6520 ender = *p++;
a0d0e21e 6521 break;
a687059c 6522 }
e2509266 6523 if (RExC_flags & PMf_EXTENDED)
830247a4 6524 p = regwhite(p, RExC_end);
60a8b682
JH
6525 if (UTF && FOLD) {
6526 /* Prime the casefolded buffer. */
ac7e0132 6527 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6528 }
a0d0e21e
LW
6529 if (ISMULT2(p)) { /* Back off on ?+*. */
6530 if (len)
6531 p = oldp;
16ea2a2e 6532 else if (UTF) {
80aecb99 6533 if (FOLD) {
60a8b682 6534 /* Emit all the Unicode characters. */
1df70142 6535 STRLEN numlen;
80aecb99
JH
6536 for (foldbuf = tmpbuf;
6537 foldlen;
6538 foldlen -= numlen) {
6539 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6540 if (numlen > 0) {
71207a34 6541 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6542 s += unilen;
6543 len += unilen;
6544 /* In EBCDIC the numlen
6545 * and unilen can differ. */
9dc45d57 6546 foldbuf += numlen;
47654450
JH
6547 if (numlen >= foldlen)
6548 break;
9dc45d57
JH
6549 }
6550 else
6551 break; /* "Can't happen." */
80aecb99
JH
6552 }
6553 }
6554 else {
71207a34 6555 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6556 if (unilen > 0) {
0ebc6274
JH
6557 s += unilen;
6558 len += unilen;
9dc45d57 6559 }
80aecb99 6560 }
a0ed51b3 6561 }
a0d0e21e
LW
6562 else {
6563 len++;
eb160463 6564 REGC((char)ender, s++);
a0d0e21e
LW
6565 }
6566 break;
a687059c 6567 }
16ea2a2e 6568 if (UTF) {
80aecb99 6569 if (FOLD) {
60a8b682 6570 /* Emit all the Unicode characters. */
1df70142 6571 STRLEN numlen;
80aecb99
JH
6572 for (foldbuf = tmpbuf;
6573 foldlen;
6574 foldlen -= numlen) {
6575 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6576 if (numlen > 0) {
71207a34 6577 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6578 len += unilen;
6579 s += unilen;
6580 /* In EBCDIC the numlen
6581 * and unilen can differ. */
9dc45d57 6582 foldbuf += numlen;
47654450
JH
6583 if (numlen >= foldlen)
6584 break;
9dc45d57
JH
6585 }
6586 else
6587 break;
80aecb99
JH
6588 }
6589 }
6590 else {
71207a34 6591 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6592 if (unilen > 0) {
0ebc6274
JH
6593 s += unilen;
6594 len += unilen;
9dc45d57 6595 }
80aecb99
JH
6596 }
6597 len--;
a0ed51b3
LW
6598 }
6599 else
eb160463 6600 REGC((char)ender, s++);
a0d0e21e
LW
6601 }
6602 loopdone:
830247a4 6603 RExC_parse = p - 1;
fac92740 6604 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6605 nextchar(pRExC_state);
793db0cb
JH
6606 {
6607 /* len is STRLEN which is unsigned, need to copy to signed */
6608 IV iv = len;
6609 if (iv < 0)
6610 vFAIL("Internal disaster");
6611 }
a0d0e21e
LW
6612 if (len > 0)
6613 *flagp |= HASWIDTH;
090f7165 6614 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6615 *flagp |= SIMPLE;
3dab1dad 6616
cd439c50 6617 if (SIZE_ONLY)
830247a4 6618 RExC_size += STR_SZ(len);
3dab1dad
YO
6619 else {
6620 STR_LEN(ret) = len;
830247a4 6621 RExC_emit += STR_SZ(len);
07be1b83 6622 }
3dab1dad 6623 }
a0d0e21e
LW
6624 break;
6625 }
a687059c 6626
a0d0e21e 6627 return(ret);
a687059c
LW
6628}
6629
873ef191 6630STATIC char *
5f66b61c 6631S_regwhite(char *p, const char *e)
5b5a24f7
CS
6632{
6633 while (p < e) {
6634 if (isSPACE(*p))
6635 ++p;
6636 else if (*p == '#') {
6637 do {
6638 p++;
6639 } while (p < e && *p != '\n');
6640 }
6641 else
6642 break;
6643 }
6644 return p;
6645}
6646
b8c5462f
JH
6647/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6648 Character classes ([:foo:]) can also be negated ([:^foo:]).
6649 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6650 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6651 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6652
6653#define POSIXCC_DONE(c) ((c) == ':')
6654#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6655#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6656
b8c5462f 6657STATIC I32
830247a4 6658S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6659{
97aff369 6660 dVAR;
936ed897 6661 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6662
830247a4 6663 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6664 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6665 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6666 const char c = UCHARAT(RExC_parse);
097eb12c 6667 char* const s = RExC_parse++;
b81d288d 6668
9a86a77b 6669 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6670 RExC_parse++;
6671 if (RExC_parse == RExC_end)
620e46c5 6672 /* Grandfather lone [:, [=, [. */
830247a4 6673 RExC_parse = s;
620e46c5 6674 else {
3dab1dad 6675 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6676 assert(*t == c);
6677
9a86a77b 6678 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6679 const char *posixcc = s + 1;
830247a4 6680 RExC_parse++; /* skip over the ending ] */
3dab1dad 6681
b8c5462f 6682 if (*s == ':') {
1df70142
AL
6683 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6684 const I32 skip = t - posixcc;
80916619
NC
6685
6686 /* Initially switch on the length of the name. */
6687 switch (skip) {
6688 case 4:
3dab1dad
YO
6689 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6690 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6691 break;
80916619
NC
6692 case 5:
6693 /* Names all of length 5. */
6694 /* alnum alpha ascii blank cntrl digit graph lower
6695 print punct space upper */
6696 /* Offset 4 gives the best switch position. */
6697 switch (posixcc[4]) {
6698 case 'a':
3dab1dad
YO
6699 if (memEQ(posixcc, "alph", 4)) /* alpha */
6700 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6701 break;
6702 case 'e':
3dab1dad
YO
6703 if (memEQ(posixcc, "spac", 4)) /* space */
6704 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6705 break;
6706 case 'h':
3dab1dad
YO
6707 if (memEQ(posixcc, "grap", 4)) /* graph */
6708 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6709 break;
6710 case 'i':
3dab1dad
YO
6711 if (memEQ(posixcc, "asci", 4)) /* ascii */
6712 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6713 break;
6714 case 'k':
3dab1dad
YO
6715 if (memEQ(posixcc, "blan", 4)) /* blank */
6716 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6717 break;
6718 case 'l':
3dab1dad
YO
6719 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6720 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6721 break;
6722 case 'm':
3dab1dad
YO
6723 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6724 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6725 break;
6726 case 'r':
3dab1dad
YO
6727 if (memEQ(posixcc, "lowe", 4)) /* lower */
6728 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6729 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6730 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6731 break;
6732 case 't':
3dab1dad
YO
6733 if (memEQ(posixcc, "digi", 4)) /* digit */
6734 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6735 else if (memEQ(posixcc, "prin", 4)) /* print */
6736 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6737 else if (memEQ(posixcc, "punc", 4)) /* punct */
6738 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6739 break;
b8c5462f
JH
6740 }
6741 break;
80916619 6742 case 6:
3dab1dad
YO
6743 if (memEQ(posixcc, "xdigit", 6))
6744 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6745 break;
6746 }
80916619
NC
6747
6748 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6749 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6750 t - s - 1, s + 1);
80916619
NC
6751 assert (posixcc[skip] == ':');
6752 assert (posixcc[skip+1] == ']');
b45f050a 6753 } else if (!SIZE_ONLY) {
b8c5462f 6754 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6755
830247a4 6756 /* adjust RExC_parse so the warning shows after
b45f050a 6757 the class closes */
9a86a77b 6758 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6759 RExC_parse++;
b45f050a
JF
6760 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6761 }
b8c5462f
JH
6762 } else {
6763 /* Maternal grandfather:
6764 * "[:" ending in ":" but not in ":]" */
830247a4 6765 RExC_parse = s;
767d463e 6766 }
620e46c5
JH
6767 }
6768 }
6769
b8c5462f
JH
6770 return namedclass;
6771}
6772
6773STATIC void
830247a4 6774S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6775{
97aff369 6776 dVAR;
3dab1dad 6777 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6778 const char *s = RExC_parse;
6779 const char c = *s++;
b8c5462f 6780
3dab1dad 6781 while (isALNUM(*s))
b8c5462f
JH
6782 s++;
6783 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6784 if (ckWARN(WARN_REGEXP))
6785 vWARN3(s+2,
6786 "POSIX syntax [%c %c] belongs inside character classes",
6787 c, c);
b45f050a
JF
6788
6789 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6790 if (POSIXCC_NOTYET(c)) {
830247a4 6791 /* adjust RExC_parse so the error shows after
b45f050a 6792 the class closes */
9a86a77b 6793 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6794 NOOP;
b45f050a
JF
6795 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6796 }
b8c5462f
JH
6797 }
6798 }
620e46c5
JH
6799}
6800
7f6f358c
YO
6801
6802/*
6803 parse a class specification and produce either an ANYOF node that
6804 matches the pattern. If the pattern matches a single char only and
6805 that char is < 256 then we produce an EXACT node instead.
6806*/
76e3520e 6807STATIC regnode *
3dab1dad 6808S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6809{
97aff369 6810 dVAR;
9ef43ace 6811 register UV value = 0;
9a86a77b 6812 register UV nextvalue;
3568d838 6813 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6814 register IV range = 0;
c277df42 6815 register regnode *ret;
ba210ebe 6816 STRLEN numlen;
ffc61ed2 6817 IV namedclass;
cbbf8932 6818 char *rangebegin = NULL;
936ed897 6819 bool need_class = 0;
c445ea15 6820 SV *listsv = NULL;
ffc61ed2 6821 UV n;
9e55ce06 6822 bool optimize_invert = TRUE;
cbbf8932 6823 AV* unicode_alternate = NULL;
1b2d223b
JH
6824#ifdef EBCDIC
6825 UV literal_endpoint = 0;
6826#endif
7f6f358c 6827 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6828
3dab1dad 6829 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6830 case we need to change the emitted regop to an EXACT. */
07be1b83 6831 const char * orig_parse = RExC_parse;
72f13be8 6832 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6833#ifndef DEBUGGING
6834 PERL_UNUSED_ARG(depth);
6835#endif
72f13be8 6836
3dab1dad 6837 DEBUG_PARSE("clas");
7f6f358c
YO
6838
6839 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6840 ret = reganode(pRExC_state, ANYOF, 0);
6841
6842 if (!SIZE_ONLY)
6843 ANYOF_FLAGS(ret) = 0;
6844
9a86a77b 6845 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6846 RExC_naughty++;
6847 RExC_parse++;
6848 if (!SIZE_ONLY)
6849 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6850 }
a0d0e21e 6851
73060fc4 6852 if (SIZE_ONLY) {
830247a4 6853 RExC_size += ANYOF_SKIP;
73060fc4
JH
6854 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6855 }
936ed897 6856 else {
830247a4 6857 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6858 if (FOLD)
6859 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6860 if (LOC)
6861 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6862 ANYOF_BITMAP_ZERO(ret);
396482e1 6863 listsv = newSVpvs("# comment\n");
a0d0e21e 6864 }
b8c5462f 6865
9a86a77b
JH
6866 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6867
b938889d 6868 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6869 checkposixcc(pRExC_state);
b8c5462f 6870
f064b6ad
HS
6871 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6872 if (UCHARAT(RExC_parse) == ']')
6873 goto charclassloop;
ffc61ed2 6874
fc8cd66c 6875parseit:
9a86a77b 6876 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6877
6878 charclassloop:
6879
6880 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6881
73b437c8 6882 if (!range)
830247a4 6883 rangebegin = RExC_parse;
ffc61ed2 6884 if (UTF) {
5e12f4fb 6885 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6886 RExC_end - RExC_parse,
9f7f3913 6887 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6888 RExC_parse += numlen;
6889 }
6890 else
6891 value = UCHARAT(RExC_parse++);
7f6f358c 6892
9a86a77b
JH
6893 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6894 if (value == '[' && POSIXCC(nextvalue))
830247a4 6895 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6896 else if (value == '\\') {
ffc61ed2 6897 if (UTF) {
5e12f4fb 6898 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6899 RExC_end - RExC_parse,
9f7f3913 6900 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6901 RExC_parse += numlen;
6902 }
6903 else
6904 value = UCHARAT(RExC_parse++);
470c3474 6905 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6906 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6907 * be a problem later if we want switch on Unicode.
6908 * A similar issue a little bit later when switching on
6909 * namedclass. --jhi */
ffc61ed2 6910 switch ((I32)value) {
b8c5462f
JH
6911 case 'w': namedclass = ANYOF_ALNUM; break;
6912 case 'W': namedclass = ANYOF_NALNUM; break;
6913 case 's': namedclass = ANYOF_SPACE; break;
6914 case 'S': namedclass = ANYOF_NSPACE; break;
6915 case 'd': namedclass = ANYOF_DIGIT; break;
6916 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6917 case 'N': /* Handle \N{NAME} in class */
6918 {
6919 /* We only pay attention to the first char of
6920 multichar strings being returned. I kinda wonder
6921 if this makes sense as it does change the behaviour
6922 from earlier versions, OTOH that behaviour was broken
6923 as well. */
6924 UV v; /* value is register so we cant & it /grrr */
6925 if (reg_namedseq(pRExC_state, &v)) {
6926 goto parseit;
6927 }
6928 value= v;
6929 }
6930 break;
ffc61ed2
JH
6931 case 'p':
6932 case 'P':
3dab1dad
YO
6933 {
6934 char *e;
af6f566e 6935 if (RExC_parse >= RExC_end)
2a4859cd 6936 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6937 if (*RExC_parse == '{') {
1df70142 6938 const U8 c = (U8)value;
ffc61ed2
JH
6939 e = strchr(RExC_parse++, '}');
6940 if (!e)
0da60cf5 6941 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6942 while (isSPACE(UCHARAT(RExC_parse)))
6943 RExC_parse++;
6944 if (e == RExC_parse)
0da60cf5 6945 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6946 n = e - RExC_parse;
ab13f0c7
JH
6947 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6948 n--;
ffc61ed2
JH
6949 }
6950 else {
6951 e = RExC_parse;
6952 n = 1;
6953 }
6954 if (!SIZE_ONLY) {
ab13f0c7
JH
6955 if (UCHARAT(RExC_parse) == '^') {
6956 RExC_parse++;
6957 n--;
6958 value = value == 'p' ? 'P' : 'p'; /* toggle */
6959 while (isSPACE(UCHARAT(RExC_parse))) {
6960 RExC_parse++;
6961 n--;
6962 }
6963 }
097eb12c
AL
6964 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6965 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6966 }
6967 RExC_parse = e + 1;
6968 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6969 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6970 }
f81125e2 6971 break;
b8c5462f
JH
6972 case 'n': value = '\n'; break;
6973 case 'r': value = '\r'; break;
6974 case 't': value = '\t'; break;
6975 case 'f': value = '\f'; break;
6976 case 'b': value = '\b'; break;
c7f1f016
NIS
6977 case 'e': value = ASCII_TO_NATIVE('\033');break;
6978 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6979 case 'x':
ffc61ed2 6980 if (*RExC_parse == '{') {
a4c04bdc
NC
6981 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6982 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6983 char * const e = strchr(RExC_parse++, '}');
b81d288d 6984 if (!e)
ffc61ed2 6985 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6986
6987 numlen = e - RExC_parse;
6988 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6989 RExC_parse = e + 1;
6990 }
6991 else {
a4c04bdc 6992 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6993 numlen = 2;
6994 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6995 RExC_parse += numlen;
6996 }
9e08bc66
TS
6997 if (PL_encoding && value < 0x100)
6998 goto recode_encoding;
b8c5462f
JH
6999 break;
7000 case 'c':
830247a4 7001 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7002 value = toCTRL(value);
7003 break;
7004 case '0': case '1': case '2': case '3': case '4':
7005 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7006 {
7007 I32 flags = 0;
7008 numlen = 3;
7009 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7010 RExC_parse += numlen;
7011 if (PL_encoding && value < 0x100)
7012 goto recode_encoding;
7013 break;
7014 }
7015 recode_encoding:
7016 {
7017 SV* enc = PL_encoding;
7018 value = reg_recode((const char)(U8)value, &enc);
7019 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7020 vWARN(RExC_parse,
7021 "Invalid escape in the specified encoding");
7022 break;
7023 }
1028017a 7024 default:
041457d9 7025 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7026 vWARN2(RExC_parse,
7027 "Unrecognized escape \\%c in character class passed through",
7028 (int)value);
1028017a 7029 break;
b8c5462f 7030 }
ffc61ed2 7031 } /* end of \blah */
1b2d223b
JH
7032#ifdef EBCDIC
7033 else
7034 literal_endpoint++;
7035#endif
ffc61ed2
JH
7036
7037 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7038
7039 if (!SIZE_ONLY && !need_class)
936ed897 7040 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7041
936ed897 7042 need_class = 1;
ffc61ed2
JH
7043
7044 /* a bad range like a-\d, a-[:digit:] ? */
7045 if (range) {
73b437c8 7046 if (!SIZE_ONLY) {
afd78fd5 7047 if (ckWARN(WARN_REGEXP)) {
097eb12c 7048 const int w =
afd78fd5
JH
7049 RExC_parse >= rangebegin ?
7050 RExC_parse - rangebegin : 0;
830247a4 7051 vWARN4(RExC_parse,
b45f050a 7052 "False [] range \"%*.*s\"",
097eb12c 7053 w, w, rangebegin);
afd78fd5 7054 }
3568d838
JH
7055 if (prevvalue < 256) {
7056 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7057 ANYOF_BITMAP_SET(ret, '-');
7058 }
7059 else {
7060 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7061 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7062 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7063 }
b8c5462f 7064 }
ffc61ed2
JH
7065
7066 range = 0; /* this was not a true range */
73b437c8 7067 }
ffc61ed2 7068
73b437c8 7069 if (!SIZE_ONLY) {
c49a72a9
NC
7070 const char *what = NULL;
7071 char yesno = 0;
7072
3568d838
JH
7073 if (namedclass > OOB_NAMEDCLASS)
7074 optimize_invert = FALSE;
e2962f66
JH
7075 /* Possible truncation here but in some 64-bit environments
7076 * the compiler gets heartburn about switch on 64-bit values.
7077 * A similar issue a little earlier when switching on value.
98f323fa 7078 * --jhi */
e2962f66 7079 switch ((I32)namedclass) {
73b437c8
JH
7080 case ANYOF_ALNUM:
7081 if (LOC)
936ed897 7082 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7083 else {
7084 for (value = 0; value < 256; value++)
7085 if (isALNUM(value))
936ed897 7086 ANYOF_BITMAP_SET(ret, value);
73b437c8 7087 }
c49a72a9
NC
7088 yesno = '+';
7089 what = "Word";
73b437c8
JH
7090 break;
7091 case ANYOF_NALNUM:
7092 if (LOC)
936ed897 7093 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7094 else {
7095 for (value = 0; value < 256; value++)
7096 if (!isALNUM(value))
936ed897 7097 ANYOF_BITMAP_SET(ret, value);
73b437c8 7098 }
c49a72a9
NC
7099 yesno = '!';
7100 what = "Word";
73b437c8 7101 break;
ffc61ed2 7102 case ANYOF_ALNUMC:
73b437c8 7103 if (LOC)
ffc61ed2 7104 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7105 else {
7106 for (value = 0; value < 256; value++)
ffc61ed2 7107 if (isALNUMC(value))
936ed897 7108 ANYOF_BITMAP_SET(ret, value);
73b437c8 7109 }
c49a72a9
NC
7110 yesno = '+';
7111 what = "Alnum";
73b437c8
JH
7112 break;
7113 case ANYOF_NALNUMC:
7114 if (LOC)
936ed897 7115 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7116 else {
7117 for (value = 0; value < 256; value++)
7118 if (!isALNUMC(value))
936ed897 7119 ANYOF_BITMAP_SET(ret, value);
73b437c8 7120 }
c49a72a9
NC
7121 yesno = '!';
7122 what = "Alnum";
73b437c8
JH
7123 break;
7124 case ANYOF_ALPHA:
7125 if (LOC)
936ed897 7126 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7127 else {
7128 for (value = 0; value < 256; value++)
7129 if (isALPHA(value))
936ed897 7130 ANYOF_BITMAP_SET(ret, value);
73b437c8 7131 }
c49a72a9
NC
7132 yesno = '+';
7133 what = "Alpha";
73b437c8
JH
7134 break;
7135 case ANYOF_NALPHA:
7136 if (LOC)
936ed897 7137 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7138 else {
7139 for (value = 0; value < 256; value++)
7140 if (!isALPHA(value))
936ed897 7141 ANYOF_BITMAP_SET(ret, value);
73b437c8 7142 }
c49a72a9
NC
7143 yesno = '!';
7144 what = "Alpha";
73b437c8
JH
7145 break;
7146 case ANYOF_ASCII:
7147 if (LOC)
936ed897 7148 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7149 else {
c7f1f016 7150#ifndef EBCDIC
1ba5c669
JH
7151 for (value = 0; value < 128; value++)
7152 ANYOF_BITMAP_SET(ret, value);
7153#else /* EBCDIC */
ffbc6a93 7154 for (value = 0; value < 256; value++) {
3a3c4447
JH
7155 if (isASCII(value))
7156 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7157 }
1ba5c669 7158#endif /* EBCDIC */
73b437c8 7159 }
c49a72a9
NC
7160 yesno = '+';
7161 what = "ASCII";
73b437c8
JH
7162 break;
7163 case ANYOF_NASCII:
7164 if (LOC)
936ed897 7165 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7166 else {
c7f1f016 7167#ifndef EBCDIC
1ba5c669
JH
7168 for (value = 128; value < 256; value++)
7169 ANYOF_BITMAP_SET(ret, value);
7170#else /* EBCDIC */
ffbc6a93 7171 for (value = 0; value < 256; value++) {
3a3c4447
JH
7172 if (!isASCII(value))
7173 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7174 }
1ba5c669 7175#endif /* EBCDIC */
73b437c8 7176 }
c49a72a9
NC
7177 yesno = '!';
7178 what = "ASCII";
73b437c8 7179 break;
aaa51d5e
JF
7180 case ANYOF_BLANK:
7181 if (LOC)
7182 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7183 else {
7184 for (value = 0; value < 256; value++)
7185 if (isBLANK(value))
7186 ANYOF_BITMAP_SET(ret, value);
7187 }
c49a72a9
NC
7188 yesno = '+';
7189 what = "Blank";
aaa51d5e
JF
7190 break;
7191 case ANYOF_NBLANK:
7192 if (LOC)
7193 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7194 else {
7195 for (value = 0; value < 256; value++)
7196 if (!isBLANK(value))
7197 ANYOF_BITMAP_SET(ret, value);
7198 }
c49a72a9
NC
7199 yesno = '!';
7200 what = "Blank";
aaa51d5e 7201 break;
73b437c8
JH
7202 case ANYOF_CNTRL:
7203 if (LOC)
936ed897 7204 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7205 else {
7206 for (value = 0; value < 256; value++)
7207 if (isCNTRL(value))
936ed897 7208 ANYOF_BITMAP_SET(ret, value);
73b437c8 7209 }
c49a72a9
NC
7210 yesno = '+';
7211 what = "Cntrl";
73b437c8
JH
7212 break;
7213 case ANYOF_NCNTRL:
7214 if (LOC)
936ed897 7215 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7216 else {
7217 for (value = 0; value < 256; value++)
7218 if (!isCNTRL(value))
936ed897 7219 ANYOF_BITMAP_SET(ret, value);
73b437c8 7220 }
c49a72a9
NC
7221 yesno = '!';
7222 what = "Cntrl";
ffc61ed2
JH
7223 break;
7224 case ANYOF_DIGIT:
7225 if (LOC)
7226 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7227 else {
7228 /* consecutive digits assumed */
7229 for (value = '0'; value <= '9'; value++)
7230 ANYOF_BITMAP_SET(ret, value);
7231 }
c49a72a9
NC
7232 yesno = '+';
7233 what = "Digit";
ffc61ed2
JH
7234 break;
7235 case ANYOF_NDIGIT:
7236 if (LOC)
7237 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7238 else {
7239 /* consecutive digits assumed */
7240 for (value = 0; value < '0'; value++)
7241 ANYOF_BITMAP_SET(ret, value);
7242 for (value = '9' + 1; value < 256; value++)
7243 ANYOF_BITMAP_SET(ret, value);
7244 }
c49a72a9
NC
7245 yesno = '!';
7246 what = "Digit";
73b437c8
JH
7247 break;
7248 case ANYOF_GRAPH:
7249 if (LOC)
936ed897 7250 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7251 else {
7252 for (value = 0; value < 256; value++)
7253 if (isGRAPH(value))
936ed897 7254 ANYOF_BITMAP_SET(ret, value);
73b437c8 7255 }
c49a72a9
NC
7256 yesno = '+';
7257 what = "Graph";
73b437c8
JH
7258 break;
7259 case ANYOF_NGRAPH:
7260 if (LOC)
936ed897 7261 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7262 else {
7263 for (value = 0; value < 256; value++)
7264 if (!isGRAPH(value))
936ed897 7265 ANYOF_BITMAP_SET(ret, value);
73b437c8 7266 }
c49a72a9
NC
7267 yesno = '!';
7268 what = "Graph";
73b437c8
JH
7269 break;
7270 case ANYOF_LOWER:
7271 if (LOC)
936ed897 7272 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7273 else {
7274 for (value = 0; value < 256; value++)
7275 if (isLOWER(value))
936ed897 7276 ANYOF_BITMAP_SET(ret, value);
73b437c8 7277 }
c49a72a9
NC
7278 yesno = '+';
7279 what = "Lower";
73b437c8
JH
7280 break;
7281 case ANYOF_NLOWER:
7282 if (LOC)
936ed897 7283 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7284 else {
7285 for (value = 0; value < 256; value++)
7286 if (!isLOWER(value))
936ed897 7287 ANYOF_BITMAP_SET(ret, value);
73b437c8 7288 }
c49a72a9
NC
7289 yesno = '!';
7290 what = "Lower";
73b437c8
JH
7291 break;
7292 case ANYOF_PRINT:
7293 if (LOC)
936ed897 7294 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7295 else {
7296 for (value = 0; value < 256; value++)
7297 if (isPRINT(value))
936ed897 7298 ANYOF_BITMAP_SET(ret, value);
73b437c8 7299 }
c49a72a9
NC
7300 yesno = '+';
7301 what = "Print";
73b437c8
JH
7302 break;
7303 case ANYOF_NPRINT:
7304 if (LOC)
936ed897 7305 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7306 else {
7307 for (value = 0; value < 256; value++)
7308 if (!isPRINT(value))
936ed897 7309 ANYOF_BITMAP_SET(ret, value);
73b437c8 7310 }
c49a72a9
NC
7311 yesno = '!';
7312 what = "Print";
73b437c8 7313 break;
aaa51d5e
JF
7314 case ANYOF_PSXSPC:
7315 if (LOC)
7316 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7317 else {
7318 for (value = 0; value < 256; value++)
7319 if (isPSXSPC(value))
7320 ANYOF_BITMAP_SET(ret, value);
7321 }
c49a72a9
NC
7322 yesno = '+';
7323 what = "Space";
aaa51d5e
JF
7324 break;
7325 case ANYOF_NPSXSPC:
7326 if (LOC)
7327 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7328 else {
7329 for (value = 0; value < 256; value++)
7330 if (!isPSXSPC(value))
7331 ANYOF_BITMAP_SET(ret, value);
7332 }
c49a72a9
NC
7333 yesno = '!';
7334 what = "Space";
aaa51d5e 7335 break;
73b437c8
JH
7336 case ANYOF_PUNCT:
7337 if (LOC)
936ed897 7338 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7339 else {
7340 for (value = 0; value < 256; value++)
7341 if (isPUNCT(value))
936ed897 7342 ANYOF_BITMAP_SET(ret, value);
73b437c8 7343 }
c49a72a9
NC
7344 yesno = '+';
7345 what = "Punct";
73b437c8
JH
7346 break;
7347 case ANYOF_NPUNCT:
7348 if (LOC)
936ed897 7349 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7350 else {
7351 for (value = 0; value < 256; value++)
7352 if (!isPUNCT(value))
936ed897 7353 ANYOF_BITMAP_SET(ret, value);
73b437c8 7354 }
c49a72a9
NC
7355 yesno = '!';
7356 what = "Punct";
ffc61ed2
JH
7357 break;
7358 case ANYOF_SPACE:
7359 if (LOC)
7360 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7361 else {
7362 for (value = 0; value < 256; value++)
7363 if (isSPACE(value))
7364 ANYOF_BITMAP_SET(ret, value);
7365 }
c49a72a9
NC
7366 yesno = '+';
7367 what = "SpacePerl";
ffc61ed2
JH
7368 break;
7369 case ANYOF_NSPACE:
7370 if (LOC)
7371 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7372 else {
7373 for (value = 0; value < 256; value++)
7374 if (!isSPACE(value))
7375 ANYOF_BITMAP_SET(ret, value);
7376 }
c49a72a9
NC
7377 yesno = '!';
7378 what = "SpacePerl";
73b437c8
JH
7379 break;
7380 case ANYOF_UPPER:
7381 if (LOC)
936ed897 7382 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7383 else {
7384 for (value = 0; value < 256; value++)
7385 if (isUPPER(value))
936ed897 7386 ANYOF_BITMAP_SET(ret, value);
73b437c8 7387 }
c49a72a9
NC
7388 yesno = '+';
7389 what = "Upper";
73b437c8
JH
7390 break;
7391 case ANYOF_NUPPER:
7392 if (LOC)
936ed897 7393 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7394 else {
7395 for (value = 0; value < 256; value++)
7396 if (!isUPPER(value))
936ed897 7397 ANYOF_BITMAP_SET(ret, value);
73b437c8 7398 }
c49a72a9
NC
7399 yesno = '!';
7400 what = "Upper";
73b437c8
JH
7401 break;
7402 case ANYOF_XDIGIT:
7403 if (LOC)
936ed897 7404 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7405 else {
7406 for (value = 0; value < 256; value++)
7407 if (isXDIGIT(value))
936ed897 7408 ANYOF_BITMAP_SET(ret, value);
73b437c8 7409 }
c49a72a9
NC
7410 yesno = '+';
7411 what = "XDigit";
73b437c8
JH
7412 break;
7413 case ANYOF_NXDIGIT:
7414 if (LOC)
936ed897 7415 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7416 else {
7417 for (value = 0; value < 256; value++)
7418 if (!isXDIGIT(value))
936ed897 7419 ANYOF_BITMAP_SET(ret, value);
73b437c8 7420 }
c49a72a9
NC
7421 yesno = '!';
7422 what = "XDigit";
73b437c8 7423 break;
f81125e2
JP
7424 case ANYOF_MAX:
7425 /* this is to handle \p and \P */
7426 break;
73b437c8 7427 default:
b45f050a 7428 vFAIL("Invalid [::] class");
73b437c8 7429 break;
b8c5462f 7430 }
c49a72a9
NC
7431 if (what) {
7432 /* Strings such as "+utf8::isWord\n" */
7433 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7434 }
b8c5462f 7435 if (LOC)
936ed897 7436 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7437 continue;
a0d0e21e 7438 }
ffc61ed2
JH
7439 } /* end of namedclass \blah */
7440
a0d0e21e 7441 if (range) {
eb160463 7442 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7443 const int w = RExC_parse - rangebegin;
7444 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7445 range = 0; /* not a valid range */
73b437c8 7446 }
a0d0e21e
LW
7447 }
7448 else {
3568d838 7449 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7450 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7451 RExC_parse[1] != ']') {
7452 RExC_parse++;
ffc61ed2
JH
7453
7454 /* a bad range like \w-, [:word:]- ? */
7455 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7456 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7457 const int w =
afd78fd5
JH
7458 RExC_parse >= rangebegin ?
7459 RExC_parse - rangebegin : 0;
830247a4 7460 vWARN4(RExC_parse,
b45f050a 7461 "False [] range \"%*.*s\"",
097eb12c 7462 w, w, rangebegin);
afd78fd5 7463 }
73b437c8 7464 if (!SIZE_ONLY)
936ed897 7465 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7466 } else
ffc61ed2
JH
7467 range = 1; /* yeah, it's a range! */
7468 continue; /* but do it the next time */
a0d0e21e 7469 }
a687059c 7470 }
ffc61ed2 7471
93733859 7472 /* now is the next time */
07be1b83 7473 /*stored += (value - prevvalue + 1);*/
ae5c130c 7474 if (!SIZE_ONLY) {
3568d838 7475 if (prevvalue < 256) {
1df70142 7476 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7477 IV i;
3568d838 7478#ifdef EBCDIC
1b2d223b
JH
7479 /* In EBCDIC [\x89-\x91] should include
7480 * the \x8e but [i-j] should not. */
7481 if (literal_endpoint == 2 &&
7482 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7483 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7484 {
3568d838
JH
7485 if (isLOWER(prevvalue)) {
7486 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7487 if (isLOWER(i))
7488 ANYOF_BITMAP_SET(ret, i);
7489 } else {
3568d838 7490 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7491 if (isUPPER(i))
7492 ANYOF_BITMAP_SET(ret, i);
7493 }
8ada0baa 7494 }
ffc61ed2 7495 else
8ada0baa 7496#endif
07be1b83
YO
7497 for (i = prevvalue; i <= ceilvalue; i++) {
7498 if (!ANYOF_BITMAP_TEST(ret,i)) {
7499 stored++;
7500 ANYOF_BITMAP_SET(ret, i);
7501 }
7502 }
3568d838 7503 }
a5961de5 7504 if (value > 255 || UTF) {
1df70142
AL
7505 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7506 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7507 stored+=2; /* can't optimize this class */
ffc61ed2 7508 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7509 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7510 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7511 prevnatvalue, natvalue);
7512 }
7513 else if (prevnatvalue == natvalue) {
7514 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7515 if (FOLD) {
89ebb4a3 7516 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7517 STRLEN foldlen;
1df70142 7518 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7519
e294cc5d
JH
7520#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7521 if (RExC_precomp[0] == ':' &&
7522 RExC_precomp[1] == '[' &&
7523 (f == 0xDF || f == 0x92)) {
7524 f = NATIVE_TO_UNI(f);
7525 }
7526#endif
c840d2a2
JH
7527 /* If folding and foldable and a single
7528 * character, insert also the folded version
7529 * to the charclass. */
9e55ce06 7530 if (f != value) {
e294cc5d
JH
7531#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7532 if ((RExC_precomp[0] == ':' &&
7533 RExC_precomp[1] == '[' &&
7534 (f == 0xA2 &&
7535 (value == 0xFB05 || value == 0xFB06))) ?
7536 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7537 foldlen == (STRLEN)UNISKIP(f) )
7538#else
eb160463 7539 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7540#endif
9e55ce06
JH
7541 Perl_sv_catpvf(aTHX_ listsv,
7542 "%04"UVxf"\n", f);
7543 else {
7544 /* Any multicharacter foldings
7545 * require the following transform:
7546 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7547 * where E folds into "pq" and F folds
7548 * into "rst", all other characters
7549 * fold to single characters. We save
7550 * away these multicharacter foldings,
7551 * to be later saved as part of the
7552 * additional "s" data. */
7553 SV *sv;
7554
7555 if (!unicode_alternate)
7556 unicode_alternate = newAV();
7557 sv = newSVpvn((char*)foldbuf, foldlen);
7558 SvUTF8_on(sv);
7559 av_push(unicode_alternate, sv);
7560 }
7561 }
254ba52a 7562
60a8b682
JH
7563 /* If folding and the value is one of the Greek
7564 * sigmas insert a few more sigmas to make the
7565 * folding rules of the sigmas to work right.
7566 * Note that not all the possible combinations
7567 * are handled here: some of them are handled
9e55ce06
JH
7568 * by the standard folding rules, and some of
7569 * them (literal or EXACTF cases) are handled
7570 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7571 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7572 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7573 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7574 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7575 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7576 }
7577 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7578 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7579 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7580 }
7581 }
ffc61ed2 7582 }
1b2d223b
JH
7583#ifdef EBCDIC
7584 literal_endpoint = 0;
7585#endif
8ada0baa 7586 }
ffc61ed2
JH
7587
7588 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7589 }
ffc61ed2 7590
936ed897 7591 if (need_class) {
4f66b38d 7592 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7593 if (SIZE_ONLY)
830247a4 7594 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7595 else
830247a4 7596 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7597 }
ffc61ed2 7598
7f6f358c
YO
7599
7600 if (SIZE_ONLY)
7601 return ret;
7602 /****** !SIZE_ONLY AFTER HERE *********/
7603
7604 if( stored == 1 && value < 256
7605 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7606 ) {
7607 /* optimize single char class to an EXACT node
7608 but *only* when its not a UTF/high char */
07be1b83
YO
7609 const char * cur_parse= RExC_parse;
7610 RExC_emit = (regnode *)orig_emit;
7611 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7612 ret = reg_node(pRExC_state,
7613 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7614 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7615 *STRING(ret)= (char)value;
7616 STR_LEN(ret)= 1;
7617 RExC_emit += STR_SZ(1);
7618 return ret;
7619 }
ae5c130c 7620 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7621 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7622 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7623 ) {
a0ed51b3 7624 for (value = 0; value < 256; ++value) {
936ed897 7625 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7626 UV fold = PL_fold[value];
ffc61ed2
JH
7627
7628 if (fold != value)
7629 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7630 }
7631 }
936ed897 7632 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7633 }
ffc61ed2 7634
ae5c130c 7635 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7636 if (optimize_invert &&
ffc61ed2
JH
7637 /* If the only flag is inversion. */
7638 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7639 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7640 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7641 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7642 }
7f6f358c 7643 {
097eb12c 7644 AV * const av = newAV();
ffc61ed2 7645 SV *rv;
9e55ce06 7646 /* The 0th element stores the character class description
6a0407ee 7647 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7648 * to initialize the appropriate swash (which gets stored in
7649 * the 1st element), and also useful for dumping the regnode.
7650 * The 2nd element stores the multicharacter foldings,
6a0407ee 7651 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7652 av_store(av, 0, listsv);
7653 av_store(av, 1, NULL);
9e55ce06 7654 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7655 rv = newRV_noinc((SV*)av);
19860706 7656 n = add_data(pRExC_state, 1, "s");
830247a4 7657 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7658 ARG_SET(ret, n);
a0ed51b3 7659 }
a0ed51b3
LW
7660 return ret;
7661}
7662
76e3520e 7663STATIC char*
830247a4 7664S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7665{
097eb12c 7666 char* const retval = RExC_parse++;
a0d0e21e 7667
4633a7c4 7668 for (;;) {
830247a4
IZ
7669 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7670 RExC_parse[2] == '#') {
e994fd66
AE
7671 while (*RExC_parse != ')') {
7672 if (RExC_parse == RExC_end)
7673 FAIL("Sequence (?#... not terminated");
830247a4 7674 RExC_parse++;
e994fd66 7675 }
830247a4 7676 RExC_parse++;
4633a7c4
LW
7677 continue;
7678 }
e2509266 7679 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7680 if (isSPACE(*RExC_parse)) {
7681 RExC_parse++;
748a9306
LW
7682 continue;
7683 }
830247a4 7684 else if (*RExC_parse == '#') {
e994fd66
AE
7685 while (RExC_parse < RExC_end)
7686 if (*RExC_parse++ == '\n') break;
748a9306
LW
7687 continue;
7688 }
748a9306 7689 }
4633a7c4 7690 return retval;
a0d0e21e 7691 }
a687059c
LW
7692}
7693
7694/*
c277df42 7695- reg_node - emit a node
a0d0e21e 7696*/
76e3520e 7697STATIC regnode * /* Location. */
830247a4 7698S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7699{
97aff369 7700 dVAR;
c277df42 7701 register regnode *ptr;
504618e9 7702 regnode * const ret = RExC_emit;
07be1b83 7703 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7704
c277df42 7705 if (SIZE_ONLY) {
830247a4
IZ
7706 SIZE_ALIGN(RExC_size);
7707 RExC_size += 1;
a0d0e21e
LW
7708 return(ret);
7709 }
e2e6a0f1
YO
7710#ifdef DEBUGGING
7711 if (OP(RExC_emit) == 255)
7712 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7713 reg_name[op], OP(RExC_emit));
7714#endif
c277df42 7715 NODE_ALIGN_FILL(ret);
a0d0e21e 7716 ptr = ret;
c277df42 7717 FILL_ADVANCE_NODE(ptr, op);
fac92740 7718 if (RExC_offsets) { /* MJD */
07be1b83 7719 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7720 "reg_node", __LINE__,
7721 reg_name[op],
07be1b83
YO
7722 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7723 ? "Overwriting end of array!\n" : "OK",
7724 (UV)(RExC_emit - RExC_emit_start),
7725 (UV)(RExC_parse - RExC_start),
7726 (UV)RExC_offsets[0]));
ccb2c380 7727 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7728 }
07be1b83 7729
830247a4 7730 RExC_emit = ptr;
a0d0e21e 7731 return(ret);
a687059c
LW
7732}
7733
7734/*
a0d0e21e
LW
7735- reganode - emit a node with an argument
7736*/
76e3520e 7737STATIC regnode * /* Location. */
830247a4 7738S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7739{
97aff369 7740 dVAR;
c277df42 7741 register regnode *ptr;
504618e9 7742 regnode * const ret = RExC_emit;
07be1b83 7743 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7744
c277df42 7745 if (SIZE_ONLY) {
830247a4
IZ
7746 SIZE_ALIGN(RExC_size);
7747 RExC_size += 2;
6bda09f9
YO
7748 /*
7749 We can't do this:
7750
7751 assert(2==regarglen[op]+1);
7752
7753 Anything larger than this has to allocate the extra amount.
7754 If we changed this to be:
7755
7756 RExC_size += (1 + regarglen[op]);
7757
7758 then it wouldn't matter. Its not clear what side effect
7759 might come from that so its not done so far.
7760 -- dmq
7761 */
a0d0e21e
LW
7762 return(ret);
7763 }
e2e6a0f1
YO
7764#ifdef DEBUGGING
7765 if (OP(RExC_emit) == 255)
7766 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7767#endif
c277df42 7768 NODE_ALIGN_FILL(ret);
a0d0e21e 7769 ptr = ret;
c277df42 7770 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7771 if (RExC_offsets) { /* MJD */
07be1b83 7772 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7773 "reganode",
ccb2c380
MP
7774 __LINE__,
7775 reg_name[op],
07be1b83 7776 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7777 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7778 (UV)(RExC_emit - RExC_emit_start),
7779 (UV)(RExC_parse - RExC_start),
7780 (UV)RExC_offsets[0]));
ccb2c380 7781 Set_Cur_Node_Offset;
fac92740
MJD
7782 }
7783
830247a4 7784 RExC_emit = ptr;
a0d0e21e 7785 return(ret);
fe14fcc3
LW
7786}
7787
7788/*
cd439c50 7789- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7790*/
71207a34
AL
7791STATIC STRLEN
7792S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7793{
97aff369 7794 dVAR;
71207a34 7795 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7796}
7797
7798/*
a0d0e21e
LW
7799- reginsert - insert an operator in front of already-emitted operand
7800*
7801* Means relocating the operand.
7802*/
76e3520e 7803STATIC void
6bda09f9 7804S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7805{
97aff369 7806 dVAR;
c277df42
IZ
7807 register regnode *src;
7808 register regnode *dst;
7809 register regnode *place;
504618e9 7810 const int offset = regarglen[(U8)op];
6bda09f9 7811 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7812 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7813/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7814 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7815 if (SIZE_ONLY) {
6bda09f9 7816 RExC_size += size;
a0d0e21e
LW
7817 return;
7818 }
a687059c 7819
830247a4 7820 src = RExC_emit;
6bda09f9 7821 RExC_emit += size;
830247a4 7822 dst = RExC_emit;
40d049e4 7823 if (RExC_open_parens) {
6bda09f9 7824 int paren;
6d99fb9b 7825 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7826 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7827 if ( RExC_open_parens[paren] >= opnd ) {
7828 DEBUG_PARSE_FMT("open"," - %d",size);
7829 RExC_open_parens[paren] += size;
7830 } else {
7831 DEBUG_PARSE_FMT("open"," - %s","ok");
7832 }
7833 if ( RExC_close_parens[paren] >= opnd ) {
7834 DEBUG_PARSE_FMT("close"," - %d",size);
7835 RExC_close_parens[paren] += size;
7836 } else {
7837 DEBUG_PARSE_FMT("close"," - %s","ok");
7838 }
7839 }
6bda09f9 7840 }
40d049e4 7841
fac92740 7842 while (src > opnd) {
c277df42 7843 StructCopy(--src, --dst, regnode);
fac92740 7844 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7845 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7846 "reg_insert",
ccb2c380
MP
7847 __LINE__,
7848 reg_name[op],
07be1b83
YO
7849 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7850 ? "Overwriting end of array!\n" : "OK",
7851 (UV)(src - RExC_emit_start),
7852 (UV)(dst - RExC_emit_start),
7853 (UV)RExC_offsets[0]));
ccb2c380
MP
7854 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7855 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7856 }
7857 }
7858
a0d0e21e
LW
7859
7860 place = opnd; /* Op node, where operand used to be. */
fac92740 7861 if (RExC_offsets) { /* MJD */
07be1b83 7862 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7863 "reginsert",
ccb2c380
MP
7864 __LINE__,
7865 reg_name[op],
07be1b83 7866 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7867 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7868 (UV)(place - RExC_emit_start),
7869 (UV)(RExC_parse - RExC_start),
786e8c11 7870 (UV)RExC_offsets[0]));
ccb2c380 7871 Set_Node_Offset(place, RExC_parse);
45948336 7872 Set_Node_Length(place, 1);
fac92740 7873 }
c277df42
IZ
7874 src = NEXTOPER(place);
7875 FILL_ADVANCE_NODE(place, op);
7876 Zero(src, offset, regnode);
a687059c
LW
7877}
7878
7879/*
c277df42 7880- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7881- SEE ALSO: regtail_study
a0d0e21e 7882*/
097eb12c 7883/* TODO: All three parms should be const */
76e3520e 7884STATIC void
3dab1dad 7885S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7886{
97aff369 7887 dVAR;
c277df42 7888 register regnode *scan;
72f13be8 7889 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7890#ifndef DEBUGGING
7891 PERL_UNUSED_ARG(depth);
7892#endif
a0d0e21e 7893
c277df42 7894 if (SIZE_ONLY)
a0d0e21e
LW
7895 return;
7896
7897 /* Find last node. */
7898 scan = p;
7899 for (;;) {
504618e9 7900 regnode * const temp = regnext(scan);
3dab1dad
YO
7901 DEBUG_PARSE_r({
7902 SV * const mysv=sv_newmortal();
7903 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7904 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7905 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7906 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7907 (temp == NULL ? "->" : ""),
7908 (temp == NULL ? reg_name[OP(val)] : "")
7909 );
3dab1dad
YO
7910 });
7911 if (temp == NULL)
7912 break;
7913 scan = temp;
7914 }
7915
7916 if (reg_off_by_arg[OP(scan)]) {
7917 ARG_SET(scan, val - scan);
7918 }
7919 else {
7920 NEXT_OFF(scan) = val - scan;
7921 }
7922}
7923
07be1b83 7924#ifdef DEBUGGING
3dab1dad
YO
7925/*
7926- regtail_study - set the next-pointer at the end of a node chain of p to val.
7927- Look for optimizable sequences at the same time.
7928- currently only looks for EXACT chains.
07be1b83
YO
7929
7930This is expermental code. The idea is to use this routine to perform
7931in place optimizations on branches and groups as they are constructed,
7932with the long term intention of removing optimization from study_chunk so
7933that it is purely analytical.
7934
7935Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7936to control which is which.
7937
3dab1dad
YO
7938*/
7939/* TODO: All four parms should be const */
07be1b83 7940
3dab1dad
YO
7941STATIC U8
7942S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7943{
7944 dVAR;
7945 register regnode *scan;
07be1b83
YO
7946 U8 exact = PSEUDO;
7947#ifdef EXPERIMENTAL_INPLACESCAN
7948 I32 min = 0;
7949#endif
7950
3dab1dad
YO
7951 GET_RE_DEBUG_FLAGS_DECL;
7952
07be1b83 7953
3dab1dad
YO
7954 if (SIZE_ONLY)
7955 return exact;
7956
7957 /* Find last node. */
7958
7959 scan = p;
7960 for (;;) {
7961 regnode * const temp = regnext(scan);
07be1b83
YO
7962#ifdef EXPERIMENTAL_INPLACESCAN
7963 if (PL_regkind[OP(scan)] == EXACT)
7964 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7965 return EXACT;
7966#endif
3dab1dad
YO
7967 if ( exact ) {
7968 switch (OP(scan)) {
7969 case EXACT:
7970 case EXACTF:
7971 case EXACTFL:
7972 if( exact == PSEUDO )
7973 exact= OP(scan);
07be1b83
YO
7974 else if ( exact != OP(scan) )
7975 exact= 0;
3dab1dad
YO
7976 case NOTHING:
7977 break;
7978 default:
7979 exact= 0;
7980 }
7981 }
7982 DEBUG_PARSE_r({
7983 SV * const mysv=sv_newmortal();
7984 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7985 regprop(RExC_rx, mysv, scan);
eaf3ca90 7986 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7987 SvPV_nolen_const(mysv),
eaf3ca90
YO
7988 REG_NODE_NUM(scan),
7989 reg_name[exact]);
3dab1dad 7990 });
a0d0e21e
LW
7991 if (temp == NULL)
7992 break;
7993 scan = temp;
7994 }
07be1b83
YO
7995 DEBUG_PARSE_r({
7996 SV * const mysv_val=sv_newmortal();
7997 DEBUG_PARSE_MSG("");
7998 regprop(RExC_rx, mysv_val, val);
7999 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8000 SvPV_nolen_const(mysv_val),
8001 REG_NODE_NUM(val),
8002 val - scan
8003 );
8004 });
c277df42
IZ
8005 if (reg_off_by_arg[OP(scan)]) {
8006 ARG_SET(scan, val - scan);
a0ed51b3
LW
8007 }
8008 else {
c277df42
IZ
8009 NEXT_OFF(scan) = val - scan;
8010 }
3dab1dad
YO
8011
8012 return exact;
a687059c 8013}
07be1b83 8014#endif
a687059c
LW
8015
8016/*
a687059c
LW
8017 - regcurly - a little FSA that accepts {\d+,?\d*}
8018 */
79072805 8019STATIC I32
5f66b61c 8020S_regcurly(register const char *s)
a687059c
LW
8021{
8022 if (*s++ != '{')
8023 return FALSE;
f0fcb552 8024 if (!isDIGIT(*s))
a687059c 8025 return FALSE;
f0fcb552 8026 while (isDIGIT(*s))
a687059c
LW
8027 s++;
8028 if (*s == ',')
8029 s++;
f0fcb552 8030 while (isDIGIT(*s))
a687059c
LW
8031 s++;
8032 if (*s != '}')
8033 return FALSE;
8034 return TRUE;
8035}
8036
a687059c
LW
8037
8038/*
fd181c75 8039 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8040 */
8041void
097eb12c 8042Perl_regdump(pTHX_ const regexp *r)
a687059c 8043{
35ff7856 8044#ifdef DEBUGGING
97aff369 8045 dVAR;
c445ea15 8046 SV * const sv = sv_newmortal();
ab3bbdeb 8047 SV *dsv= sv_newmortal();
a687059c 8048
786e8c11 8049 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8050
8051 /* Header fields of interest. */
ab3bbdeb
YO
8052 if (r->anchored_substr) {
8053 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8054 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8055 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8056 "anchored %s%s at %"IVdf" ",
8057 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8058 (IV)r->anchored_offset);
ab3bbdeb
YO
8059 } else if (r->anchored_utf8) {
8060 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8061 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8062 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8063 "anchored utf8 %s%s at %"IVdf" ",
8064 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8065 (IV)r->anchored_offset);
ab3bbdeb
YO
8066 }
8067 if (r->float_substr) {
8068 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8069 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8070 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8071 "floating %s%s at %"IVdf"..%"UVuf" ",
8072 s, RE_SV_TAIL(r->float_substr),
7b0972df 8073 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8074 } else if (r->float_utf8) {
8075 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8076 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8077 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8078 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8079 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8080 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8081 }
33b8afdf 8082 if (r->check_substr || r->check_utf8)
b81d288d 8083 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8084 (const char *)
8085 (r->check_substr == r->float_substr
8086 && r->check_utf8 == r->float_utf8
8087 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8088 if (r->reganch & ROPT_NOSCAN)
8089 PerlIO_printf(Perl_debug_log, " noscan");
8090 if (r->reganch & ROPT_CHECK_ALL)
8091 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8092 if (r->check_substr || r->check_utf8)
c277df42
IZ
8093 PerlIO_printf(Perl_debug_log, ") ");
8094
46fc3d4c 8095 if (r->regstclass) {
32fc9b6a 8096 regprop(r, sv, r->regstclass);
1de06328 8097 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8098 }
774d564b 8099 if (r->reganch & ROPT_ANCH) {
8100 PerlIO_printf(Perl_debug_log, "anchored");
8101 if (r->reganch & ROPT_ANCH_BOL)
8102 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8103 if (r->reganch & ROPT_ANCH_MBOL)
8104 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8105 if (r->reganch & ROPT_ANCH_SBOL)
8106 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8107 if (r->reganch & ROPT_ANCH_GPOS)
8108 PerlIO_printf(Perl_debug_log, "(GPOS)");
8109 PerlIO_putc(Perl_debug_log, ' ');
8110 }
c277df42
IZ
8111 if (r->reganch & ROPT_GPOS_SEEN)
8112 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8113 if (r->reganch & ROPT_SKIP)
760ac839 8114 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8115 if (r->reganch & ROPT_IMPLICIT)
760ac839 8116 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8117 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8118 if (r->reganch & ROPT_EVAL_SEEN)
8119 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8120 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8121#else
96a5add6 8122 PERL_UNUSED_CONTEXT;
65e66c80 8123 PERL_UNUSED_ARG(r);
17c3b450 8124#endif /* DEBUGGING */
a687059c
LW
8125}
8126
8127/*
a0d0e21e
LW
8128- regprop - printable representation of opcode
8129*/
46fc3d4c 8130void
32fc9b6a 8131Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8132{
35ff7856 8133#ifdef DEBUGGING
97aff369 8134 dVAR;
9b155405 8135 register int k;
1de06328 8136 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8137
54dc92de 8138 sv_setpvn(sv, "", 0);
b515a41d 8139
03363afd 8140 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8141 /* It would be nice to FAIL() here, but this may be called from
8142 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8143 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8144 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8145
3dab1dad 8146 k = PL_regkind[OP(o)];
9b155405 8147
2a782b5b 8148 if (k == EXACT) {
396482e1 8149 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8150 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8151 * is a crude hack but it may be the best for now since
8152 * we have no flag "this EXACTish node was UTF-8"
8153 * --jhi */
8154 const char * const s =
ddc5bc0f 8155 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8156 PL_colors[0], PL_colors[1],
8157 PERL_PV_ESCAPE_UNI_DETECT |
8158 PERL_PV_PRETTY_ELIPSES |
8159 PERL_PV_PRETTY_LTGT
8160 );
8161 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8162 } else if (k == TRIE) {
3dab1dad 8163 /* print the details of the trie in dumpuntil instead, as
4f639d21 8164 * prog->data isn't available here */
1de06328
YO
8165 const char op = OP(o);
8166 const I32 n = ARG(o);
8167 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8168 (reg_ac_data *)prog->data->data[n] :
8169 NULL;
8170 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8171 (reg_trie_data*)prog->data->data[n] :
8172 ac->trie;
8173
8174 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8175 DEBUG_TRIE_COMPILE_r(
8176 Perl_sv_catpvf(aTHX_ sv,
8177 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8178 (UV)trie->startstate,
1e2e3d02 8179 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8180 (UV)trie->wordcount,
8181 (UV)trie->minlen,
8182 (UV)trie->maxlen,
8183 (UV)TRIE_CHARCOUNT(trie),
8184 (UV)trie->uniquecharcount
8185 )
8186 );
8187 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8188 int i;
8189 int rangestart = -1;
f46cb337 8190 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8191 Perl_sv_catpvf(aTHX_ sv, "[");
8192 for (i = 0; i <= 256; i++) {
8193 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8194 if (rangestart == -1)
8195 rangestart = i;
8196 } else if (rangestart != -1) {
8197 if (i <= rangestart + 3)
8198 for (; rangestart < i; rangestart++)
8199 put_byte(sv, rangestart);
8200 else {
8201 put_byte(sv, rangestart);
8202 sv_catpvs(sv, "-");
8203 put_byte(sv, i - 1);
8204 }
8205 rangestart = -1;
8206 }
8207 }
8208 Perl_sv_catpvf(aTHX_ sv, "]");
8209 }
8210
a3621e74 8211 } else if (k == CURLY) {
cb434fcc 8212 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8213 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8214 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8215 }
2c2d71f5
JH
8216 else if (k == WHILEM && o->flags) /* Ordinal/of */
8217 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8218 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8219 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8220 else if (k == GOSUB)
6bda09f9 8221 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8222 else if (k == VERB) {
8223 if (!o->flags)
8224 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8225 (SV*)prog->data->data[ ARG( o ) ]);
8226 } else if (k == LOGICAL)
04ebc1ab 8227 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8228 else if (k == ANYOF) {
8229 int i, rangestart = -1;
2d03de9c 8230 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8231
8232 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8233 static const char * const anyofs[] = {
653099ff
GS
8234 "\\w",
8235 "\\W",
8236 "\\s",
8237 "\\S",
8238 "\\d",
8239 "\\D",
8240 "[:alnum:]",
8241 "[:^alnum:]",
8242 "[:alpha:]",
8243 "[:^alpha:]",
8244 "[:ascii:]",
8245 "[:^ascii:]",
8246 "[:ctrl:]",
8247 "[:^ctrl:]",
8248 "[:graph:]",
8249 "[:^graph:]",
8250 "[:lower:]",
8251 "[:^lower:]",
8252 "[:print:]",
8253 "[:^print:]",
8254 "[:punct:]",
8255 "[:^punct:]",
8256 "[:upper:]",
aaa51d5e 8257 "[:^upper:]",
653099ff 8258 "[:xdigit:]",
aaa51d5e
JF
8259 "[:^xdigit:]",
8260 "[:space:]",
8261 "[:^space:]",
8262 "[:blank:]",
8263 "[:^blank:]"
653099ff
GS
8264 };
8265
19860706 8266 if (flags & ANYOF_LOCALE)
396482e1 8267 sv_catpvs(sv, "{loc}");
19860706 8268 if (flags & ANYOF_FOLD)
396482e1 8269 sv_catpvs(sv, "{i}");
653099ff 8270 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8271 if (flags & ANYOF_INVERT)
396482e1 8272 sv_catpvs(sv, "^");
ffc61ed2
JH
8273 for (i = 0; i <= 256; i++) {
8274 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8275 if (rangestart == -1)
8276 rangestart = i;
8277 } else if (rangestart != -1) {
8278 if (i <= rangestart + 3)
8279 for (; rangestart < i; rangestart++)
653099ff 8280 put_byte(sv, rangestart);
ffc61ed2
JH
8281 else {
8282 put_byte(sv, rangestart);
396482e1 8283 sv_catpvs(sv, "-");
ffc61ed2 8284 put_byte(sv, i - 1);
653099ff 8285 }
ffc61ed2 8286 rangestart = -1;
653099ff 8287 }
847a199f 8288 }
ffc61ed2
JH
8289
8290 if (o->flags & ANYOF_CLASS)
bb7a0f54 8291 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8292 if (ANYOF_CLASS_TEST(o,i))
8293 sv_catpv(sv, anyofs[i]);
8294
8295 if (flags & ANYOF_UNICODE)
396482e1 8296 sv_catpvs(sv, "{unicode}");
1aa99e6b 8297 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8298 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8299
8300 {
8301 SV *lv;
32fc9b6a 8302 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8303
ffc61ed2
JH
8304 if (lv) {
8305 if (sw) {
89ebb4a3 8306 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8307
ffc61ed2 8308 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8309 uvchr_to_utf8(s, i);
ffc61ed2 8310
3568d838 8311 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8312 if (rangestart == -1)
8313 rangestart = i;
8314 } else if (rangestart != -1) {
ffc61ed2
JH
8315 if (i <= rangestart + 3)
8316 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8317 const U8 * const e = uvchr_to_utf8(s,rangestart);
8318 U8 *p;
8319 for(p = s; p < e; p++)
ffc61ed2
JH
8320 put_byte(sv, *p);
8321 }
8322 else {
2d03de9c
AL
8323 const U8 *e = uvchr_to_utf8(s,rangestart);
8324 U8 *p;
8325 for (p = s; p < e; p++)
ffc61ed2 8326 put_byte(sv, *p);
396482e1 8327 sv_catpvs(sv, "-");
2d03de9c
AL
8328 e = uvchr_to_utf8(s, i-1);
8329 for (p = s; p < e; p++)
1df70142 8330 put_byte(sv, *p);
ffc61ed2
JH
8331 }
8332 rangestart = -1;
8333 }
19860706 8334 }
ffc61ed2 8335
396482e1 8336 sv_catpvs(sv, "..."); /* et cetera */
19860706 8337 }
fde631ed 8338
ffc61ed2 8339 {
2e0de35c 8340 char *s = savesvpv(lv);
c445ea15 8341 char * const origs = s;
b81d288d 8342
3dab1dad
YO
8343 while (*s && *s != '\n')
8344 s++;
b81d288d 8345
ffc61ed2 8346 if (*s == '\n') {
2d03de9c 8347 const char * const t = ++s;
ffc61ed2
JH
8348
8349 while (*s) {
8350 if (*s == '\n')
8351 *s = ' ';
8352 s++;
8353 }
8354 if (s[-1] == ' ')
8355 s[-1] = 0;
8356
8357 sv_catpv(sv, t);
fde631ed 8358 }
b81d288d 8359
ffc61ed2 8360 Safefree(origs);
fde631ed
JH
8361 }
8362 }
653099ff 8363 }
ffc61ed2 8364
653099ff
GS
8365 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8366 }
9b155405 8367 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8368 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8369#else
96a5add6 8370 PERL_UNUSED_CONTEXT;
65e66c80
SP
8371 PERL_UNUSED_ARG(sv);
8372 PERL_UNUSED_ARG(o);
f9049ba1 8373 PERL_UNUSED_ARG(prog);
17c3b450 8374#endif /* DEBUGGING */
35ff7856 8375}
a687059c 8376
cad2e5aa
JH
8377SV *
8378Perl_re_intuit_string(pTHX_ regexp *prog)
8379{ /* Assume that RE_INTUIT is set */
97aff369 8380 dVAR;
a3621e74 8381 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8382 PERL_UNUSED_CONTEXT;
8383
a3621e74 8384 DEBUG_COMPILE_r(
cfd0369c 8385 {
2d03de9c 8386 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8387 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8388
8389 if (!PL_colorset) reginitcolors();
8390 PerlIO_printf(Perl_debug_log,
a0288114 8391 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8392 PL_colors[4],
8393 prog->check_substr ? "" : "utf8 ",
8394 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8395 s,
8396 PL_colors[1],
8397 (strlen(s) > 60 ? "..." : ""));
8398 } );
8399
33b8afdf 8400 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8401}
8402
84da74a7
YO
8403/*
8404 pregfree - free a regexp
8405
8406 See regdupe below if you change anything here.
8407*/
8408
2b69d0c2 8409void
864dbfa3 8410Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8411{
27da23d5 8412 dVAR;
0df25f3d 8413
fc32ee4a 8414 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8415
7821416a
IZ
8416 if (!r || (--r->refcnt > 0))
8417 return;
ab3bbdeb 8418 DEBUG_COMPILE_r({
0df25f3d
YO
8419 if (!PL_colorset)
8420 reginitcolors();
5c9f2f80
YO
8421 {
8422 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8423 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8424 dsv, r->precomp, r->prelen, 60);
8425 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8426 PL_colors[4],PL_colors[5],s);
8427 }
9e55ce06 8428 });
cad2e5aa 8429
43c5f42d
NC
8430 /* gcov results gave these as non-null 100% of the time, so there's no
8431 optimisation in checking them before calling Safefree */
8432 Safefree(r->precomp);
8433 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8434 RX_MATCH_COPY_FREE(r);
f8c7b90f 8435#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8436 if (r->saved_copy)
8437 SvREFCNT_dec(r->saved_copy);
8438#endif
a193d654
GS
8439 if (r->substrs) {
8440 if (r->anchored_substr)
8441 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8442 if (r->anchored_utf8)
8443 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8444 if (r->float_substr)
8445 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8446 if (r->float_utf8)
8447 SvREFCNT_dec(r->float_utf8);
2779dcf1 8448 Safefree(r->substrs);
a193d654 8449 }
81714fb9
YO
8450 if (r->paren_names)
8451 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8452 if (r->data) {
8453 int n = r->data->count;
f3548bdc
DM
8454 PAD* new_comppad = NULL;
8455 PAD* old_comppad;
4026c95a 8456 PADOFFSET refcnt;
dfad63ad 8457
c277df42 8458 while (--n >= 0) {
261faec3 8459 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8460 switch (r->data->what[n]) {
8461 case 's':
81714fb9 8462 case 'S':
c277df42
IZ
8463 SvREFCNT_dec((SV*)r->data->data[n]);
8464 break;
653099ff
GS
8465 case 'f':
8466 Safefree(r->data->data[n]);
8467 break;
dfad63ad
HS
8468 case 'p':
8469 new_comppad = (AV*)r->data->data[n];
8470 break;
c277df42 8471 case 'o':
dfad63ad 8472 if (new_comppad == NULL)
cea2e8a9 8473 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8474 PAD_SAVE_LOCAL(old_comppad,
8475 /* Watch out for global destruction's random ordering. */
c445ea15 8476 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8477 );
b34c0dd4 8478 OP_REFCNT_LOCK;
4026c95a
SH
8479 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8480 OP_REFCNT_UNLOCK;
8481 if (!refcnt)
9b978d73 8482 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8483
f3548bdc 8484 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8485 SvREFCNT_dec((SV*)new_comppad);
8486 new_comppad = NULL;
c277df42
IZ
8487 break;
8488 case 'n':
9e55ce06 8489 break;
07be1b83 8490 case 'T':
be8e71aa
YO
8491 { /* Aho Corasick add-on structure for a trie node.
8492 Used in stclass optimization only */
07be1b83
YO
8493 U32 refcount;
8494 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8495 OP_REFCNT_LOCK;
8496 refcount = --aho->refcount;
8497 OP_REFCNT_UNLOCK;
8498 if ( !refcount ) {
8499 Safefree(aho->states);
8500 Safefree(aho->fail);
8501 aho->trie=NULL; /* not necessary to free this as it is
8502 handled by the 't' case */
8503 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8504 Safefree(r->regstclass);
07be1b83
YO
8505 }
8506 }
8507 break;
a3621e74 8508 case 't':
07be1b83 8509 {
be8e71aa 8510 /* trie structure. */
07be1b83
YO
8511 U32 refcount;
8512 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8513 OP_REFCNT_LOCK;
8514 refcount = --trie->refcount;
8515 OP_REFCNT_UNLOCK;
8516 if ( !refcount ) {
8517 Safefree(trie->charmap);
8518 if (trie->widecharmap)
8519 SvREFCNT_dec((SV*)trie->widecharmap);
8520 Safefree(trie->states);
8521 Safefree(trie->trans);
8522 if (trie->bitmap)
8523 Safefree(trie->bitmap);
8524 if (trie->wordlen)
8525 Safefree(trie->wordlen);
786e8c11
YO
8526 if (trie->jump)
8527 Safefree(trie->jump);
8528 if (trie->nextword)
8529 Safefree(trie->nextword);
a3621e74 8530#ifdef DEBUGGING
5c9f2f80
YO
8531 if (trie->words)
8532 SvREFCNT_dec((SV*)trie->words);
8533 if (trie->revcharmap)
8534 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8535#endif
07be1b83 8536 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8537 }
07be1b83
YO
8538 }
8539 break;
c277df42 8540 default:
830247a4 8541 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8542 }
8543 }
8544 Safefree(r->data->what);
8545 Safefree(r->data);
a0d0e21e
LW
8546 }
8547 Safefree(r->startp);
8548 Safefree(r->endp);
c74340f9
YO
8549 if (r->swap) {
8550 Safefree(r->swap->startp);
8551 Safefree(r->swap->endp);
8552 Safefree(r->swap);
8553 }
a0d0e21e 8554 Safefree(r);
a687059c 8555}
c277df42 8556
84da74a7
YO
8557#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8558#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8559#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8560#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8561
8562/*
8563 regdupe - duplicate a regexp.
8564
8565 This routine is called by sv.c's re_dup and is expected to clone a
8566 given regexp structure. It is a no-op when not under USE_ITHREADS.
8567 (Originally this *was* re_dup() for change history see sv.c)
8568
8569 See pregfree() above if you change anything here.
8570*/
a3c0e9ca 8571#if defined(USE_ITHREADS)
84da74a7
YO
8572regexp *
8573Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8574{
84da74a7
YO
8575 dVAR;
8576 REGEXP *ret;
8577 int i, len, npar;
8578 struct reg_substr_datum *s;
8579
8580 if (!r)
8581 return (REGEXP *)NULL;
8582
8583 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8584 return ret;
8585
8586 len = r->offsets[0];
8587 npar = r->nparens+1;
8588
8589 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8590 Copy(r->program, ret->program, len+1, regnode);
8591
8592 Newx(ret->startp, npar, I32);
8593 Copy(r->startp, ret->startp, npar, I32);
8594 Newx(ret->endp, npar, I32);
8595 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8596 if(r->swap) {
8597 Newx(ret->swap, 1, regexp_paren_ofs);
8598 /* no need to copy these */
8599 Newx(ret->swap->startp, npar, I32);
8600 Newx(ret->swap->endp, npar, I32);
8601 } else {
8602 ret->swap = NULL;
8603 }
84da74a7
YO
8604
8605 Newx(ret->substrs, 1, struct reg_substr_data);
8606 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8607 s->min_offset = r->substrs->data[i].min_offset;
8608 s->max_offset = r->substrs->data[i].max_offset;
8609 s->end_shift = r->substrs->data[i].end_shift;
8610 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8611 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8612 }
8613
8614 ret->regstclass = NULL;
8615 if (r->data) {
8616 struct reg_data *d;
8617 const int count = r->data->count;
8618 int i;
8619
8620 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8621 char, struct reg_data);
8622 Newx(d->what, count, U8);
8623
8624 d->count = count;
8625 for (i = 0; i < count; i++) {
8626 d->what[i] = r->data->what[i];
8627 switch (d->what[i]) {
e2e6a0f1 8628 /* legal options are one of: sSfpont
84da74a7
YO
8629 see also regcomp.h and pregfree() */
8630 case 's':
81714fb9 8631 case 'S':
84da74a7
YO
8632 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8633 break;
8634 case 'p':
8635 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8636 break;
8637 case 'f':
8638 /* This is cheating. */
8639 Newx(d->data[i], 1, struct regnode_charclass_class);
8640 StructCopy(r->data->data[i], d->data[i],
8641 struct regnode_charclass_class);
8642 ret->regstclass = (regnode*)d->data[i];
8643 break;
8644 case 'o':
8645 /* Compiled op trees are readonly, and can thus be
8646 shared without duplication. */
8647 OP_REFCNT_LOCK;
8648 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8649 OP_REFCNT_UNLOCK;
8650 break;
8651 case 'n':
8652 d->data[i] = r->data->data[i];
8653 break;
8654 case 't':
8655 d->data[i] = r->data->data[i];
8656 OP_REFCNT_LOCK;
8657 ((reg_trie_data*)d->data[i])->refcount++;
8658 OP_REFCNT_UNLOCK;
8659 break;
8660 case 'T':
8661 d->data[i] = r->data->data[i];
8662 OP_REFCNT_LOCK;
8663 ((reg_ac_data*)d->data[i])->refcount++;
8664 OP_REFCNT_UNLOCK;
8665 /* Trie stclasses are readonly and can thus be shared
8666 * without duplication. We free the stclass in pregfree
8667 * when the corresponding reg_ac_data struct is freed.
8668 */
8669 ret->regstclass= r->regstclass;
8670 break;
8671 default:
8672 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8673 }
8674 }
8675
8676 ret->data = d;
8677 }
8678 else
8679 ret->data = NULL;
8680
8681 Newx(ret->offsets, 2*len+1, U32);
8682 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8683
8684 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8685 ret->refcnt = r->refcnt;
8686 ret->minlen = r->minlen;
de8c5301 8687 ret->minlenret = r->minlenret;
84da74a7
YO
8688 ret->prelen = r->prelen;
8689 ret->nparens = r->nparens;
8690 ret->lastparen = r->lastparen;
8691 ret->lastcloseparen = r->lastcloseparen;
8692 ret->reganch = r->reganch;
8693
8694 ret->sublen = r->sublen;
8695
f9f4320a 8696 ret->engine = r->engine;
81714fb9
YO
8697
8698 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8699
84da74a7
YO
8700 if (RX_MATCH_COPIED(ret))
8701 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8702 else
8703 ret->subbeg = NULL;
8704#ifdef PERL_OLD_COPY_ON_WRITE
8705 ret->saved_copy = NULL;
8706#endif
8707
8708 ptr_table_store(PL_ptr_table, r, ret);
8709 return ret;
84da74a7 8710}
a3c0e9ca 8711#endif
84da74a7 8712
de8c5301
YO
8713/*
8714 reg_stringify()
8715
8716 converts a regexp embedded in a MAGIC struct to its stringified form,
8717 caching the converted form in the struct and returns the cached
8718 string.
8719
8720 If lp is nonnull then it is used to return the length of the
8721 resulting string
8722
8723 If flags is nonnull and the returned string contains UTF8 then
8724 (flags & 1) will be true.
8725
8726 If haseval is nonnull then it is used to return whether the pattern
8727 contains evals.
8728
8729 Normally called via macro:
8730
8731 CALLREG_STRINGIFY(mg,0,0);
8732
8733 And internally with
8734
8735 CALLREG_AS_STR(mg,lp,flags,haseval)
8736
8737 See sv_2pv_flags() in sv.c for an example of internal usage.
8738
8739 */
8740
8741char *
8742Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8743 dVAR;
8744 const regexp * const re = (regexp *)mg->mg_obj;
8745
8746 if (!mg->mg_ptr) {
8747 const char *fptr = "msix";
8748 char reflags[6];
8749 char ch;
8750 int left = 0;
8751 int right = 4;
8752 bool need_newline = 0;
8753 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8754
8755 while((ch = *fptr++)) {
8756 if(reganch & 1) {
8757 reflags[left++] = ch;
8758 }
8759 else {
8760 reflags[right--] = ch;
8761 }
8762 reganch >>= 1;
8763 }
8764 if(left != 4) {
8765 reflags[left] = '-';
8766 left = 5;
8767 }
8768
8769 mg->mg_len = re->prelen + 4 + left;
8770 /*
8771 * If /x was used, we have to worry about a regex ending with a
8772 * comment later being embedded within another regex. If so, we don't
8773 * want this regex's "commentization" to leak out to the right part of
8774 * the enclosing regex, we must cap it with a newline.
8775 *
8776 * So, if /x was used, we scan backwards from the end of the regex. If
8777 * we find a '#' before we find a newline, we need to add a newline
8778 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8779 * we don't need to add anything. -jfriedl
8780 */
8781 if (PMf_EXTENDED & re->reganch) {
8782 const char *endptr = re->precomp + re->prelen;
8783 while (endptr >= re->precomp) {
8784 const char c = *(endptr--);
8785 if (c == '\n')
8786 break; /* don't need another */
8787 if (c == '#') {
8788 /* we end while in a comment, so we need a newline */
8789 mg->mg_len++; /* save space for it */
8790 need_newline = 1; /* note to add it */
8791 break;
8792 }
8793 }
8794 }
8795
8796 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8797 mg->mg_ptr[0] = '(';
8798 mg->mg_ptr[1] = '?';
8799 Copy(reflags, mg->mg_ptr+2, left, char);
8800 *(mg->mg_ptr+left+2) = ':';
8801 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8802 if (need_newline)
8803 mg->mg_ptr[mg->mg_len - 2] = '\n';
8804 mg->mg_ptr[mg->mg_len - 1] = ')';
8805 mg->mg_ptr[mg->mg_len] = 0;
8806 }
8807 if (haseval)
8808 *haseval = re->program[0].next_off;
8809 if (flags)
8810 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8811
8812 if (lp)
8813 *lp = mg->mg_len;
8814 return mg->mg_ptr;
8815}
8816
8817
76234dfb 8818#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8819/*
8820 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8821 */
8822regnode *
864dbfa3 8823Perl_regnext(pTHX_ register regnode *p)
c277df42 8824{
97aff369 8825 dVAR;
c277df42
IZ
8826 register I32 offset;
8827
3280af22 8828 if (p == &PL_regdummy)
c277df42
IZ
8829 return(NULL);
8830
8831 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8832 if (offset == 0)
8833 return(NULL);
8834
c277df42 8835 return(p+offset);
c277df42 8836}
76234dfb 8837#endif
c277df42 8838
01f988be 8839STATIC void
cea2e8a9 8840S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8841{
8842 va_list args;
8843 STRLEN l1 = strlen(pat1);
8844 STRLEN l2 = strlen(pat2);
8845 char buf[512];
06bf62c7 8846 SV *msv;
73d840c0 8847 const char *message;
c277df42
IZ
8848
8849 if (l1 > 510)
8850 l1 = 510;
8851 if (l1 + l2 > 510)
8852 l2 = 510 - l1;
8853 Copy(pat1, buf, l1 , char);
8854 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8855 buf[l1 + l2] = '\n';
8856 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8857#ifdef I_STDARG
8858 /* ANSI variant takes additional second argument */
c277df42 8859 va_start(args, pat2);
8736538c
AS
8860#else
8861 va_start(args);
8862#endif
5a844595 8863 msv = vmess(buf, &args);
c277df42 8864 va_end(args);
cfd0369c 8865 message = SvPV_const(msv,l1);
c277df42
IZ
8866 if (l1 > 512)
8867 l1 = 512;
8868 Copy(message, buf, l1 , char);
197cf9b9 8869 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8870 Perl_croak(aTHX_ "%s", buf);
c277df42 8871}
a0ed51b3
LW
8872
8873/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8874
76234dfb 8875#ifndef PERL_IN_XSUB_RE
a0ed51b3 8876void
864dbfa3 8877Perl_save_re_context(pTHX)
b81d288d 8878{
97aff369 8879 dVAR;
1ade1aa1
NC
8880
8881 struct re_save_state *state;
8882
8883 SAVEVPTR(PL_curcop);
8884 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8885
8886 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8887 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8888 SSPUSHINT(SAVEt_RE_STATE);
8889
46ab3289 8890 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8891
a0ed51b3 8892 PL_reg_start_tmp = 0;
a0ed51b3 8893 PL_reg_start_tmpl = 0;
c445ea15 8894 PL_reg_oldsaved = NULL;
a5db57d6 8895 PL_reg_oldsavedlen = 0;
a5db57d6 8896 PL_reg_maxiter = 0;
a5db57d6 8897 PL_reg_leftiter = 0;
c445ea15 8898 PL_reg_poscache = NULL;
a5db57d6 8899 PL_reg_poscache_size = 0;
1ade1aa1
NC
8900#ifdef PERL_OLD_COPY_ON_WRITE
8901 PL_nrs = NULL;
8902#endif
ada6e8a9 8903
c445ea15
AL
8904 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8905 if (PL_curpm) {
8906 const REGEXP * const rx = PM_GETRE(PL_curpm);
8907 if (rx) {
1df70142 8908 U32 i;
ada6e8a9 8909 for (i = 1; i <= rx->nparens; i++) {
1df70142 8910 char digits[TYPE_CHARS(long)];
d9fad198 8911 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8912 GV *const *const gvp
8913 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8914
b37c2d43
AL
8915 if (gvp) {
8916 GV * const gv = *gvp;
8917 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8918 save_scalar(gv);
49f27e4b 8919 }
ada6e8a9
AMS
8920 }
8921 }
8922 }
a0ed51b3 8923}
76234dfb 8924#endif
51371543 8925
51371543 8926static void
acfe0abc 8927clear_re(pTHX_ void *r)
51371543 8928{
97aff369 8929 dVAR;
51371543
GS
8930 ReREFCNT_dec((regexp *)r);
8931}
ffbc6a93 8932
a28509cc
AL
8933#ifdef DEBUGGING
8934
8935STATIC void
8936S_put_byte(pTHX_ SV *sv, int c)
8937{
8938 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8939 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8940 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8941 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8942 else
8943 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8944}
8945
786e8c11 8946
3dab1dad
YO
8947#define CLEAR_OPTSTART \
8948 if (optstart) STMT_START { \
07be1b83 8949 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8950 optstart=NULL; \
8951 } STMT_END
8952
786e8c11 8953#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8954
b5a2f8d8
NC
8955STATIC const regnode *
8956S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8957 const regnode *last, const regnode *plast,
8958 SV* sv, I32 indent, U32 depth)
a28509cc 8959{
97aff369 8960 dVAR;
786e8c11 8961 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8962 register const regnode *next;
3dab1dad
YO
8963 const regnode *optstart= NULL;
8964 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8965
786e8c11
YO
8966#ifdef DEBUG_DUMPUNTIL
8967 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8968 last ? last-start : 0,plast ? plast-start : 0);
8969#endif
8970
8971 if (plast && plast < last)
8972 last= plast;
8973
8974 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8975 /* While that wasn't END last time... */
8976
8977 NODE_ALIGN(node);
8978 op = OP(node);
8979 if (op == CLOSE)
786e8c11 8980 indent--;
b5a2f8d8 8981 next = regnext((regnode *)node);
07be1b83 8982
a28509cc 8983 /* Where, what. */
8e11feef 8984 if (OP(node) == OPTIMIZED) {
e68ec53f 8985 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8986 optstart = node;
3dab1dad 8987 else
8e11feef 8988 goto after_print;
3dab1dad
YO
8989 } else
8990 CLEAR_OPTSTART;
07be1b83 8991
32fc9b6a 8992 regprop(r, sv, node);
a28509cc 8993 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8994 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8995
8996 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8997 if (next == NULL) /* Next ptr. */
8998 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8999 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9000 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9001 else
9002 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9003
1de06328 9004 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9005 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9006 }
9007
a28509cc
AL
9008 after_print:
9009 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9010 assert(next);
9011 {
9012 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9013 ? regnext((regnode *)next)
9014 : next);
be8e71aa
YO
9015 if (last && nnode > last)
9016 nnode = last;
786e8c11 9017 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9018 }
a28509cc
AL
9019 }
9020 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9021 assert(next);
786e8c11 9022 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9023 }
9024 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9025 const regnode *this_trie = node;
1de06328 9026 const char op = OP(node);
a28509cc 9027 const I32 n = ARG(node);
1de06328
YO
9028 const reg_ac_data * const ac = op>=AHOCORASICK ?
9029 (reg_ac_data *)r->data->data[n] :
9030 NULL;
9031 const reg_trie_data * const trie = op<AHOCORASICK ?
9032 (reg_trie_data*)r->data->data[n] :
9033 ac->trie;
786e8c11 9034 const regnode *nextbranch= NULL;
a28509cc 9035 I32 word_idx;
1de06328 9036 sv_setpvn(sv, "", 0);
786e8c11 9037 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9038 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9039
9040 PerlIO_printf(Perl_debug_log, "%*s%s ",
9041 (int)(2*(indent+3)), "",
9042 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9043 PL_colors[0], PL_colors[1],
9044 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9045 PERL_PV_PRETTY_ELIPSES |
7f69552c 9046 PERL_PV_PRETTY_LTGT
786e8c11
YO
9047 )
9048 : "???"
9049 );
9050 if (trie->jump) {
40d049e4 9051 U16 dist= trie->jump[word_idx+1];
7f69552c 9052 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9053 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9054 if (dist) {
9055 if (!nextbranch)
24b23f37 9056 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9057 DUMPUNTIL(this_trie + dist, nextbranch);
9058 }
786e8c11
YO
9059 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9060 nextbranch= regnext((regnode *)nextbranch);
9061 } else {
9062 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9063 }
786e8c11
YO
9064 }
9065 if (last && next > last)
9066 node= last;
9067 else
9068 node= next;
a28509cc 9069 }
786e8c11
YO
9070 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9071 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9072 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9073 }
9074 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9075 assert(next);
786e8c11 9076 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9077 }
9078 else if ( op == PLUS || op == STAR) {
786e8c11 9079 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9080 }
9081 else if (op == ANYOF) {
9082 /* arglen 1 + class block */
9083 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9084 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9085 node = NEXTOPER(node);
9086 }
9087 else if (PL_regkind[(U8)op] == EXACT) {
9088 /* Literal string, where present. */
9089 node += NODE_SZ_STR(node) - 1;
9090 node = NEXTOPER(node);
9091 }
9092 else {
9093 node = NEXTOPER(node);
9094 node += regarglen[(U8)op];
9095 }
9096 if (op == CURLYX || op == OPEN)
786e8c11 9097 indent++;
a28509cc 9098 else if (op == WHILEM)
786e8c11 9099 indent--;
a28509cc 9100 }
3dab1dad 9101 CLEAR_OPTSTART;
786e8c11
YO
9102#ifdef DEBUG_DUMPUNTIL
9103 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9104#endif
1de06328 9105 return node;
a28509cc
AL
9106}
9107
9108#endif /* DEBUGGING */
9109
241d1a3b
NC
9110/*
9111 * Local variables:
9112 * c-indentation-style: bsd
9113 * c-basic-offset: 4
9114 * indent-tabs-mode: t
9115 * End:
9116 *
37442d52
RGS
9117 * ex: set ts=8 sts=4 sw=4 noet:
9118 */