This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Off by one in the trie code?
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
6bda09f9 120 regnode **parens; /* offsets of each paren */
1aa99e6b 121 I32 utf8;
6bda09f9 122 HV *charnames; /* cache of named sequences */
81714fb9 123 HV *paren_names; /* Paren names */
830247a4
IZ
124#if ADD_TO_REGEXEC
125 char *starttry; /* -Dr: where regtry was called. */
126#define RExC_starttry (pRExC_state->starttry)
127#endif
3dab1dad 128#ifdef DEBUGGING
be8e71aa 129 const char *lastparse;
3dab1dad
YO
130 I32 lastnum;
131#define RExC_lastparse (pRExC_state->lastparse)
132#define RExC_lastnum (pRExC_state->lastnum)
133#endif
830247a4
IZ
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 155#define RExC_charnames (pRExC_state->charnames)
6bda09f9 156#define RExC_parens (pRExC_state->parens)
81714fb9 157#define RExC_paren_names (pRExC_state->paren_names)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
3dab1dad
YO
175#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
176
07be1b83
YO
177/* whether trie related optimizations are enabled */
178#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
179#define TRIE_STUDY_OPT
786e8c11 180#define FULL_TRIE_STUDY
07be1b83
YO
181#define TRIE_STCLASS
182#endif
1de06328
YO
183
184
185/* About scan_data_t.
186
187 During optimisation we recurse through the regexp program performing
188 various inplace (keyhole style) optimisations. In addition study_chunk
189 and scan_commit populate this data structure with information about
190 what strings MUST appear in the pattern. We look for the longest
191 string that must appear for at a fixed location, and we look for the
192 longest string that may appear at a floating location. So for instance
193 in the pattern:
194
195 /FOO[xX]A.*B[xX]BAR/
196
197 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
198 strings (because they follow a .* construct). study_chunk will identify
199 both FOO and BAR as being the longest fixed and floating strings respectively.
200
201 The strings can be composites, for instance
202
203 /(f)(o)(o)/
204
205 will result in a composite fixed substring 'foo'.
206
207 For each string some basic information is maintained:
208
209 - offset or min_offset
210 This is the position the string must appear at, or not before.
211 It also implicitly (when combined with minlenp) tells us how many
212 character must match before the string we are searching.
213 Likewise when combined with minlenp and the length of the string
214 tells us how many characters must appear after the string we have
215 found.
216
217 - max_offset
218 Only used for floating strings. This is the rightmost point that
219 the string can appear at. Ifset to I32 max it indicates that the
220 string can occur infinitely far to the right.
221
222 - minlenp
223 A pointer to the minimum length of the pattern that the string
224 was found inside. This is important as in the case of positive
225 lookahead or positive lookbehind we can have multiple patterns
226 involved. Consider
227
228 /(?=FOO).*F/
229
230 The minimum length of the pattern overall is 3, the minimum length
231 of the lookahead part is 3, but the minimum length of the part that
232 will actually match is 1. So 'FOO's minimum length is 3, but the
233 minimum length for the F is 1. This is important as the minimum length
234 is used to determine offsets in front of and behind the string being
235 looked for. Since strings can be composites this is the length of the
236 pattern at the time it was commited with a scan_commit. Note that
237 the length is calculated by study_chunk, so that the minimum lengths
238 are not known until the full pattern has been compiled, thus the
239 pointer to the value.
240
241 - lookbehind
242
243 In the case of lookbehind the string being searched for can be
244 offset past the start point of the final matching string.
245 If this value was just blithely removed from the min_offset it would
246 invalidate some of the calculations for how many chars must match
247 before or after (as they are derived from min_offset and minlen and
248 the length of the string being searched for).
249 When the final pattern is compiled and the data is moved from the
250 scan_data_t structure into the regexp structure the information
251 about lookbehind is factored in, with the information that would
252 have been lost precalculated in the end_shift field for the
253 associated string.
254
255 The fields pos_min and pos_delta are used to store the minimum offset
256 and the delta to the maximum offset at the current point in the pattern.
257
258*/
2c2d71f5
JH
259
260typedef struct scan_data_t {
1de06328
YO
261 /*I32 len_min; unused */
262 /*I32 len_delta; unused */
2c2d71f5
JH
263 I32 pos_min;
264 I32 pos_delta;
265 SV *last_found;
1de06328 266 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
267 I32 last_start_min;
268 I32 last_start_max;
1de06328
YO
269 SV **longest; /* Either &l_fixed, or &l_float. */
270 SV *longest_fixed; /* longest fixed string found in pattern */
271 I32 offset_fixed; /* offset where it starts */
272 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
273 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
274 SV *longest_float; /* longest floating string found in pattern */
275 I32 offset_float_min; /* earliest point in string it can appear */
276 I32 offset_float_max; /* latest point in string it can appear */
277 I32 *minlen_float; /* pointer to the minlen relevent to the string */
278 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
279 I32 flags;
280 I32 whilem_c;
cb434fcc 281 I32 *last_closep;
653099ff 282 struct regnode_charclass_class *start_class;
2c2d71f5
JH
283} scan_data_t;
284
a687059c 285/*
e50aee73 286 * Forward declarations for pregcomp()'s friends.
a687059c 287 */
a0d0e21e 288
27da23d5 289static const scan_data_t zero_scan_data =
1de06328 290 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
291
292#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
293#define SF_BEFORE_SEOL 0x0001
294#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
295#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
296#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
297
09b7f37c
CB
298#ifdef NO_UNARY_PLUS
299# define SF_FIX_SHIFT_EOL (0+2)
300# define SF_FL_SHIFT_EOL (0+4)
301#else
302# define SF_FIX_SHIFT_EOL (+2)
303# define SF_FL_SHIFT_EOL (+4)
304#endif
c277df42
IZ
305
306#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
307#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
308
309#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
310#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
311#define SF_IS_INF 0x0040
312#define SF_HAS_PAR 0x0080
313#define SF_IN_PAR 0x0100
314#define SF_HAS_EVAL 0x0200
315#define SCF_DO_SUBSTR 0x0400
653099ff
GS
316#define SCF_DO_STCLASS_AND 0x0800
317#define SCF_DO_STCLASS_OR 0x1000
318#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 319#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 320
786e8c11
YO
321#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
322
07be1b83 323
eb160463 324#define UTF (RExC_utf8 != 0)
e2509266
JH
325#define LOC ((RExC_flags & PMf_LOCALE) != 0)
326#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 327
ffc61ed2 328#define OOB_UNICODE 12345678
93733859 329#define OOB_NAMEDCLASS -1
b8c5462f 330
a0ed51b3
LW
331#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
332#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
333
8615cb43 334
b45f050a
JF
335/* length of regex to show in messages that don't mark a position within */
336#define RegexLengthToShowInErrorMessages 127
337
338/*
339 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
340 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
341 * op/pragma/warn/regcomp.
342 */
7253e4e3
RK
343#define MARKER1 "<-- HERE" /* marker as it appears in the description */
344#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 345
7253e4e3 346#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
347
348/*
349 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
350 * arg. Show regex, up to a maximum length. If it's too long, chop and add
351 * "...".
352 */
ccb2c380 353#define FAIL(msg) STMT_START { \
bfed75c6 354 const char *ellipses = ""; \
ccb2c380
MP
355 IV len = RExC_end - RExC_precomp; \
356 \
357 if (!SIZE_ONLY) \
358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
359 if (len > RegexLengthToShowInErrorMessages) { \
360 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
361 len = RegexLengthToShowInErrorMessages - 10; \
362 ellipses = "..."; \
363 } \
364 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
365 msg, (int)len, RExC_precomp, ellipses); \
366} STMT_END
8615cb43 367
b45f050a 368/*
b45f050a
JF
369 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
370 */
ccb2c380 371#define Simple_vFAIL(m) STMT_START { \
a28509cc 372 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
373 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
374 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
b45f050a
JF
376
377/*
378 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
379 */
ccb2c380
MP
380#define vFAIL(m) STMT_START { \
381 if (!SIZE_ONLY) \
382 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
383 Simple_vFAIL(m); \
384} STMT_END
b45f050a
JF
385
386/*
387 * Like Simple_vFAIL(), but accepts two arguments.
388 */
ccb2c380 389#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
392 (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
397 */
ccb2c380
MP
398#define vFAIL2(m,a1) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL2(m, a1); \
402} STMT_END
b45f050a
JF
403
404
405/*
406 * Like Simple_vFAIL(), but accepts three arguments.
407 */
ccb2c380 408#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 409 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
410 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
411 (int)offset, RExC_precomp, RExC_precomp + offset); \
412} STMT_END
b45f050a
JF
413
414/*
415 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
416 */
ccb2c380
MP
417#define vFAIL3(m,a1,a2) STMT_START { \
418 if (!SIZE_ONLY) \
419 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
420 Simple_vFAIL3(m, a1, a2); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Like Simple_vFAIL(), but accepts four arguments.
425 */
ccb2c380 426#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a 431
ccb2c380 432#define vWARN(loc,m) STMT_START { \
a28509cc 433 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
434 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
435 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
436} STMT_END
437
438#define vWARNdep(loc,m) STMT_START { \
a28509cc 439 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
440 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
441 "%s" REPORT_LOCATION, \
442 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
443} STMT_END
444
445
446#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 447 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
451
452#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
463
464#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
9d1d55b5 469
8615cb43 470
cd439c50 471/* Allow for side effects in s */
ccb2c380
MP
472#define REGC(c,s) STMT_START { \
473 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
474} STMT_END
cd439c50 475
fac92740
MJD
476/* Macros for recording node offsets. 20001227 mjd@plover.com
477 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
478 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
479 * Element 0 holds the number n.
07be1b83 480 * Position is 1 indexed.
fac92740
MJD
481 */
482
ccb2c380
MP
483#define Set_Node_Offset_To_R(node,byte) STMT_START { \
484 if (! SIZE_ONLY) { \
485 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 486 __LINE__, (node), (int)(byte))); \
ccb2c380 487 if((node) < 0) { \
551405c4 488 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
489 } else { \
490 RExC_offsets[2*(node)-1] = (byte); \
491 } \
492 } \
493} STMT_END
494
495#define Set_Node_Offset(node,byte) \
496 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
497#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
498
499#define Set_Node_Length_To_R(node,len) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 502 __LINE__, (int)(node), (int)(len))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)] = (len); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Length(node,len) \
512 Set_Node_Length_To_R((node)-RExC_emit_start, len)
513#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
514#define Set_Node_Cur_Length(node) \
515 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
516
517/* Get offsets and lengths */
518#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
519#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
520
07be1b83
YO
521#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
522 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
523 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
524} STMT_END
525
526
527#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
528#define EXPERIMENTAL_INPLACESCAN
529#endif
530
1de06328 531#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 532DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
533 PerlIO_printf(Perl_debug_log, \
534 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
535 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
536 (int)(depth)*2, "", \
537 (IV)((data)->pos_min), \
538 (IV)((data)->pos_delta), \
539 (IV)((data)->flags), \
540 (IV)((data)->whilem_c), \
541 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
542 ); \
543 if ((data)->last_found) \
544 PerlIO_printf(Perl_debug_log, \
545 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
546 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
547 SvPVX_const((data)->last_found), \
548 (IV)((data)->last_end), \
549 (IV)((data)->last_start_min), \
550 (IV)((data)->last_start_max), \
551 ((data)->longest && \
552 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
553 SvPVX_const((data)->longest_fixed), \
554 (IV)((data)->offset_fixed), \
555 ((data)->longest && \
556 (data)->longest==&((data)->longest_float)) ? "*" : "", \
557 SvPVX_const((data)->longest_float), \
558 (IV)((data)->offset_float_min), \
559 (IV)((data)->offset_float_max) \
560 ); \
561 PerlIO_printf(Perl_debug_log,"\n"); \
562});
563
acfe0abc 564static void clear_re(pTHX_ void *r);
4327152a 565
653099ff 566/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 567 Update the longest found anchored substring and the longest found
653099ff
GS
568 floating substrings if needed. */
569
4327152a 570STATIC void
1de06328 571S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 572{
e1ec3a88
AL
573 const STRLEN l = CHR_SVLEN(data->last_found);
574 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 575 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 576
c277df42 577 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 578 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
579 if (*data->longest == data->longest_fixed) {
580 data->offset_fixed = l ? data->last_start_min : data->pos_min;
581 if (data->flags & SF_BEFORE_EOL)
b81d288d 582 data->flags
c277df42
IZ
583 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
584 else
585 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
586 data->minlen_fixed=minlenp;
587 data->lookbehind_fixed=0;
a0ed51b3
LW
588 }
589 else {
c277df42 590 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
591 data->offset_float_max = (l
592 ? data->last_start_max
c277df42 593 : data->pos_min + data->pos_delta);
9051bda5
HS
594 if ((U32)data->offset_float_max > (U32)I32_MAX)
595 data->offset_float_max = I32_MAX;
c277df42 596 if (data->flags & SF_BEFORE_EOL)
b81d288d 597 data->flags
c277df42
IZ
598 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
599 else
600 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
601 data->minlen_float=minlenp;
602 data->lookbehind_float=0;
c277df42
IZ
603 }
604 }
605 SvCUR_set(data->last_found, 0);
0eda9292 606 {
a28509cc 607 SV * const sv = data->last_found;
097eb12c
AL
608 if (SvUTF8(sv) && SvMAGICAL(sv)) {
609 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
610 if (mg)
611 mg->mg_len = 0;
612 }
0eda9292 613 }
c277df42
IZ
614 data->last_end = -1;
615 data->flags &= ~SF_BEFORE_EOL;
1de06328 616 DEBUG_STUDYDATA(data,0);
c277df42
IZ
617}
618
653099ff
GS
619/* Can match anything (initialization) */
620STATIC void
097eb12c 621S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 622{
653099ff 623 ANYOF_CLASS_ZERO(cl);
f8bef550 624 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 625 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
626 if (LOC)
627 cl->flags |= ANYOF_LOCALE;
628}
629
630/* Can match anything (initialization) */
631STATIC int
5f66b61c 632S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
633{
634 int value;
635
aaa51d5e 636 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
637 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
638 return 1;
1aa99e6b
IH
639 if (!(cl->flags & ANYOF_UNICODE_ALL))
640 return 0;
10edeb5d 641 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 642 return 0;
653099ff
GS
643 return 1;
644}
645
646/* Can match anything (initialization) */
647STATIC void
097eb12c 648S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 649{
8ecf7187 650 Zero(cl, 1, struct regnode_charclass_class);
653099ff 651 cl->type = ANYOF;
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653}
654
655STATIC void
097eb12c 656S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 657{
8ecf7187 658 Zero(cl, 1, struct regnode_charclass_class);
653099ff 659 cl->type = ANYOF;
830247a4 660 cl_anything(pRExC_state, cl);
653099ff
GS
661 if (LOC)
662 cl->flags |= ANYOF_LOCALE;
663}
664
665/* 'And' a given class with another one. Can create false positives */
666/* We assume that cl is not inverted */
667STATIC void
5f66b61c 668S_cl_and(struct regnode_charclass_class *cl,
a28509cc 669 const struct regnode_charclass_class *and_with)
653099ff 670{
653099ff
GS
671 if (!(and_with->flags & ANYOF_CLASS)
672 && !(cl->flags & ANYOF_CLASS)
673 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
674 && !(and_with->flags & ANYOF_FOLD)
675 && !(cl->flags & ANYOF_FOLD)) {
676 int i;
677
678 if (and_with->flags & ANYOF_INVERT)
679 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
680 cl->bitmap[i] &= ~and_with->bitmap[i];
681 else
682 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
683 cl->bitmap[i] &= and_with->bitmap[i];
684 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
685 if (!(and_with->flags & ANYOF_EOS))
686 cl->flags &= ~ANYOF_EOS;
1aa99e6b 687
14ebb1a2
JH
688 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
689 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
690 cl->flags &= ~ANYOF_UNICODE_ALL;
691 cl->flags |= ANYOF_UNICODE;
692 ARG_SET(cl, ARG(and_with));
693 }
14ebb1a2
JH
694 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
695 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 696 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
697 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
698 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 699 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
700}
701
702/* 'OR' a given class with another one. Can create false positives */
703/* We assume that cl is not inverted */
704STATIC void
097eb12c 705S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 706{
653099ff
GS
707 if (or_with->flags & ANYOF_INVERT) {
708 /* We do not use
709 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
710 * <= (B1 | !B2) | (CL1 | !CL2)
711 * which is wasteful if CL2 is small, but we ignore CL2:
712 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
713 * XXXX Can we handle case-fold? Unclear:
714 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
715 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
716 */
717 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
718 && !(or_with->flags & ANYOF_FOLD)
719 && !(cl->flags & ANYOF_FOLD) ) {
720 int i;
721
722 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
723 cl->bitmap[i] |= ~or_with->bitmap[i];
724 } /* XXXX: logic is complicated otherwise */
725 else {
830247a4 726 cl_anything(pRExC_state, cl);
653099ff
GS
727 }
728 } else {
729 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
730 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 731 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
732 || (cl->flags & ANYOF_FOLD)) ) {
733 int i;
734
735 /* OR char bitmap and class bitmap separately */
736 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
737 cl->bitmap[i] |= or_with->bitmap[i];
738 if (or_with->flags & ANYOF_CLASS) {
739 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
740 cl->classflags[i] |= or_with->classflags[i];
741 cl->flags |= ANYOF_CLASS;
742 }
743 }
744 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 745 cl_anything(pRExC_state, cl);
653099ff
GS
746 }
747 }
748 if (or_with->flags & ANYOF_EOS)
749 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
750
751 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
752 ARG(cl) != ARG(or_with)) {
753 cl->flags |= ANYOF_UNICODE_ALL;
754 cl->flags &= ~ANYOF_UNICODE;
755 }
756 if (or_with->flags & ANYOF_UNICODE_ALL) {
757 cl->flags |= ANYOF_UNICODE_ALL;
758 cl->flags &= ~ANYOF_UNICODE;
759 }
653099ff
GS
760}
761
a3621e74
YO
762#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
763#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
764#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
765#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
766
3dab1dad
YO
767
768#ifdef DEBUGGING
07be1b83 769/*
3dab1dad
YO
770 dump_trie(trie)
771 dump_trie_interim_list(trie,next_alloc)
772 dump_trie_interim_table(trie,next_alloc)
773
774 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
775 The _interim_ variants are used for debugging the interim
776 tables that are used to generate the final compressed
777 representation which is what dump_trie expects.
778
3dab1dad
YO
779 Part of the reason for their existance is to provide a form
780 of documentation as to how the different representations function.
07be1b83
YO
781
782*/
3dab1dad
YO
783
784/*
785 dump_trie(trie)
786 Dumps the final compressed table form of the trie to Perl_debug_log.
787 Used for debugging make_trie().
788*/
789
790STATIC void
791S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
792{
793 U32 state;
ab3bbdeb
YO
794 SV *sv=sv_newmortal();
795 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
796 GET_RE_DEBUG_FLAGS_DECL;
797
ab3bbdeb 798
3dab1dad
YO
799 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
800 (int)depth * 2 + 2,"",
801 "Match","Base","Ofs" );
802
803 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 804 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 805 if ( tmp ) {
ab3bbdeb
YO
806 PerlIO_printf( Perl_debug_log, "%*s",
807 colwidth,
ddc5bc0f 808 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
809 PL_colors[0], PL_colors[1],
810 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
811 PERL_PV_ESCAPE_FIRSTCHAR
812 )
813 );
3dab1dad
YO
814 }
815 }
816 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
817 (int)depth * 2 + 2,"");
818
819 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 820 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "\n");
822
786e8c11 823 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 824 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
825
826 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
827
828 if ( trie->states[ state ].wordnum ) {
829 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
830 } else {
831 PerlIO_printf( Perl_debug_log, "%6s", "" );
832 }
833
834 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
835
836 if ( base ) {
837 U32 ofs = 0;
838
839 while( ( base + ofs < trie->uniquecharcount ) ||
840 ( base + ofs - trie->uniquecharcount < trie->lasttrans
841 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
842 ofs++;
843
844 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
845
846 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
847 if ( ( base + ofs >= trie->uniquecharcount ) &&
848 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
849 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
850 {
ab3bbdeb
YO
851 PerlIO_printf( Perl_debug_log, "%*"UVXf,
852 colwidth,
3dab1dad
YO
853 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
854 } else {
ab3bbdeb 855 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
856 }
857 }
858
859 PerlIO_printf( Perl_debug_log, "]");
860
861 }
862 PerlIO_printf( Perl_debug_log, "\n" );
863 }
864}
865/*
866 dump_trie_interim_list(trie,next_alloc)
867 Dumps a fully constructed but uncompressed trie in list form.
868 List tries normally only are used for construction when the number of
869 possible chars (trie->uniquecharcount) is very high.
870 Used for debugging make_trie().
871*/
872STATIC void
873S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
874{
875 U32 state;
ab3bbdeb
YO
876 SV *sv=sv_newmortal();
877 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
878 GET_RE_DEBUG_FLAGS_DECL;
879 /* print out the table precompression. */
ab3bbdeb
YO
880 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
881 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
882 "------:-----+-----------------\n" );
3dab1dad
YO
883
884 for( state=1 ; state < next_alloc ; state ++ ) {
885 U16 charid;
886
ab3bbdeb 887 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
888 (int)depth * 2 + 2,"", (UV)state );
889 if ( ! trie->states[ state ].wordnum ) {
890 PerlIO_printf( Perl_debug_log, "%5s| ","");
891 } else {
892 PerlIO_printf( Perl_debug_log, "W%4x| ",
893 trie->states[ state ].wordnum
894 );
895 }
896 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 897 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
898 if ( tmp ) {
899 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
900 colwidth,
ddc5bc0f 901 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
902 PL_colors[0], PL_colors[1],
903 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
904 PERL_PV_ESCAPE_FIRSTCHAR
905 ) ,
3dab1dad
YO
906 TRIE_LIST_ITEM(state,charid).forid,
907 (UV)TRIE_LIST_ITEM(state,charid).newstate
908 );
909 }
ab3bbdeb
YO
910 }
911 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
912 }
913}
914
915/*
916 dump_trie_interim_table(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in table form.
918 This is the normal DFA style state transition table, with a few
919 twists to facilitate compression later.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
924{
925 U32 state;
926 U16 charid;
ab3bbdeb
YO
927 SV *sv=sv_newmortal();
928 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
929 GET_RE_DEBUG_FLAGS_DECL;
930
931 /*
932 print out the table precompression so that we can do a visual check
933 that they are identical.
934 */
935
936 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
937
938 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 939 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 940 if ( tmp ) {
ab3bbdeb
YO
941 PerlIO_printf( Perl_debug_log, "%*s",
942 colwidth,
ddc5bc0f 943 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
944 PL_colors[0], PL_colors[1],
945 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
946 PERL_PV_ESCAPE_FIRSTCHAR
947 )
948 );
3dab1dad
YO
949 }
950 }
951
952 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
953
954 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 955 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
956 }
957
958 PerlIO_printf( Perl_debug_log, "\n" );
959
960 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
961
962 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
963 (int)depth * 2 + 2,"",
964 (UV)TRIE_NODENUM( state ) );
965
966 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
967 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
968 if (v)
969 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
970 else
971 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
972 }
973 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
974 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
975 } else {
976 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
977 trie->states[ TRIE_NODENUM( state ) ].wordnum );
978 }
979 }
07be1b83 980}
3dab1dad
YO
981
982#endif
983
786e8c11
YO
984/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
985 startbranch: the first branch in the whole branch sequence
986 first : start branch of sequence of branch-exact nodes.
987 May be the same as startbranch
988 last : Thing following the last branch.
989 May be the same as tail.
990 tail : item following the branch sequence
991 count : words in the sequence
992 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
993 depth : indent depth
3dab1dad 994
786e8c11 995Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 996
786e8c11
YO
997A trie is an N'ary tree where the branches are determined by digital
998decomposition of the key. IE, at the root node you look up the 1st character and
999follow that branch repeat until you find the end of the branches. Nodes can be
1000marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1001
786e8c11 1002 /he|she|his|hers/
72f13be8 1003
786e8c11
YO
1004would convert into the following structure. Numbers represent states, letters
1005following numbers represent valid transitions on the letter from that state, if
1006the number is in square brackets it represents an accepting state, otherwise it
1007will be in parenthesis.
07be1b83 1008
786e8c11
YO
1009 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1010 | |
1011 | (2)
1012 | |
1013 (1) +-i->(6)-+-s->[7]
1014 |
1015 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1016
786e8c11
YO
1017 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1018
1019This shows that when matching against the string 'hers' we will begin at state 1
1020read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1021then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1022is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1023single traverse. We store a mapping from accepting to state to which word was
1024matched, and then when we have multiple possibilities we try to complete the
1025rest of the regex in the order in which they occured in the alternation.
1026
1027The only prior NFA like behaviour that would be changed by the TRIE support is
1028the silent ignoring of duplicate alternations which are of the form:
1029
1030 / (DUPE|DUPE) X? (?{ ... }) Y /x
1031
1032Thus EVAL blocks follwing a trie may be called a different number of times with
1033and without the optimisation. With the optimisations dupes will be silently
1034ignored. This inconsistant behaviour of EVAL type nodes is well established as
1035the following demonstrates:
1036
1037 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1038
1039which prints out 'word' three times, but
1040
1041 'words'=~/(word|word|word)(?{ print $1 })S/
1042
1043which doesnt print it out at all. This is due to other optimisations kicking in.
1044
1045Example of what happens on a structural level:
1046
1047The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1048
1049 1: CURLYM[1] {1,32767}(18)
1050 5: BRANCH(8)
1051 6: EXACT <ac>(16)
1052 8: BRANCH(11)
1053 9: EXACT <ad>(16)
1054 11: BRANCH(14)
1055 12: EXACT <ab>(16)
1056 16: SUCCEED(0)
1057 17: NOTHING(18)
1058 18: END(0)
1059
1060This would be optimizable with startbranch=5, first=5, last=16, tail=16
1061and should turn into:
1062
1063 1: CURLYM[1] {1,32767}(18)
1064 5: TRIE(16)
1065 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1066 <ac>
1067 <ad>
1068 <ab>
1069 16: SUCCEED(0)
1070 17: NOTHING(18)
1071 18: END(0)
1072
1073Cases where tail != last would be like /(?foo|bar)baz/:
1074
1075 1: BRANCH(4)
1076 2: EXACT <foo>(8)
1077 4: BRANCH(7)
1078 5: EXACT <bar>(8)
1079 7: TAIL(8)
1080 8: EXACT <baz>(10)
1081 10: END(0)
1082
1083which would be optimizable with startbranch=1, first=1, last=7, tail=8
1084and would end up looking like:
1085
1086 1: TRIE(8)
1087 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1088 <foo>
1089 <bar>
1090 7: TAIL(8)
1091 8: EXACT <baz>(10)
1092 10: END(0)
1093
1094 d = uvuni_to_utf8_flags(d, uv, 0);
1095
1096is the recommended Unicode-aware way of saying
1097
1098 *(d++) = uv;
1099*/
1100
1101#define TRIE_STORE_REVCHAR \
1102 STMT_START { \
1103 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1104 if (UTF) SvUTF8_on(tmp); \
1105 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1106 } STMT_END
1107
1108#define TRIE_READ_CHAR STMT_START { \
1109 wordlen++; \
1110 if ( UTF ) { \
1111 if ( folder ) { \
1112 if ( foldlen > 0 ) { \
1113 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1114 foldlen -= len; \
1115 scan += len; \
1116 len = 0; \
1117 } else { \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1119 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1120 foldlen -= UNISKIP( uvc ); \
1121 scan = foldbuf + UNISKIP( uvc ); \
1122 } \
1123 } else { \
1124 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1125 } \
1126 } else { \
1127 uvc = (U32)*uc; \
1128 len = 1; \
1129 } \
1130} STMT_END
1131
1132
1133
1134#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1135 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1136 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1137 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1138 } \
1139 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1140 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1141 TRIE_LIST_CUR( state )++; \
1142} STMT_END
07be1b83 1143
786e8c11
YO
1144#define TRIE_LIST_NEW(state) STMT_START { \
1145 Newxz( trie->states[ state ].trans.list, \
1146 4, reg_trie_trans_le ); \
1147 TRIE_LIST_CUR( state ) = 1; \
1148 TRIE_LIST_LEN( state ) = 4; \
1149} STMT_END
07be1b83 1150
786e8c11
YO
1151#define TRIE_HANDLE_WORD(state) STMT_START { \
1152 U16 dupe= trie->states[ state ].wordnum; \
1153 regnode * const noper_next = regnext( noper ); \
1154 \
1155 if (trie->wordlen) \
1156 trie->wordlen[ curword ] = wordlen; \
1157 DEBUG_r({ \
1158 /* store the word for dumping */ \
1159 SV* tmp; \
1160 if (OP(noper) != NOTHING) \
1161 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1162 else \
1163 tmp = newSVpvn( "", 0 ); \
1164 if ( UTF ) SvUTF8_on( tmp ); \
1165 av_push( trie->words, tmp ); \
1166 }); \
1167 \
1168 curword++; \
1169 \
1170 if ( noper_next < tail ) { \
1171 if (!trie->jump) \
1172 Newxz( trie->jump, word_count + 1, U16); \
1173 trie->jump[curword] = (U16)(tail - noper_next); \
1174 if (!jumper) \
1175 jumper = noper_next; \
1176 if (!nextbranch) \
1177 nextbranch= regnext(cur); \
1178 } \
1179 \
1180 if ( dupe ) { \
1181 /* So it's a dupe. This means we need to maintain a */\
1182 /* linked-list from the first to the next. */\
1183 /* we only allocate the nextword buffer when there */\
1184 /* a dupe, so first time we have to do the allocation */\
1185 if (!trie->nextword) \
1186 Newxz( trie->nextword, word_count + 1, U16); \
1187 while ( trie->nextword[dupe] ) \
1188 dupe= trie->nextword[dupe]; \
1189 trie->nextword[dupe]= curword; \
1190 } else { \
1191 /* we haven't inserted this word yet. */ \
1192 trie->states[ state ].wordnum = curword; \
1193 } \
1194} STMT_END
07be1b83 1195
3dab1dad 1196
786e8c11
YO
1197#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1198 ( ( base + charid >= ucharcount \
1199 && base + charid < ubound \
1200 && state == trie->trans[ base - ucharcount + charid ].check \
1201 && trie->trans[ base - ucharcount + charid ].next ) \
1202 ? trie->trans[ base - ucharcount + charid ].next \
1203 : ( state==1 ? special : 0 ) \
1204 )
3dab1dad 1205
786e8c11
YO
1206#define MADE_TRIE 1
1207#define MADE_JUMP_TRIE 2
1208#define MADE_EXACT_TRIE 4
3dab1dad 1209
a3621e74 1210STATIC I32
786e8c11 1211S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1212{
27da23d5 1213 dVAR;
a3621e74
YO
1214 /* first pass, loop through and scan words */
1215 reg_trie_data *trie;
1216 regnode *cur;
9f7f3913 1217 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1218 STRLEN len = 0;
1219 UV uvc = 0;
1220 U16 curword = 0;
1221 U32 next_alloc = 0;
786e8c11
YO
1222 regnode *jumper = NULL;
1223 regnode *nextbranch = NULL;
a3621e74 1224 /* we just use folder as a flag in utf8 */
e1ec3a88 1225 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1226 ? PL_fold
1227 : ( flags == EXACTFL
1228 ? PL_fold_locale
1229 : NULL
1230 )
1231 );
1232
e1ec3a88 1233 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1234 SV *re_trie_maxbuff;
3dab1dad
YO
1235#ifndef DEBUGGING
1236 /* these are only used during construction but are useful during
8e11feef 1237 * debugging so we store them in the struct when debugging.
8e11feef 1238 */
3dab1dad 1239 STRLEN trie_charcount=0;
3dab1dad
YO
1240 AV *trie_revcharmap;
1241#endif
a3621e74 1242 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1243#ifndef DEBUGGING
1244 PERL_UNUSED_ARG(depth);
1245#endif
a3621e74 1246
a02a5408 1247 Newxz( trie, 1, reg_trie_data );
a3621e74 1248 trie->refcount = 1;
3dab1dad 1249 trie->startstate = 1;
786e8c11 1250 trie->wordcount = word_count;
a3621e74 1251 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1252 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1253 if (!(UTF && folder))
1254 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1255 DEBUG_r({
1256 trie->words = newAV();
a3621e74 1257 });
3dab1dad 1258 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1259
0111c4fd 1260 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1261 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1262 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1263 }
3dab1dad
YO
1264 DEBUG_OPTIMISE_r({
1265 PerlIO_printf( Perl_debug_log,
786e8c11 1266 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1267 (int)depth * 2 + 2, "",
1268 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1269 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1270 (int)depth);
3dab1dad 1271 });
a3621e74
YO
1272 /* -- First loop and Setup --
1273
1274 We first traverse the branches and scan each word to determine if it
1275 contains widechars, and how many unique chars there are, this is
1276 important as we have to build a table with at least as many columns as we
1277 have unique chars.
1278
1279 We use an array of integers to represent the character codes 0..255
1280 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1281 native representation of the character value as the key and IV's for the
1282 coded index.
1283
1284 *TODO* If we keep track of how many times each character is used we can
1285 remap the columns so that the table compression later on is more
1286 efficient in terms of memory by ensuring most common value is in the
1287 middle and the least common are on the outside. IMO this would be better
1288 than a most to least common mapping as theres a decent chance the most
1289 common letter will share a node with the least common, meaning the node
1290 will not be compressable. With a middle is most common approach the worst
1291 case is when we have the least common nodes twice.
1292
1293 */
1294
a3621e74 1295 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1296 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1297 const U8 *uc = (U8*)STRING( noper );
a28509cc 1298 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1299 STRLEN foldlen = 0;
1300 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1301 const U8 *scan = (U8*)NULL;
07be1b83 1302 U32 wordlen = 0; /* required init */
3dab1dad 1303 STRLEN chars=0;
a3621e74 1304
3dab1dad
YO
1305 if (OP(noper) == NOTHING) {
1306 trie->minlen= 0;
1307 continue;
1308 }
1309 if (trie->bitmap) {
1310 TRIE_BITMAP_SET(trie,*uc);
1311 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1312 }
a3621e74 1313 for ( ; uc < e ; uc += len ) {
3dab1dad 1314 TRIE_CHARCOUNT(trie)++;
a3621e74 1315 TRIE_READ_CHAR;
3dab1dad 1316 chars++;
a3621e74
YO
1317 if ( uvc < 256 ) {
1318 if ( !trie->charmap[ uvc ] ) {
1319 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1320 if ( folder )
1321 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1322 TRIE_STORE_REVCHAR;
a3621e74
YO
1323 }
1324 } else {
1325 SV** svpp;
1326 if ( !trie->widecharmap )
1327 trie->widecharmap = newHV();
1328
1329 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1330
1331 if ( !svpp )
e4584336 1332 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1333
1334 if ( !SvTRUE( *svpp ) ) {
1335 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1336 TRIE_STORE_REVCHAR;
a3621e74
YO
1337 }
1338 }
1339 }
3dab1dad
YO
1340 if( cur == first ) {
1341 trie->minlen=chars;
1342 trie->maxlen=chars;
1343 } else if (chars < trie->minlen) {
1344 trie->minlen=chars;
1345 } else if (chars > trie->maxlen) {
1346 trie->maxlen=chars;
1347 }
1348
a3621e74
YO
1349 } /* end first pass */
1350 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1351 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1352 (int)depth * 2 + 2,"",
85c3142d 1353 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1354 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1355 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1356 );
786e8c11 1357 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1358
1359 /*
1360 We now know what we are dealing with in terms of unique chars and
1361 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1362 representation using a flat table will take. If it's over a reasonable
1363 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1364 conservative but potentially much slower representation using an array
1365 of lists.
1366
1367 At the end we convert both representations into the same compressed
1368 form that will be used in regexec.c for matching with. The latter
1369 is a form that cannot be used to construct with but has memory
1370 properties similar to the list form and access properties similar
1371 to the table form making it both suitable for fast searches and
1372 small enough that its feasable to store for the duration of a program.
1373
1374 See the comment in the code where the compressed table is produced
1375 inplace from the flat tabe representation for an explanation of how
1376 the compression works.
1377
1378 */
1379
1380
3dab1dad 1381 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1382 /*
1383 Second Pass -- Array Of Lists Representation
1384
1385 Each state will be represented by a list of charid:state records
1386 (reg_trie_trans_le) the first such element holds the CUR and LEN
1387 points of the allocated array. (See defines above).
1388
1389 We build the initial structure using the lists, and then convert
1390 it into the compressed table form which allows faster lookups
1391 (but cant be modified once converted).
a3621e74
YO
1392 */
1393
a3621e74
YO
1394 STRLEN transcount = 1;
1395
3dab1dad 1396 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1397 TRIE_LIST_NEW(1);
1398 next_alloc = 2;
1399
1400 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1401
c445ea15
AL
1402 regnode * const noper = NEXTOPER( cur );
1403 U8 *uc = (U8*)STRING( noper );
1404 const U8 * const e = uc + STR_LEN( noper );
1405 U32 state = 1; /* required init */
1406 U16 charid = 0; /* sanity init */
1407 U8 *scan = (U8*)NULL; /* sanity init */
1408 STRLEN foldlen = 0; /* required init */
07be1b83 1409 U32 wordlen = 0; /* required init */
c445ea15
AL
1410 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1411
3dab1dad 1412 if (OP(noper) != NOTHING) {
786e8c11 1413 for ( ; uc < e ; uc += len ) {
c445ea15 1414
786e8c11 1415 TRIE_READ_CHAR;
c445ea15 1416
786e8c11
YO
1417 if ( uvc < 256 ) {
1418 charid = trie->charmap[ uvc ];
c445ea15 1419 } else {
786e8c11
YO
1420 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1421 if ( !svpp ) {
1422 charid = 0;
1423 } else {
1424 charid=(U16)SvIV( *svpp );
1425 }
c445ea15 1426 }
786e8c11
YO
1427 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1428 if ( charid ) {
a3621e74 1429
786e8c11
YO
1430 U16 check;
1431 U32 newstate = 0;
a3621e74 1432
786e8c11
YO
1433 charid--;
1434 if ( !trie->states[ state ].trans.list ) {
1435 TRIE_LIST_NEW( state );
c445ea15 1436 }
786e8c11
YO
1437 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1438 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1439 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1440 break;
1441 }
1442 }
1443 if ( ! newstate ) {
1444 newstate = next_alloc++;
1445 TRIE_LIST_PUSH( state, charid, newstate );
1446 transcount++;
1447 }
1448 state = newstate;
1449 } else {
1450 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1451 }
a28509cc 1452 }
c445ea15 1453 }
3dab1dad 1454 TRIE_HANDLE_WORD(state);
a3621e74
YO
1455
1456 } /* end second pass */
1457
786e8c11 1458 trie->laststate = next_alloc;
a3621e74
YO
1459 Renew( trie->states, next_alloc, reg_trie_state );
1460
3dab1dad
YO
1461 /* and now dump it out before we compress it */
1462 DEBUG_TRIE_COMPILE_MORE_r(
1463 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1464 );
a3621e74 1465
a02a5408 1466 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1467 {
1468 U32 state;
a3621e74
YO
1469 U32 tp = 0;
1470 U32 zp = 0;
1471
1472
1473 for( state=1 ; state < next_alloc ; state ++ ) {
1474 U32 base=0;
1475
1476 /*
1477 DEBUG_TRIE_COMPILE_MORE_r(
1478 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1479 );
1480 */
1481
1482 if (trie->states[state].trans.list) {
1483 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1484 U16 maxid=minid;
a28509cc 1485 U16 idx;
a3621e74
YO
1486
1487 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1488 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1489 if ( forid < minid ) {
1490 minid=forid;
1491 } else if ( forid > maxid ) {
1492 maxid=forid;
1493 }
a3621e74
YO
1494 }
1495 if ( transcount < tp + maxid - minid + 1) {
1496 transcount *= 2;
1497 Renew( trie->trans, transcount, reg_trie_trans );
1498 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1499 }
1500 base = trie->uniquecharcount + tp - minid;
1501 if ( maxid == minid ) {
1502 U32 set = 0;
1503 for ( ; zp < tp ; zp++ ) {
1504 if ( ! trie->trans[ zp ].next ) {
1505 base = trie->uniquecharcount + zp - minid;
1506 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1507 trie->trans[ zp ].check = state;
1508 set = 1;
1509 break;
1510 }
1511 }
1512 if ( !set ) {
1513 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1514 trie->trans[ tp ].check = state;
1515 tp++;
1516 zp = tp;
1517 }
1518 } else {
1519 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1520 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1521 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1522 trie->trans[ tid ].check = state;
1523 }
1524 tp += ( maxid - minid + 1 );
1525 }
1526 Safefree(trie->states[ state ].trans.list);
1527 }
1528 /*
1529 DEBUG_TRIE_COMPILE_MORE_r(
1530 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1531 );
1532 */
1533 trie->states[ state ].trans.base=base;
1534 }
cc601c31 1535 trie->lasttrans = tp + 1;
a3621e74
YO
1536 }
1537 } else {
1538 /*
1539 Second Pass -- Flat Table Representation.
1540
1541 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1542 We know that we will need Charcount+1 trans at most to store the data
1543 (one row per char at worst case) So we preallocate both structures
1544 assuming worst case.
1545
1546 We then construct the trie using only the .next slots of the entry
1547 structs.
1548
1549 We use the .check field of the first entry of the node temporarily to
1550 make compression both faster and easier by keeping track of how many non
1551 zero fields are in the node.
1552
1553 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1554 transition.
1555
1556 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1557 number representing the first entry of the node, and state as a
1558 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1559 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1560 are 2 entrys per node. eg:
1561
1562 A B A B
1563 1. 2 4 1. 3 7
1564 2. 0 3 3. 0 5
1565 3. 0 0 5. 0 0
1566 4. 0 0 7. 0 0
1567
1568 The table is internally in the right hand, idx form. However as we also
1569 have to deal with the states array which is indexed by nodenum we have to
1570 use TRIE_NODENUM() to convert.
1571
1572 */
1573
3dab1dad
YO
1574
1575 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1576 reg_trie_trans );
3dab1dad 1577 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1578 next_alloc = trie->uniquecharcount + 1;
1579
3dab1dad 1580
a3621e74
YO
1581 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1582
c445ea15 1583 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1584 const U8 *uc = (U8*)STRING( noper );
1585 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1586
1587 U32 state = 1; /* required init */
1588
1589 U16 charid = 0; /* sanity init */
1590 U32 accept_state = 0; /* sanity init */
1591 U8 *scan = (U8*)NULL; /* sanity init */
1592
1593 STRLEN foldlen = 0; /* required init */
07be1b83 1594 U32 wordlen = 0; /* required init */
a3621e74
YO
1595 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1596
3dab1dad 1597 if ( OP(noper) != NOTHING ) {
786e8c11 1598 for ( ; uc < e ; uc += len ) {
a3621e74 1599
786e8c11 1600 TRIE_READ_CHAR;
a3621e74 1601
786e8c11
YO
1602 if ( uvc < 256 ) {
1603 charid = trie->charmap[ uvc ];
1604 } else {
1605 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1606 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1607 }
786e8c11
YO
1608 if ( charid ) {
1609 charid--;
1610 if ( !trie->trans[ state + charid ].next ) {
1611 trie->trans[ state + charid ].next = next_alloc;
1612 trie->trans[ state ].check++;
1613 next_alloc += trie->uniquecharcount;
1614 }
1615 state = trie->trans[ state + charid ].next;
1616 } else {
1617 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1618 }
1619 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1620 }
a3621e74 1621 }
3dab1dad
YO
1622 accept_state = TRIE_NODENUM( state );
1623 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1624
1625 } /* end second pass */
1626
3dab1dad
YO
1627 /* and now dump it out before we compress it */
1628 DEBUG_TRIE_COMPILE_MORE_r(
1629 dump_trie_interim_table(trie,next_alloc,depth+1)
1630 );
a3621e74 1631
a3621e74
YO
1632 {
1633 /*
1634 * Inplace compress the table.*
1635
1636 For sparse data sets the table constructed by the trie algorithm will
1637 be mostly 0/FAIL transitions or to put it another way mostly empty.
1638 (Note that leaf nodes will not contain any transitions.)
1639
1640 This algorithm compresses the tables by eliminating most such
1641 transitions, at the cost of a modest bit of extra work during lookup:
1642
1643 - Each states[] entry contains a .base field which indicates the
1644 index in the state[] array wheres its transition data is stored.
1645
1646 - If .base is 0 there are no valid transitions from that node.
1647
1648 - If .base is nonzero then charid is added to it to find an entry in
1649 the trans array.
1650
1651 -If trans[states[state].base+charid].check!=state then the
1652 transition is taken to be a 0/Fail transition. Thus if there are fail
1653 transitions at the front of the node then the .base offset will point
1654 somewhere inside the previous nodes data (or maybe even into a node
1655 even earlier), but the .check field determines if the transition is
1656 valid.
1657
786e8c11 1658 XXX - wrong maybe?
a3621e74
YO
1659 The following process inplace converts the table to the compressed
1660 table: We first do not compress the root node 1,and mark its all its
1661 .check pointers as 1 and set its .base pointer as 1 as well. This
1662 allows to do a DFA construction from the compressed table later, and
1663 ensures that any .base pointers we calculate later are greater than
1664 0.
1665
1666 - We set 'pos' to indicate the first entry of the second node.
1667
1668 - We then iterate over the columns of the node, finding the first and
1669 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1670 and set the .check pointers accordingly, and advance pos
1671 appropriately and repreat for the next node. Note that when we copy
1672 the next pointers we have to convert them from the original
1673 NODEIDX form to NODENUM form as the former is not valid post
1674 compression.
1675
1676 - If a node has no transitions used we mark its base as 0 and do not
1677 advance the pos pointer.
1678
1679 - If a node only has one transition we use a second pointer into the
1680 structure to fill in allocated fail transitions from other states.
1681 This pointer is independent of the main pointer and scans forward
1682 looking for null transitions that are allocated to a state. When it
1683 finds one it writes the single transition into the "hole". If the
786e8c11 1684 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1685
1686 - Once compressed we can Renew/realloc the structures to release the
1687 excess space.
1688
1689 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1690 specifically Fig 3.47 and the associated pseudocode.
1691
1692 demq
1693 */
a3b680e6 1694 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1695 U32 state, charid;
a3621e74 1696 U32 pos = 0, zp=0;
786e8c11 1697 trie->laststate = laststate;
a3621e74
YO
1698
1699 for ( state = 1 ; state < laststate ; state++ ) {
1700 U8 flag = 0;
a28509cc
AL
1701 const U32 stateidx = TRIE_NODEIDX( state );
1702 const U32 o_used = trie->trans[ stateidx ].check;
1703 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1704 trie->trans[ stateidx ].check = 0;
1705
1706 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1707 if ( flag || trie->trans[ stateidx + charid ].next ) {
1708 if ( trie->trans[ stateidx + charid ].next ) {
1709 if (o_used == 1) {
1710 for ( ; zp < pos ; zp++ ) {
1711 if ( ! trie->trans[ zp ].next ) {
1712 break;
1713 }
1714 }
1715 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1716 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1717 trie->trans[ zp ].check = state;
1718 if ( ++zp > pos ) pos = zp;
1719 break;
1720 }
1721 used--;
1722 }
1723 if ( !flag ) {
1724 flag = 1;
1725 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1726 }
1727 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1728 trie->trans[ pos ].check = state;
1729 pos++;
1730 }
1731 }
1732 }
cc601c31 1733 trie->lasttrans = pos + 1;
a3621e74
YO
1734 Renew( trie->states, laststate + 1, reg_trie_state);
1735 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1736 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1737 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1738 (int)depth * 2 + 2,"",
1739 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1740 (IV)next_alloc,
1741 (IV)pos,
a3621e74
YO
1742 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1743 );
1744
1745 } /* end table compress */
1746 }
cc601c31
YO
1747 /* resize the trans array to remove unused space */
1748 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1749
3dab1dad
YO
1750 /* and now dump out the compressed format */
1751 DEBUG_TRIE_COMPILE_r(
1752 dump_trie(trie,depth+1)
1753 );
07be1b83 1754
3dab1dad
YO
1755 { /* Modify the program and insert the new TRIE node*/
1756 regnode *convert;
1757 U8 nodetype =(U8)(flags & 0xFF);
1758 char *str=NULL;
786e8c11 1759
07be1b83 1760#ifdef DEBUGGING
e62cc96a 1761 regnode *optimize = NULL;
b57a0404
JH
1762 U32 mjd_offset = 0;
1763 U32 mjd_nodelen = 0;
07be1b83 1764#endif
a3621e74 1765 /*
3dab1dad
YO
1766 This means we convert either the first branch or the first Exact,
1767 depending on whether the thing following (in 'last') is a branch
1768 or not and whther first is the startbranch (ie is it a sub part of
1769 the alternation or is it the whole thing.)
1770 Assuming its a sub part we conver the EXACT otherwise we convert
1771 the whole branch sequence, including the first.
a3621e74 1772 */
3dab1dad
YO
1773 /* Find the node we are going to overwrite */
1774 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1775 /* whole branch chain */
3dab1dad 1776 convert = first;
07be1b83
YO
1777 DEBUG_r({
1778 const regnode *nop = NEXTOPER( convert );
1779 mjd_offset= Node_Offset((nop));
1780 mjd_nodelen= Node_Length((nop));
1781 });
1782 } else {
1783 /* branch sub-chain */
3dab1dad
YO
1784 convert = NEXTOPER( first );
1785 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1786 DEBUG_r({
1787 mjd_offset= Node_Offset((convert));
1788 mjd_nodelen= Node_Length((convert));
1789 });
1790 }
1791 DEBUG_OPTIMISE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1793 (int)depth * 2 + 2, "",
786e8c11 1794 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1795 );
a3621e74 1796
3dab1dad
YO
1797 /* But first we check to see if there is a common prefix we can
1798 split out as an EXACT and put in front of the TRIE node. */
1799 trie->startstate= 1;
786e8c11 1800 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1801 U32 state;
1802 DEBUG_OPTIMISE_r(
8e11feef
RGS
1803 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1804 (int)depth * 2 + 2, "",
786e8c11 1805 (UV)trie->laststate)
8e11feef 1806 );
786e8c11 1807 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1808 U32 ofs = 0;
8e11feef
RGS
1809 I32 idx = -1;
1810 U32 count = 0;
1811 const U32 base = trie->states[ state ].trans.base;
a3621e74 1812
3dab1dad 1813 if ( trie->states[state].wordnum )
8e11feef 1814 count = 1;
a3621e74 1815
8e11feef 1816 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1817 if ( ( base + ofs >= trie->uniquecharcount ) &&
1818 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1819 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1820 {
3dab1dad 1821 if ( ++count > 1 ) {
8e11feef 1822 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1823 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1824 if ( state == 1 ) break;
3dab1dad
YO
1825 if ( count == 2 ) {
1826 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1827 DEBUG_OPTIMISE_r(
8e11feef
RGS
1828 PerlIO_printf(Perl_debug_log,
1829 "%*sNew Start State=%"UVuf" Class: [",
1830 (int)depth * 2 + 2, "",
786e8c11 1831 (UV)state));
be8e71aa
YO
1832 if (idx >= 0) {
1833 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1834 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1835
3dab1dad 1836 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1837 if ( folder )
1838 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1839 DEBUG_OPTIMISE_r(
07be1b83 1840 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1841 );
8e11feef
RGS
1842 }
1843 }
1844 TRIE_BITMAP_SET(trie,*ch);
1845 if ( folder )
1846 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1847 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1848 }
1849 idx = ofs;
1850 }
3dab1dad
YO
1851 }
1852 if ( count == 1 ) {
1853 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1854 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1855 DEBUG_OPTIMISE_r(
8e11feef
RGS
1856 PerlIO_printf( Perl_debug_log,
1857 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1858 (int)depth * 2 + 2, "",
786e8c11 1859 (UV)state, (UV)idx, ch)
3dab1dad
YO
1860 );
1861 if ( state==1 ) {
1862 OP( convert ) = nodetype;
1863 str=STRING(convert);
1864 STR_LEN(convert)=0;
1865 }
1866 *str++=*ch;
1867 STR_LEN(convert)++;
a3621e74 1868
8e11feef 1869 } else {
f9049ba1 1870#ifdef DEBUGGING
8e11feef
RGS
1871 if (state>1)
1872 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1873#endif
8e11feef
RGS
1874 break;
1875 }
1876 }
3dab1dad 1877 if (str) {
8e11feef 1878 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1879 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1880 trie->startstate = state;
07be1b83
YO
1881 trie->minlen -= (state - 1);
1882 trie->maxlen -= (state - 1);
1883 DEBUG_r({
1884 regnode *fix = convert;
1885 mjd_nodelen++;
1886 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1887 while( ++fix < n ) {
1888 Set_Node_Offset_Length(fix, 0, 0);
1889 }
1890 });
8e11feef
RGS
1891 if (trie->maxlen) {
1892 convert = n;
1893 } else {
3dab1dad 1894 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1895 DEBUG_r(optimize= n);
3dab1dad
YO
1896 }
1897 }
1898 }
a5ca303d
YO
1899 if (!jumper)
1900 jumper = last;
3dab1dad 1901 if ( trie->maxlen ) {
8e11feef
RGS
1902 NEXT_OFF( convert ) = (U16)(tail - convert);
1903 ARG_SET( convert, data_slot );
786e8c11
YO
1904 /* Store the offset to the first unabsorbed branch in
1905 jump[0], which is otherwise unused by the jump logic.
1906 We use this when dumping a trie and during optimisation. */
1907 if (trie->jump)
1908 trie->jump[0] = (U16)(tail - nextbranch);
a5ca303d 1909
786e8c11
YO
1910 /* XXXX */
1911 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1912 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1913 {
1914 OP( convert ) = TRIEC;
1915 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1916 Safefree(trie->bitmap);
1917 trie->bitmap= NULL;
1918 } else
1919 OP( convert ) = TRIE;
a3621e74 1920
3dab1dad
YO
1921 /* store the type in the flags */
1922 convert->flags = nodetype;
a5ca303d
YO
1923 DEBUG_r({
1924 optimize = convert
1925 + NODE_STEP_REGNODE
1926 + regarglen[ OP( convert ) ];
1927 });
1928 /* XXX We really should free up the resource in trie now,
1929 as we won't use them - (which resources?) dmq */
3dab1dad 1930 }
a3621e74 1931 /* needed for dumping*/
e62cc96a 1932 DEBUG_r(if (optimize) {
07be1b83 1933 regnode *opt = convert;
e62cc96a 1934 while ( ++opt < optimize) {
07be1b83
YO
1935 Set_Node_Offset_Length(opt,0,0);
1936 }
786e8c11
YO
1937 /*
1938 Try to clean up some of the debris left after the
1939 optimisation.
a3621e74 1940 */
786e8c11 1941 while( optimize < jumper ) {
07be1b83 1942 mjd_nodelen += Node_Length((optimize));
a3621e74 1943 OP( optimize ) = OPTIMIZED;
07be1b83 1944 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1945 optimize++;
1946 }
07be1b83 1947 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1948 });
1949 } /* end node insert */
07be1b83 1950#ifndef DEBUGGING
6e8b4190 1951 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1952#endif
786e8c11
YO
1953 return trie->jump
1954 ? MADE_JUMP_TRIE
1955 : trie->startstate>1
1956 ? MADE_EXACT_TRIE
1957 : MADE_TRIE;
1958}
1959
1960STATIC void
1961S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1962{
1963/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1964
1965 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1966 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1967 ISBN 0-201-10088-6
1968
1969 We find the fail state for each state in the trie, this state is the longest proper
1970 suffix of the current states 'word' that is also a proper prefix of another word in our
1971 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1972 the DFA not to have to restart after its tried and failed a word at a given point, it
1973 simply continues as though it had been matching the other word in the first place.
1974 Consider
1975 'abcdgu'=~/abcdefg|cdgu/
1976 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1977 fail, which would bring use to the state representing 'd' in the second word where we would
1978 try 'g' and succeed, prodceding to match 'cdgu'.
1979 */
1980 /* add a fail transition */
1981 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1982 U32 *q;
1983 const U32 ucharcount = trie->uniquecharcount;
1984 const U32 numstates = trie->laststate;
1985 const U32 ubound = trie->lasttrans + ucharcount;
1986 U32 q_read = 0;
1987 U32 q_write = 0;
1988 U32 charid;
1989 U32 base = trie->states[ 1 ].trans.base;
1990 U32 *fail;
1991 reg_ac_data *aho;
1992 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1993 GET_RE_DEBUG_FLAGS_DECL;
1994#ifndef DEBUGGING
1995 PERL_UNUSED_ARG(depth);
1996#endif
1997
1998
1999 ARG_SET( stclass, data_slot );
2000 Newxz( aho, 1, reg_ac_data );
2001 RExC_rx->data->data[ data_slot ] = (void*)aho;
2002 aho->trie=trie;
2003 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2004 (trie->laststate+1)*sizeof(reg_trie_state));
2005 Newxz( q, numstates, U32);
2006 Newxz( aho->fail, numstates, U32 );
2007 aho->refcount = 1;
2008 fail = aho->fail;
2009 /* initialize fail[0..1] to be 1 so that we always have
2010 a valid final fail state */
2011 fail[ 0 ] = fail[ 1 ] = 1;
2012
2013 for ( charid = 0; charid < ucharcount ; charid++ ) {
2014 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2015 if ( newstate ) {
2016 q[ q_write ] = newstate;
2017 /* set to point at the root */
2018 fail[ q[ q_write++ ] ]=1;
2019 }
2020 }
2021 while ( q_read < q_write) {
2022 const U32 cur = q[ q_read++ % numstates ];
2023 base = trie->states[ cur ].trans.base;
2024
2025 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2026 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2027 if (ch_state) {
2028 U32 fail_state = cur;
2029 U32 fail_base;
2030 do {
2031 fail_state = fail[ fail_state ];
2032 fail_base = aho->states[ fail_state ].trans.base;
2033 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2034
2035 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2036 fail[ ch_state ] = fail_state;
2037 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2038 {
2039 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2040 }
2041 q[ q_write++ % numstates] = ch_state;
2042 }
2043 }
2044 }
2045 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2046 when we fail in state 1, this allows us to use the
2047 charclass scan to find a valid start char. This is based on the principle
2048 that theres a good chance the string being searched contains lots of stuff
2049 that cant be a start char.
2050 */
2051 fail[ 0 ] = fail[ 1 ] = 0;
2052 DEBUG_TRIE_COMPILE_r({
2053 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2054 for( q_read=1; q_read<numstates; q_read++ ) {
2055 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2056 }
2057 PerlIO_printf(Perl_debug_log, "\n");
2058 });
2059 Safefree(q);
2060 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2061}
2062
786e8c11 2063
a3621e74 2064/*
5d1c421c
JH
2065 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2066 * These need to be revisited when a newer toolchain becomes available.
2067 */
2068#if defined(__sparc64__) && defined(__GNUC__)
2069# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2070# undef SPARC64_GCC_WORKAROUND
2071# define SPARC64_GCC_WORKAROUND 1
2072# endif
2073#endif
2074
07be1b83
YO
2075#define DEBUG_PEEP(str,scan,depth) \
2076 DEBUG_OPTIMISE_r({ \
2077 SV * const mysv=sv_newmortal(); \
2078 regnode *Next = regnext(scan); \
2079 regprop(RExC_rx, mysv, scan); \
2080 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2081 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2082 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2083 });
2084
1de06328
YO
2085
2086
2087
2088
07be1b83
YO
2089#define JOIN_EXACT(scan,min,flags) \
2090 if (PL_regkind[OP(scan)] == EXACT) \
2091 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2092
be8e71aa 2093STATIC U32
07be1b83
YO
2094S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2095 /* Merge several consecutive EXACTish nodes into one. */
2096 regnode *n = regnext(scan);
2097 U32 stringok = 1;
2098 regnode *next = scan + NODE_SZ_STR(scan);
2099 U32 merged = 0;
2100 U32 stopnow = 0;
2101#ifdef DEBUGGING
2102 regnode *stop = scan;
72f13be8 2103 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2104#else
d47053eb
RGS
2105 PERL_UNUSED_ARG(depth);
2106#endif
2107#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2108 PERL_UNUSED_ARG(flags);
2109 PERL_UNUSED_ARG(val);
07be1b83 2110#endif
07be1b83
YO
2111 DEBUG_PEEP("join",scan,depth);
2112
2113 /* Skip NOTHING, merge EXACT*. */
2114 while (n &&
2115 ( PL_regkind[OP(n)] == NOTHING ||
2116 (stringok && (OP(n) == OP(scan))))
2117 && NEXT_OFF(n)
2118 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2119
2120 if (OP(n) == TAIL || n > next)
2121 stringok = 0;
2122 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2123 DEBUG_PEEP("skip:",n,depth);
2124 NEXT_OFF(scan) += NEXT_OFF(n);
2125 next = n + NODE_STEP_REGNODE;
2126#ifdef DEBUGGING
2127 if (stringok)
2128 stop = n;
2129#endif
2130 n = regnext(n);
2131 }
2132 else if (stringok) {
786e8c11 2133 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2134 regnode * const nnext = regnext(n);
2135
2136 DEBUG_PEEP("merg",n,depth);
2137
2138 merged++;
2139 if (oldl + STR_LEN(n) > U8_MAX)
2140 break;
2141 NEXT_OFF(scan) += NEXT_OFF(n);
2142 STR_LEN(scan) += STR_LEN(n);
2143 next = n + NODE_SZ_STR(n);
2144 /* Now we can overwrite *n : */
2145 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2146#ifdef DEBUGGING
2147 stop = next - 1;
2148#endif
2149 n = nnext;
2150 if (stopnow) break;
2151 }
2152
d47053eb
RGS
2153#ifdef EXPERIMENTAL_INPLACESCAN
2154 if (flags && !NEXT_OFF(n)) {
2155 DEBUG_PEEP("atch", val, depth);
2156 if (reg_off_by_arg[OP(n)]) {
2157 ARG_SET(n, val - n);
2158 }
2159 else {
2160 NEXT_OFF(n) = val - n;
2161 }
2162 stopnow = 1;
2163 }
07be1b83
YO
2164#endif
2165 }
2166
2167 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2168 /*
2169 Two problematic code points in Unicode casefolding of EXACT nodes:
2170
2171 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2172 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2173
2174 which casefold to
2175
2176 Unicode UTF-8
2177
2178 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2179 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2180
2181 This means that in case-insensitive matching (or "loose matching",
2182 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2183 length of the above casefolded versions) can match a target string
2184 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2185 This would rather mess up the minimum length computation.
2186
2187 What we'll do is to look for the tail four bytes, and then peek
2188 at the preceding two bytes to see whether we need to decrease
2189 the minimum length by four (six minus two).
2190
2191 Thanks to the design of UTF-8, there cannot be false matches:
2192 A sequence of valid UTF-8 bytes cannot be a subsequence of
2193 another valid sequence of UTF-8 bytes.
2194
2195 */
2196 char * const s0 = STRING(scan), *s, *t;
2197 char * const s1 = s0 + STR_LEN(scan) - 1;
2198 char * const s2 = s1 - 4;
e294cc5d
JH
2199#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2200 const char t0[] = "\xaf\x49\xaf\x42";
2201#else
07be1b83 2202 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2203#endif
07be1b83
YO
2204 const char * const t1 = t0 + 3;
2205
2206 for (s = s0 + 2;
2207 s < s2 && (t = ninstr(s, s1, t0, t1));
2208 s = t + 4) {
e294cc5d
JH
2209#ifdef EBCDIC
2210 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2211 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2212#else
07be1b83
YO
2213 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2214 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2215#endif
07be1b83
YO
2216 *min -= 4;
2217 }
2218 }
2219
2220#ifdef DEBUGGING
2221 /* Allow dumping */
2222 n = scan + NODE_SZ_STR(scan);
2223 while (n <= stop) {
2224 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2225 OP(n) = OPTIMIZED;
2226 NEXT_OFF(n) = 0;
2227 }
2228 n++;
2229 }
2230#endif
2231 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2232 return stopnow;
2233}
2234
653099ff
GS
2235/* REx optimizer. Converts nodes into quickier variants "in place".
2236 Finds fixed substrings. */
2237
a0288114 2238/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2239 to the position after last scanned or to NULL. */
2240
07be1b83
YO
2241
2242
76e3520e 2243STATIC I32
1de06328
YO
2244S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2245 I32 *minlenp, I32 *deltap,
9a957fbc 2246 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2247 /* scanp: Start here (read-write). */
2248 /* deltap: Write maxlen-minlen here. */
2249 /* last: Stop before this one. */
2250{
97aff369 2251 dVAR;
c277df42
IZ
2252 I32 min = 0, pars = 0, code;
2253 regnode *scan = *scanp, *next;
2254 I32 delta = 0;
2255 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2256 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2257 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2258 scan_data_t data_fake;
653099ff 2259 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2260 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2261 regnode *first_non_open = scan;
2262
a3621e74
YO
2263
2264 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2265#ifdef DEBUGGING
2266 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2267#endif
786e8c11
YO
2268 if ( depth == 0 ) {
2269 while (first_non_open && OP(first_non_open) == OPEN)
2270 first_non_open=regnext(first_non_open);
2271 }
2272
b81d288d 2273
c277df42
IZ
2274 while (scan && OP(scan) != END && scan < last) {
2275 /* Peephole optimizer: */
1de06328 2276 DEBUG_STUDYDATA(data,depth);
07be1b83 2277 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2278 JOIN_EXACT(scan,&min,0);
a3621e74 2279
653099ff
GS
2280 /* Follow the next-chain of the current node and optimize
2281 away all the NOTHINGs from it. */
c277df42 2282 if (OP(scan) != CURLYX) {
a3b680e6 2283 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2284 ? I32_MAX
2285 /* I32 may be smaller than U16 on CRAYs! */
2286 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2287 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2288 int noff;
2289 regnode *n = scan;
b81d288d 2290
c277df42
IZ
2291 /* Skip NOTHING and LONGJMP. */
2292 while ((n = regnext(n))
3dab1dad 2293 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2294 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2295 && off + noff < max)
2296 off += noff;
2297 if (reg_off_by_arg[OP(scan)])
2298 ARG(scan) = off;
b81d288d 2299 else
c277df42
IZ
2300 NEXT_OFF(scan) = off;
2301 }
a3621e74 2302
07be1b83 2303
3dab1dad 2304
653099ff
GS
2305 /* The principal pseudo-switch. Cannot be a switch, since we
2306 look into several different things. */
b81d288d 2307 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2308 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2309 next = regnext(scan);
2310 code = OP(scan);
a3621e74 2311 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2312
2313 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2314 /* NOTE - There is similar code to this block below for handling
2315 TRIE nodes on a re-study. If you change stuff here check there
2316 too. */
c277df42 2317 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2318 struct regnode_charclass_class accum;
d4c19fe8 2319 regnode * const startbranch=scan;
c277df42 2320
653099ff 2321 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2322 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2323 if (flags & SCF_DO_STCLASS)
830247a4 2324 cl_init_zero(pRExC_state, &accum);
a3621e74 2325
c277df42 2326 while (OP(scan) == code) {
830247a4 2327 I32 deltanext, minnext, f = 0, fake;
653099ff 2328 struct regnode_charclass_class this_class;
c277df42
IZ
2329
2330 num++;
2331 data_fake.flags = 0;
b81d288d 2332 if (data) {
2c2d71f5 2333 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2334 data_fake.last_closep = data->last_closep;
2335 }
2336 else
2337 data_fake.last_closep = &fake;
c277df42
IZ
2338 next = regnext(scan);
2339 scan = NEXTOPER(scan);
2340 if (code != BRANCH)
2341 scan = NEXTOPER(scan);
653099ff 2342 if (flags & SCF_DO_STCLASS) {
830247a4 2343 cl_init(pRExC_state, &this_class);
653099ff
GS
2344 data_fake.start_class = &this_class;
2345 f = SCF_DO_STCLASS_AND;
b81d288d 2346 }
e1901655
IZ
2347 if (flags & SCF_WHILEM_VISITED_POS)
2348 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2349
653099ff 2350 /* we suppose the run is continuous, last=next...*/
1de06328 2351 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2352 next, &data_fake, f,depth+1);
b81d288d 2353 if (min1 > minnext)
c277df42
IZ
2354 min1 = minnext;
2355 if (max1 < minnext + deltanext)
2356 max1 = minnext + deltanext;
2357 if (deltanext == I32_MAX)
aca2d497 2358 is_inf = is_inf_internal = 1;
c277df42
IZ
2359 scan = next;
2360 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2361 pars++;
3dab1dad
YO
2362 if (data) {
2363 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2364 data->flags |= SF_HAS_EVAL;
2c2d71f5 2365 data->whilem_c = data_fake.whilem_c;
3dab1dad 2366 }
653099ff 2367 if (flags & SCF_DO_STCLASS)
830247a4 2368 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2369 if (code == SUSPEND)
c277df42
IZ
2370 break;
2371 }
2372 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2373 min1 = 0;
2374 if (flags & SCF_DO_SUBSTR) {
2375 data->pos_min += min1;
2376 data->pos_delta += max1 - min1;
2377 if (max1 != min1 || is_inf)
2378 data->longest = &(data->longest_float);
2379 }
2380 min += min1;
2381 delta += max1 - min1;
653099ff 2382 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2383 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2384 if (min1) {
2385 cl_and(data->start_class, &and_with);
2386 flags &= ~SCF_DO_STCLASS;
2387 }
2388 }
2389 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2390 if (min1) {
2391 cl_and(data->start_class, &accum);
653099ff 2392 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2393 }
2394 else {
b81d288d 2395 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2396 * data->start_class */
2397 StructCopy(data->start_class, &and_with,
2398 struct regnode_charclass_class);
2399 flags &= ~SCF_DO_STCLASS_AND;
2400 StructCopy(&accum, data->start_class,
2401 struct regnode_charclass_class);
2402 flags |= SCF_DO_STCLASS_OR;
2403 data->start_class->flags |= ANYOF_EOS;
2404 }
653099ff 2405 }
a3621e74 2406
786e8c11 2407 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2408 /* demq.
2409
2410 Assuming this was/is a branch we are dealing with: 'scan' now
2411 points at the item that follows the branch sequence, whatever
2412 it is. We now start at the beginning of the sequence and look
2413 for subsequences of
2414
786e8c11
YO
2415 BRANCH->EXACT=>x1
2416 BRANCH->EXACT=>x2
2417 tail
a3621e74
YO
2418
2419 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2420
2421 If we can find such a subseqence we need to turn the first
2422 element into a trie and then add the subsequent branch exact
2423 strings to the trie.
2424
2425 We have two cases
2426
786e8c11 2427 1. patterns where the whole set of branch can be converted.
a3621e74 2428
786e8c11 2429 2. patterns where only a subset can be converted.
a3621e74
YO
2430
2431 In case 1 we can replace the whole set with a single regop
2432 for the trie. In case 2 we need to keep the start and end
2433 branchs so
2434
2435 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2436 becomes BRANCH TRIE; BRANCH X;
2437
786e8c11
YO
2438 There is an additional case, that being where there is a
2439 common prefix, which gets split out into an EXACT like node
2440 preceding the TRIE node.
2441
2442 If x(1..n)==tail then we can do a simple trie, if not we make
2443 a "jump" trie, such that when we match the appropriate word
2444 we "jump" to the appopriate tail node. Essentailly we turn
2445 a nested if into a case structure of sorts.
a3621e74
YO
2446
2447 */
786e8c11 2448
3dab1dad 2449 int made=0;
0111c4fd
RGS
2450 if (!re_trie_maxbuff) {
2451 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2452 if (!SvIOK(re_trie_maxbuff))
2453 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2454 }
786e8c11 2455 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2456 regnode *cur;
2457 regnode *first = (regnode *)NULL;
2458 regnode *last = (regnode *)NULL;
2459 regnode *tail = scan;
2460 U8 optype = 0;
2461 U32 count=0;
2462
2463#ifdef DEBUGGING
c445ea15 2464 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2465#endif
2466 /* var tail is used because there may be a TAIL
2467 regop in the way. Ie, the exacts will point to the
2468 thing following the TAIL, but the last branch will
2469 point at the TAIL. So we advance tail. If we
2470 have nested (?:) we may have to move through several
2471 tails.
2472 */
2473
2474 while ( OP( tail ) == TAIL ) {
2475 /* this is the TAIL generated by (?:) */
2476 tail = regnext( tail );
2477 }
2478
3dab1dad 2479
a3621e74 2480 DEBUG_OPTIMISE_r({
32fc9b6a 2481 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2482 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2483 (int)depth * 2 + 2, "",
2484 "Looking for TRIE'able sequences. Tail node is: ",
2485 SvPV_nolen_const( mysv )
a3621e74
YO
2486 );
2487 });
3dab1dad 2488
a3621e74
YO
2489 /*
2490
2491 step through the branches, cur represents each
2492 branch, noper is the first thing to be matched
2493 as part of that branch and noper_next is the
2494 regnext() of that node. if noper is an EXACT
2495 and noper_next is the same as scan (our current
2496 position in the regex) then the EXACT branch is
2497 a possible optimization target. Once we have
2498 two or more consequetive such branches we can
2499 create a trie of the EXACT's contents and stich
2500 it in place. If the sequence represents all of
2501 the branches we eliminate the whole thing and
2502 replace it with a single TRIE. If it is a
2503 subsequence then we need to stitch it in. This
2504 means the first branch has to remain, and needs
2505 to be repointed at the item on the branch chain
2506 following the last branch optimized. This could
2507 be either a BRANCH, in which case the
2508 subsequence is internal, or it could be the
2509 item following the branch sequence in which
2510 case the subsequence is at the end.
2511
2512 */
2513
2514 /* dont use tail as the end marker for this traverse */
2515 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2516 regnode * const noper = NEXTOPER( cur );
be981c67 2517#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2518 regnode * const noper_next = regnext( noper );
be981c67 2519#endif
a3621e74 2520
a3621e74 2521 DEBUG_OPTIMISE_r({
32fc9b6a 2522 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2523 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2524 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2525
32fc9b6a 2526 regprop(RExC_rx, mysv, noper);
a3621e74 2527 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2528 SvPV_nolen_const(mysv));
a3621e74
YO
2529
2530 if ( noper_next ) {
32fc9b6a 2531 regprop(RExC_rx, mysv, noper_next );
a3621e74 2532 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2533 SvPV_nolen_const(mysv));
a3621e74 2534 }
3dab1dad
YO
2535 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2536 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2537 });
3dab1dad
YO
2538 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2539 : PL_regkind[ OP( noper ) ] == EXACT )
2540 || OP(noper) == NOTHING )
786e8c11
YO
2541#ifdef NOJUMPTRIE
2542 && noper_next == tail
2543#endif
2544 && count < U16_MAX)
a3621e74
YO
2545 {
2546 count++;
3dab1dad
YO
2547 if ( !first || optype == NOTHING ) {
2548 if (!first) first = cur;
a3621e74
YO
2549 optype = OP( noper );
2550 } else {
a3621e74 2551 last = cur;
a3621e74
YO
2552 }
2553 } else {
2554 if ( last ) {
786e8c11
YO
2555 make_trie( pRExC_state,
2556 startbranch, first, cur, tail, count,
2557 optype, depth+1 );
a3621e74 2558 }
3dab1dad 2559 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2560#ifdef NOJUMPTRIE
2561 && noper_next == tail
2562#endif
2563 ){
a3621e74
YO
2564 count = 1;
2565 first = cur;
2566 optype = OP( noper );
2567 } else {
2568 count = 0;
2569 first = NULL;
2570 optype = 0;
2571 }
2572 last = NULL;
2573 }
2574 }
2575 DEBUG_OPTIMISE_r({
32fc9b6a 2576 regprop(RExC_rx, mysv, cur);
a3621e74 2577 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2578 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2579 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2580
2581 });
2582 if ( last ) {
786e8c11 2583 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2584#ifdef TRIE_STUDY_OPT
786e8c11
YO
2585 if ( ((made == MADE_EXACT_TRIE &&
2586 startbranch == first)
2587 || ( first_non_open == first )) &&
2588 depth==0 )
2589 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2590#endif
07be1b83 2591 }
a3621e74 2592 }
3dab1dad
YO
2593
2594 } /* do trie */
786e8c11 2595
a0ed51b3 2596 }
a3621e74 2597 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2598 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2599 } else /* single branch is optimized. */
c277df42
IZ
2600 scan = NEXTOPER(scan);
2601 continue;
a0ed51b3
LW
2602 }
2603 else if (OP(scan) == EXACT) {
cd439c50 2604 I32 l = STR_LEN(scan);
c445ea15 2605 UV uc;
a0ed51b3 2606 if (UTF) {
a3b680e6 2607 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2608 l = utf8_length(s, s + l);
9041c2e3 2609 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2610 } else {
2611 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2612 }
2613 min += l;
c277df42 2614 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2615 /* The code below prefers earlier match for fixed
2616 offset, later match for variable offset. */
2617 if (data->last_end == -1) { /* Update the start info. */
2618 data->last_start_min = data->pos_min;
2619 data->last_start_max = is_inf
b81d288d 2620 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2621 }
cd439c50 2622 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2623 if (UTF)
2624 SvUTF8_on(data->last_found);
0eda9292 2625 {
9a957fbc 2626 SV * const sv = data->last_found;
a28509cc 2627 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2628 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2629 if (mg && mg->mg_len >= 0)
5e43f467
JH
2630 mg->mg_len += utf8_length((U8*)STRING(scan),
2631 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2632 }
c277df42
IZ
2633 data->last_end = data->pos_min + l;
2634 data->pos_min += l; /* As in the first entry. */
2635 data->flags &= ~SF_BEFORE_EOL;
2636 }
653099ff
GS
2637 if (flags & SCF_DO_STCLASS_AND) {
2638 /* Check whether it is compatible with what we know already! */
2639 int compat = 1;
2640
1aa99e6b 2641 if (uc >= 0x100 ||
516a5887 2642 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2643 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2644 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2645 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2646 )
653099ff
GS
2647 compat = 0;
2648 ANYOF_CLASS_ZERO(data->start_class);
2649 ANYOF_BITMAP_ZERO(data->start_class);
2650 if (compat)
1aa99e6b 2651 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2652 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2653 if (uc < 0x100)
2654 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2655 }
2656 else if (flags & SCF_DO_STCLASS_OR) {
2657 /* false positive possible if the class is case-folded */
1aa99e6b 2658 if (uc < 0x100)
9b877dbb
IH
2659 ANYOF_BITMAP_SET(data->start_class, uc);
2660 else
2661 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2662 data->start_class->flags &= ~ANYOF_EOS;
2663 cl_and(data->start_class, &and_with);
2664 }
2665 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2666 }
3dab1dad 2667 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2668 I32 l = STR_LEN(scan);
1aa99e6b 2669 UV uc = *((U8*)STRING(scan));
653099ff
GS
2670
2671 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2672 if (flags & SCF_DO_SUBSTR) {
2673 assert(data);
1de06328 2674 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2675 }
a0ed51b3 2676 if (UTF) {
6136c704 2677 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2678 l = utf8_length(s, s + l);
9041c2e3 2679 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2680 }
2681 min += l;
ecaa9b9c 2682 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2683 data->pos_min += l;
653099ff
GS
2684 if (flags & SCF_DO_STCLASS_AND) {
2685 /* Check whether it is compatible with what we know already! */
2686 int compat = 1;
2687
1aa99e6b 2688 if (uc >= 0x100 ||
516a5887 2689 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2690 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2691 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2692 compat = 0;
2693 ANYOF_CLASS_ZERO(data->start_class);
2694 ANYOF_BITMAP_ZERO(data->start_class);
2695 if (compat) {
1aa99e6b 2696 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2697 data->start_class->flags &= ~ANYOF_EOS;
2698 data->start_class->flags |= ANYOF_FOLD;
2699 if (OP(scan) == EXACTFL)
2700 data->start_class->flags |= ANYOF_LOCALE;
2701 }
2702 }
2703 else if (flags & SCF_DO_STCLASS_OR) {
2704 if (data->start_class->flags & ANYOF_FOLD) {
2705 /* false positive possible if the class is case-folded.
2706 Assume that the locale settings are the same... */
1aa99e6b
IH
2707 if (uc < 0x100)
2708 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2709 data->start_class->flags &= ~ANYOF_EOS;
2710 }
2711 cl_and(data->start_class, &and_with);
2712 }
2713 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2714 }
bfed75c6 2715 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2716 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2717 I32 f = flags, pos_before = 0;
d4c19fe8 2718 regnode * const oscan = scan;
653099ff
GS
2719 struct regnode_charclass_class this_class;
2720 struct regnode_charclass_class *oclass = NULL;
727f22e3 2721 I32 next_is_eval = 0;
653099ff 2722
3dab1dad 2723 switch (PL_regkind[OP(scan)]) {
653099ff 2724 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2725 scan = NEXTOPER(scan);
2726 goto finish;
2727 case PLUS:
653099ff 2728 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2729 next = NEXTOPER(scan);
653099ff 2730 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2731 mincount = 1;
2732 maxcount = REG_INFTY;
c277df42
IZ
2733 next = regnext(scan);
2734 scan = NEXTOPER(scan);
2735 goto do_curly;
2736 }
2737 }
2738 if (flags & SCF_DO_SUBSTR)
2739 data->pos_min++;
2740 min++;
2741 /* Fall through. */
2742 case STAR:
653099ff
GS
2743 if (flags & SCF_DO_STCLASS) {
2744 mincount = 0;
b81d288d 2745 maxcount = REG_INFTY;
653099ff
GS
2746 next = regnext(scan);
2747 scan = NEXTOPER(scan);
2748 goto do_curly;
2749 }
b81d288d 2750 is_inf = is_inf_internal = 1;
c277df42
IZ
2751 scan = regnext(scan);
2752 if (flags & SCF_DO_SUBSTR) {
1de06328 2753 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2754 data->longest = &(data->longest_float);
2755 }
2756 goto optimize_curly_tail;
2757 case CURLY:
b81d288d 2758 mincount = ARG1(scan);
c277df42
IZ
2759 maxcount = ARG2(scan);
2760 next = regnext(scan);
cb434fcc
IZ
2761 if (OP(scan) == CURLYX) {
2762 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2763 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2764 }
c277df42 2765 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2766 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2767 do_curly:
2768 if (flags & SCF_DO_SUBSTR) {
1de06328 2769 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2770 pos_before = data->pos_min;
2771 }
2772 if (data) {
2773 fl = data->flags;
2774 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2775 if (is_inf)
2776 data->flags |= SF_IS_INF;
2777 }
653099ff 2778 if (flags & SCF_DO_STCLASS) {
830247a4 2779 cl_init(pRExC_state, &this_class);
653099ff
GS
2780 oclass = data->start_class;
2781 data->start_class = &this_class;
2782 f |= SCF_DO_STCLASS_AND;
2783 f &= ~SCF_DO_STCLASS_OR;
2784 }
e1901655
IZ
2785 /* These are the cases when once a subexpression
2786 fails at a particular position, it cannot succeed
2787 even after backtracking at the enclosing scope.
b81d288d 2788
e1901655
IZ
2789 XXXX what if minimal match and we are at the
2790 initial run of {n,m}? */
2791 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2792 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2793
c277df42 2794 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2795 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2796 (mincount == 0
2797 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2798
2799 if (flags & SCF_DO_STCLASS)
2800 data->start_class = oclass;
2801 if (mincount == 0 || minnext == 0) {
2802 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2803 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2804 }
2805 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2806 /* Switch to OR mode: cache the old value of
653099ff
GS
2807 * data->start_class */
2808 StructCopy(data->start_class, &and_with,
2809 struct regnode_charclass_class);
2810 flags &= ~SCF_DO_STCLASS_AND;
2811 StructCopy(&this_class, data->start_class,
2812 struct regnode_charclass_class);
2813 flags |= SCF_DO_STCLASS_OR;
2814 data->start_class->flags |= ANYOF_EOS;
2815 }
2816 } else { /* Non-zero len */
2817 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2818 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2819 cl_and(data->start_class, &and_with);
2820 }
2821 else if (flags & SCF_DO_STCLASS_AND)
2822 cl_and(data->start_class, &this_class);
2823 flags &= ~SCF_DO_STCLASS;
2824 }
c277df42
IZ
2825 if (!scan) /* It was not CURLYX, but CURLY. */
2826 scan = next;
041457d9
DM
2827 if ( /* ? quantifier ok, except for (?{ ... }) */
2828 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2829 && (minnext == 0) && (deltanext == 0)
99799961 2830 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2831 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2832 && ckWARN(WARN_REGEXP))
b45f050a 2833 {
830247a4 2834 vWARN(RExC_parse,
b45f050a
JF
2835 "Quantifier unexpected on zero-length expression");
2836 }
2837
c277df42 2838 min += minnext * mincount;
b81d288d 2839 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2840 && (minnext + deltanext) > 0)
2841 || deltanext == I32_MAX);
aca2d497 2842 is_inf |= is_inf_internal;
c277df42
IZ
2843 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2844
2845 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2846 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2847 && data->flags & SF_IN_PAR
2848 && !(data->flags & SF_HAS_EVAL)
2849 && !deltanext && minnext == 1 ) {
2850 /* Try to optimize to CURLYN. */
2851 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2852 regnode * const nxt1 = nxt;
497b47a8
JH
2853#ifdef DEBUGGING
2854 regnode *nxt2;
2855#endif
c277df42
IZ
2856
2857 /* Skip open. */
2858 nxt = regnext(nxt);
bfed75c6 2859 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2860 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2861 && STR_LEN(nxt) == 1))
c277df42 2862 goto nogo;
497b47a8 2863#ifdef DEBUGGING
c277df42 2864 nxt2 = nxt;
497b47a8 2865#endif
c277df42 2866 nxt = regnext(nxt);
b81d288d 2867 if (OP(nxt) != CLOSE)
c277df42
IZ
2868 goto nogo;
2869 /* Now we know that nxt2 is the only contents: */
eb160463 2870 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2871 OP(oscan) = CURLYN;
2872 OP(nxt1) = NOTHING; /* was OPEN. */
2873#ifdef DEBUGGING
2874 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2875 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2876 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2877 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2878 OP(nxt + 1) = OPTIMIZED; /* was count. */
2879 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2880#endif
c277df42 2881 }
c277df42
IZ
2882 nogo:
2883
2884 /* Try optimization CURLYX => CURLYM. */
b81d288d 2885 if ( OP(oscan) == CURLYX && data
c277df42 2886 && !(data->flags & SF_HAS_PAR)
c277df42 2887 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2888 && !deltanext /* atom is fixed width */
2889 && minnext != 0 /* CURLYM can't handle zero width */
2890 ) {
c277df42
IZ
2891 /* XXXX How to optimize if data == 0? */
2892 /* Optimize to a simpler form. */
2893 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2894 regnode *nxt2;
2895
2896 OP(oscan) = CURLYM;
2897 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2898 && (OP(nxt2) != WHILEM))
c277df42
IZ
2899 nxt = nxt2;
2900 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2901 /* Need to optimize away parenths. */
2902 if (data->flags & SF_IN_PAR) {
2903 /* Set the parenth number. */
2904 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2905
b81d288d 2906 if (OP(nxt) != CLOSE)
b45f050a 2907 FAIL("Panic opt close");
eb160463 2908 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2909 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2910 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2911#ifdef DEBUGGING
2912 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2913 OP(nxt + 1) = OPTIMIZED; /* was count. */
2914 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2915 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2916#endif
c277df42
IZ
2917#if 0
2918 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2919 regnode *nnxt = regnext(nxt1);
b81d288d 2920
c277df42
IZ
2921 if (nnxt == nxt) {
2922 if (reg_off_by_arg[OP(nxt1)])
2923 ARG_SET(nxt1, nxt2 - nxt1);
2924 else if (nxt2 - nxt1 < U16_MAX)
2925 NEXT_OFF(nxt1) = nxt2 - nxt1;
2926 else
2927 OP(nxt) = NOTHING; /* Cannot beautify */
2928 }
2929 nxt1 = nnxt;
2930 }
2931#endif
2932 /* Optimize again: */
1de06328 2933 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2934 NULL, 0,depth+1);
a0ed51b3
LW
2935 }
2936 else
c277df42 2937 oscan->flags = 0;
c277df42 2938 }
e1901655
IZ
2939 else if ((OP(oscan) == CURLYX)
2940 && (flags & SCF_WHILEM_VISITED_POS)
2941 /* See the comment on a similar expression above.
2942 However, this time it not a subexpression
2943 we care about, but the expression itself. */
2944 && (maxcount == REG_INFTY)
2945 && data && ++data->whilem_c < 16) {
2946 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2947 /* Find WHILEM (as in regexec.c) */
2948 regnode *nxt = oscan + NEXT_OFF(oscan);
2949
2950 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2951 nxt += ARG(nxt);
eb160463
GS
2952 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2953 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2954 }
b81d288d 2955 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2956 pars++;
2957 if (flags & SCF_DO_SUBSTR) {
c445ea15 2958 SV *last_str = NULL;
c277df42
IZ
2959 int counted = mincount != 0;
2960
2961 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2962#if defined(SPARC64_GCC_WORKAROUND)
2963 I32 b = 0;
2964 STRLEN l = 0;
cfd0369c 2965 const char *s = NULL;
5d1c421c
JH
2966 I32 old = 0;
2967
2968 if (pos_before >= data->last_start_min)
2969 b = pos_before;
2970 else
2971 b = data->last_start_min;
2972
2973 l = 0;
cfd0369c 2974 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2975 old = b - data->last_start_min;
2976
2977#else
b81d288d 2978 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2979 ? pos_before : data->last_start_min;
2980 STRLEN l;
d4c19fe8 2981 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2982 I32 old = b - data->last_start_min;
5d1c421c 2983#endif
a0ed51b3
LW
2984
2985 if (UTF)
2986 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2987
a0ed51b3 2988 l -= old;
c277df42 2989 /* Get the added string: */
79cb57f6 2990 last_str = newSVpvn(s + old, l);
0e933229
IH
2991 if (UTF)
2992 SvUTF8_on(last_str);
c277df42
IZ
2993 if (deltanext == 0 && pos_before == b) {
2994 /* What was added is a constant string */
2995 if (mincount > 1) {
2996 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2997 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2998 SvPVX_const(last_str), l, mincount - 1);
b162af07 2999 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3000 /* Add additional parts. */
b81d288d 3001 SvCUR_set(data->last_found,
c277df42
IZ
3002 SvCUR(data->last_found) - l);
3003 sv_catsv(data->last_found, last_str);
0eda9292
JH
3004 {
3005 SV * sv = data->last_found;
3006 MAGIC *mg =
3007 SvUTF8(sv) && SvMAGICAL(sv) ?
3008 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3009 if (mg && mg->mg_len >= 0)
3010 mg->mg_len += CHR_SVLEN(last_str);
3011 }
c277df42
IZ
3012 data->last_end += l * (mincount - 1);
3013 }
2a8d9689
HS
3014 } else {
3015 /* start offset must point into the last copy */
3016 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3017 data->last_start_max += is_inf ? I32_MAX
3018 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3019 }
3020 }
3021 /* It is counted once already... */
3022 data->pos_min += minnext * (mincount - counted);
3023 data->pos_delta += - counted * deltanext +
3024 (minnext + deltanext) * maxcount - minnext * mincount;
3025 if (mincount != maxcount) {
653099ff
GS
3026 /* Cannot extend fixed substrings found inside
3027 the group. */
1de06328 3028 scan_commit(pRExC_state,data,minlenp);
c277df42 3029 if (mincount && last_str) {
d4c19fe8
AL
3030 SV * const sv = data->last_found;
3031 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3032 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3033
3034 if (mg)
3035 mg->mg_len = -1;
3036 sv_setsv(sv, last_str);
c277df42 3037 data->last_end = data->pos_min;
b81d288d 3038 data->last_start_min =
a0ed51b3 3039 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3040 data->last_start_max = is_inf
3041 ? I32_MAX
c277df42 3042 : data->pos_min + data->pos_delta
a0ed51b3 3043 - CHR_SVLEN(last_str);
c277df42
IZ
3044 }
3045 data->longest = &(data->longest_float);
3046 }
aca2d497 3047 SvREFCNT_dec(last_str);
c277df42 3048 }
405ff068 3049 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3050 data->flags |= SF_HAS_EVAL;
3051 optimize_curly_tail:
c277df42 3052 if (OP(oscan) != CURLYX) {
3dab1dad 3053 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3054 && NEXT_OFF(next))
3055 NEXT_OFF(oscan) += NEXT_OFF(next);
3056 }
c277df42 3057 continue;
653099ff 3058 default: /* REF and CLUMP only? */
c277df42 3059 if (flags & SCF_DO_SUBSTR) {
1de06328 3060 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3061 data->longest = &(data->longest_float);
3062 }
aca2d497 3063 is_inf = is_inf_internal = 1;
653099ff 3064 if (flags & SCF_DO_STCLASS_OR)
830247a4 3065 cl_anything(pRExC_state, data->start_class);
653099ff 3066 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3067 break;
3068 }
a0ed51b3 3069 }
bfed75c6 3070 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3071 int value = 0;
653099ff 3072
c277df42 3073 if (flags & SCF_DO_SUBSTR) {
1de06328 3074 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3075 data->pos_min++;
3076 }
3077 min++;
653099ff
GS
3078 if (flags & SCF_DO_STCLASS) {
3079 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3080
3081 /* Some of the logic below assumes that switching
3082 locale on will only add false positives. */
3dab1dad 3083 switch (PL_regkind[OP(scan)]) {
653099ff 3084 case SANY:
653099ff
GS
3085 default:
3086 do_default:
3087 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3088 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3089 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3090 break;
3091 case REG_ANY:
3092 if (OP(scan) == SANY)
3093 goto do_default;
3094 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3095 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3096 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3097 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3098 }
3099 if (flags & SCF_DO_STCLASS_AND || !value)
3100 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3101 break;
3102 case ANYOF:
3103 if (flags & SCF_DO_STCLASS_AND)
3104 cl_and(data->start_class,
3105 (struct regnode_charclass_class*)scan);
3106 else
830247a4 3107 cl_or(pRExC_state, data->start_class,
653099ff
GS
3108 (struct regnode_charclass_class*)scan);
3109 break;
3110 case ALNUM:
3111 if (flags & SCF_DO_STCLASS_AND) {
3112 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3113 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3114 for (value = 0; value < 256; value++)
3115 if (!isALNUM(value))
3116 ANYOF_BITMAP_CLEAR(data->start_class, value);
3117 }
3118 }
3119 else {
3120 if (data->start_class->flags & ANYOF_LOCALE)
3121 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3122 else {
3123 for (value = 0; value < 256; value++)
3124 if (isALNUM(value))
b81d288d 3125 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3126 }
3127 }
3128 break;
3129 case ALNUML:
3130 if (flags & SCF_DO_STCLASS_AND) {
3131 if (data->start_class->flags & ANYOF_LOCALE)
3132 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3133 }
3134 else {
3135 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3136 data->start_class->flags |= ANYOF_LOCALE;
3137 }
3138 break;
3139 case NALNUM:
3140 if (flags & SCF_DO_STCLASS_AND) {
3141 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3142 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3143 for (value = 0; value < 256; value++)
3144 if (isALNUM(value))
3145 ANYOF_BITMAP_CLEAR(data->start_class, value);
3146 }
3147 }
3148 else {
3149 if (data->start_class->flags & ANYOF_LOCALE)
3150 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3151 else {
3152 for (value = 0; value < 256; value++)
3153 if (!isALNUM(value))
b81d288d 3154 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3155 }
3156 }
3157 break;
3158 case NALNUML:
3159 if (flags & SCF_DO_STCLASS_AND) {
3160 if (data->start_class->flags & ANYOF_LOCALE)
3161 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3162 }
3163 else {
3164 data->start_class->flags |= ANYOF_LOCALE;
3165 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3166 }
3167 break;
3168 case SPACE:
3169 if (flags & SCF_DO_STCLASS_AND) {
3170 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3171 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3172 for (value = 0; value < 256; value++)
3173 if (!isSPACE(value))
3174 ANYOF_BITMAP_CLEAR(data->start_class, value);
3175 }
3176 }
3177 else {
3178 if (data->start_class->flags & ANYOF_LOCALE)
3179 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3180 else {
3181 for (value = 0; value < 256; value++)
3182 if (isSPACE(value))
b81d288d 3183 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3184 }
3185 }
3186 break;
3187 case SPACEL:
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 if (data->start_class->flags & ANYOF_LOCALE)
3190 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3191 }
3192 else {
3193 data->start_class->flags |= ANYOF_LOCALE;
3194 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3195 }
3196 break;
3197 case NSPACE:
3198 if (flags & SCF_DO_STCLASS_AND) {
3199 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3200 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3201 for (value = 0; value < 256; value++)
3202 if (isSPACE(value))
3203 ANYOF_BITMAP_CLEAR(data->start_class, value);
3204 }
3205 }
3206 else {
3207 if (data->start_class->flags & ANYOF_LOCALE)
3208 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3209 else {
3210 for (value = 0; value < 256; value++)
3211 if (!isSPACE(value))
b81d288d 3212 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3213 }
3214 }
3215 break;
3216 case NSPACEL:
3217 if (flags & SCF_DO_STCLASS_AND) {
3218 if (data->start_class->flags & ANYOF_LOCALE) {
3219 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3220 for (value = 0; value < 256; value++)
3221 if (!isSPACE(value))
3222 ANYOF_BITMAP_CLEAR(data->start_class, value);
3223 }
3224 }
3225 else {
3226 data->start_class->flags |= ANYOF_LOCALE;
3227 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3228 }
3229 break;
3230 case DIGIT:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3233 for (value = 0; value < 256; value++)
3234 if (!isDIGIT(value))
3235 ANYOF_BITMAP_CLEAR(data->start_class, value);
3236 }
3237 else {
3238 if (data->start_class->flags & ANYOF_LOCALE)
3239 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3240 else {
3241 for (value = 0; value < 256; value++)
3242 if (isDIGIT(value))
b81d288d 3243 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3244 }
3245 }
3246 break;
3247 case NDIGIT:
3248 if (flags & SCF_DO_STCLASS_AND) {
3249 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3250 for (value = 0; value < 256; value++)
3251 if (isDIGIT(value))
3252 ANYOF_BITMAP_CLEAR(data->start_class, value);
3253 }
3254 else {
3255 if (data->start_class->flags & ANYOF_LOCALE)
3256 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3257 else {
3258 for (value = 0; value < 256; value++)
3259 if (!isDIGIT(value))
b81d288d 3260 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3261 }
3262 }
3263 break;
3264 }
3265 if (flags & SCF_DO_STCLASS_OR)
3266 cl_and(data->start_class, &and_with);
3267 flags &= ~SCF_DO_STCLASS;
3268 }
a0ed51b3 3269 }
3dab1dad 3270 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3271 data->flags |= (OP(scan) == MEOL
3272 ? SF_BEFORE_MEOL
3273 : SF_BEFORE_SEOL);
a0ed51b3 3274 }
3dab1dad 3275 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3276 /* Lookbehind, or need to calculate parens/evals/stclass: */
3277 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3278 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3279 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3280 || OP(scan) == UNLESSM )
3281 {
3282 /* Negative Lookahead/lookbehind
3283 In this case we can't do fixed string optimisation.
3284 */
3285
3286 I32 deltanext, minnext, fake = 0;
3287 regnode *nscan;
3288 struct regnode_charclass_class intrnl;
3289 int f = 0;
3290
3291 data_fake.flags = 0;
3292 if (data) {
3293 data_fake.whilem_c = data->whilem_c;
3294 data_fake.last_closep = data->last_closep;
a0ed51b3 3295 }
1de06328
YO
3296 else
3297 data_fake.last_closep = &fake;
3298 if ( flags & SCF_DO_STCLASS && !scan->flags
3299 && OP(scan) == IFMATCH ) { /* Lookahead */
3300 cl_init(pRExC_state, &intrnl);
3301 data_fake.start_class = &intrnl;
3302 f |= SCF_DO_STCLASS_AND;
c277df42 3303 }
1de06328
YO
3304 if (flags & SCF_WHILEM_VISITED_POS)
3305 f |= SCF_WHILEM_VISITED_POS;
3306 next = regnext(scan);
3307 nscan = NEXTOPER(NEXTOPER(scan));
3308 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3309 if (scan->flags) {
3310 if (deltanext) {
3311 vFAIL("Variable length lookbehind not implemented");
3312 }
3313 else if (minnext > (I32)U8_MAX) {
3314 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3315 }
3316 scan->flags = (U8)minnext;
3317 }
3318 if (data) {
3319 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3320 pars++;
3321 if (data_fake.flags & SF_HAS_EVAL)
3322 data->flags |= SF_HAS_EVAL;
3323 data->whilem_c = data_fake.whilem_c;
3324 }
3325 if (f & SCF_DO_STCLASS_AND) {
3326 const int was = (data->start_class->flags & ANYOF_EOS);
3327
3328 cl_and(data->start_class, &intrnl);
3329 if (was)
3330 data->start_class->flags |= ANYOF_EOS;
3331 }
be8e71aa 3332 }
1de06328
YO
3333#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3334 else {
3335 /* Positive Lookahead/lookbehind
3336 In this case we can do fixed string optimisation,
3337 but we must be careful about it. Note in the case of
3338 lookbehind the positions will be offset by the minimum
3339 length of the pattern, something we won't know about
3340 until after the recurse.
3341 */
3342 I32 deltanext, fake = 0;
3343 regnode *nscan;
3344 struct regnode_charclass_class intrnl;
3345 int f = 0;
3346 /* We use SAVEFREEPV so that when the full compile
3347 is finished perl will clean up the allocated
3348 minlens when its all done. This was we don't
3349 have to worry about freeing them when we know
3350 they wont be used, which would be a pain.
3351 */
3352 I32 *minnextp;
3353 Newx( minnextp, 1, I32 );
3354 SAVEFREEPV(minnextp);
3355
3356 if (data) {
3357 StructCopy(data, &data_fake, scan_data_t);
3358 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3359 f |= SCF_DO_SUBSTR;
3360 if (scan->flags)
3361 scan_commit(pRExC_state, &data_fake,minlenp);
3362 data_fake.last_found=newSVsv(data->last_found);
3363 }
3364 }
3365 else
3366 data_fake.last_closep = &fake;
3367 data_fake.flags = 0;
3368 if (is_inf)
3369 data_fake.flags |= SF_IS_INF;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 cl_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
3375 }
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = NEXTOPER(NEXTOPER(scan));
3380
3381 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3382 if (scan->flags) {
3383 if (deltanext) {
3384 vFAIL("Variable length lookbehind not implemented");
3385 }
3386 else if (*minnextp > (I32)U8_MAX) {
3387 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3388 }
3389 scan->flags = (U8)*minnextp;
3390 }
3391
3392 *minnextp += min;
3393
3394
3395 if (f & SCF_DO_STCLASS_AND) {
3396 const int was = (data->start_class->flags & ANYOF_EOS);
3397
3398 cl_and(data->start_class, &intrnl);
3399 if (was)
3400 data->start_class->flags |= ANYOF_EOS;
3401 }
3402 if (data) {
3403 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3404 pars++;
3405 if (data_fake.flags & SF_HAS_EVAL)
3406 data->flags |= SF_HAS_EVAL;
3407 data->whilem_c = data_fake.whilem_c;
3408 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3409 if (RExC_rx->minlen<*minnextp)
3410 RExC_rx->minlen=*minnextp;
3411 scan_commit(pRExC_state, &data_fake, minnextp);
3412 SvREFCNT_dec(data_fake.last_found);
3413
3414 if ( data_fake.minlen_fixed != minlenp )
3415 {
3416 data->offset_fixed= data_fake.offset_fixed;
3417 data->minlen_fixed= data_fake.minlen_fixed;
3418 data->lookbehind_fixed+= scan->flags;
3419 }
3420 if ( data_fake.minlen_float != minlenp )
3421 {
3422 data->minlen_float= data_fake.minlen_float;
3423 data->offset_float_min=data_fake.offset_float_min;
3424 data->offset_float_max=data_fake.offset_float_max;
3425 data->lookbehind_float+= scan->flags;
3426 }
3427 }
3428 }
3429
653099ff 3430
653099ff 3431 }
1de06328 3432#endif
a0ed51b3
LW
3433 }
3434 else if (OP(scan) == OPEN) {
c277df42 3435 pars++;
a0ed51b3 3436 }
cb434fcc 3437 else if (OP(scan) == CLOSE) {
eb160463 3438 if ((I32)ARG(scan) == is_par) {
cb434fcc 3439 next = regnext(scan);
c277df42 3440
cb434fcc
IZ
3441 if ( next && (OP(next) != WHILEM) && next < last)
3442 is_par = 0; /* Disable optimization */
3443 }
3444 if (data)
3445 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3446 }
3447 else if (OP(scan) == EVAL) {
c277df42
IZ
3448 if (data)
3449 data->flags |= SF_HAS_EVAL;
3450 }
0a4db386
YO
3451 else if ( (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3452 || OP(scan)==RECURSE) /* recursion */
3453 {
3454 if (OP(scan)==RECURSE) {
3455 ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
3456 }
0f5d15d6 3457 if (flags & SCF_DO_SUBSTR) {
1de06328 3458 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3459 data->longest = &(data->longest_float);
3460 }
3461 is_inf = is_inf_internal = 1;
653099ff 3462 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3463 cl_anything(pRExC_state, data->start_class);
96776eda 3464 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3465 }
786e8c11
YO
3466#ifdef TRIE_STUDY_OPT
3467#ifdef FULL_TRIE_STUDY
3468 else if (PL_regkind[OP(scan)] == TRIE) {
3469 /* NOTE - There is similar code to this block above for handling
3470 BRANCH nodes on the initial study. If you change stuff here
3471 check there too. */
3472 regnode *tail= regnext(scan);
3473 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3474 I32 max1 = 0, min1 = I32_MAX;
3475 struct regnode_charclass_class accum;
3476
3477 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3478 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3479 if (flags & SCF_DO_STCLASS)
3480 cl_init_zero(pRExC_state, &accum);
3481
3482 if (!trie->jump) {
3483 min1= trie->minlen;
3484 max1= trie->maxlen;
3485 } else {
3486 const regnode *nextbranch= NULL;
3487 U32 word;
3488
3489 for ( word=1 ; word <= trie->wordcount ; word++)
3490 {
3491 I32 deltanext=0, minnext=0, f = 0, fake;
3492 struct regnode_charclass_class this_class;
3493
3494 data_fake.flags = 0;
3495 if (data) {
3496 data_fake.whilem_c = data->whilem_c;
3497 data_fake.last_closep = data->last_closep;
3498 }
3499 else
3500 data_fake.last_closep = &fake;
3501
3502 if (flags & SCF_DO_STCLASS) {
3503 cl_init(pRExC_state, &this_class);
3504 data_fake.start_class = &this_class;
3505 f = SCF_DO_STCLASS_AND;
3506 }
3507 if (flags & SCF_WHILEM_VISITED_POS)
3508 f |= SCF_WHILEM_VISITED_POS;
3509
3510 if (trie->jump[word]) {
3511 if (!nextbranch)
3512 nextbranch = tail - trie->jump[0];
3513 scan= tail - trie->jump[word];
3514 /* We go from the jump point to the branch that follows
3515 it. Note this means we need the vestigal unused branches
3516 even though they arent otherwise used.
3517 */
1de06328 3518 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3519 (regnode *)nextbranch, &data_fake, f,depth+1);
3520 }
3521 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3522 nextbranch= regnext((regnode*)nextbranch);
3523
3524 if (min1 > (I32)(minnext + trie->minlen))
3525 min1 = minnext + trie->minlen;
3526 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3527 max1 = minnext + deltanext + trie->maxlen;
3528 if (deltanext == I32_MAX)
3529 is_inf = is_inf_internal = 1;
3530
3531 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3532 pars++;
3533
3534 if (data) {
3535 if (data_fake.flags & SF_HAS_EVAL)
3536 data->flags |= SF_HAS_EVAL;
3537 data->whilem_c = data_fake.whilem_c;
3538 }
3539 if (flags & SCF_DO_STCLASS)
3540 cl_or(pRExC_state, &accum, &this_class);
3541 }
3542 }
3543 if (flags & SCF_DO_SUBSTR) {
3544 data->pos_min += min1;
3545 data->pos_delta += max1 - min1;
3546 if (max1 != min1 || is_inf)
3547 data->longest = &(data->longest_float);
3548 }
3549 min += min1;
3550 delta += max1 - min1;
3551 if (flags & SCF_DO_STCLASS_OR) {
3552 cl_or(pRExC_state, data->start_class, &accum);
3553 if (min1) {
3554 cl_and(data->start_class, &and_with);
3555 flags &= ~SCF_DO_STCLASS;
3556 }
3557 }
3558 else if (flags & SCF_DO_STCLASS_AND) {
3559 if (min1) {
3560 cl_and(data->start_class, &accum);
3561 flags &= ~SCF_DO_STCLASS;
3562 }
3563 else {
3564 /* Switch to OR mode: cache the old value of
3565 * data->start_class */
3566 StructCopy(data->start_class, &and_with,
3567 struct regnode_charclass_class);
3568 flags &= ~SCF_DO_STCLASS_AND;
3569 StructCopy(&accum, data->start_class,
3570 struct regnode_charclass_class);
3571 flags |= SCF_DO_STCLASS_OR;
3572 data->start_class->flags |= ANYOF_EOS;
3573 }
3574 }
3575 scan= tail;
3576 continue;
3577 }
3578#else
3579 else if (PL_regkind[OP(scan)] == TRIE) {
3580 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3581 U8*bang=NULL;
3582
3583 min += trie->minlen;
3584 delta += (trie->maxlen - trie->minlen);
3585 flags &= ~SCF_DO_STCLASS; /* xxx */
3586 if (flags & SCF_DO_SUBSTR) {
1de06328 3587 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3588 data->pos_min += trie->minlen;
3589 data->pos_delta += (trie->maxlen - trie->minlen);
3590 if (trie->maxlen != trie->minlen)
3591 data->longest = &(data->longest_float);
3592 }
3593 if (trie->jump) /* no more substrings -- for now /grr*/
3594 flags &= ~SCF_DO_SUBSTR;
3595 }
3596#endif /* old or new */
3597#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3598 /* Else: zero-length, ignore. */
3599 scan = regnext(scan);
3600 }
3601
3602 finish:
3603 *scanp = scan;
aca2d497 3604 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3605 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3606 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3607 if (is_par > (I32)U8_MAX)
c277df42
IZ
3608 is_par = 0;
3609 if (is_par && pars==1 && data) {
3610 data->flags |= SF_IN_PAR;
3611 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3612 }
3613 else if (pars && data) {
c277df42
IZ
3614 data->flags |= SF_HAS_PAR;
3615 data->flags &= ~SF_IN_PAR;
3616 }
653099ff
GS
3617 if (flags & SCF_DO_STCLASS_OR)
3618 cl_and(data->start_class, &and_with);
786e8c11
YO
3619 if (flags & SCF_TRIE_RESTUDY)
3620 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3621
3622 DEBUG_STUDYDATA(data,depth);
3623
c277df42
IZ
3624 return min;
3625}
3626
76e3520e 3627STATIC I32
5f66b61c 3628S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3629{
830247a4 3630 if (RExC_rx->data) {
2eb97020 3631 const U32 count = RExC_rx->data->count;
b81d288d 3632 Renewc(RExC_rx->data,
2eb97020 3633 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3634 char, struct reg_data);
2eb97020 3635 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3636 RExC_rx->data->count += n;
a0ed51b3
LW
3637 }
3638 else {
a02a5408 3639 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3640 char, struct reg_data);
a02a5408 3641 Newx(RExC_rx->data->what, n, U8);
830247a4 3642 RExC_rx->data->count = n;
c277df42 3643 }
830247a4
IZ
3644 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3645 return RExC_rx->data->count - n;
c277df42
IZ
3646}
3647
76234dfb 3648#ifndef PERL_IN_XSUB_RE
d88dccdf 3649void
864dbfa3 3650Perl_reginitcolors(pTHX)
d88dccdf 3651{
97aff369 3652 dVAR;
1df70142 3653 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3654 if (s) {
1df70142
AL
3655 char *t = savepv(s);
3656 int i = 0;
3657 PL_colors[0] = t;
d88dccdf 3658 while (++i < 6) {
1df70142
AL
3659 t = strchr(t, '\t');
3660 if (t) {
3661 *t = '\0';
3662 PL_colors[i] = ++t;
d88dccdf
IZ
3663 }
3664 else
1df70142 3665 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3666 }
3667 } else {
1df70142 3668 int i = 0;
b81d288d 3669 while (i < 6)
06b5626a 3670 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3671 }
3672 PL_colorset = 1;
3673}
76234dfb 3674#endif
8615cb43 3675
07be1b83 3676
786e8c11
YO
3677#ifdef TRIE_STUDY_OPT
3678#define CHECK_RESTUDY_GOTO \
3679 if ( \
3680 (data.flags & SCF_TRIE_RESTUDY) \
3681 && ! restudied++ \
3682 ) goto reStudy
3683#else
3684#define CHECK_RESTUDY_GOTO
3685#endif
f9f4320a 3686
a687059c 3687/*
e50aee73 3688 - pregcomp - compile a regular expression into internal code
a687059c
LW
3689 *
3690 * We can't allocate space until we know how big the compiled form will be,
3691 * but we can't compile it (and thus know how big it is) until we've got a
3692 * place to put the code. So we cheat: we compile it twice, once with code
3693 * generation turned off and size counting turned on, and once "for real".
3694 * This also means that we don't allocate space until we are sure that the
3695 * thing really will compile successfully, and we never have to move the
3696 * code and thus invalidate pointers into it. (Note that it has to be in
3697 * one piece because free() must be able to free it all.) [NB: not true in perl]
3698 *
3699 * Beware that the optimization-preparation code in here knows about some
3700 * of the structure of the compiled regexp. [I'll say.]
3701 */
b9b4dddf
YO
3702
3703
3704
f9f4320a 3705#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3706#define RE_ENGINE_PTR &PL_core_reg_engine
3707#else
f9f4320a
YO
3708extern const struct regexp_engine my_reg_engine;
3709#define RE_ENGINE_PTR &my_reg_engine
3710#endif
b9b4dddf
YO
3711/* these make a few things look better, to avoid indentation */
3712#define BEGIN_BLOCK {
f9f4320a
YO
3713#define END_BLOCK }
3714
a687059c 3715regexp *
864dbfa3 3716Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3717{
97aff369 3718 dVAR;
f9f4320a
YO
3719 GET_RE_DEBUG_FLAGS_DECL;
3720 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3721#ifndef PERL_IN_XSUB_RE
3722 BEGIN_BLOCK
f9f4320a
YO
3723 /* Dispatch a request to compile a regexp to correct
3724 regexp engine. */
3725 HV * const table = GvHV(PL_hintgv);
3726 if (table) {
3727 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3728 if (ptr && SvIOK(*ptr)) {
3729 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3730 DEBUG_COMPILE_r({
8d8756e7 3731 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3732 SvIV(*ptr));
3733 });
f2f78491 3734 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3735 }
b9b4dddf
YO
3736 }
3737 END_BLOCK
3738#endif
3739 BEGIN_BLOCK
a0d0e21e 3740 register regexp *r;
c277df42 3741 regnode *scan;
c277df42 3742 regnode *first;
a0d0e21e 3743 I32 flags;
a0d0e21e
LW
3744 I32 minlen = 0;
3745 I32 sawplus = 0;
3746 I32 sawopen = 0;
2c2d71f5 3747 scan_data_t data;
830247a4 3748 RExC_state_t RExC_state;
be8e71aa 3749 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3750#ifdef TRIE_STUDY_OPT
3751 int restudied= 0;
3752 RExC_state_t copyRExC_state;
3753#endif
a0d0e21e 3754 if (exp == NULL)
c277df42 3755 FAIL("NULL regexp argument");
a0d0e21e 3756
a5961de5 3757 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3758
5cfc7842 3759 RExC_precomp = exp;
a3621e74 3760 DEBUG_COMPILE_r({
ab3bbdeb
YO
3761 SV *dsv= sv_newmortal();
3762 RE_PV_QUOTED_DECL(s, RExC_utf8,
3763 dsv, RExC_precomp, (xend - exp), 60);
3764 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3765 PL_colors[4],PL_colors[5],s);
a5961de5 3766 });
e2509266 3767 RExC_flags = pm->op_pmflags;
830247a4 3768 RExC_sawback = 0;
bbce6d69 3769
830247a4
IZ
3770 RExC_seen = 0;
3771 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3772 RExC_seen_evals = 0;
3773 RExC_extralen = 0;
c277df42 3774
bbce6d69 3775 /* First pass: determine size, legality. */
830247a4 3776 RExC_parse = exp;
fac92740 3777 RExC_start = exp;
830247a4
IZ
3778 RExC_end = xend;
3779 RExC_naughty = 0;
3780 RExC_npar = 1;
3781 RExC_size = 0L;
3782 RExC_emit = &PL_regdummy;
3783 RExC_whilem_seen = 0;
fc8cd66c 3784 RExC_charnames = NULL;
81714fb9
YO
3785 RExC_parens = NULL;
3786 RExC_paren_names = NULL;
3787
85ddcde9
JH
3788#if 0 /* REGC() is (currently) a NOP at the first pass.
3789 * Clever compilers notice this and complain. --jhi */
830247a4 3790 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3791#endif
3dab1dad
YO
3792 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3793 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3794 RExC_precomp = NULL;
a0d0e21e
LW
3795 return(NULL);
3796 }
07be1b83 3797 DEBUG_PARSE_r({
81714fb9
YO
3798 PerlIO_printf(Perl_debug_log,
3799 "Required size %"IVdf" nodes\n"
3800 "Starting second pass (creation)\n",
3801 (IV)RExC_size);
07be1b83
YO
3802 RExC_lastnum=0;
3803 RExC_lastparse=NULL;
3804 });
c277df42
IZ
3805 /* Small enough for pointer-storage convention?
3806 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3807 if (RExC_size >= 0x10000L && RExC_extralen)
3808 RExC_size += RExC_extralen;
c277df42 3809 else
830247a4
IZ
3810 RExC_extralen = 0;
3811 if (RExC_whilem_seen > 15)
3812 RExC_whilem_seen = 15;
a0d0e21e 3813
f9f4320a
YO
3814 /* Allocate space and zero-initialize. Note, the two step process
3815 of zeroing when in debug mode, thus anything assigned has to
3816 happen after that */
a02a5408 3817 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3818 char, regexp);
a0d0e21e 3819 if (r == NULL)
b45f050a 3820 FAIL("Regexp out of space");
0f79a09d
GS
3821#ifdef DEBUGGING
3822 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3823 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3824#endif
f9f4320a
YO
3825 /* initialization begins here */
3826 r->engine= RE_ENGINE_PTR;
c277df42 3827 r->refcnt = 1;
bbce6d69 3828 r->prelen = xend - exp;
5cfc7842 3829 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3830 r->subbeg = NULL;
f8c7b90f 3831#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3832 r->saved_copy = NULL;
ed252734 3833#endif
cf93c79d 3834 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3835 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3836 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3837
3838 r->substrs = 0; /* Useful during FAIL. */
3839 r->startp = 0; /* Useful during FAIL. */
6bda09f9 3840 r->endp = 0;
81714fb9
YO
3841 r->paren_names = 0;
3842
6bda09f9
YO
3843 if (RExC_seen & REG_SEEN_RECURSE) {
3844 Newx(RExC_parens, RExC_npar,regnode *);
3845 SAVEFREEPV(RExC_parens);
3846 }
3847
3848 /* Useful during FAIL. */
a02a5408 3849 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3850 if (r->offsets) {
2af232bd 3851 r->offsets[0] = RExC_size;
fac92740 3852 }
a3621e74 3853 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3854 "%s %"UVuf" bytes for offset annotations.\n",
3855 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3856 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3857
830247a4 3858 RExC_rx = r;
bbce6d69 3859
3860 /* Second pass: emit code. */
e2509266 3861 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3862 RExC_parse = exp;
3863 RExC_end = xend;
3864 RExC_naughty = 0;
3865 RExC_npar = 1;
fac92740 3866 RExC_emit_start = r->program;
830247a4 3867 RExC_emit = r->program;
2cd61cdb 3868 /* Store the count of eval-groups for security checks: */
786e8c11 3869 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3870 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3871 r->data = 0;
3dab1dad 3872 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3873 return(NULL);
6bda09f9 3874
07be1b83
YO
3875 /* XXXX To minimize changes to RE engine we always allocate
3876 3-units-long substrs field. */
3877 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3878
07be1b83 3879reStudy:
1de06328 3880 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3881 Zero(r->substrs, 1, struct reg_substr_data);
3882 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3883
07be1b83
YO
3884#ifdef TRIE_STUDY_OPT
3885 if ( restudied ) {
3886 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3887 RExC_state=copyRExC_state;
1de06328 3888 if (data.last_found) {
07be1b83 3889 SvREFCNT_dec(data.longest_fixed);
07be1b83 3890 SvREFCNT_dec(data.longest_float);
07be1b83 3891 SvREFCNT_dec(data.last_found);
1de06328 3892 }
07be1b83
YO
3893 } else {
3894 copyRExC_state=RExC_state;
3895 }
3896#endif
fc8cd66c 3897
a0d0e21e 3898 /* Dig out information for optimizations. */
cf93c79d 3899 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3900 pm->op_pmflags = RExC_flags;
a0ed51b3 3901 if (UTF)
5ff6fc6d 3902 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3903 r->regstclass = NULL;
830247a4 3904 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3905 r->reganch |= ROPT_NAUGHTY;
c277df42 3906 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3907
1de06328
YO
3908 /* testing for BRANCH here tells us whether there is "must appear"
3909 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 3910 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 3911 I32 fake;
c5254dd6 3912 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3913 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3914 int stclass_flag;
07be1b83 3915 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3916
3917 first = scan;
c277df42 3918 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3919 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3920 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3921 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3922 /* for now we can't handle lookbehind IFMATCH*/
3923 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3924 (OP(first) == PLUS) ||
3925 (OP(first) == MINMOD) ||
653099ff 3926 /* An {n,m} with n>0 */
07be1b83
YO
3927 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3928 {
786e8c11 3929
a0d0e21e
LW
3930 if (OP(first) == PLUS)
3931 sawplus = 1;
3932 else
3dab1dad 3933 first += regarglen[OP(first)];
07be1b83
YO
3934 if (OP(first) == IFMATCH) {
3935 first = NEXTOPER(first);
3936 first += EXTRA_STEP_2ARGS;
7c167cea 3937 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3938 first = NEXTOPER(first);
a687059c
LW
3939 }
3940
a0d0e21e
LW
3941 /* Starting-point info. */
3942 again:
786e8c11 3943 DEBUG_PEEP("first:",first,0);
07be1b83 3944 /* Ignore EXACT as we deal with it later. */
3dab1dad 3945 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3946 if (OP(first) == EXACT)
6f207bd3 3947 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3948 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3949 r->regstclass = first;
3950 }
07be1b83 3951#ifdef TRIE_STCLASS
786e8c11 3952 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3953 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3954 {
786e8c11 3955 regnode *trie_op;
07be1b83 3956 /* this can happen only on restudy */
786e8c11
YO
3957 if ( OP(first) == TRIE ) {
3958 struct regnode_1 *trieop;
3959 Newxz(trieop,1,struct regnode_1);
3960 StructCopy(first,trieop,struct regnode_1);
3961 trie_op=(regnode *)trieop;
3962 } else {
3963 struct regnode_charclass *trieop;
3964 Newxz(trieop,1,struct regnode_charclass);
3965 StructCopy(first,trieop,struct regnode_charclass);
3966 trie_op=(regnode *)trieop;
3967 }
1de06328 3968 OP(trie_op)+=2;
786e8c11
YO
3969 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3970 r->regstclass = trie_op;
07be1b83
YO
3971 }
3972#endif
bfed75c6 3973 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3974 r->regstclass = first;
3dab1dad
YO
3975 else if (PL_regkind[OP(first)] == BOUND ||
3976 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3977 r->regstclass = first;
3dab1dad 3978 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3979 r->reganch |= (OP(first) == MBOL
3980 ? ROPT_ANCH_MBOL
3981 : (OP(first) == SBOL
3982 ? ROPT_ANCH_SBOL
3983 : ROPT_ANCH_BOL));
a0d0e21e 3984 first = NEXTOPER(first);
774d564b 3985 goto again;
3986 }
3987 else if (OP(first) == GPOS) {
3988 r->reganch |= ROPT_ANCH_GPOS;
3989 first = NEXTOPER(first);
3990 goto again;
a0d0e21e 3991 }
e09294f4 3992 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3993 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3994 !(r->reganch & ROPT_ANCH) )
3995 {
3996 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3997 const int type =
3998 (OP(NEXTOPER(first)) == REG_ANY)
3999 ? ROPT_ANCH_MBOL
4000 : ROPT_ANCH_SBOL;
cad2e5aa 4001 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4002 first = NEXTOPER(first);
774d564b 4003 goto again;
a0d0e21e 4004 }
b81d288d 4005 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4007 /* x+ must match at the 1st pos of run of x's */
4008 r->reganch |= ROPT_SKIP;
a0d0e21e 4009
c277df42 4010 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4011#ifdef TRIE_STUDY_OPT
81714fb9 4012 DEBUG_PARSE_r(
be8e71aa
YO
4013 if (!restudied)
4014 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4015 (IV)(first - scan + 1))
4016 );
4017#else
81714fb9 4018 DEBUG_PARSE_r(
be8e71aa
YO
4019 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4020 (IV)(first - scan + 1))
4021 );
4022#endif
4023
4024
a0d0e21e
LW
4025 /*
4026 * If there's something expensive in the r.e., find the
4027 * longest literal string that must appear and make it the
4028 * regmust. Resolve ties in favor of later strings, since
4029 * the regstart check works with the beginning of the r.e.
4030 * and avoiding duplication strengthens checking. Not a
4031 * strong reason, but sufficient in the absence of others.
4032 * [Now we resolve ties in favor of the earlier string if
c277df42 4033 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4034 * earlier string may buy us something the later one won't.]
4035 */
a0d0e21e 4036 minlen = 0;
a687059c 4037
396482e1
GA
4038 data.longest_fixed = newSVpvs("");
4039 data.longest_float = newSVpvs("");
4040 data.last_found = newSVpvs("");
c277df42
IZ
4041 data.longest = &(data.longest_fixed);
4042 first = scan;
653099ff 4043 if (!r->regstclass) {
830247a4 4044 cl_init(pRExC_state, &ch_class);
653099ff
GS
4045 data.start_class = &ch_class;
4046 stclass_flag = SCF_DO_STCLASS_AND;
4047 } else /* XXXX Check for BOUND? */
4048 stclass_flag = 0;
cb434fcc 4049 data.last_closep = &last_close;
653099ff 4050
1de06328 4051 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 4052 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4053
07be1b83 4054
786e8c11
YO
4055 CHECK_RESTUDY_GOTO;
4056
4057
830247a4 4058 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4059 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4060 && !RExC_seen_zerolen
4061 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4062 r->reganch |= ROPT_CHECK_ALL;
1de06328 4063 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4064 SvREFCNT_dec(data.last_found);
4065
1de06328
YO
4066 /* Note that code very similar to this but for anchored string
4067 follows immediately below, changes may need to be made to both.
4068 Be careful.
4069 */
a0ed51b3 4070 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4071 if (longest_float_length
c277df42
IZ
4072 || (data.flags & SF_FL_BEFORE_EOL
4073 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4074 || (RExC_flags & PMf_MULTILINE))))
4075 {
1182767e 4076 I32 t,ml;
cf93c79d 4077
1de06328 4078 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4079 && data.offset_fixed == data.offset_float_min
4080 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4081 goto remove_float; /* As in (a)+. */
4082
1de06328
YO
4083 /* copy the information about the longest float from the reg_scan_data
4084 over to the program. */
33b8afdf
JH
4085 if (SvUTF8(data.longest_float)) {
4086 r->float_utf8 = data.longest_float;
c445ea15 4087 r->float_substr = NULL;
33b8afdf
JH
4088 } else {
4089 r->float_substr = data.longest_float;
c445ea15 4090 r->float_utf8 = NULL;
33b8afdf 4091 }
1de06328
YO
4092 /* float_end_shift is how many chars that must be matched that
4093 follow this item. We calculate it ahead of time as once the
4094 lookbehind offset is added in we lose the ability to correctly
4095 calculate it.*/
4096 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4097 : (I32)longest_float_length;
1de06328
YO
4098 r->float_end_shift = ml - data.offset_float_min
4099 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4100 + data.lookbehind_float;
4101 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4102 r->float_max_offset = data.offset_float_max;
1182767e 4103 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4104 r->float_max_offset -= data.lookbehind_float;
4105
cf93c79d
IZ
4106 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4107 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4108 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4109 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4110 }
4111 else {
aca2d497 4112 remove_float:
c445ea15 4113 r->float_substr = r->float_utf8 = NULL;
c277df42 4114 SvREFCNT_dec(data.longest_float);
c5254dd6 4115 longest_float_length = 0;
a0d0e21e 4116 }
c277df42 4117
1de06328
YO
4118 /* Note that code very similar to this but for floating string
4119 is immediately above, changes may need to be made to both.
4120 Be careful.
4121 */
a0ed51b3 4122 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4123 if (longest_fixed_length
c277df42
IZ
4124 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4125 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4126 || (RExC_flags & PMf_MULTILINE))))
4127 {
1182767e 4128 I32 t,ml;
cf93c79d 4129
1de06328
YO
4130 /* copy the information about the longest fixed
4131 from the reg_scan_data over to the program. */
33b8afdf
JH
4132 if (SvUTF8(data.longest_fixed)) {
4133 r->anchored_utf8 = data.longest_fixed;
c445ea15 4134 r->anchored_substr = NULL;
33b8afdf
JH
4135 } else {
4136 r->anchored_substr = data.longest_fixed;
c445ea15 4137 r->anchored_utf8 = NULL;
33b8afdf 4138 }
1de06328
YO
4139 /* fixed_end_shift is how many chars that must be matched that
4140 follow this item. We calculate it ahead of time as once the
4141 lookbehind offset is added in we lose the ability to correctly
4142 calculate it.*/
4143 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4144 : (I32)longest_fixed_length;
1de06328
YO
4145 r->anchored_end_shift = ml - data.offset_fixed
4146 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4147 + data.lookbehind_fixed;
4148 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4149
cf93c79d
IZ
4150 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4151 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4152 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4153 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4154 }
4155 else {
c445ea15 4156 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4157 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4158 longest_fixed_length = 0;
a0d0e21e 4159 }
b81d288d 4160 if (r->regstclass
ffc61ed2 4161 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4162 r->regstclass = NULL;
33b8afdf
JH
4163 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4164 && stclass_flag
653099ff 4165 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4166 && !cl_is_anything(data.start_class))
4167 {
1df70142 4168 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4169
a02a5408 4170 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4171 struct regnode_charclass_class);
4172 StructCopy(data.start_class,
830247a4 4173 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4174 struct regnode_charclass_class);
830247a4 4175 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4176 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4177 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4178 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4179 PerlIO_printf(Perl_debug_log,
a0288114 4180 "synthetic stclass \"%s\".\n",
3f7c398e 4181 SvPVX_const(sv));});
653099ff 4182 }
c277df42
IZ
4183
4184 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4185 if (longest_fixed_length > longest_float_length) {
1de06328 4186 r->check_end_shift = r->anchored_end_shift;
c277df42 4187 r->check_substr = r->anchored_substr;
33b8afdf 4188 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4189 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4190 if (r->reganch & ROPT_ANCH_SINGLE)
4191 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4192 }
4193 else {
1de06328 4194 r->check_end_shift = r->float_end_shift;
c277df42 4195 r->check_substr = r->float_substr;
33b8afdf 4196 r->check_utf8 = r->float_utf8;
1de06328
YO
4197 r->check_offset_min = r->float_min_offset;
4198 r->check_offset_max = r->float_max_offset;
a0d0e21e 4199 }
30382c73
IZ
4200 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4201 This should be changed ASAP! */
33b8afdf 4202 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4203 r->reganch |= RE_USE_INTUIT;
33b8afdf 4204 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4205 r->reganch |= RE_INTUIT_TAIL;
4206 }
1de06328
YO
4207 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4208 if ( (STRLEN)minlen < longest_float_length )
4209 minlen= longest_float_length;
4210 if ( (STRLEN)minlen < longest_fixed_length )
4211 minlen= longest_fixed_length;
4212 */
a0ed51b3
LW
4213 }
4214 else {
c277df42
IZ
4215 /* Several toplevels. Best we can is to set minlen. */
4216 I32 fake;
653099ff 4217 struct regnode_charclass_class ch_class;
cb434fcc 4218 I32 last_close = 0;
c277df42 4219
a3621e74 4220 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4221
c277df42 4222 scan = r->program + 1;
830247a4 4223 cl_init(pRExC_state, &ch_class);
653099ff 4224 data.start_class = &ch_class;
cb434fcc 4225 data.last_closep = &last_close;
07be1b83 4226
1de06328 4227 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4228 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4229
786e8c11 4230 CHECK_RESTUDY_GOTO;
07be1b83 4231
33b8afdf 4232 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4233 = r->float_substr = r->float_utf8 = NULL;
653099ff 4234 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4235 && !cl_is_anything(data.start_class))
4236 {
1df70142 4237 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4238
a02a5408 4239 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4240 struct regnode_charclass_class);
4241 StructCopy(data.start_class,
830247a4 4242 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4243 struct regnode_charclass_class);
830247a4 4244 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4245 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4246 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4247 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4248 PerlIO_printf(Perl_debug_log,
a0288114 4249 "synthetic stclass \"%s\".\n",
3f7c398e 4250 SvPVX_const(sv));});
653099ff 4251 }
a0d0e21e
LW
4252 }
4253
1de06328
YO
4254 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4255 the "real" pattern. */
4256 if (r->minlen < minlen)
4257 r->minlen = minlen;
4258
b81d288d 4259 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4260 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4261 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4262 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4263 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4264 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4265 if (RExC_seen & REG_SEEN_CANY)
4266 r->reganch |= ROPT_CANY_SEEN;
81714fb9
YO
4267 if (RExC_paren_names)
4268 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4269 else
4270 r->paren_names = NULL;
4271
a02a5408
JC
4272 Newxz(r->startp, RExC_npar, I32);
4273 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4274
f2278c82 4275 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4276 DEBUG_DUMP_r({
4277 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4278 regdump(r);
4279 });
8e9a8a48
YO
4280 DEBUG_OFFSETS_r(if (r->offsets) {
4281 const U32 len = r->offsets[0];
4282 U32 i;
4283 GET_RE_DEBUG_FLAGS_DECL;
4284 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4285 for (i = 1; i <= len; i++) {
4286 if (r->offsets[i*2-1] || r->offsets[i*2])
4287 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4288 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4289 }
4290 PerlIO_printf(Perl_debug_log, "\n");
4291 });
a0d0e21e 4292 return(r);
f9f4320a 4293 END_BLOCK
a687059c
LW
4294}
4295
f9f4320a
YO
4296#undef CORE_ONLY_BLOCK
4297#undef END_BLOCK
4298#undef RE_ENGINE_PTR
3dab1dad 4299
9af228c6 4300#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4301SV*
4302Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4303{
4304 I32 parno = 0; /* no match */
4305 if (PL_curpm) {
4306 const REGEXP * const rx = PM_GETRE(PL_curpm);
4307 if (rx && rx->paren_names) {
4308 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4309 if (he_str) {
4310 IV i;
4311 SV* sv_dat=HeVAL(he_str);
4312 I32 *nums=(I32*)SvPVX(sv_dat);
4313 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4314 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4315 rx->endp[nums[i]] != -1)
4316 {
4317 parno = nums[i];
4318 break;
4319 }
4320 }
4321 }
4322 }
4323 }
4324 if ( !parno ) {
4325 return 0;
4326 } else {
4327 GV *gv_paren;
4328 SV *sv= sv_newmortal();
4329 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4330 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4331 return GvSVn(gv_paren);
4332 }
4333}
9af228c6 4334#endif
0a4db386 4335
894be9b7 4336/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4337 * If flags is REG_RSN_RETURN_NULL returns null.
4338 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4339 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4340 * to the parsed name as looked up in the RExC_paren_names hash.
4341 * If there is an error throws a vFAIL().. type exception.
894be9b7 4342 */
0a4db386
YO
4343
4344#define REG_RSN_RETURN_NULL 0
4345#define REG_RSN_RETURN_NAME 1
4346#define REG_RSN_RETURN_DATA 2
4347
894be9b7
YO
4348STATIC SV*
4349S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4350 char *name_start = RExC_parse;
0a4db386 4351 if ( UTF ) {
894be9b7 4352 STRLEN numlen;
0a4db386
YO
4353 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4354 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4355 {
4356 RExC_parse += numlen;
4357 }
4358 } else {
4359 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4360 RExC_parse++;
4361 }
0a4db386
YO
4362 if ( flags ) {
4363 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4364 (int)(RExC_parse - name_start)));
894be9b7 4365 if (UTF)
0a4db386
YO
4366 SvUTF8_on(sv_name);
4367 if ( flags == REG_RSN_RETURN_NAME)
4368 return sv_name;
4369 else if (flags==REG_RSN_RETURN_DATA) {
4370 HE *he_str = NULL;
4371 SV *sv_dat = NULL;
4372 if ( ! sv_name ) /* should not happen*/
4373 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4374 if (RExC_paren_names)
4375 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4376 if ( he_str )
4377 sv_dat = HeVAL(he_str);
4378 if ( ! sv_dat )
4379 vFAIL("Reference to nonexistent named group");
4380 return sv_dat;
4381 }
4382 else {
4383 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4384 }
4385 /* NOT REACHED */
894be9b7 4386 }
0a4db386 4387 return NULL;
894be9b7
YO
4388}
4389
3dab1dad
YO
4390#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4391 int rem=(int)(RExC_end - RExC_parse); \
4392 int cut; \
4393 int num; \
4394 int iscut=0; \
4395 if (rem>10) { \
4396 rem=10; \
4397 iscut=1; \
4398 } \
4399 cut=10-rem; \
4400 if (RExC_lastparse!=RExC_parse) \
4401 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4402 rem, RExC_parse, \
4403 cut + 4, \
4404 iscut ? "..." : "<" \
4405 ); \
4406 else \
4407 PerlIO_printf(Perl_debug_log,"%16s",""); \
4408 \
4409 if (SIZE_ONLY) \
4410 num=RExC_size; \
4411 else \
4412 num=REG_NODE_NUM(RExC_emit); \
4413 if (RExC_lastnum!=num) \
0a4db386 4414 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4415 else \
0a4db386 4416 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4417 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4418 (int)((depth*2)), "", \
3dab1dad
YO
4419 (funcname) \
4420 ); \
4421 RExC_lastnum=num; \
4422 RExC_lastparse=RExC_parse; \
4423})
4424
07be1b83
YO
4425
4426
3dab1dad
YO
4427#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4428 DEBUG_PARSE_MSG((funcname)); \
4429 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4430})
6bda09f9
YO
4431#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4432 DEBUG_PARSE_MSG((funcname)); \
4433 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4434})
a687059c
LW
4435/*
4436 - reg - regular expression, i.e. main body or parenthesized thing
4437 *
4438 * Caller must absorb opening parenthesis.
4439 *
4440 * Combining parenthesis handling with the base level of regular expression
4441 * is a trifle forced, but the need to tie the tails of the branches to what
4442 * follows makes it hard to avoid.
4443 */
07be1b83
YO
4444#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4445#ifdef DEBUGGING
4446#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4447#else
4448#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4449#endif
3dab1dad 4450
76e3520e 4451STATIC regnode *
3dab1dad 4452S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4453 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4454{
27da23d5 4455 dVAR;
c277df42
IZ
4456 register regnode *ret; /* Will be the head of the group. */
4457 register regnode *br;
4458 register regnode *lastbr;
cbbf8932 4459 register regnode *ender = NULL;
a0d0e21e 4460 register I32 parno = 0;
cbbf8932
AL
4461 I32 flags;
4462 const I32 oregflags = RExC_flags;
6136c704
AL
4463 bool have_branch = 0;
4464 bool is_open = 0;
9d1d55b5
JP
4465
4466 /* for (?g), (?gc), and (?o) warnings; warning
4467 about (?c) will warn about (?g) -- japhy */
4468
6136c704
AL
4469#define WASTED_O 0x01
4470#define WASTED_G 0x02
4471#define WASTED_C 0x04
4472#define WASTED_GC (0x02|0x04)
cbbf8932 4473 I32 wastedflags = 0x00;
9d1d55b5 4474
fac92740 4475 char * parse_start = RExC_parse; /* MJD */
a28509cc 4476 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4477
3dab1dad
YO
4478 GET_RE_DEBUG_FLAGS_DECL;
4479 DEBUG_PARSE("reg ");
4480
4481
821b33a5 4482 *flagp = 0; /* Tentatively. */
a0d0e21e 4483
9d1d55b5 4484
a0d0e21e
LW
4485 /* Make an OPEN node, if parenthesized. */
4486 if (paren) {
fac92740 4487 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4488 U32 posflags = 0, negflags = 0;
4489 U32 *flagsp = &posflags;
6136c704 4490 bool is_logical = 0;
a28509cc 4491 const char * const seqstart = RExC_parse;
ca9dfc88 4492
830247a4
IZ
4493 RExC_parse++;
4494 paren = *RExC_parse++;
c277df42 4495 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4496 switch (paren) {
894be9b7 4497
fac92740 4498 case '<': /* (?<...) */
b81d288d 4499 if (*RExC_parse == '!')
c277df42 4500 paren = ',';
0a4db386
YO
4501 else if (*RExC_parse != '=')
4502 { /* (?<...>) */
81714fb9 4503 char *name_start;
894be9b7 4504 SV *svname;
81714fb9
YO
4505 paren= '>';
4506 case '\'': /* (?'...') */
4507 name_start= RExC_parse;
0a4db386
YO
4508 svname = reg_scan_name(pRExC_state,
4509 SIZE_ONLY ? /* reverse test from the others */
4510 REG_RSN_RETURN_NAME :
4511 REG_RSN_RETURN_NULL);
81714fb9
YO
4512 if (RExC_parse == name_start)
4513 goto unknown;
4514 if (*RExC_parse != paren)
4515 vFAIL2("Sequence (?%c... not terminated",
4516 paren=='>' ? '<' : paren);
4517 if (SIZE_ONLY) {
e62cc96a
YO
4518 HE *he_str;
4519 SV *sv_dat = NULL;
894be9b7
YO
4520 if (!svname) /* shouldnt happen */
4521 Perl_croak(aTHX_
4522 "panic: reg_scan_name returned NULL");
81714fb9
YO
4523 if (!RExC_paren_names) {
4524 RExC_paren_names= newHV();
4525 sv_2mortal((SV*)RExC_paren_names);
4526 }
4527 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4528 if ( he_str )
81714fb9 4529 sv_dat = HeVAL(he_str);
e62cc96a 4530 if ( ! sv_dat ) {
81714fb9 4531 /* croak baby croak */
e62cc96a
YO
4532 Perl_croak(aTHX_
4533 "panic: paren_name hash element allocation failed");
4534 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4535 IV count=SvIV(sv_dat);
4536 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4537 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4538 pv[count]=RExC_npar;
4539 SvIVX(sv_dat)++;
4540 } else {
4541 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4542 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4543 SvIOK_on(sv_dat);
4544 SvIVX(sv_dat)= 1;
e62cc96a
YO
4545 }
4546
81714fb9
YO
4547 /*sv_dump(sv_dat);*/
4548 }
4549 nextchar(pRExC_state);
4550 paren = 1;
4551 goto capturing_parens;
4552 }
4553 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4554 RExC_parse++;
fac92740
MJD
4555 case '=': /* (?=...) */
4556 case '!': /* (?!...) */
830247a4 4557 RExC_seen_zerolen++;
fac92740
MJD
4558 case ':': /* (?:...) */
4559 case '>': /* (?>...) */
a0d0e21e 4560 break;
fac92740
MJD
4561 case '$': /* (?$...) */
4562 case '@': /* (?@...) */
8615cb43 4563 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4564 break;
fac92740 4565 case '#': /* (?#...) */
830247a4
IZ
4566 while (*RExC_parse && *RExC_parse != ')')
4567 RExC_parse++;
4568 if (*RExC_parse != ')')
c277df42 4569 FAIL("Sequence (?#... not terminated");
830247a4 4570 nextchar(pRExC_state);
a0d0e21e
LW
4571 *flagp = TRYAGAIN;
4572 return NULL;
894be9b7
YO
4573 case '0' : /* (?0) */
4574 case 'R' : /* (?R) */
4575 if (*RExC_parse != ')')
6bda09f9
YO
4576 FAIL("Sequence (?R) not terminated");
4577 reg_node(pRExC_state, SRECURSE);
894be9b7
YO
4578 break; /* (?PARNO) */
4579 { /* named and numeric backreferences */
4580 I32 num;
4581 char * parse_start;
4582 case '&': /* (?&NAME) */
4583 parse_start = RExC_parse - 1;
4584 {
0a4db386
YO
4585 SV *sv_dat = reg_scan_name(pRExC_state,
4586 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4587 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4588 }
4589 goto gen_recurse_regop;
4590 /* NOT REACHED */
6bda09f9
YO
4591 case '1': case '2': case '3': case '4': /* (?1) */
4592 case '5': case '6': case '7': case '8': case '9':
4593 RExC_parse--;
894be9b7
YO
4594 num = atoi(RExC_parse);
4595 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4596 while (isDIGIT(*RExC_parse))
4597 RExC_parse++;
4598 if (*RExC_parse!=')')
4599 vFAIL("Expecting close bracket");
894be9b7
YO
4600
4601 gen_recurse_regop:
6bda09f9
YO
4602 ret = reganode(pRExC_state, RECURSE, num);
4603 if (!SIZE_ONLY) {
4604 if (num > (I32)RExC_rx->nparens) {
4605 RExC_parse++;
4606 vFAIL("Reference to nonexistent group");
4607 }
4608 ARG2L_SET( ret, 0);
4609 RExC_emit++;
226de585 4610 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4611 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4612 } else {
6bda09f9 4613 RExC_size++;
6bda09f9 4614 }
0a4db386 4615 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4616 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4617 Set_Node_Offset(ret, parse_start); /* MJD */
4618
6bda09f9
YO
4619 nextchar(pRExC_state);
4620 return ret;
894be9b7
YO
4621 } /* named and numeric backreferences */
4622 /* NOT REACHED */
4623
fac92740 4624 case 'p': /* (?p...) */
9014280d 4625 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4626 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4627 /* FALL THROUGH*/
fac92740 4628 case '?': /* (??...) */
6136c704 4629 is_logical = 1;
438a3801
YST
4630 if (*RExC_parse != '{')
4631 goto unknown;
830247a4 4632 paren = *RExC_parse++;
0f5d15d6 4633 /* FALL THROUGH */
fac92740 4634 case '{': /* (?{...}) */
c277df42 4635 {
c277df42
IZ
4636 I32 count = 1, n = 0;
4637 char c;
830247a4 4638 char *s = RExC_parse;
c277df42 4639
830247a4
IZ
4640 RExC_seen_zerolen++;
4641 RExC_seen |= REG_SEEN_EVAL;
4642 while (count && (c = *RExC_parse)) {
6136c704
AL
4643 if (c == '\\') {
4644 if (RExC_parse[1])
4645 RExC_parse++;
4646 }
b81d288d 4647 else if (c == '{')
c277df42 4648 count++;
b81d288d 4649 else if (c == '}')
c277df42 4650 count--;
830247a4 4651 RExC_parse++;
c277df42 4652 }
6136c704 4653 if (*RExC_parse != ')') {
b81d288d 4654 RExC_parse = s;
b45f050a
JF
4655 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4656 }
c277df42 4657 if (!SIZE_ONLY) {
f3548bdc 4658 PAD *pad;
6136c704
AL
4659 OP_4tree *sop, *rop;
4660 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4661
569233ed
SB
4662 ENTER;
4663 Perl_save_re_context(aTHX);
f3548bdc 4664 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4665 sop->op_private |= OPpREFCOUNTED;
4666 /* re_dup will OpREFCNT_inc */
4667 OpREFCNT_set(sop, 1);
569233ed 4668 LEAVE;
c277df42 4669
830247a4
IZ
4670 n = add_data(pRExC_state, 3, "nop");
4671 RExC_rx->data->data[n] = (void*)rop;
4672 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4673 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4674 SvREFCNT_dec(sv);
a0ed51b3 4675 }
e24b16f9 4676 else { /* First pass */
830247a4 4677 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4678 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4679 /* No compiled RE interpolated, has runtime
4680 components ===> unsafe. */
4681 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4682 if (PL_tainting && PL_tainted)
cc6b7395 4683 FAIL("Eval-group in insecure regular expression");
54df2634 4684#if PERL_VERSION > 8
923e4eb5 4685 if (IN_PERL_COMPILETIME)
b5c19bd7 4686 PL_cv_has_eval = 1;
54df2634 4687#endif
c277df42 4688 }
b5c19bd7 4689
830247a4 4690 nextchar(pRExC_state);
6136c704 4691 if (is_logical) {
830247a4 4692 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4693 if (!SIZE_ONLY)
4694 ret->flags = 2;
3dab1dad 4695 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4696 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4697 return ret;
4698 }
ccb2c380
MP
4699 ret = reganode(pRExC_state, EVAL, n);
4700 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4701 Set_Node_Offset(ret, parse_start);
4702 return ret;
c277df42 4703 }
fac92740 4704 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4705 {
0a4db386 4706 int is_define= 0;
fac92740 4707 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4708 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4709 || RExC_parse[1] == '<'
830247a4 4710 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4711 I32 flag;
4712
830247a4 4713 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4714 if (!SIZE_ONLY)
4715 ret->flags = 1;
3dab1dad 4716 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4717 goto insert_if;
b81d288d 4718 }
a0ed51b3 4719 }
0a4db386
YO
4720 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
4721 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
4722 {
4723 char ch = RExC_parse[0] == '<' ? '>' : '\'';
4724 char *name_start= RExC_parse++;
4725 I32 num = 0;
4726 SV *sv_dat=reg_scan_name(pRExC_state,
4727 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4728 if (RExC_parse == name_start || *RExC_parse != ch)
4729 vFAIL2("Sequence (?(%c... not terminated",
4730 (ch == '>' ? '<' : ch));
4731 RExC_parse++;
4732 if (!SIZE_ONLY) {
4733 num = add_data( pRExC_state, 1, "S" );
4734 RExC_rx->data->data[num]=(void*)sv_dat;
4735 SvREFCNT_inc(sv_dat);
4736 }
4737 ret = reganode(pRExC_state,NGROUPP,num);
4738 goto insert_if_check_paren;
4739 }
4740 else if (RExC_parse[0] == 'D' &&
4741 RExC_parse[1] == 'E' &&
4742 RExC_parse[2] == 'F' &&
4743 RExC_parse[3] == 'I' &&
4744 RExC_parse[4] == 'N' &&
4745 RExC_parse[5] == 'E')
4746 {
4747 ret = reganode(pRExC_state,DEFINEP,0);
4748 RExC_parse +=6 ;
4749 is_define = 1;
4750 goto insert_if_check_paren;
4751 }
4752 else if (RExC_parse[0] == 'R') {
4753 RExC_parse++;
4754 parno = 0;
4755 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4756 parno = atoi(RExC_parse++);
4757 while (isDIGIT(*RExC_parse))
4758 RExC_parse++;
4759 } else if (RExC_parse[0] == '&') {
4760 SV *sv_dat;
4761 RExC_parse++;
4762 sv_dat = reg_scan_name(pRExC_state,
4763 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4764 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4765 }
4766 ret = reganode(pRExC_state,RECURSEP,parno);
4767 goto insert_if_check_paren;
4768 }
830247a4 4769 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4770 /* (?(1)...) */
6136c704 4771 char c;
830247a4 4772 parno = atoi(RExC_parse++);
c277df42 4773
830247a4
IZ
4774 while (isDIGIT(*RExC_parse))
4775 RExC_parse++;
fac92740 4776 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4777
0a4db386 4778 insert_if_check_paren:
830247a4 4779 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4780 vFAIL("Switch condition not recognized");
c277df42 4781 insert_if:
3dab1dad
YO
4782 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4783 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4784 if (br == NULL)
830247a4 4785 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4786 else
3dab1dad 4787 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4788 c = *nextchar(pRExC_state);
d1b80229
IZ
4789 if (flags&HASWIDTH)
4790 *flagp |= HASWIDTH;
c277df42 4791 if (c == '|') {
0a4db386
YO
4792 if (is_define)
4793 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 4794 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4795 regbranch(pRExC_state, &flags, 1,depth+1);
4796 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4797 if (flags&HASWIDTH)
4798 *flagp |= HASWIDTH;
830247a4 4799 c = *nextchar(pRExC_state);
a0ed51b3
LW
4800 }
4801 else
c277df42
IZ
4802 lastbr = NULL;
4803 if (c != ')')
8615cb43 4804 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4805 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4806 REGTAIL(pRExC_state, br, ender);
c277df42 4807 if (lastbr) {
3dab1dad
YO
4808 REGTAIL(pRExC_state, lastbr, ender);
4809 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4810 }
4811 else
3dab1dad 4812 REGTAIL(pRExC_state, ret, ender);
c277df42 4813 return ret;
a0ed51b3
LW
4814 }
4815 else {
830247a4 4816 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4817 }
4818 }
1b1626e4 4819 case 0:
830247a4 4820 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4821 vFAIL("Sequence (? incomplete");
1b1626e4 4822 break;
a0d0e21e 4823 default:
830247a4 4824 --RExC_parse;
fac92740 4825 parse_flags: /* (?i) */
830247a4 4826 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4827 /* (?g), (?gc) and (?o) are useless here
4828 and must be globally applied -- japhy */
4829
4830 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4831 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4832 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4833 if (! (wastedflags & wflagbit) ) {
4834 wastedflags |= wflagbit;
4835 vWARN5(
4836 RExC_parse + 1,
4837 "Useless (%s%c) - %suse /%c modifier",
4838 flagsp == &negflags ? "?-" : "?",
4839 *RExC_parse,
4840 flagsp == &negflags ? "don't " : "",
4841 *RExC_parse
4842 );
4843 }
4844 }
4845 }
4846 else if (*RExC_parse == 'c') {
4847 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4848 if (! (wastedflags & WASTED_C) ) {
4849 wastedflags |= WASTED_GC;
9d1d55b5
JP
4850 vWARN3(
4851 RExC_parse + 1,
4852 "Useless (%sc) - %suse /gc modifier",
4853 flagsp == &negflags ? "?-" : "?",
4854 flagsp == &negflags ? "don't " : ""
4855 );
4856 }
4857 }
4858 }
4859 else { pmflag(flagsp, *RExC_parse); }
4860
830247a4 4861 ++RExC_parse;
ca9dfc88 4862 }
830247a4 4863 if (*RExC_parse == '-') {
ca9dfc88 4864 flagsp = &negflags;
9d1d55b5 4865 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4866 ++RExC_parse;
ca9dfc88 4867 goto parse_flags;
48c036b1 4868 }
e2509266
JH
4869 RExC_flags |= posflags;
4870 RExC_flags &= ~negflags;
830247a4
IZ
4871 if (*RExC_parse == ':') {
4872 RExC_parse++;
ca9dfc88
IZ
4873 paren = ':';
4874 break;
4875 }
c277df42 4876 unknown:
830247a4
IZ
4877 if (*RExC_parse != ')') {
4878 RExC_parse++;
4879 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4880 }
830247a4 4881 nextchar(pRExC_state);
a0d0e21e
LW
4882 *flagp = TRYAGAIN;
4883 return NULL;
4884 }
4885 }
fac92740 4886 else { /* (...) */
81714fb9 4887 capturing_parens:
830247a4
IZ
4888 parno = RExC_npar;
4889 RExC_npar++;
4890 ret = reganode(pRExC_state, OPEN, parno);
6bda09f9 4891 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
226de585 4892 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4893 "Setting paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret)));
6bda09f9 4894 RExC_parens[parno-1]= ret;
226de585 4895
6bda09f9 4896 }
fac92740
MJD
4897 Set_Node_Length(ret, 1); /* MJD */
4898 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4899 is_open = 1;
a0d0e21e 4900 }
a0ed51b3 4901 }
fac92740 4902 else /* ! paren */
a0d0e21e
LW
4903 ret = NULL;
4904
4905 /* Pick up the branches, linking them together. */
fac92740 4906 parse_start = RExC_parse; /* MJD */
3dab1dad 4907 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4908 /* branch_len = (paren != 0); */
2af232bd 4909
a0d0e21e
LW
4910 if (br == NULL)
4911 return(NULL);
830247a4
IZ
4912 if (*RExC_parse == '|') {
4913 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 4914 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 4915 }
fac92740 4916 else { /* MJD */
6bda09f9 4917 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
4918 Set_Node_Length(br, paren != 0);
4919 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4920 }
c277df42
IZ
4921 have_branch = 1;
4922 if (SIZE_ONLY)
830247a4 4923 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4924 }
4925 else if (paren == ':') {
c277df42
IZ
4926 *flagp |= flags&SIMPLE;
4927 }
6136c704 4928 if (is_open) { /* Starts with OPEN. */
3dab1dad 4929 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4930 }
4931 else if (paren != '?') /* Not Conditional */
a0d0e21e 4932 ret = br;
32a0ca98 4933 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4934 lastbr = br;
830247a4
IZ
4935 while (*RExC_parse == '|') {
4936 if (!SIZE_ONLY && RExC_extralen) {
4937 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4938 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4939 }
4940 if (SIZE_ONLY)
830247a4
IZ
4941 RExC_extralen += 2; /* Account for LONGJMP. */
4942 nextchar(pRExC_state);
3dab1dad 4943 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4944
a687059c 4945 if (br == NULL)
a0d0e21e 4946 return(NULL);
3dab1dad 4947 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4948 lastbr = br;
821b33a5
IZ
4949 if (flags&HASWIDTH)
4950 *flagp |= HASWIDTH;
a687059c 4951 *flagp |= flags&SPSTART;
a0d0e21e
LW
4952 }
4953
c277df42
IZ
4954 if (have_branch || paren != ':') {
4955 /* Make a closing node, and hook it on the end. */
4956 switch (paren) {
4957 case ':':
830247a4 4958 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4959 break;
4960 case 1:
830247a4 4961 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4962 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4963 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4964 break;
4965 case '<':
c277df42
IZ
4966 case ',':
4967 case '=':
4968 case '!':
c277df42 4969 *flagp &= ~HASWIDTH;
821b33a5
IZ
4970 /* FALL THROUGH */
4971 case '>':
830247a4 4972 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4973 break;
4974 case 0:
830247a4 4975 ender = reg_node(pRExC_state, END);
c277df42
IZ
4976 break;
4977 }
eaf3ca90 4978 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 4979
9674d46a 4980 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
4981 if (depth==1)
4982 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4983
c277df42 4984 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4985 for (br = ret; br; br = regnext(br)) {
4986 const U8 op = PL_regkind[OP(br)];
4987 if (op == BRANCH) {
07be1b83 4988 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4989 }
4990 else if (op == BRANCHJ) {
07be1b83 4991 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4992 }
c277df42
IZ
4993 }
4994 }
a0d0e21e 4995 }
c277df42
IZ
4996
4997 {
e1ec3a88
AL
4998 const char *p;
4999 static const char parens[] = "=!<,>";
c277df42
IZ
5000
5001 if (paren && (p = strchr(parens, paren))) {
eb160463 5002 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5003 int flag = (p - parens) > 1;
5004
5005 if (paren == '>')
5006 node = SUSPEND, flag = 0;
6bda09f9 5007 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5008 Set_Node_Cur_Length(ret);
5009 Set_Node_Offset(ret, parse_start + 1);
c277df42 5010 ret->flags = flag;
07be1b83 5011 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5012 }
a0d0e21e
LW
5013 }
5014
5015 /* Check for proper termination. */
ce3e6498 5016 if (paren) {
e2509266 5017 RExC_flags = oregflags;
830247a4
IZ
5018 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5019 RExC_parse = oregcomp_parse;
380a0633 5020 vFAIL("Unmatched (");
ce3e6498 5021 }
a0ed51b3 5022 }
830247a4
IZ
5023 else if (!paren && RExC_parse < RExC_end) {
5024 if (*RExC_parse == ')') {
5025 RExC_parse++;
380a0633 5026 vFAIL("Unmatched )");
a0ed51b3
LW
5027 }
5028 else
b45f050a 5029 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5030 /* NOTREACHED */
5031 }
a687059c 5032
a0d0e21e 5033 return(ret);
a687059c
LW
5034}
5035
5036/*
5037 - regbranch - one alternative of an | operator
5038 *
5039 * Implements the concatenation operator.
5040 */
76e3520e 5041STATIC regnode *
3dab1dad 5042S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5043{
97aff369 5044 dVAR;
c277df42
IZ
5045 register regnode *ret;
5046 register regnode *chain = NULL;
5047 register regnode *latest;
5048 I32 flags = 0, c = 0;
3dab1dad
YO
5049 GET_RE_DEBUG_FLAGS_DECL;
5050 DEBUG_PARSE("brnc");
b81d288d 5051 if (first)
c277df42
IZ
5052 ret = NULL;
5053 else {
b81d288d 5054 if (!SIZE_ONLY && RExC_extralen)
830247a4 5055 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5056 else {
830247a4 5057 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5058 Set_Node_Length(ret, 1);
5059 }
c277df42
IZ
5060 }
5061
b81d288d 5062 if (!first && SIZE_ONLY)
830247a4 5063 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5064
c277df42 5065 *flagp = WORST; /* Tentatively. */
a0d0e21e 5066
830247a4
IZ
5067 RExC_parse--;
5068 nextchar(pRExC_state);
5069 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5070 flags &= ~TRYAGAIN;
3dab1dad 5071 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5072 if (latest == NULL) {
5073 if (flags & TRYAGAIN)
5074 continue;
5075 return(NULL);
a0ed51b3
LW
5076 }
5077 else if (ret == NULL)
c277df42 5078 ret = latest;
a0d0e21e 5079 *flagp |= flags&HASWIDTH;
c277df42 5080 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5081 *flagp |= flags&SPSTART;
5082 else {
830247a4 5083 RExC_naughty++;
3dab1dad 5084 REGTAIL(pRExC_state, chain, latest);
a687059c 5085 }
a0d0e21e 5086 chain = latest;
c277df42
IZ
5087 c++;
5088 }
5089 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5090 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5091 if (ret == NULL)
5092 ret = chain;
5093 }
5094 if (c == 1) {
5095 *flagp |= flags&SIMPLE;
a0d0e21e 5096 }
a687059c 5097
d4c19fe8 5098 return ret;
a687059c
LW
5099}
5100
5101/*
5102 - regpiece - something followed by possible [*+?]
5103 *
5104 * Note that the branching code sequences used for ? and the general cases
5105 * of * and + are somewhat optimized: they use the same NOTHING node as
5106 * both the endmarker for their branch list and the body of the last branch.
5107 * It might seem that this node could be dispensed with entirely, but the
5108 * endmarker role is not redundant.
5109 */
76e3520e 5110STATIC regnode *
3dab1dad 5111S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5112{
97aff369 5113 dVAR;
c277df42 5114 register regnode *ret;
a0d0e21e
LW
5115 register char op;
5116 register char *next;
5117 I32 flags;
1df70142 5118 const char * const origparse = RExC_parse;
a0d0e21e 5119 I32 min;
c277df42 5120 I32 max = REG_INFTY;
fac92740 5121 char *parse_start;
10edeb5d 5122 const char *maxpos = NULL;
3dab1dad
YO
5123 GET_RE_DEBUG_FLAGS_DECL;
5124 DEBUG_PARSE("piec");
a0d0e21e 5125
3dab1dad 5126 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5127 if (ret == NULL) {
5128 if (flags & TRYAGAIN)
5129 *flagp |= TRYAGAIN;
5130 return(NULL);
5131 }
5132
830247a4 5133 op = *RExC_parse;
a0d0e21e 5134
830247a4 5135 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5136 maxpos = NULL;
fac92740 5137 parse_start = RExC_parse; /* MJD */
830247a4 5138 next = RExC_parse + 1;
a0d0e21e
LW
5139 while (isDIGIT(*next) || *next == ',') {
5140 if (*next == ',') {
5141 if (maxpos)
5142 break;
5143 else
5144 maxpos = next;
a687059c 5145 }
a0d0e21e
LW
5146 next++;
5147 }
5148 if (*next == '}') { /* got one */
5149 if (!maxpos)
5150 maxpos = next;
830247a4
IZ
5151 RExC_parse++;
5152 min = atoi(RExC_parse);
a0d0e21e
LW
5153 if (*maxpos == ',')
5154 maxpos++;
5155 else
830247a4 5156 maxpos = RExC_parse;
a0d0e21e
LW
5157 max = atoi(maxpos);
5158 if (!max && *maxpos != '0')
c277df42
IZ
5159 max = REG_INFTY; /* meaning "infinity" */
5160 else if (max >= REG_INFTY)
8615cb43 5161 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5162 RExC_parse = next;
5163 nextchar(pRExC_state);
a0d0e21e
LW
5164
5165 do_curly:
5166 if ((flags&SIMPLE)) {
830247a4 5167 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5168 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5169 Set_Node_Offset(ret, parse_start+1); /* MJD */
5170 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5171 }
5172 else {
3dab1dad 5173 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5174
5175 w->flags = 0;
3dab1dad 5176 REGTAIL(pRExC_state, ret, w);
830247a4 5177 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5178 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5179 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5180 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5181 }
6bda09f9 5182 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5183 /* MJD hk */
5184 Set_Node_Offset(ret, parse_start+1);
2af232bd 5185 Set_Node_Length(ret,
fac92740 5186 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5187
830247a4 5188 if (!SIZE_ONLY && RExC_extralen)
c277df42 5189 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5190 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5191 if (SIZE_ONLY)
830247a4
IZ
5192 RExC_whilem_seen++, RExC_extralen += 3;
5193 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5194 }
c277df42 5195 ret->flags = 0;
a0d0e21e
LW
5196
5197 if (min > 0)
821b33a5
IZ
5198 *flagp = WORST;
5199 if (max > 0)
5200 *flagp |= HASWIDTH;
a0d0e21e 5201 if (max && max < min)
8615cb43 5202 vFAIL("Can't do {n,m} with n > m");
c277df42 5203 if (!SIZE_ONLY) {
eb160463
GS
5204 ARG1_SET(ret, (U16)min);
5205 ARG2_SET(ret, (U16)max);
a687059c 5206 }
a687059c 5207
a0d0e21e 5208 goto nest_check;
a687059c 5209 }
a0d0e21e 5210 }
a687059c 5211
a0d0e21e
LW
5212 if (!ISMULT1(op)) {
5213 *flagp = flags;
a687059c 5214 return(ret);
a0d0e21e 5215 }
bb20fd44 5216
c277df42 5217#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5218
5219 /* if this is reinstated, don't forget to put this back into perldiag:
5220
5221 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5222
5223 (F) The part of the regexp subject to either the * or + quantifier
5224 could match an empty string. The {#} shows in the regular
5225 expression about where the problem was discovered.
5226
5227 */
5228
bb20fd44 5229 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5230 vFAIL("Regexp *+ operand could be empty");
b81d288d 5231#endif
bb20fd44 5232
fac92740 5233 parse_start = RExC_parse;
830247a4 5234 nextchar(pRExC_state);
a0d0e21e 5235
821b33a5 5236 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5237
5238 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5239 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5240 ret->flags = 0;
830247a4 5241 RExC_naughty += 4;
a0d0e21e
LW
5242 }
5243 else if (op == '*') {
5244 min = 0;
5245 goto do_curly;
a0ed51b3
LW
5246 }
5247 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5248 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5249 ret->flags = 0;
830247a4 5250 RExC_naughty += 3;
a0d0e21e
LW
5251 }
5252 else if (op == '+') {
5253 min = 1;
5254 goto do_curly;
a0ed51b3
LW
5255 }
5256 else if (op == '?') {
a0d0e21e
LW
5257 min = 0; max = 1;
5258 goto do_curly;
5259 }
5260 nest_check:
041457d9 5261 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5262 vWARN3(RExC_parse,
b45f050a 5263 "%.*s matches null string many times",
afd78fd5 5264 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5265 origparse);
a0d0e21e
LW
5266 }
5267
b9b4dddf 5268 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5269 nextchar(pRExC_state);
6bda09f9 5270 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5271 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5272 }
b9b4dddf
YO
5273#ifndef REG_ALLOW_MINMOD_SUSPEND
5274 else
5275#endif
5276 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5277 regnode *ender;
5278 nextchar(pRExC_state);
5279 ender = reg_node(pRExC_state, SUCCEED);
5280 REGTAIL(pRExC_state, ret, ender);
5281 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5282 ret->flags = 0;
5283 ender = reg_node(pRExC_state, TAIL);
5284 REGTAIL(pRExC_state, ret, ender);
5285 /*ret= ender;*/
5286 }
5287
5288 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5289 RExC_parse++;
b45f050a
JF
5290 vFAIL("Nested quantifiers");
5291 }
a0d0e21e
LW
5292
5293 return(ret);
a687059c
LW
5294}
5295
fc8cd66c
YO
5296
5297/* reg_namedseq(pRExC_state,UVp)
5298
5299 This is expected to be called by a parser routine that has
5300 recognized'\N' and needs to handle the rest. RExC_parse is
5301 expected to point at the first char following the N at the time
5302 of the call.
5303
5304 If valuep is non-null then it is assumed that we are parsing inside
5305 of a charclass definition and the first codepoint in the resolved
5306 string is returned via *valuep and the routine will return NULL.
5307 In this mode if a multichar string is returned from the charnames
5308 handler a warning will be issued, and only the first char in the
5309 sequence will be examined. If the string returned is zero length
5310 then the value of *valuep is undefined and NON-NULL will
5311 be returned to indicate failure. (This will NOT be a valid pointer
5312 to a regnode.)
5313
5314 If value is null then it is assumed that we are parsing normal text
5315 and inserts a new EXACT node into the program containing the resolved
5316 string and returns a pointer to the new node. If the string is
5317 zerolength a NOTHING node is emitted.
5318
5319 On success RExC_parse is set to the char following the endbrace.
5320 Parsing failures will generate a fatal errorvia vFAIL(...)
5321
5322 NOTE: We cache all results from the charnames handler locally in
5323 the RExC_charnames hash (created on first use) to prevent a charnames
5324 handler from playing silly-buggers and returning a short string and
5325 then a long string for a given pattern. Since the regexp program
5326 size is calculated during an initial parse this would result
5327 in a buffer overrun so we cache to prevent the charname result from
5328 changing during the course of the parse.
5329
5330 */
5331STATIC regnode *
5332S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5333{
5334 char * name; /* start of the content of the name */
5335 char * endbrace; /* endbrace following the name */
5336 SV *sv_str = NULL;
5337 SV *sv_name = NULL;
5338 STRLEN len; /* this has various purposes throughout the code */
5339 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5340 regnode *ret = NULL;
5341
5342 if (*RExC_parse != '{') {
5343 vFAIL("Missing braces on \\N{}");
5344 }
5345 name = RExC_parse+1;
5346 endbrace = strchr(RExC_parse, '}');
5347 if ( ! endbrace ) {
5348 RExC_parse++;
5349 vFAIL("Missing right brace on \\N{}");
5350 }
5351 RExC_parse = endbrace + 1;
5352
5353
5354 /* RExC_parse points at the beginning brace,
5355 endbrace points at the last */
5356 if ( name[0]=='U' && name[1]=='+' ) {
5357 /* its a "unicode hex" notation {U+89AB} */
5358 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5359 | PERL_SCAN_DISALLOW_PREFIX
5360 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5361 UV cp;
196f1508 5362 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5363 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5364 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5365 cp = 0xFFFD;
5366 }
5367 if (cp > 0xff)
5368 RExC_utf8 = 1;
5369 if ( valuep ) {
5370 *valuep = cp;
5371 return NULL;
5372 }
5373 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5374 } else {
5375 /* fetch the charnames handler for this scope */
5376 HV * const table = GvHV(PL_hintgv);
5377 SV **cvp= table ?
5378 hv_fetchs(table, "charnames", FALSE) :
5379 NULL;
5380 SV *cv= cvp ? *cvp : NULL;
5381 HE *he_str;
5382 int count;
5383 /* create an SV with the name as argument */
5384 sv_name = newSVpvn(name, endbrace - name);
5385
5386 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5387 vFAIL2("Constant(\\N{%s}) unknown: "
5388 "(possibly a missing \"use charnames ...\")",
5389 SvPVX(sv_name));
5390 }
5391 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5392 vFAIL2("Constant(\\N{%s}): "
5393 "$^H{charnames} is not defined",SvPVX(sv_name));
5394 }
5395
5396
5397
5398 if (!RExC_charnames) {
5399 /* make sure our cache is allocated */
5400 RExC_charnames = newHV();
6bda09f9 5401 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5402 }
5403 /* see if we have looked this one up before */
5404 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5405 if ( he_str ) {
5406 sv_str = HeVAL(he_str);
5407 cached = 1;
5408 } else {
5409 dSP ;
5410
5411 ENTER ;
5412 SAVETMPS ;
5413 PUSHMARK(SP) ;
5414
5415 XPUSHs(sv_name);
5416
5417 PUTBACK ;
5418
5419 count= call_sv(cv, G_SCALAR);
5420
5421 if (count == 1) { /* XXXX is this right? dmq */
5422 sv_str = POPs;
5423 SvREFCNT_inc_simple_void(sv_str);
5424 }
5425
5426 SPAGAIN ;
5427 PUTBACK ;
5428 FREETMPS ;
5429 LEAVE ;
5430
5431 if ( !sv_str || !SvOK(sv_str) ) {
5432 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5433 "did not return a defined value",SvPVX(sv_name));
5434 }
5435 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5436 cached = 1;
5437 }
5438 }
5439 if (valuep) {
5440 char *p = SvPV(sv_str, len);
5441 if (len) {
5442 STRLEN numlen = 1;
5443 if ( SvUTF8(sv_str) ) {
196f1508 5444 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5445 if (*valuep > 0x7F)
5446 RExC_utf8 = 1;
5447 /* XXXX
5448 We have to turn on utf8 for high bit chars otherwise
5449 we get failures with
5450
5451 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5452 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5453
5454 This is different from what \x{} would do with the same
5455 codepoint, where the condition is > 0xFF.
5456 - dmq
5457 */
5458
5459
5460 } else {
5461 *valuep = (UV)*p;
5462 /* warn if we havent used the whole string? */
5463 }
5464 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5465 vWARN2(RExC_parse,
5466 "Ignoring excess chars from \\N{%s} in character class",
5467 SvPVX(sv_name)
5468 );
5469 }
5470 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5471 vWARN2(RExC_parse,
5472 "Ignoring zero length \\N{%s} in character class",
5473 SvPVX(sv_name)
5474 );
5475 }
5476 if (sv_name)
5477 SvREFCNT_dec(sv_name);
5478 if (!cached)
5479 SvREFCNT_dec(sv_str);
5480 return len ? NULL : (regnode *)&len;
5481 } else if(SvCUR(sv_str)) {
5482
5483 char *s;
5484 char *p, *pend;
5485 STRLEN charlen = 1;
5486 char * parse_start = name-3; /* needed for the offsets */
5487 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5488
5489 ret = reg_node(pRExC_state,
5490 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5491 s= STRING(ret);
5492
5493 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5494 sv_utf8_upgrade(sv_str);
5495 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5496 RExC_utf8= 1;
5497 }
5498
5499 p = SvPV(sv_str, len);
5500 pend = p + len;
5501 /* len is the length written, charlen is the size the char read */
5502 for ( len = 0; p < pend; p += charlen ) {
5503 if (UTF) {
196f1508 5504 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5505 if (FOLD) {
5506 STRLEN foldlen,numlen;
5507 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5508 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5509 /* Emit all the Unicode characters. */
5510
5511 for (foldbuf = tmpbuf;
5512 foldlen;
5513 foldlen -= numlen)
5514 {
5515 uvc = utf8_to_uvchr(foldbuf, &numlen);
5516 if (numlen > 0) {
5517 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5518 s += unilen;
5519 len += unilen;
5520 /* In EBCDIC the numlen
5521 * and unilen can differ. */
5522 foldbuf += numlen;
5523 if (numlen >= foldlen)
5524 break;
5525 }
5526 else
5527 break; /* "Can't happen." */
5528 }
5529 } else {
5530 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5531 if (unilen > 0) {
5532 s += unilen;
5533 len += unilen;
5534 }
5535 }
5536 } else {
5537 len++;
5538 REGC(*p, s++);
5539 }
5540 }
5541 if (SIZE_ONLY) {
5542 RExC_size += STR_SZ(len);
5543 } else {
5544 STR_LEN(ret) = len;
5545 RExC_emit += STR_SZ(len);
5546 }
5547 Set_Node_Cur_Length(ret); /* MJD */
5548 RExC_parse--;
5549 nextchar(pRExC_state);
5550 } else {
5551 ret = reg_node(pRExC_state,NOTHING);
5552 }
5553 if (!cached) {
5554 SvREFCNT_dec(sv_str);
5555 }
5556 if (sv_name) {
5557 SvREFCNT_dec(sv_name);
5558 }
5559 return ret;
5560
5561}
5562
5563
5564
a687059c
LW
5565/*
5566 - regatom - the lowest level
5567 *
5568 * Optimization: gobbles an entire sequence of ordinary characters so that
5569 * it can turn them into a single node, which is smaller to store and
5570 * faster to run. Backslashed characters are exceptions, each becoming a
5571 * separate node; the code is simpler that way and it's not worth fixing.
5572 *
7f6f358c
YO
5573 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5574 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5575 */
76e3520e 5576STATIC regnode *
3dab1dad 5577S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5578{
97aff369 5579 dVAR;
cbbf8932 5580 register regnode *ret = NULL;
a0d0e21e 5581 I32 flags;
45948336 5582 char *parse_start = RExC_parse;
3dab1dad
YO
5583 GET_RE_DEBUG_FLAGS_DECL;
5584 DEBUG_PARSE("atom");
a0d0e21e
LW
5585 *flagp = WORST; /* Tentatively. */
5586
5587tryagain:
830247a4 5588 switch (*RExC_parse) {
a0d0e21e 5589 case '^':
830247a4
IZ
5590 RExC_seen_zerolen++;
5591 nextchar(pRExC_state);
e2509266 5592 if (RExC_flags & PMf_MULTILINE)
830247a4 5593 ret = reg_node(pRExC_state, MBOL);
e2509266 5594 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5595 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5596 else
830247a4 5597 ret = reg_node(pRExC_state, BOL);
fac92740 5598 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5599 break;
5600 case '$':
830247a4 5601 nextchar(pRExC_state);
b81d288d 5602 if (*RExC_parse)
830247a4 5603 RExC_seen_zerolen++;
e2509266 5604 if (RExC_flags & PMf_MULTILINE)
830247a4 5605 ret = reg_node(pRExC_state, MEOL);
e2509266 5606 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5607 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5608 else
830247a4 5609 ret = reg_node(pRExC_state, EOL);
fac92740 5610 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5611 break;
5612 case '.':
830247a4 5613 nextchar(pRExC_state);
e2509266 5614 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5615 ret = reg_node(pRExC_state, SANY);
5616 else
5617 ret = reg_node(pRExC_state, REG_ANY);
5618 *flagp |= HASWIDTH|SIMPLE;
830247a4 5619 RExC_naughty++;
fac92740 5620 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5621 break;
5622 case '[':
b45f050a 5623 {
3dab1dad
YO
5624 char * const oregcomp_parse = ++RExC_parse;
5625 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5626 if (*RExC_parse != ']') {
5627 RExC_parse = oregcomp_parse;
b45f050a
JF
5628 vFAIL("Unmatched [");
5629 }
830247a4 5630 nextchar(pRExC_state);
a0d0e21e 5631 *flagp |= HASWIDTH|SIMPLE;
fac92740 5632 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5633 break;
b45f050a 5634 }
a0d0e21e 5635 case '(':
830247a4 5636 nextchar(pRExC_state);
3dab1dad 5637 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5638 if (ret == NULL) {
bf93d4cc 5639 if (flags & TRYAGAIN) {
830247a4 5640 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5641 /* Make parent create an empty node if needed. */
5642 *flagp |= TRYAGAIN;
5643 return(NULL);
5644 }
a0d0e21e 5645 goto tryagain;
bf93d4cc 5646 }
a0d0e21e
LW
5647 return(NULL);
5648 }
c277df42 5649 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5650 break;
5651 case '|':
5652 case ')':
5653 if (flags & TRYAGAIN) {
5654 *flagp |= TRYAGAIN;
5655 return NULL;
5656 }
b45f050a 5657 vFAIL("Internal urp");
a0d0e21e
LW
5658 /* Supposed to be caught earlier. */
5659 break;
85afd4ae 5660 case '{':
830247a4
IZ
5661 if (!regcurly(RExC_parse)) {
5662 RExC_parse++;
85afd4ae
CS
5663 goto defchar;
5664 }
5665 /* FALL THROUGH */
a0d0e21e
LW
5666 case '?':
5667 case '+':
5668 case '*':
830247a4 5669 RExC_parse++;
b45f050a 5670 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5671 break;
5672 case '\\':
830247a4 5673 switch (*++RExC_parse) {
a0d0e21e 5674 case 'A':
830247a4
IZ
5675 RExC_seen_zerolen++;
5676 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5677 *flagp |= SIMPLE;
830247a4 5678 nextchar(pRExC_state);
fac92740 5679 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5680 break;
5681 case 'G':
830247a4
IZ
5682 ret = reg_node(pRExC_state, GPOS);
5683 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5684 *flagp |= SIMPLE;
830247a4 5685 nextchar(pRExC_state);
fac92740 5686 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5687 break;
5688 case 'Z':
830247a4 5689 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5690 *flagp |= SIMPLE;
a1917ab9 5691 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5692 nextchar(pRExC_state);
a0d0e21e 5693 break;
b85d18e9 5694 case 'z':
830247a4 5695 ret = reg_node(pRExC_state, EOS);
b85d18e9 5696 *flagp |= SIMPLE;
830247a4
IZ
5697 RExC_seen_zerolen++; /* Do not optimize RE away */
5698 nextchar(pRExC_state);
fac92740 5699 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5700 break;
4a2d328f 5701 case 'C':
f33976b4
DB
5702 ret = reg_node(pRExC_state, CANY);
5703 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5704 *flagp |= HASWIDTH|SIMPLE;
830247a4 5705 nextchar(pRExC_state);
fac92740 5706 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5707 break;
5708 case 'X':
830247a4 5709 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5710 *flagp |= HASWIDTH;
830247a4 5711 nextchar(pRExC_state);
fac92740 5712 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5713 break;
a0d0e21e 5714 case 'w':
eb160463 5715 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5716 *flagp |= HASWIDTH|SIMPLE;
830247a4 5717 nextchar(pRExC_state);
fac92740 5718 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5719 break;
5720 case 'W':
eb160463 5721 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5722 *flagp |= HASWIDTH|SIMPLE;
830247a4 5723 nextchar(pRExC_state);
fac92740 5724 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5725 break;
5726 case 'b':
830247a4
IZ
5727 RExC_seen_zerolen++;
5728 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5729 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5730 *flagp |= SIMPLE;
830247a4 5731 nextchar(pRExC_state);
fac92740 5732 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5733 break;
5734 case 'B':
830247a4
IZ
5735 RExC_seen_zerolen++;
5736 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5737 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5738 *flagp |= SIMPLE;
830247a4 5739 nextchar(pRExC_state);
fac92740 5740 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5741 break;
5742 case 's':
eb160463 5743 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5744 *flagp |= HASWIDTH|SIMPLE;
830247a4 5745 nextchar(pRExC_state);
fac92740 5746 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5747 break;
5748 case 'S':
eb160463 5749 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5750 *flagp |= HASWIDTH|SIMPLE;
830247a4 5751 nextchar(pRExC_state);
fac92740 5752 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5753 break;
5754 case 'd':
ffc61ed2 5755 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5756 *flagp |= HASWIDTH|SIMPLE;
830247a4 5757 nextchar(pRExC_state);
fac92740 5758 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5759 break;
5760 case 'D':
ffc61ed2 5761 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5762 *flagp |= HASWIDTH|SIMPLE;
830247a4 5763 nextchar(pRExC_state);
fac92740 5764 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5765 break;
a14b48bc
LW
5766 case 'p':
5767 case 'P':
3568d838 5768 {
3dab1dad 5769 char* const oldregxend = RExC_end;
ccb2c380 5770 char* parse_start = RExC_parse - 2;
a14b48bc 5771
830247a4 5772 if (RExC_parse[1] == '{') {
3568d838 5773 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5774 RExC_end = strchr(RExC_parse, '}');
5775 if (!RExC_end) {
3dab1dad 5776 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5777 RExC_parse += 2;
5778 RExC_end = oldregxend;
0da60cf5 5779 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5780 }
830247a4 5781 RExC_end++;
a14b48bc 5782 }
af6f566e 5783 else {
830247a4 5784 RExC_end = RExC_parse + 2;
af6f566e
HS
5785 if (RExC_end > oldregxend)
5786 RExC_end = oldregxend;
5787 }
830247a4 5788 RExC_parse--;
a14b48bc 5789
3dab1dad 5790 ret = regclass(pRExC_state,depth+1);
a14b48bc 5791
830247a4
IZ
5792 RExC_end = oldregxend;
5793 RExC_parse--;
ccb2c380
MP
5794
5795 Set_Node_Offset(ret, parse_start + 2);
5796 Set_Node_Cur_Length(ret);
830247a4 5797 nextchar(pRExC_state);
a14b48bc
LW
5798 *flagp |= HASWIDTH|SIMPLE;
5799 }
5800 break;
fc8cd66c
YO
5801 case 'N':
5802 /* Handle \N{NAME} here and not below because it can be
5803 multicharacter. join_exact() will join them up later on.
5804 Also this makes sure that things like /\N{BLAH}+/ and
5805 \N{BLAH} being multi char Just Happen. dmq*/
5806 ++RExC_parse;
5807 ret= reg_namedseq(pRExC_state, NULL);
5808 break;
0a4db386 5809 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
5810 {
5811 char ch= RExC_parse[1];
5812 if (ch != '<' && ch != '\'') {
5813 if (SIZE_ONLY)
5814 vWARN( RExC_parse + 1,
5815 "Possible broken named back reference treated as literal k");
5816 parse_start--;
5817 goto defchar;
5818 } else {
5819 char* name_start = (RExC_parse += 2);
5820 I32 num = 0;
0a4db386
YO
5821 SV *sv_dat = reg_scan_name(pRExC_state,
5822 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
5823 ch= (ch == '<') ? '>' : '\'';
5824
81714fb9
YO
5825 if (RExC_parse == name_start || *RExC_parse != ch)
5826 vFAIL2("Sequence \\k%c... not terminated",
5827 (ch == '>' ? '<' : ch));
5828
5829 RExC_sawback = 1;
5830 ret = reganode(pRExC_state,
5831 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5832 num);
5833 *flagp |= HASWIDTH;
5834
5835
5836 if (!SIZE_ONLY) {
81714fb9
YO
5837 num = add_data( pRExC_state, 1, "S" );
5838 ARG_SET(ret,num);
5839 RExC_rx->data->data[num]=(void*)sv_dat;
5840 SvREFCNT_inc(sv_dat);
5841 }
5842 /* override incorrect value set in reganode MJD */
5843 Set_Node_Offset(ret, parse_start+1);
5844 Set_Node_Cur_Length(ret); /* MJD */
5845 nextchar(pRExC_state);
5846
5847 }
5848 break;
5849 }
a0d0e21e
LW
5850 case 'n':
5851 case 'r':
5852 case 't':
5853 case 'f':
5854 case 'e':
5855 case 'a':
5856 case 'x':
5857 case 'c':
5858 case '0':
5859 goto defchar;
5860 case '1': case '2': case '3': case '4':
5861 case '5': case '6': case '7': case '8': case '9':
5862 {
1df70142 5863 const I32 num = atoi(RExC_parse);
a0d0e21e 5864
830247a4 5865 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5866 goto defchar;
5867 else {
3dab1dad 5868 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5869 while (isDIGIT(*RExC_parse))
5870 RExC_parse++;
b45f050a 5871
eb160463 5872 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5873 vFAIL("Reference to nonexistent group");
830247a4 5874 RExC_sawback = 1;
eb160463
GS
5875 ret = reganode(pRExC_state,
5876 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5877 num);
a0d0e21e 5878 *flagp |= HASWIDTH;
2af232bd 5879
fac92740 5880 /* override incorrect value set in reganode MJD */
2af232bd 5881 Set_Node_Offset(ret, parse_start+1);
fac92740 5882 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5883 RExC_parse--;
5884 nextchar(pRExC_state);
a0d0e21e
LW
5885 }
5886 }
5887 break;
5888 case '\0':
830247a4 5889 if (RExC_parse >= RExC_end)
b45f050a 5890 FAIL("Trailing \\");
a0d0e21e
LW
5891 /* FALL THROUGH */
5892 default:
a0288114 5893 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5894 back into the quick-grab loop below */
45948336 5895 parse_start--;
a0d0e21e
LW
5896 goto defchar;
5897 }
5898 break;
4633a7c4
LW
5899
5900 case '#':
e2509266 5901 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5902 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5903 RExC_parse++;
830247a4 5904 if (RExC_parse < RExC_end)
4633a7c4
LW
5905 goto tryagain;
5906 }
5907 /* FALL THROUGH */
5908
a0d0e21e 5909 default: {
ba210ebe 5910 register STRLEN len;
58ae7d3f 5911 register UV ender;
a0d0e21e 5912 register char *p;
3dab1dad 5913 char *s;
80aecb99 5914 STRLEN foldlen;
89ebb4a3 5915 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5916
5917 parse_start = RExC_parse - 1;
a0d0e21e 5918
830247a4 5919 RExC_parse++;
a0d0e21e
LW
5920
5921 defchar:
58ae7d3f 5922 ender = 0;
eb160463
GS
5923 ret = reg_node(pRExC_state,
5924 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5925 s = STRING(ret);
830247a4
IZ
5926 for (len = 0, p = RExC_parse - 1;
5927 len < 127 && p < RExC_end;
a0d0e21e
LW
5928 len++)
5929 {
3dab1dad 5930 char * const oldp = p;
5b5a24f7 5931
e2509266 5932 if (RExC_flags & PMf_EXTENDED)
830247a4 5933 p = regwhite(p, RExC_end);
a0d0e21e
LW
5934 switch (*p) {
5935 case '^':
5936 case '$':
5937 case '.':
5938 case '[':
5939 case '(':
5940 case ')':
5941 case '|':
5942 goto loopdone;
5943 case '\\':
5944 switch (*++p) {
5945 case 'A':
1ed8eac0
JF
5946 case 'C':
5947 case 'X':
a0d0e21e
LW
5948 case 'G':
5949 case 'Z':
b85d18e9 5950 case 'z':
a0d0e21e
LW
5951 case 'w':
5952 case 'W':
5953 case 'b':
5954 case 'B':
5955 case 's':
5956 case 'S':
5957 case 'd':
5958 case 'D':
a14b48bc
LW
5959 case 'p':
5960 case 'P':
fc8cd66c 5961 case 'N':
a0d0e21e
LW
5962 --p;
5963 goto loopdone;
5964 case 'n':
5965 ender = '\n';
5966 p++;
a687059c 5967 break;
a0d0e21e
LW
5968 case 'r':
5969 ender = '\r';
5970 p++;
a687059c 5971 break;
a0d0e21e
LW
5972 case 't':
5973 ender = '\t';
5974 p++;
a687059c 5975 break;
a0d0e21e
LW
5976 case 'f':
5977 ender = '\f';
5978 p++;
a687059c 5979 break;
a0d0e21e 5980 case 'e':
c7f1f016 5981 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5982 p++;
a687059c 5983 break;
a0d0e21e 5984 case 'a':
c7f1f016 5985 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5986 p++;
a687059c 5987 break;
a0d0e21e 5988 case 'x':
a0ed51b3 5989 if (*++p == '{') {
1df70142 5990 char* const e = strchr(p, '}');
b81d288d 5991
b45f050a 5992 if (!e) {
830247a4 5993 RExC_parse = p + 1;
b45f050a
JF
5994 vFAIL("Missing right brace on \\x{}");
5995 }
de5f0749 5996 else {
a4c04bdc
NC
5997 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5998 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5999 STRLEN numlen = e - p - 1;
53305cf1 6000 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6001 if (ender > 0xff)
6002 RExC_utf8 = 1;
a0ed51b3
LW
6003 p = e + 1;
6004 }
a0ed51b3
LW
6005 }
6006 else {
a4c04bdc 6007 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6008 STRLEN numlen = 2;
53305cf1 6009 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6010 p += numlen;
6011 }
a687059c 6012 break;
a0d0e21e
LW
6013 case 'c':
6014 p++;
bbce6d69 6015 ender = UCHARAT(p++);
6016 ender = toCTRL(ender);
a687059c 6017 break;
a0d0e21e
LW
6018 case '0': case '1': case '2': case '3':case '4':
6019 case '5': case '6': case '7': case '8':case '9':
6020 if (*p == '0' ||
830247a4 6021 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6022 I32 flags = 0;
1df70142 6023 STRLEN numlen = 3;
53305cf1 6024 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6025 p += numlen;
6026 }
6027 else {
6028 --p;
6029 goto loopdone;
a687059c
LW
6030 }
6031 break;
a0d0e21e 6032 case '\0':
830247a4 6033 if (p >= RExC_end)
b45f050a 6034 FAIL("Trailing \\");
a687059c 6035 /* FALL THROUGH */
a0d0e21e 6036 default:
041457d9 6037 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6038 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6039 goto normal_default;
a0d0e21e
LW
6040 }
6041 break;
a687059c 6042 default:
a0ed51b3 6043 normal_default:
fd400ab9 6044 if (UTF8_IS_START(*p) && UTF) {
1df70142 6045 STRLEN numlen;
5e12f4fb 6046 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6047 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6048 p += numlen;
6049 }
6050 else
6051 ender = *p++;
a0d0e21e 6052 break;
a687059c 6053 }
e2509266 6054 if (RExC_flags & PMf_EXTENDED)
830247a4 6055 p = regwhite(p, RExC_end);
60a8b682
JH
6056 if (UTF && FOLD) {
6057 /* Prime the casefolded buffer. */
ac7e0132 6058 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6059 }
a0d0e21e
LW
6060 if (ISMULT2(p)) { /* Back off on ?+*. */
6061 if (len)
6062 p = oldp;
16ea2a2e 6063 else if (UTF) {
80aecb99 6064 if (FOLD) {
60a8b682 6065 /* Emit all the Unicode characters. */
1df70142 6066 STRLEN numlen;
80aecb99
JH
6067 for (foldbuf = tmpbuf;
6068 foldlen;
6069 foldlen -= numlen) {
6070 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6071 if (numlen > 0) {
71207a34 6072 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6073 s += unilen;
6074 len += unilen;
6075 /* In EBCDIC the numlen
6076 * and unilen can differ. */
9dc45d57 6077 foldbuf += numlen;
47654450
JH
6078 if (numlen >= foldlen)
6079 break;
9dc45d57
JH
6080 }
6081 else
6082 break; /* "Can't happen." */
80aecb99
JH
6083 }
6084 }
6085 else {
71207a34 6086 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6087 if (unilen > 0) {
0ebc6274
JH
6088 s += unilen;
6089 len += unilen;
9dc45d57 6090 }
80aecb99 6091 }
a0ed51b3 6092 }
a0d0e21e
LW
6093 else {
6094 len++;
eb160463 6095 REGC((char)ender, s++);
a0d0e21e
LW
6096 }
6097 break;
a687059c 6098 }
16ea2a2e 6099 if (UTF) {
80aecb99 6100 if (FOLD) {
60a8b682 6101 /* Emit all the Unicode characters. */
1df70142 6102 STRLEN numlen;
80aecb99
JH
6103 for (foldbuf = tmpbuf;
6104 foldlen;
6105 foldlen -= numlen) {
6106 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6107 if (numlen > 0) {
71207a34 6108 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6109 len += unilen;
6110 s += unilen;
6111 /* In EBCDIC the numlen
6112 * and unilen can differ. */
9dc45d57 6113 foldbuf += numlen;
47654450
JH
6114 if (numlen >= foldlen)
6115 break;
9dc45d57
JH
6116 }
6117 else
6118 break;
80aecb99
JH
6119 }
6120 }
6121 else {
71207a34 6122 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6123 if (unilen > 0) {
0ebc6274
JH
6124 s += unilen;
6125 len += unilen;
9dc45d57 6126 }
80aecb99
JH
6127 }
6128 len--;
a0ed51b3
LW
6129 }
6130 else
eb160463 6131 REGC((char)ender, s++);
a0d0e21e
LW
6132 }
6133 loopdone:
830247a4 6134 RExC_parse = p - 1;
fac92740 6135 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6136 nextchar(pRExC_state);
793db0cb
JH
6137 {
6138 /* len is STRLEN which is unsigned, need to copy to signed */
6139 IV iv = len;
6140 if (iv < 0)
6141 vFAIL("Internal disaster");
6142 }
a0d0e21e
LW
6143 if (len > 0)
6144 *flagp |= HASWIDTH;
090f7165 6145 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6146 *flagp |= SIMPLE;
3dab1dad 6147
cd439c50 6148 if (SIZE_ONLY)
830247a4 6149 RExC_size += STR_SZ(len);
3dab1dad
YO
6150 else {
6151 STR_LEN(ret) = len;
830247a4 6152 RExC_emit += STR_SZ(len);
07be1b83 6153 }
3dab1dad 6154 }
a0d0e21e
LW
6155 break;
6156 }
a687059c 6157
60a8b682
JH
6158 /* If the encoding pragma is in effect recode the text of
6159 * any EXACT-kind nodes. */
fc8cd66c 6160 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
6161 const STRLEN oldlen = STR_LEN(ret);
6162 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
6163
6164 if (RExC_utf8)
6165 SvUTF8_on(sv);
6166 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
6167 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
6168 const STRLEN newlen = SvCUR(sv);
d0063567
DK
6169
6170 if (SvUTF8(sv))
6171 RExC_utf8 = 1;
6172 if (!SIZE_ONLY) {
a3621e74
YO
6173 GET_RE_DEBUG_FLAGS_DECL;
6174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
6175 (int)oldlen, STRING(ret),
6176 (int)newlen, s));
6177 Copy(s, STRING(ret), newlen, char);
6178 STR_LEN(ret) += newlen - oldlen;
6179 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
6180 } else
6181 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
6182 }
a72c7584
JH
6183 }
6184
a0d0e21e 6185 return(ret);
a687059c
LW
6186}
6187
873ef191 6188STATIC char *
5f66b61c 6189S_regwhite(char *p, const char *e)
5b5a24f7
CS
6190{
6191 while (p < e) {
6192 if (isSPACE(*p))
6193 ++p;
6194 else if (*p == '#') {
6195 do {
6196 p++;
6197 } while (p < e && *p != '\n');
6198 }
6199 else
6200 break;
6201 }
6202 return p;
6203}
6204
b8c5462f
JH
6205/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6206 Character classes ([:foo:]) can also be negated ([:^foo:]).
6207 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6208 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6209 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6210
6211#define POSIXCC_DONE(c) ((c) == ':')
6212#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6213#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6214
b8c5462f 6215STATIC I32
830247a4 6216S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6217{
97aff369 6218 dVAR;
936ed897 6219 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6220
830247a4 6221 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6222 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6223 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6224 const char c = UCHARAT(RExC_parse);
097eb12c 6225 char* const s = RExC_parse++;
b81d288d 6226
9a86a77b 6227 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6228 RExC_parse++;
6229 if (RExC_parse == RExC_end)
620e46c5 6230 /* Grandfather lone [:, [=, [. */
830247a4 6231 RExC_parse = s;
620e46c5 6232 else {
3dab1dad 6233 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6234 assert(*t == c);
6235
9a86a77b 6236 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6237 const char *posixcc = s + 1;
830247a4 6238 RExC_parse++; /* skip over the ending ] */
3dab1dad 6239
b8c5462f 6240 if (*s == ':') {
1df70142
AL
6241 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6242 const I32 skip = t - posixcc;
80916619
NC
6243
6244 /* Initially switch on the length of the name. */
6245 switch (skip) {
6246 case 4:
3dab1dad
YO
6247 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6248 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6249 break;
80916619
NC
6250 case 5:
6251 /* Names all of length 5. */
6252 /* alnum alpha ascii blank cntrl digit graph lower
6253 print punct space upper */
6254 /* Offset 4 gives the best switch position. */
6255 switch (posixcc[4]) {
6256 case 'a':
3dab1dad
YO
6257 if (memEQ(posixcc, "alph", 4)) /* alpha */
6258 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6259 break;
6260 case 'e':
3dab1dad
YO
6261 if (memEQ(posixcc, "spac", 4)) /* space */
6262 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6263 break;
6264 case 'h':
3dab1dad
YO
6265 if (memEQ(posixcc, "grap", 4)) /* graph */
6266 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6267 break;
6268 case 'i':
3dab1dad
YO
6269 if (memEQ(posixcc, "asci", 4)) /* ascii */
6270 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6271 break;
6272 case 'k':
3dab1dad
YO
6273 if (memEQ(posixcc, "blan", 4)) /* blank */
6274 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6275 break;
6276 case 'l':
3dab1dad
YO
6277 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6278 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6279 break;
6280 case 'm':
3dab1dad
YO
6281 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6282 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6283 break;
6284 case 'r':
3dab1dad
YO
6285 if (memEQ(posixcc, "lowe", 4)) /* lower */
6286 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6287 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6288 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6289 break;
6290 case 't':
3dab1dad
YO
6291 if (memEQ(posixcc, "digi", 4)) /* digit */
6292 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6293 else if (memEQ(posixcc, "prin", 4)) /* print */
6294 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6295 else if (memEQ(posixcc, "punc", 4)) /* punct */
6296 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6297 break;
b8c5462f
JH
6298 }
6299 break;
80916619 6300 case 6:
3dab1dad
YO
6301 if (memEQ(posixcc, "xdigit", 6))
6302 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6303 break;
6304 }
80916619
NC
6305
6306 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6307 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6308 t - s - 1, s + 1);
80916619
NC
6309 assert (posixcc[skip] == ':');
6310 assert (posixcc[skip+1] == ']');
b45f050a 6311 } else if (!SIZE_ONLY) {
b8c5462f 6312 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6313
830247a4 6314 /* adjust RExC_parse so the warning shows after
b45f050a 6315 the class closes */
9a86a77b 6316 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6317 RExC_parse++;
b45f050a
JF
6318 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6319 }
b8c5462f
JH
6320 } else {
6321 /* Maternal grandfather:
6322 * "[:" ending in ":" but not in ":]" */
830247a4 6323 RExC_parse = s;
767d463e 6324 }
620e46c5
JH
6325 }
6326 }
6327
b8c5462f
JH
6328 return namedclass;
6329}
6330
6331STATIC void
830247a4 6332S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6333{
97aff369 6334 dVAR;
3dab1dad 6335 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6336 const char *s = RExC_parse;
6337 const char c = *s++;
b8c5462f 6338
3dab1dad 6339 while (isALNUM(*s))
b8c5462f
JH
6340 s++;
6341 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6342 if (ckWARN(WARN_REGEXP))
6343 vWARN3(s+2,
6344 "POSIX syntax [%c %c] belongs inside character classes",
6345 c, c);
b45f050a
JF
6346
6347 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6348 if (POSIXCC_NOTYET(c)) {
830247a4 6349 /* adjust RExC_parse so the error shows after
b45f050a 6350 the class closes */
9a86a77b 6351 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6352 NOOP;
b45f050a
JF
6353 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6354 }
b8c5462f
JH
6355 }
6356 }
620e46c5
JH
6357}
6358
7f6f358c
YO
6359
6360/*
6361 parse a class specification and produce either an ANYOF node that
6362 matches the pattern. If the pattern matches a single char only and
6363 that char is < 256 then we produce an EXACT node instead.
6364*/
76e3520e 6365STATIC regnode *
3dab1dad 6366S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6367{
97aff369 6368 dVAR;
9ef43ace 6369 register UV value = 0;
9a86a77b 6370 register UV nextvalue;
3568d838 6371 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6372 register IV range = 0;
c277df42 6373 register regnode *ret;
ba210ebe 6374 STRLEN numlen;
ffc61ed2 6375 IV namedclass;
cbbf8932 6376 char *rangebegin = NULL;
936ed897 6377 bool need_class = 0;
c445ea15 6378 SV *listsv = NULL;
ffc61ed2 6379 UV n;
9e55ce06 6380 bool optimize_invert = TRUE;
cbbf8932 6381 AV* unicode_alternate = NULL;
1b2d223b
JH
6382#ifdef EBCDIC
6383 UV literal_endpoint = 0;
6384#endif
7f6f358c 6385 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6386
3dab1dad 6387 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6388 case we need to change the emitted regop to an EXACT. */
07be1b83 6389 const char * orig_parse = RExC_parse;
72f13be8 6390 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6391#ifndef DEBUGGING
6392 PERL_UNUSED_ARG(depth);
6393#endif
72f13be8 6394
3dab1dad 6395 DEBUG_PARSE("clas");
7f6f358c
YO
6396
6397 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6398 ret = reganode(pRExC_state, ANYOF, 0);
6399
6400 if (!SIZE_ONLY)
6401 ANYOF_FLAGS(ret) = 0;
6402
9a86a77b 6403 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6404 RExC_naughty++;
6405 RExC_parse++;
6406 if (!SIZE_ONLY)
6407 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6408 }
a0d0e21e 6409
73060fc4 6410 if (SIZE_ONLY) {
830247a4 6411 RExC_size += ANYOF_SKIP;
73060fc4
JH
6412 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6413 }
936ed897 6414 else {
830247a4 6415 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6416 if (FOLD)
6417 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6418 if (LOC)
6419 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6420 ANYOF_BITMAP_ZERO(ret);
396482e1 6421 listsv = newSVpvs("# comment\n");
a0d0e21e 6422 }
b8c5462f 6423
9a86a77b
JH
6424 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6425
b938889d 6426 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6427 checkposixcc(pRExC_state);
b8c5462f 6428
f064b6ad
HS
6429 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6430 if (UCHARAT(RExC_parse) == ']')
6431 goto charclassloop;
ffc61ed2 6432
fc8cd66c 6433parseit:
9a86a77b 6434 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6435
6436 charclassloop:
6437
6438 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6439
73b437c8 6440 if (!range)
830247a4 6441 rangebegin = RExC_parse;
ffc61ed2 6442 if (UTF) {
5e12f4fb 6443 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6444 RExC_end - RExC_parse,
9f7f3913 6445 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6446 RExC_parse += numlen;
6447 }
6448 else
6449 value = UCHARAT(RExC_parse++);
7f6f358c 6450
9a86a77b
JH
6451 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6452 if (value == '[' && POSIXCC(nextvalue))
830247a4 6453 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6454 else if (value == '\\') {
ffc61ed2 6455 if (UTF) {
5e12f4fb 6456 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6457 RExC_end - RExC_parse,
9f7f3913 6458 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6459 RExC_parse += numlen;
6460 }
6461 else
6462 value = UCHARAT(RExC_parse++);
470c3474 6463 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6464 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6465 * be a problem later if we want switch on Unicode.
6466 * A similar issue a little bit later when switching on
6467 * namedclass. --jhi */
ffc61ed2 6468 switch ((I32)value) {
b8c5462f
JH
6469 case 'w': namedclass = ANYOF_ALNUM; break;
6470 case 'W': namedclass = ANYOF_NALNUM; break;
6471 case 's': namedclass = ANYOF_SPACE; break;
6472 case 'S': namedclass = ANYOF_NSPACE; break;
6473 case 'd': namedclass = ANYOF_DIGIT; break;
6474 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6475 case 'N': /* Handle \N{NAME} in class */
6476 {
6477 /* We only pay attention to the first char of
6478 multichar strings being returned. I kinda wonder
6479 if this makes sense as it does change the behaviour
6480 from earlier versions, OTOH that behaviour was broken
6481 as well. */
6482 UV v; /* value is register so we cant & it /grrr */
6483 if (reg_namedseq(pRExC_state, &v)) {
6484 goto parseit;
6485 }
6486 value= v;
6487 }
6488 break;
ffc61ed2
JH
6489 case 'p':
6490 case 'P':
3dab1dad
YO
6491 {
6492 char *e;
af6f566e 6493 if (RExC_parse >= RExC_end)
2a4859cd 6494 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6495 if (*RExC_parse == '{') {
1df70142 6496 const U8 c = (U8)value;
ffc61ed2
JH
6497 e = strchr(RExC_parse++, '}');
6498 if (!e)
0da60cf5 6499 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6500 while (isSPACE(UCHARAT(RExC_parse)))
6501 RExC_parse++;
6502 if (e == RExC_parse)
0da60cf5 6503 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6504 n = e - RExC_parse;
ab13f0c7
JH
6505 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6506 n--;
ffc61ed2
JH
6507 }
6508 else {
6509 e = RExC_parse;
6510 n = 1;
6511 }
6512 if (!SIZE_ONLY) {
ab13f0c7
JH
6513 if (UCHARAT(RExC_parse) == '^') {
6514 RExC_parse++;
6515 n--;
6516 value = value == 'p' ? 'P' : 'p'; /* toggle */
6517 while (isSPACE(UCHARAT(RExC_parse))) {
6518 RExC_parse++;
6519 n--;
6520 }
6521 }
097eb12c
AL
6522 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6523 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6524 }
6525 RExC_parse = e + 1;
6526 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6527 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6528 }
f81125e2 6529 break;
b8c5462f
JH
6530 case 'n': value = '\n'; break;
6531 case 'r': value = '\r'; break;
6532 case 't': value = '\t'; break;
6533 case 'f': value = '\f'; break;
6534 case 'b': value = '\b'; break;
c7f1f016
NIS
6535 case 'e': value = ASCII_TO_NATIVE('\033');break;
6536 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6537 case 'x':
ffc61ed2 6538 if (*RExC_parse == '{') {
a4c04bdc
NC
6539 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6540 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6541 char * const e = strchr(RExC_parse++, '}');
b81d288d 6542 if (!e)
ffc61ed2 6543 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6544
6545 numlen = e - RExC_parse;
6546 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6547 RExC_parse = e + 1;
6548 }
6549 else {
a4c04bdc 6550 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6551 numlen = 2;
6552 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6553 RExC_parse += numlen;
6554 }
b8c5462f
JH
6555 break;
6556 case 'c':
830247a4 6557 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6558 value = toCTRL(value);
6559 break;
6560 case '0': case '1': case '2': case '3': case '4':
6561 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6562 {
6563 I32 flags = 0;
6564 numlen = 3;
6565 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6566 RExC_parse += numlen;
b8c5462f 6567 break;
53305cf1 6568 }
1028017a 6569 default:
041457d9 6570 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6571 vWARN2(RExC_parse,
6572 "Unrecognized escape \\%c in character class passed through",
6573 (int)value);
1028017a 6574 break;
b8c5462f 6575 }
ffc61ed2 6576 } /* end of \blah */
1b2d223b
JH
6577#ifdef EBCDIC
6578 else
6579 literal_endpoint++;
6580#endif
ffc61ed2
JH
6581
6582 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6583
6584 if (!SIZE_ONLY && !need_class)
936ed897 6585 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6586
936ed897 6587 need_class = 1;
ffc61ed2
JH
6588
6589 /* a bad range like a-\d, a-[:digit:] ? */
6590 if (range) {
73b437c8 6591 if (!SIZE_ONLY) {
afd78fd5 6592 if (ckWARN(WARN_REGEXP)) {
097eb12c 6593 const int w =
afd78fd5
JH
6594 RExC_parse >= rangebegin ?
6595 RExC_parse - rangebegin : 0;
830247a4 6596 vWARN4(RExC_parse,
b45f050a 6597 "False [] range \"%*.*s\"",
097eb12c 6598 w, w, rangebegin);
afd78fd5 6599 }
3568d838
JH
6600 if (prevvalue < 256) {
6601 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6602 ANYOF_BITMAP_SET(ret, '-');
6603 }
6604 else {
6605 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6606 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6607 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6608 }
b8c5462f 6609 }
ffc61ed2
JH
6610
6611 range = 0; /* this was not a true range */
73b437c8 6612 }
ffc61ed2 6613
73b437c8 6614 if (!SIZE_ONLY) {
c49a72a9
NC
6615 const char *what = NULL;
6616 char yesno = 0;
6617
3568d838
JH
6618 if (namedclass > OOB_NAMEDCLASS)
6619 optimize_invert = FALSE;
e2962f66
JH
6620 /* Possible truncation here but in some 64-bit environments
6621 * the compiler gets heartburn about switch on 64-bit values.
6622 * A similar issue a little earlier when switching on value.
98f323fa 6623 * --jhi */
e2962f66 6624 switch ((I32)namedclass) {
73b437c8
JH
6625 case ANYOF_ALNUM:
6626 if (LOC)
936ed897 6627 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6628 else {
6629 for (value = 0; value < 256; value++)
6630 if (isALNUM(value))
936ed897 6631 ANYOF_BITMAP_SET(ret, value);
73b437c8 6632 }
c49a72a9
NC
6633 yesno = '+';
6634 what = "Word";
73b437c8
JH
6635 break;
6636 case ANYOF_NALNUM:
6637 if (LOC)
936ed897 6638 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6639 else {
6640 for (value = 0; value < 256; value++)
6641 if (!isALNUM(value))
936ed897 6642 ANYOF_BITMAP_SET(ret, value);
73b437c8 6643 }
c49a72a9
NC
6644 yesno = '!';
6645 what = "Word";
73b437c8 6646 break;
ffc61ed2 6647 case ANYOF_ALNUMC:
73b437c8 6648 if (LOC)
ffc61ed2 6649 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6650 else {
6651 for (value = 0; value < 256; value++)
ffc61ed2 6652 if (isALNUMC(value))
936ed897 6653 ANYOF_BITMAP_SET(ret, value);
73b437c8 6654 }
c49a72a9
NC
6655 yesno = '+';
6656 what = "Alnum";
73b437c8
JH
6657 break;
6658 case ANYOF_NALNUMC:
6659 if (LOC)
936ed897 6660 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6661 else {
6662 for (value = 0; value < 256; value++)
6663 if (!isALNUMC(value))
936ed897 6664 ANYOF_BITMAP_SET(ret, value);
73b437c8 6665 }
c49a72a9
NC
6666 yesno = '!';
6667 what = "Alnum";
73b437c8
JH
6668 break;
6669 case ANYOF_ALPHA:
6670 if (LOC)
936ed897 6671 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6672 else {
6673 for (value = 0; value < 256; value++)
6674 if (isALPHA(value))
936ed897 6675 ANYOF_BITMAP_SET(ret, value);
73b437c8 6676 }
c49a72a9
NC
6677 yesno = '+';
6678 what = "Alpha";
73b437c8
JH
6679 break;
6680 case ANYOF_NALPHA:
6681 if (LOC)
936ed897 6682 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6683 else {
6684 for (value = 0; value < 256; value++)
6685 if (!isALPHA(value))
936ed897 6686 ANYOF_BITMAP_SET(ret, value);
73b437c8 6687 }
c49a72a9
NC
6688 yesno = '!';
6689 what = "Alpha";
73b437c8
JH
6690 break;
6691 case ANYOF_ASCII:
6692 if (LOC)
936ed897 6693 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6694 else {
c7f1f016 6695#ifndef EBCDIC
1ba5c669
JH
6696 for (value = 0; value < 128; value++)
6697 ANYOF_BITMAP_SET(ret, value);
6698#else /* EBCDIC */
ffbc6a93 6699 for (value = 0; value < 256; value++) {
3a3c4447
JH
6700 if (isASCII(value))
6701 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6702 }
1ba5c669 6703#endif /* EBCDIC */
73b437c8 6704 }
c49a72a9
NC
6705 yesno = '+';
6706 what = "ASCII";
73b437c8
JH
6707 break;
6708 case ANYOF_NASCII:
6709 if (LOC)
936ed897 6710 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6711 else {
c7f1f016 6712#ifndef EBCDIC
1ba5c669
JH
6713 for (value = 128; value < 256; value++)
6714 ANYOF_BITMAP_SET(ret, value);
6715#else /* EBCDIC */
ffbc6a93 6716 for (value = 0; value < 256; value++) {
3a3c4447
JH
6717 if (!isASCII(value))
6718 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6719 }
1ba5c669 6720#endif /* EBCDIC */
73b437c8 6721 }
c49a72a9
NC
6722 yesno = '!';
6723 what = "ASCII";
73b437c8 6724 break;
aaa51d5e
JF
6725 case ANYOF_BLANK:
6726 if (LOC)
6727 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6728 else {
6729 for (value = 0; value < 256; value++)
6730 if (isBLANK(value))
6731 ANYOF_BITMAP_SET(ret, value);
6732 }
c49a72a9
NC
6733 yesno = '+';
6734 what = "Blank";
aaa51d5e
JF
6735 break;
6736 case ANYOF_NBLANK:
6737 if (LOC)
6738 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6739 else {
6740 for (value = 0; value < 256; value++)
6741 if (!isBLANK(value))
6742 ANYOF_BITMAP_SET(ret, value);
6743 }
c49a72a9
NC
6744 yesno = '!';
6745 what = "Blank";
aaa51d5e 6746 break;
73b437c8
JH
6747 case ANYOF_CNTRL:
6748 if (LOC)
936ed897 6749 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6750 else {
6751 for (value = 0; value < 256; value++)
6752 if (isCNTRL(value))
936ed897 6753 ANYOF_BITMAP_SET(ret, value);
73b437c8 6754 }
c49a72a9
NC
6755 yesno = '+';
6756 what = "Cntrl";
73b437c8
JH
6757 break;
6758 case ANYOF_NCNTRL:
6759 if (LOC)
936ed897 6760 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6761 else {
6762 for (value = 0; value < 256; value++)
6763 if (!isCNTRL(value))
936ed897 6764 ANYOF_BITMAP_SET(ret, value);
73b437c8 6765 }
c49a72a9
NC
6766 yesno = '!';
6767 what = "Cntrl";
ffc61ed2
JH
6768 break;
6769 case ANYOF_DIGIT:
6770 if (LOC)
6771 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6772 else {
6773 /* consecutive digits assumed */
6774 for (value = '0'; value <= '9'; value++)
6775 ANYOF_BITMAP_SET(ret, value);
6776 }
c49a72a9
NC
6777 yesno = '+';
6778 what = "Digit";
ffc61ed2
JH
6779 break;
6780 case ANYOF_NDIGIT:
6781 if (LOC)
6782 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6783 else {
6784 /* consecutive digits assumed */
6785 for (value = 0; value < '0'; value++)
6786 ANYOF_BITMAP_SET(ret, value);
6787 for (value = '9' + 1; value < 256; value++)
6788 ANYOF_BITMAP_SET(ret, value);
6789 }
c49a72a9
NC
6790 yesno = '!';
6791 what = "Digit";
73b437c8
JH
6792 break;
6793 case ANYOF_GRAPH:
6794 if (LOC)
936ed897 6795 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6796 else {
6797 for (value = 0; value < 256; value++)
6798 if (isGRAPH(value))
936ed897 6799 ANYOF_BITMAP_SET(ret, value);
73b437c8 6800 }
c49a72a9
NC
6801 yesno = '+';
6802 what = "Graph";
73b437c8
JH
6803 break;
6804 case ANYOF_NGRAPH:
6805 if (LOC)
936ed897 6806 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6807 else {
6808 for (value = 0; value < 256; value++)
6809 if (!isGRAPH(value))
936ed897 6810 ANYOF_BITMAP_SET(ret, value);
73b437c8 6811 }
c49a72a9
NC
6812 yesno = '!';
6813 what = "Graph";
73b437c8
JH
6814 break;
6815 case ANYOF_LOWER:
6816 if (LOC)
936ed897 6817 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6818 else {
6819 for (value = 0; value < 256; value++)
6820 if (isLOWER(value))
936ed897 6821 ANYOF_BITMAP_SET(ret, value);
73b437c8 6822 }
c49a72a9
NC
6823 yesno = '+';
6824 what = "Lower";
73b437c8
JH
6825 break;
6826 case ANYOF_NLOWER:
6827 if (LOC)
936ed897 6828 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6829 else {
6830 for (value = 0; value < 256; value++)
6831 if (!isLOWER(value))
936ed897 6832 ANYOF_BITMAP_SET(ret, value);
73b437c8 6833 }
c49a72a9
NC
6834 yesno = '!';
6835 what = "Lower";
73b437c8
JH
6836 break;
6837 case ANYOF_PRINT:
6838 if (LOC)
936ed897 6839 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6840 else {
6841 for (value = 0; value < 256; value++)
6842 if (isPRINT(value))
936ed897 6843 ANYOF_BITMAP_SET(ret, value);
73b437c8 6844 }
c49a72a9
NC
6845 yesno = '+';
6846 what = "Print";
73b437c8
JH
6847 break;
6848 case ANYOF_NPRINT:
6849 if (LOC)
936ed897 6850 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6851 else {
6852 for (value = 0; value < 256; value++)
6853 if (!isPRINT(value))
936ed897 6854 ANYOF_BITMAP_SET(ret, value);
73b437c8 6855 }
c49a72a9
NC
6856 yesno = '!';
6857 what = "Print";
73b437c8 6858 break;
aaa51d5e
JF
6859 case ANYOF_PSXSPC:
6860 if (LOC)
6861 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6862 else {
6863 for (value = 0; value < 256; value++)
6864 if (isPSXSPC(value))
6865 ANYOF_BITMAP_SET(ret, value);
6866 }
c49a72a9
NC
6867 yesno = '+';
6868 what = "Space";
aaa51d5e
JF
6869 break;
6870 case ANYOF_NPSXSPC:
6871 if (LOC)
6872 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6873 else {
6874 for (value = 0; value < 256; value++)
6875 if (!isPSXSPC(value))
6876 ANYOF_BITMAP_SET(ret, value);
6877 }
c49a72a9
NC
6878 yesno = '!';
6879 what = "Space";
aaa51d5e 6880 break;
73b437c8
JH
6881 case ANYOF_PUNCT:
6882 if (LOC)
936ed897 6883 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6884 else {
6885 for (value = 0; value < 256; value++)
6886 if (isPUNCT(value))
936ed897 6887 ANYOF_BITMAP_SET(ret, value);
73b437c8 6888 }
c49a72a9
NC
6889 yesno = '+';
6890 what = "Punct";
73b437c8
JH
6891 break;
6892 case ANYOF_NPUNCT:
6893 if (LOC)
936ed897 6894 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6895 else {
6896 for (value = 0; value < 256; value++)
6897 if (!isPUNCT(value))
936ed897 6898 ANYOF_BITMAP_SET(ret, value);
73b437c8 6899 }
c49a72a9
NC
6900 yesno = '!';
6901 what = "Punct";
ffc61ed2
JH
6902 break;
6903 case ANYOF_SPACE:
6904 if (LOC)
6905 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6906 else {
6907 for (value = 0; value < 256; value++)
6908 if (isSPACE(value))
6909 ANYOF_BITMAP_SET(ret, value);
6910 }
c49a72a9
NC
6911 yesno = '+';
6912 what = "SpacePerl";
ffc61ed2
JH
6913 break;
6914 case ANYOF_NSPACE:
6915 if (LOC)
6916 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6917 else {
6918 for (value = 0; value < 256; value++)
6919 if (!isSPACE(value))
6920 ANYOF_BITMAP_SET(ret, value);
6921 }
c49a72a9
NC
6922 yesno = '!';
6923 what = "SpacePerl";
73b437c8
JH
6924 break;
6925 case ANYOF_UPPER:
6926 if (LOC)
936ed897 6927 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6928 else {
6929 for (value = 0; value < 256; value++)
6930 if (isUPPER(value))
936ed897 6931 ANYOF_BITMAP_SET(ret, value);
73b437c8 6932 }
c49a72a9
NC
6933 yesno = '+';
6934 what = "Upper";
73b437c8
JH
6935 break;
6936 case ANYOF_NUPPER:
6937 if (LOC)
936ed897 6938 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6939 else {
6940 for (value = 0; value < 256; value++)
6941 if (!isUPPER(value))
936ed897 6942 ANYOF_BITMAP_SET(ret, value);
73b437c8 6943 }
c49a72a9
NC
6944 yesno = '!';
6945 what = "Upper";
73b437c8
JH
6946 break;
6947 case ANYOF_XDIGIT:
6948 if (LOC)
936ed897 6949 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6950 else {
6951 for (value = 0; value < 256; value++)
6952 if (isXDIGIT(value))
936ed897 6953 ANYOF_BITMAP_SET(ret, value);
73b437c8 6954 }
c49a72a9
NC
6955 yesno = '+';
6956 what = "XDigit";
73b437c8
JH
6957 break;
6958 case ANYOF_NXDIGIT:
6959 if (LOC)
936ed897 6960 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6961 else {
6962 for (value = 0; value < 256; value++)
6963 if (!isXDIGIT(value))
936ed897 6964 ANYOF_BITMAP_SET(ret, value);
73b437c8 6965 }
c49a72a9
NC
6966 yesno = '!';
6967 what = "XDigit";
73b437c8 6968 break;
f81125e2
JP
6969 case ANYOF_MAX:
6970 /* this is to handle \p and \P */
6971 break;
73b437c8 6972 default:
b45f050a 6973 vFAIL("Invalid [::] class");
73b437c8 6974 break;
b8c5462f 6975 }
c49a72a9
NC
6976 if (what) {
6977 /* Strings such as "+utf8::isWord\n" */
6978 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6979 }
b8c5462f 6980 if (LOC)
936ed897 6981 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6982 continue;
a0d0e21e 6983 }
ffc61ed2
JH
6984 } /* end of namedclass \blah */
6985
a0d0e21e 6986 if (range) {
eb160463 6987 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6988 const int w = RExC_parse - rangebegin;
6989 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6990 range = 0; /* not a valid range */
73b437c8 6991 }
a0d0e21e
LW
6992 }
6993 else {
3568d838 6994 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6995 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6996 RExC_parse[1] != ']') {
6997 RExC_parse++;
ffc61ed2
JH
6998
6999 /* a bad range like \w-, [:word:]- ? */
7000 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7001 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7002 const int w =
afd78fd5
JH
7003 RExC_parse >= rangebegin ?
7004 RExC_parse - rangebegin : 0;
830247a4 7005 vWARN4(RExC_parse,
b45f050a 7006 "False [] range \"%*.*s\"",
097eb12c 7007 w, w, rangebegin);
afd78fd5 7008 }
73b437c8 7009 if (!SIZE_ONLY)
936ed897 7010 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7011 } else
ffc61ed2
JH
7012 range = 1; /* yeah, it's a range! */
7013 continue; /* but do it the next time */
a0d0e21e 7014 }
a687059c 7015 }
ffc61ed2 7016
93733859 7017 /* now is the next time */
07be1b83 7018 /*stored += (value - prevvalue + 1);*/
ae5c130c 7019 if (!SIZE_ONLY) {
3568d838 7020 if (prevvalue < 256) {
1df70142 7021 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7022 IV i;
3568d838 7023#ifdef EBCDIC
1b2d223b
JH
7024 /* In EBCDIC [\x89-\x91] should include
7025 * the \x8e but [i-j] should not. */
7026 if (literal_endpoint == 2 &&
7027 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7028 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7029 {
3568d838
JH
7030 if (isLOWER(prevvalue)) {
7031 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7032 if (isLOWER(i))
7033 ANYOF_BITMAP_SET(ret, i);
7034 } else {
3568d838 7035 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7036 if (isUPPER(i))
7037 ANYOF_BITMAP_SET(ret, i);
7038 }
8ada0baa 7039 }
ffc61ed2 7040 else
8ada0baa 7041#endif
07be1b83
YO
7042 for (i = prevvalue; i <= ceilvalue; i++) {
7043 if (!ANYOF_BITMAP_TEST(ret,i)) {
7044 stored++;
7045 ANYOF_BITMAP_SET(ret, i);
7046 }
7047 }
3568d838 7048 }
a5961de5 7049 if (value > 255 || UTF) {
1df70142
AL
7050 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7051 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7052 stored+=2; /* can't optimize this class */
ffc61ed2 7053 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7054 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7055 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7056 prevnatvalue, natvalue);
7057 }
7058 else if (prevnatvalue == natvalue) {
7059 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7060 if (FOLD) {
89ebb4a3 7061 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7062 STRLEN foldlen;
1df70142 7063 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7064
e294cc5d
JH
7065#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7066 if (RExC_precomp[0] == ':' &&
7067 RExC_precomp[1] == '[' &&
7068 (f == 0xDF || f == 0x92)) {
7069 f = NATIVE_TO_UNI(f);
7070 }
7071#endif
c840d2a2
JH
7072 /* If folding and foldable and a single
7073 * character, insert also the folded version
7074 * to the charclass. */
9e55ce06 7075 if (f != value) {
e294cc5d
JH
7076#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7077 if ((RExC_precomp[0] == ':' &&
7078 RExC_precomp[1] == '[' &&
7079 (f == 0xA2 &&
7080 (value == 0xFB05 || value == 0xFB06))) ?
7081 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7082 foldlen == (STRLEN)UNISKIP(f) )
7083#else
eb160463 7084 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7085#endif
9e55ce06
JH
7086 Perl_sv_catpvf(aTHX_ listsv,
7087 "%04"UVxf"\n", f);
7088 else {
7089 /* Any multicharacter foldings
7090 * require the following transform:
7091 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7092 * where E folds into "pq" and F folds
7093 * into "rst", all other characters
7094 * fold to single characters. We save
7095 * away these multicharacter foldings,
7096 * to be later saved as part of the
7097 * additional "s" data. */
7098 SV *sv;
7099
7100 if (!unicode_alternate)
7101 unicode_alternate = newAV();
7102 sv = newSVpvn((char*)foldbuf, foldlen);
7103 SvUTF8_on(sv);
7104 av_push(unicode_alternate, sv);
7105 }
7106 }
254ba52a 7107
60a8b682
JH
7108 /* If folding and the value is one of the Greek
7109 * sigmas insert a few more sigmas to make the
7110 * folding rules of the sigmas to work right.
7111 * Note that not all the possible combinations
7112 * are handled here: some of them are handled
9e55ce06
JH
7113 * by the standard folding rules, and some of
7114 * them (literal or EXACTF cases) are handled
7115 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7116 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7117 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7118 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7119 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7120 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7121 }
7122 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7123 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7124 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7125 }
7126 }
ffc61ed2 7127 }
1b2d223b
JH
7128#ifdef EBCDIC
7129 literal_endpoint = 0;
7130#endif
8ada0baa 7131 }
ffc61ed2
JH
7132
7133 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7134 }
ffc61ed2 7135
936ed897 7136 if (need_class) {
4f66b38d 7137 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7138 if (SIZE_ONLY)
830247a4 7139 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7140 else
830247a4 7141 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7142 }
ffc61ed2 7143
7f6f358c
YO
7144
7145 if (SIZE_ONLY)
7146 return ret;
7147 /****** !SIZE_ONLY AFTER HERE *********/
7148
7149 if( stored == 1 && value < 256
7150 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7151 ) {
7152 /* optimize single char class to an EXACT node
7153 but *only* when its not a UTF/high char */
07be1b83
YO
7154 const char * cur_parse= RExC_parse;
7155 RExC_emit = (regnode *)orig_emit;
7156 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7157 ret = reg_node(pRExC_state,
7158 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7159 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7160 *STRING(ret)= (char)value;
7161 STR_LEN(ret)= 1;
7162 RExC_emit += STR_SZ(1);
7163 return ret;
7164 }
ae5c130c 7165 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7166 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7167 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7168 ) {
a0ed51b3 7169 for (value = 0; value < 256; ++value) {
936ed897 7170 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7171 UV fold = PL_fold[value];
ffc61ed2
JH
7172
7173 if (fold != value)
7174 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7175 }
7176 }
936ed897 7177 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7178 }
ffc61ed2 7179
ae5c130c 7180 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7181 if (optimize_invert &&
ffc61ed2
JH
7182 /* If the only flag is inversion. */
7183 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7184 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7185 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7186 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7187 }
7f6f358c 7188 {
097eb12c 7189 AV * const av = newAV();
ffc61ed2 7190 SV *rv;
9e55ce06 7191 /* The 0th element stores the character class description
6a0407ee 7192 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7193 * to initialize the appropriate swash (which gets stored in
7194 * the 1st element), and also useful for dumping the regnode.
7195 * The 2nd element stores the multicharacter foldings,
6a0407ee 7196 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7197 av_store(av, 0, listsv);
7198 av_store(av, 1, NULL);
9e55ce06 7199 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7200 rv = newRV_noinc((SV*)av);
19860706 7201 n = add_data(pRExC_state, 1, "s");
830247a4 7202 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7203 ARG_SET(ret, n);
a0ed51b3 7204 }
a0ed51b3
LW
7205 return ret;
7206}
7207
76e3520e 7208STATIC char*
830247a4 7209S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7210{
097eb12c 7211 char* const retval = RExC_parse++;
a0d0e21e 7212
4633a7c4 7213 for (;;) {
830247a4
IZ
7214 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7215 RExC_parse[2] == '#') {
e994fd66
AE
7216 while (*RExC_parse != ')') {
7217 if (RExC_parse == RExC_end)
7218 FAIL("Sequence (?#... not terminated");
830247a4 7219 RExC_parse++;
e994fd66 7220 }
830247a4 7221 RExC_parse++;
4633a7c4
LW
7222 continue;
7223 }
e2509266 7224 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7225 if (isSPACE(*RExC_parse)) {
7226 RExC_parse++;
748a9306
LW
7227 continue;
7228 }
830247a4 7229 else if (*RExC_parse == '#') {
e994fd66
AE
7230 while (RExC_parse < RExC_end)
7231 if (*RExC_parse++ == '\n') break;
748a9306
LW
7232 continue;
7233 }
748a9306 7234 }
4633a7c4 7235 return retval;
a0d0e21e 7236 }
a687059c
LW
7237}
7238
7239/*
c277df42 7240- reg_node - emit a node
a0d0e21e 7241*/
76e3520e 7242STATIC regnode * /* Location. */
830247a4 7243S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7244{
97aff369 7245 dVAR;
c277df42 7246 register regnode *ptr;
504618e9 7247 regnode * const ret = RExC_emit;
07be1b83 7248 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7249
c277df42 7250 if (SIZE_ONLY) {
830247a4
IZ
7251 SIZE_ALIGN(RExC_size);
7252 RExC_size += 1;
a0d0e21e
LW
7253 return(ret);
7254 }
c277df42 7255 NODE_ALIGN_FILL(ret);
a0d0e21e 7256 ptr = ret;
c277df42 7257 FILL_ADVANCE_NODE(ptr, op);
fac92740 7258 if (RExC_offsets) { /* MJD */
07be1b83 7259 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7260 "reg_node", __LINE__,
7261 reg_name[op],
07be1b83
YO
7262 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7263 ? "Overwriting end of array!\n" : "OK",
7264 (UV)(RExC_emit - RExC_emit_start),
7265 (UV)(RExC_parse - RExC_start),
7266 (UV)RExC_offsets[0]));
ccb2c380 7267 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7268 }
07be1b83 7269
830247a4 7270 RExC_emit = ptr;
a687059c 7271
a0d0e21e 7272 return(ret);
a687059c
LW
7273}
7274
7275/*
a0d0e21e
LW
7276- reganode - emit a node with an argument
7277*/
76e3520e 7278STATIC regnode * /* Location. */
830247a4 7279S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7280{
97aff369 7281 dVAR;
c277df42 7282 register regnode *ptr;
504618e9 7283 regnode * const ret = RExC_emit;
07be1b83 7284 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7285
c277df42 7286 if (SIZE_ONLY) {
830247a4
IZ
7287 SIZE_ALIGN(RExC_size);
7288 RExC_size += 2;
6bda09f9
YO
7289 /*
7290 We can't do this:
7291
7292 assert(2==regarglen[op]+1);
7293
7294 Anything larger than this has to allocate the extra amount.
7295 If we changed this to be:
7296
7297 RExC_size += (1 + regarglen[op]);
7298
7299 then it wouldn't matter. Its not clear what side effect
7300 might come from that so its not done so far.
7301 -- dmq
7302 */
a0d0e21e
LW
7303 return(ret);
7304 }
fe14fcc3 7305
c277df42 7306 NODE_ALIGN_FILL(ret);
a0d0e21e 7307 ptr = ret;
c277df42 7308 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7309 if (RExC_offsets) { /* MJD */
07be1b83 7310 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7311 "reganode",
ccb2c380
MP
7312 __LINE__,
7313 reg_name[op],
07be1b83 7314 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7315 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7316 (UV)(RExC_emit - RExC_emit_start),
7317 (UV)(RExC_parse - RExC_start),
7318 (UV)RExC_offsets[0]));
ccb2c380 7319 Set_Cur_Node_Offset;
fac92740
MJD
7320 }
7321
830247a4 7322 RExC_emit = ptr;
fe14fcc3 7323
a0d0e21e 7324 return(ret);
fe14fcc3
LW
7325}
7326
7327/*
cd439c50 7328- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7329*/
71207a34
AL
7330STATIC STRLEN
7331S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7332{
97aff369 7333 dVAR;
71207a34 7334 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7335}
7336
7337/*
a0d0e21e
LW
7338- reginsert - insert an operator in front of already-emitted operand
7339*
7340* Means relocating the operand.
7341*/
76e3520e 7342STATIC void
6bda09f9 7343S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7344{
97aff369 7345 dVAR;
c277df42
IZ
7346 register regnode *src;
7347 register regnode *dst;
7348 register regnode *place;
504618e9 7349 const int offset = regarglen[(U8)op];
6bda09f9 7350 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7351 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7352/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7353 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7354 if (SIZE_ONLY) {
6bda09f9 7355 RExC_size += size;
a0d0e21e
LW
7356 return;
7357 }
a687059c 7358
830247a4 7359 src = RExC_emit;
6bda09f9 7360 RExC_emit += size;
830247a4 7361 dst = RExC_emit;
6bda09f9
YO
7362 if (RExC_parens) {
7363 int paren;
7364 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7365 if ( RExC_parens[paren] >= src )
7366 RExC_parens[paren] += size;
7367 }
7368 }
7369
fac92740 7370 while (src > opnd) {
c277df42 7371 StructCopy(--src, --dst, regnode);
fac92740 7372 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7373 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7374 "reg_insert",
ccb2c380
MP
7375 __LINE__,
7376 reg_name[op],
07be1b83
YO
7377 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7378 ? "Overwriting end of array!\n" : "OK",
7379 (UV)(src - RExC_emit_start),
7380 (UV)(dst - RExC_emit_start),
7381 (UV)RExC_offsets[0]));
ccb2c380
MP
7382 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7383 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7384 }
7385 }
7386
a0d0e21e
LW
7387
7388 place = opnd; /* Op node, where operand used to be. */
fac92740 7389 if (RExC_offsets) { /* MJD */
07be1b83 7390 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7391 "reginsert",
ccb2c380
MP
7392 __LINE__,
7393 reg_name[op],
07be1b83 7394 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7395 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7396 (UV)(place - RExC_emit_start),
7397 (UV)(RExC_parse - RExC_start),
786e8c11 7398 (UV)RExC_offsets[0]));
ccb2c380 7399 Set_Node_Offset(place, RExC_parse);
45948336 7400 Set_Node_Length(place, 1);
fac92740 7401 }
c277df42
IZ
7402 src = NEXTOPER(place);
7403 FILL_ADVANCE_NODE(place, op);
7404 Zero(src, offset, regnode);
a687059c
LW
7405}
7406
7407/*
c277df42 7408- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7409- SEE ALSO: regtail_study
a0d0e21e 7410*/
097eb12c 7411/* TODO: All three parms should be const */
76e3520e 7412STATIC void
3dab1dad 7413S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7414{
97aff369 7415 dVAR;
c277df42 7416 register regnode *scan;
72f13be8 7417 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7418#ifndef DEBUGGING
7419 PERL_UNUSED_ARG(depth);
7420#endif
a0d0e21e 7421
c277df42 7422 if (SIZE_ONLY)
a0d0e21e
LW
7423 return;
7424
7425 /* Find last node. */
7426 scan = p;
7427 for (;;) {
504618e9 7428 regnode * const temp = regnext(scan);
3dab1dad
YO
7429 DEBUG_PARSE_r({
7430 SV * const mysv=sv_newmortal();
7431 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7432 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7433 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7434 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7435 (temp == NULL ? "->" : ""),
7436 (temp == NULL ? reg_name[OP(val)] : "")
7437 );
3dab1dad
YO
7438 });
7439 if (temp == NULL)
7440 break;
7441 scan = temp;
7442 }
7443
7444 if (reg_off_by_arg[OP(scan)]) {
7445 ARG_SET(scan, val - scan);
7446 }
7447 else {
7448 NEXT_OFF(scan) = val - scan;
7449 }
7450}
7451
07be1b83 7452#ifdef DEBUGGING
3dab1dad
YO
7453/*
7454- regtail_study - set the next-pointer at the end of a node chain of p to val.
7455- Look for optimizable sequences at the same time.
7456- currently only looks for EXACT chains.
07be1b83
YO
7457
7458This is expermental code. The idea is to use this routine to perform
7459in place optimizations on branches and groups as they are constructed,
7460with the long term intention of removing optimization from study_chunk so
7461that it is purely analytical.
7462
7463Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7464to control which is which.
7465
3dab1dad
YO
7466*/
7467/* TODO: All four parms should be const */
07be1b83 7468
3dab1dad
YO
7469STATIC U8
7470S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7471{
7472 dVAR;
7473 register regnode *scan;
07be1b83
YO
7474 U8 exact = PSEUDO;
7475#ifdef EXPERIMENTAL_INPLACESCAN
7476 I32 min = 0;
7477#endif
7478
3dab1dad
YO
7479 GET_RE_DEBUG_FLAGS_DECL;
7480
07be1b83 7481
3dab1dad
YO
7482 if (SIZE_ONLY)
7483 return exact;
7484
7485 /* Find last node. */
7486
7487 scan = p;
7488 for (;;) {
7489 regnode * const temp = regnext(scan);
07be1b83
YO
7490#ifdef EXPERIMENTAL_INPLACESCAN
7491 if (PL_regkind[OP(scan)] == EXACT)
7492 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7493 return EXACT;
7494#endif
3dab1dad
YO
7495 if ( exact ) {
7496 switch (OP(scan)) {
7497 case EXACT:
7498 case EXACTF:
7499 case EXACTFL:
7500 if( exact == PSEUDO )
7501 exact= OP(scan);
07be1b83
YO
7502 else if ( exact != OP(scan) )
7503 exact= 0;
3dab1dad
YO
7504 case NOTHING:
7505 break;
7506 default:
7507 exact= 0;
7508 }
7509 }
7510 DEBUG_PARSE_r({
7511 SV * const mysv=sv_newmortal();
7512 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7513 regprop(RExC_rx, mysv, scan);
eaf3ca90 7514 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7515 SvPV_nolen_const(mysv),
eaf3ca90
YO
7516 REG_NODE_NUM(scan),
7517 reg_name[exact]);
3dab1dad 7518 });
a0d0e21e
LW
7519 if (temp == NULL)
7520 break;
7521 scan = temp;
7522 }
07be1b83
YO
7523 DEBUG_PARSE_r({
7524 SV * const mysv_val=sv_newmortal();
7525 DEBUG_PARSE_MSG("");
7526 regprop(RExC_rx, mysv_val, val);
7527 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7528 SvPV_nolen_const(mysv_val),
7529 REG_NODE_NUM(val),
7530 val - scan
7531 );
7532 });
c277df42
IZ
7533 if (reg_off_by_arg[OP(scan)]) {
7534 ARG_SET(scan, val - scan);
a0ed51b3
LW
7535 }
7536 else {
c277df42
IZ
7537 NEXT_OFF(scan) = val - scan;
7538 }
3dab1dad
YO
7539
7540 return exact;
a687059c 7541}
07be1b83 7542#endif
a687059c
LW
7543
7544/*
a687059c
LW
7545 - regcurly - a little FSA that accepts {\d+,?\d*}
7546 */
79072805 7547STATIC I32
5f66b61c 7548S_regcurly(register const char *s)
a687059c
LW
7549{
7550 if (*s++ != '{')
7551 return FALSE;
f0fcb552 7552 if (!isDIGIT(*s))
a687059c 7553 return FALSE;
f0fcb552 7554 while (isDIGIT(*s))
a687059c
LW
7555 s++;
7556 if (*s == ',')
7557 s++;
f0fcb552 7558 while (isDIGIT(*s))
a687059c
LW
7559 s++;
7560 if (*s != '}')
7561 return FALSE;
7562 return TRUE;
7563}
7564
a687059c
LW
7565
7566/*
fd181c75 7567 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7568 */
7569void
097eb12c 7570Perl_regdump(pTHX_ const regexp *r)
a687059c 7571{
35ff7856 7572#ifdef DEBUGGING
97aff369 7573 dVAR;
c445ea15 7574 SV * const sv = sv_newmortal();
ab3bbdeb 7575 SV *dsv= sv_newmortal();
a687059c 7576
786e8c11 7577 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7578
7579 /* Header fields of interest. */
ab3bbdeb
YO
7580 if (r->anchored_substr) {
7581 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7582 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7583 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7584 "anchored %s%s at %"IVdf" ",
7585 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7586 (IV)r->anchored_offset);
ab3bbdeb
YO
7587 } else if (r->anchored_utf8) {
7588 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7589 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7590 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7591 "anchored utf8 %s%s at %"IVdf" ",
7592 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7593 (IV)r->anchored_offset);
ab3bbdeb
YO
7594 }
7595 if (r->float_substr) {
7596 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7597 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7598 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7599 "floating %s%s at %"IVdf"..%"UVuf" ",
7600 s, RE_SV_TAIL(r->float_substr),
7b0972df 7601 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7602 } else if (r->float_utf8) {
7603 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7604 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7605 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7606 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7607 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7608 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7609 }
33b8afdf 7610 if (r->check_substr || r->check_utf8)
b81d288d 7611 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7612 (const char *)
7613 (r->check_substr == r->float_substr
7614 && r->check_utf8 == r->float_utf8
7615 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7616 if (r->reganch & ROPT_NOSCAN)
7617 PerlIO_printf(Perl_debug_log, " noscan");
7618 if (r->reganch & ROPT_CHECK_ALL)
7619 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7620 if (r->check_substr || r->check_utf8)
c277df42
IZ
7621 PerlIO_printf(Perl_debug_log, ") ");
7622
46fc3d4c 7623 if (r->regstclass) {
32fc9b6a 7624 regprop(r, sv, r->regstclass);
1de06328 7625 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7626 }
774d564b 7627 if (r->reganch & ROPT_ANCH) {
7628 PerlIO_printf(Perl_debug_log, "anchored");
7629 if (r->reganch & ROPT_ANCH_BOL)
7630 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7631 if (r->reganch & ROPT_ANCH_MBOL)
7632 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7633 if (r->reganch & ROPT_ANCH_SBOL)
7634 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7635 if (r->reganch & ROPT_ANCH_GPOS)
7636 PerlIO_printf(Perl_debug_log, "(GPOS)");
7637 PerlIO_putc(Perl_debug_log, ' ');
7638 }
c277df42
IZ
7639 if (r->reganch & ROPT_GPOS_SEEN)
7640 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7641 if (r->reganch & ROPT_SKIP)
760ac839 7642 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7643 if (r->reganch & ROPT_IMPLICIT)
760ac839 7644 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7645 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7646 if (r->reganch & ROPT_EVAL_SEEN)
7647 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7648 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7649#else
96a5add6 7650 PERL_UNUSED_CONTEXT;
65e66c80 7651 PERL_UNUSED_ARG(r);
17c3b450 7652#endif /* DEBUGGING */
a687059c
LW
7653}
7654
7655/*
a0d0e21e
LW
7656- regprop - printable representation of opcode
7657*/
46fc3d4c 7658void
32fc9b6a 7659Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7660{
35ff7856 7661#ifdef DEBUGGING
97aff369 7662 dVAR;
9b155405 7663 register int k;
1de06328 7664 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7665
54dc92de 7666 sv_setpvn(sv, "", 0);
03363afd 7667 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7668 /* It would be nice to FAIL() here, but this may be called from
7669 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7670 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7671 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7672
3dab1dad 7673 k = PL_regkind[OP(o)];
9b155405 7674
2a782b5b 7675 if (k == EXACT) {
396482e1 7676 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7677 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7678 * is a crude hack but it may be the best for now since
7679 * we have no flag "this EXACTish node was UTF-8"
7680 * --jhi */
7681 const char * const s =
ddc5bc0f 7682 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7683 PL_colors[0], PL_colors[1],
7684 PERL_PV_ESCAPE_UNI_DETECT |
7685 PERL_PV_PRETTY_ELIPSES |
7686 PERL_PV_PRETTY_LTGT
7687 );
7688 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7689 } else if (k == TRIE) {
3dab1dad 7690 /* print the details of the trie in dumpuntil instead, as
4f639d21 7691 * prog->data isn't available here */
1de06328
YO
7692 const char op = OP(o);
7693 const I32 n = ARG(o);
7694 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7695 (reg_ac_data *)prog->data->data[n] :
7696 NULL;
7697 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7698 (reg_trie_data*)prog->data->data[n] :
7699 ac->trie;
7700
7701 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7702 DEBUG_TRIE_COMPILE_r(
7703 Perl_sv_catpvf(aTHX_ sv,
7704 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7705 (UV)trie->startstate,
7706 (IV)trie->laststate-1,
7707 (UV)trie->wordcount,
7708 (UV)trie->minlen,
7709 (UV)trie->maxlen,
7710 (UV)TRIE_CHARCOUNT(trie),
7711 (UV)trie->uniquecharcount
7712 )
7713 );
7714 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7715 int i;
7716 int rangestart = -1;
f46cb337 7717 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7718 Perl_sv_catpvf(aTHX_ sv, "[");
7719 for (i = 0; i <= 256; i++) {
7720 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7721 if (rangestart == -1)
7722 rangestart = i;
7723 } else if (rangestart != -1) {
7724 if (i <= rangestart + 3)
7725 for (; rangestart < i; rangestart++)
7726 put_byte(sv, rangestart);
7727 else {
7728 put_byte(sv, rangestart);
7729 sv_catpvs(sv, "-");
7730 put_byte(sv, i - 1);
7731 }
7732 rangestart = -1;
7733 }
7734 }
7735 Perl_sv_catpvf(aTHX_ sv, "]");
7736 }
7737
a3621e74 7738 } else if (k == CURLY) {
cb434fcc 7739 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7740 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7741 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7742 }
2c2d71f5
JH
7743 else if (k == WHILEM && o->flags) /* Ordinal/of */
7744 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6bda09f9 7745 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
894356b3 7746 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6bda09f9
YO
7747 else if (k == RECURSE)
7748 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9b155405 7749 else if (k == LOGICAL)
04ebc1ab 7750 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7751 else if (k == ANYOF) {
7752 int i, rangestart = -1;
2d03de9c 7753 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7754
7755 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7756 static const char * const anyofs[] = {
653099ff
GS
7757 "\\w",
7758 "\\W",
7759 "\\s",
7760 "\\S",
7761 "\\d",
7762 "\\D",
7763 "[:alnum:]",
7764 "[:^alnum:]",
7765 "[:alpha:]",
7766 "[:^alpha:]",
7767 "[:ascii:]",
7768 "[:^ascii:]",
7769 "[:ctrl:]",
7770 "[:^ctrl:]",
7771 "[:graph:]",
7772 "[:^graph:]",
7773 "[:lower:]",
7774 "[:^lower:]",
7775 "[:print:]",
7776 "[:^print:]",
7777 "[:punct:]",
7778 "[:^punct:]",
7779 "[:upper:]",
aaa51d5e 7780 "[:^upper:]",
653099ff 7781 "[:xdigit:]",
aaa51d5e
JF
7782 "[:^xdigit:]",
7783 "[:space:]",
7784 "[:^space:]",
7785 "[:blank:]",
7786 "[:^blank:]"
653099ff
GS
7787 };
7788
19860706 7789 if (flags & ANYOF_LOCALE)
396482e1 7790 sv_catpvs(sv, "{loc}");
19860706 7791 if (flags & ANYOF_FOLD)
396482e1 7792 sv_catpvs(sv, "{i}");
653099ff 7793 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7794 if (flags & ANYOF_INVERT)
396482e1 7795 sv_catpvs(sv, "^");
ffc61ed2
JH
7796 for (i = 0; i <= 256; i++) {
7797 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7798 if (rangestart == -1)
7799 rangestart = i;
7800 } else if (rangestart != -1) {
7801 if (i <= rangestart + 3)
7802 for (; rangestart < i; rangestart++)
653099ff 7803 put_byte(sv, rangestart);
ffc61ed2
JH
7804 else {
7805 put_byte(sv, rangestart);
396482e1 7806 sv_catpvs(sv, "-");
ffc61ed2 7807 put_byte(sv, i - 1);
653099ff 7808 }
ffc61ed2 7809 rangestart = -1;
653099ff 7810 }
847a199f 7811 }
ffc61ed2
JH
7812
7813 if (o->flags & ANYOF_CLASS)
bb7a0f54 7814 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7815 if (ANYOF_CLASS_TEST(o,i))
7816 sv_catpv(sv, anyofs[i]);
7817
7818 if (flags & ANYOF_UNICODE)
396482e1 7819 sv_catpvs(sv, "{unicode}");
1aa99e6b 7820 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7821 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7822
7823 {
7824 SV *lv;
32fc9b6a 7825 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7826
ffc61ed2
JH
7827 if (lv) {
7828 if (sw) {
89ebb4a3 7829 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7830
ffc61ed2 7831 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7832 uvchr_to_utf8(s, i);
ffc61ed2 7833
3568d838 7834 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7835 if (rangestart == -1)
7836 rangestart = i;
7837 } else if (rangestart != -1) {
ffc61ed2
JH
7838 if (i <= rangestart + 3)
7839 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7840 const U8 * const e = uvchr_to_utf8(s,rangestart);
7841 U8 *p;
7842 for(p = s; p < e; p++)
ffc61ed2
JH
7843 put_byte(sv, *p);
7844 }
7845 else {
2d03de9c
AL
7846 const U8 *e = uvchr_to_utf8(s,rangestart);
7847 U8 *p;
7848 for (p = s; p < e; p++)
ffc61ed2 7849 put_byte(sv, *p);
396482e1 7850 sv_catpvs(sv, "-");
2d03de9c
AL
7851 e = uvchr_to_utf8(s, i-1);
7852 for (p = s; p < e; p++)
1df70142 7853 put_byte(sv, *p);
ffc61ed2
JH
7854 }
7855 rangestart = -1;
7856 }
19860706 7857 }
ffc61ed2 7858
396482e1 7859 sv_catpvs(sv, "..."); /* et cetera */
19860706 7860 }
fde631ed 7861
ffc61ed2 7862 {
2e0de35c 7863 char *s = savesvpv(lv);
c445ea15 7864 char * const origs = s;
b81d288d 7865
3dab1dad
YO
7866 while (*s && *s != '\n')
7867 s++;
b81d288d 7868
ffc61ed2 7869 if (*s == '\n') {
2d03de9c 7870 const char * const t = ++s;
ffc61ed2
JH
7871
7872 while (*s) {
7873 if (*s == '\n')
7874 *s = ' ';
7875 s++;
7876 }
7877 if (s[-1] == ' ')
7878 s[-1] = 0;
7879
7880 sv_catpv(sv, t);
fde631ed 7881 }
b81d288d 7882
ffc61ed2 7883 Safefree(origs);
fde631ed
JH
7884 }
7885 }
653099ff 7886 }
ffc61ed2 7887
653099ff
GS
7888 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7889 }
9b155405 7890 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7891 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7892#else
96a5add6 7893 PERL_UNUSED_CONTEXT;
65e66c80
SP
7894 PERL_UNUSED_ARG(sv);
7895 PERL_UNUSED_ARG(o);
f9049ba1 7896 PERL_UNUSED_ARG(prog);
17c3b450 7897#endif /* DEBUGGING */
35ff7856 7898}
a687059c 7899
cad2e5aa
JH
7900SV *
7901Perl_re_intuit_string(pTHX_ regexp *prog)
7902{ /* Assume that RE_INTUIT is set */
97aff369 7903 dVAR;
a3621e74 7904 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7905 PERL_UNUSED_CONTEXT;
7906
a3621e74 7907 DEBUG_COMPILE_r(
cfd0369c 7908 {
2d03de9c 7909 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7910 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7911
7912 if (!PL_colorset) reginitcolors();
7913 PerlIO_printf(Perl_debug_log,
a0288114 7914 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7915 PL_colors[4],
7916 prog->check_substr ? "" : "utf8 ",
7917 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7918 s,
7919 PL_colors[1],
7920 (strlen(s) > 60 ? "..." : ""));
7921 } );
7922
33b8afdf 7923 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7924}
7925
84da74a7
YO
7926/*
7927 pregfree - free a regexp
7928
7929 See regdupe below if you change anything here.
7930*/
7931
2b69d0c2 7932void
864dbfa3 7933Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7934{
27da23d5 7935 dVAR;
0df25f3d 7936
fc32ee4a 7937 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7938
7821416a
IZ
7939 if (!r || (--r->refcnt > 0))
7940 return;
ab3bbdeb 7941 DEBUG_COMPILE_r({
0df25f3d
YO
7942 if (!PL_colorset)
7943 reginitcolors();
ab3bbdeb
YO
7944 if (RX_DEBUG(r)){
7945 SV *dsv= sv_newmortal();
7946 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7947 dsv, r->precomp, r->prelen, 60);
7948 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7949 PL_colors[4],PL_colors[5],s);
7950 }
9e55ce06 7951 });
cad2e5aa 7952
43c5f42d
NC
7953 /* gcov results gave these as non-null 100% of the time, so there's no
7954 optimisation in checking them before calling Safefree */
7955 Safefree(r->precomp);
7956 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7957 RX_MATCH_COPY_FREE(r);
f8c7b90f 7958#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7959 if (r->saved_copy)
7960 SvREFCNT_dec(r->saved_copy);
7961#endif
a193d654
GS
7962 if (r->substrs) {
7963 if (r->anchored_substr)
7964 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7965 if (r->anchored_utf8)
7966 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7967 if (r->float_substr)
7968 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7969 if (r->float_utf8)
7970 SvREFCNT_dec(r->float_utf8);
2779dcf1 7971 Safefree(r->substrs);
a193d654 7972 }
81714fb9
YO
7973 if (r->paren_names)
7974 SvREFCNT_dec(r->paren_names);
c277df42
IZ
7975 if (r->data) {
7976 int n = r->data->count;
f3548bdc
DM
7977 PAD* new_comppad = NULL;
7978 PAD* old_comppad;
4026c95a 7979 PADOFFSET refcnt;
dfad63ad 7980
c277df42 7981 while (--n >= 0) {
261faec3 7982 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7983 switch (r->data->what[n]) {
7984 case 's':
81714fb9 7985 case 'S':
c277df42
IZ
7986 SvREFCNT_dec((SV*)r->data->data[n]);
7987 break;
653099ff
GS
7988 case 'f':
7989 Safefree(r->data->data[n]);
7990 break;
dfad63ad
HS
7991 case 'p':
7992 new_comppad = (AV*)r->data->data[n];
7993 break;
c277df42 7994 case 'o':
dfad63ad 7995 if (new_comppad == NULL)
cea2e8a9 7996 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7997 PAD_SAVE_LOCAL(old_comppad,
7998 /* Watch out for global destruction's random ordering. */
c445ea15 7999 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8000 );
b34c0dd4 8001 OP_REFCNT_LOCK;
4026c95a
SH
8002 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8003 OP_REFCNT_UNLOCK;
8004 if (!refcnt)
9b978d73 8005 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8006
f3548bdc 8007 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8008 SvREFCNT_dec((SV*)new_comppad);
8009 new_comppad = NULL;
c277df42
IZ
8010 break;
8011 case 'n':
9e55ce06 8012 break;
07be1b83 8013 case 'T':
be8e71aa
YO
8014 { /* Aho Corasick add-on structure for a trie node.
8015 Used in stclass optimization only */
07be1b83
YO
8016 U32 refcount;
8017 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8018 OP_REFCNT_LOCK;
8019 refcount = --aho->refcount;
8020 OP_REFCNT_UNLOCK;
8021 if ( !refcount ) {
8022 Safefree(aho->states);
8023 Safefree(aho->fail);
8024 aho->trie=NULL; /* not necessary to free this as it is
8025 handled by the 't' case */
8026 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8027 Safefree(r->regstclass);
07be1b83
YO
8028 }
8029 }
8030 break;
a3621e74 8031 case 't':
07be1b83 8032 {
be8e71aa 8033 /* trie structure. */
07be1b83
YO
8034 U32 refcount;
8035 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8036 OP_REFCNT_LOCK;
8037 refcount = --trie->refcount;
8038 OP_REFCNT_UNLOCK;
8039 if ( !refcount ) {
8040 Safefree(trie->charmap);
8041 if (trie->widecharmap)
8042 SvREFCNT_dec((SV*)trie->widecharmap);
8043 Safefree(trie->states);
8044 Safefree(trie->trans);
8045 if (trie->bitmap)
8046 Safefree(trie->bitmap);
8047 if (trie->wordlen)
8048 Safefree(trie->wordlen);
786e8c11
YO
8049 if (trie->jump)
8050 Safefree(trie->jump);
8051 if (trie->nextword)
8052 Safefree(trie->nextword);
a3621e74 8053#ifdef DEBUGGING
be8e71aa
YO
8054 if (RX_DEBUG(r)) {
8055 if (trie->words)
8056 SvREFCNT_dec((SV*)trie->words);
8057 if (trie->revcharmap)
8058 SvREFCNT_dec((SV*)trie->revcharmap);
8059 }
a3621e74 8060#endif
07be1b83 8061 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8062 }
07be1b83
YO
8063 }
8064 break;
c277df42 8065 default:
830247a4 8066 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8067 }
8068 }
8069 Safefree(r->data->what);
8070 Safefree(r->data);
a0d0e21e
LW
8071 }
8072 Safefree(r->startp);
8073 Safefree(r->endp);
8074 Safefree(r);
a687059c 8075}
c277df42 8076
84da74a7
YO
8077#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8078#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8079#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8080#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8081
8082/*
8083 regdupe - duplicate a regexp.
8084
8085 This routine is called by sv.c's re_dup and is expected to clone a
8086 given regexp structure. It is a no-op when not under USE_ITHREADS.
8087 (Originally this *was* re_dup() for change history see sv.c)
8088
8089 See pregfree() above if you change anything here.
8090*/
a3c0e9ca 8091#if defined(USE_ITHREADS)
84da74a7
YO
8092regexp *
8093Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8094{
84da74a7
YO
8095 dVAR;
8096 REGEXP *ret;
8097 int i, len, npar;
8098 struct reg_substr_datum *s;
8099
8100 if (!r)
8101 return (REGEXP *)NULL;
8102
8103 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8104 return ret;
8105
8106 len = r->offsets[0];
8107 npar = r->nparens+1;
8108
8109 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8110 Copy(r->program, ret->program, len+1, regnode);
8111
8112 Newx(ret->startp, npar, I32);
8113 Copy(r->startp, ret->startp, npar, I32);
8114 Newx(ret->endp, npar, I32);
8115 Copy(r->startp, ret->startp, npar, I32);
8116
8117 Newx(ret->substrs, 1, struct reg_substr_data);
8118 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8119 s->min_offset = r->substrs->data[i].min_offset;
8120 s->max_offset = r->substrs->data[i].max_offset;
8121 s->end_shift = r->substrs->data[i].end_shift;
8122 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8123 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8124 }
8125
8126 ret->regstclass = NULL;
8127 if (r->data) {
8128 struct reg_data *d;
8129 const int count = r->data->count;
8130 int i;
8131
8132 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8133 char, struct reg_data);
8134 Newx(d->what, count, U8);
8135
8136 d->count = count;
8137 for (i = 0; i < count; i++) {
8138 d->what[i] = r->data->what[i];
8139 switch (d->what[i]) {
8140 /* legal options are one of: sfpont
8141 see also regcomp.h and pregfree() */
8142 case 's':
81714fb9 8143 case 'S':
84da74a7
YO
8144 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8145 break;
8146 case 'p':
8147 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8148 break;
8149 case 'f':
8150 /* This is cheating. */
8151 Newx(d->data[i], 1, struct regnode_charclass_class);
8152 StructCopy(r->data->data[i], d->data[i],
8153 struct regnode_charclass_class);
8154 ret->regstclass = (regnode*)d->data[i];
8155 break;
8156 case 'o':
8157 /* Compiled op trees are readonly, and can thus be
8158 shared without duplication. */
8159 OP_REFCNT_LOCK;
8160 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8161 OP_REFCNT_UNLOCK;
8162 break;
8163 case 'n':
8164 d->data[i] = r->data->data[i];
8165 break;
8166 case 't':
8167 d->data[i] = r->data->data[i];
8168 OP_REFCNT_LOCK;
8169 ((reg_trie_data*)d->data[i])->refcount++;
8170 OP_REFCNT_UNLOCK;
8171 break;
8172 case 'T':
8173 d->data[i] = r->data->data[i];
8174 OP_REFCNT_LOCK;
8175 ((reg_ac_data*)d->data[i])->refcount++;
8176 OP_REFCNT_UNLOCK;
8177 /* Trie stclasses are readonly and can thus be shared
8178 * without duplication. We free the stclass in pregfree
8179 * when the corresponding reg_ac_data struct is freed.
8180 */
8181 ret->regstclass= r->regstclass;
8182 break;
8183 default:
8184 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8185 }
8186 }
8187
8188 ret->data = d;
8189 }
8190 else
8191 ret->data = NULL;
8192
8193 Newx(ret->offsets, 2*len+1, U32);
8194 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8195
8196 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8197 ret->refcnt = r->refcnt;
8198 ret->minlen = r->minlen;
8199 ret->prelen = r->prelen;
8200 ret->nparens = r->nparens;
8201 ret->lastparen = r->lastparen;
8202 ret->lastcloseparen = r->lastcloseparen;
8203 ret->reganch = r->reganch;
8204
8205 ret->sublen = r->sublen;
8206
f9f4320a 8207 ret->engine = r->engine;
81714fb9
YO
8208
8209 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8210
84da74a7
YO
8211 if (RX_MATCH_COPIED(ret))
8212 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8213 else
8214 ret->subbeg = NULL;
8215#ifdef PERL_OLD_COPY_ON_WRITE
8216 ret->saved_copy = NULL;
8217#endif
8218
8219 ptr_table_store(PL_ptr_table, r, ret);
8220 return ret;
84da74a7 8221}
a3c0e9ca 8222#endif
84da74a7 8223
76234dfb 8224#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8225/*
8226 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8227 */
8228regnode *
864dbfa3 8229Perl_regnext(pTHX_ register regnode *p)
c277df42 8230{
97aff369 8231 dVAR;
c277df42
IZ
8232 register I32 offset;
8233
3280af22 8234 if (p == &PL_regdummy)
c277df42
IZ
8235 return(NULL);
8236
8237 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8238 if (offset == 0)
8239 return(NULL);
8240
c277df42 8241 return(p+offset);
c277df42 8242}
76234dfb 8243#endif
c277df42 8244
01f988be 8245STATIC void
cea2e8a9 8246S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8247{
8248 va_list args;
8249 STRLEN l1 = strlen(pat1);
8250 STRLEN l2 = strlen(pat2);
8251 char buf[512];
06bf62c7 8252 SV *msv;
73d840c0 8253 const char *message;
c277df42
IZ
8254
8255 if (l1 > 510)
8256 l1 = 510;
8257 if (l1 + l2 > 510)
8258 l2 = 510 - l1;
8259 Copy(pat1, buf, l1 , char);
8260 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8261 buf[l1 + l2] = '\n';
8262 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8263#ifdef I_STDARG
8264 /* ANSI variant takes additional second argument */
c277df42 8265 va_start(args, pat2);
8736538c
AS
8266#else
8267 va_start(args);
8268#endif
5a844595 8269 msv = vmess(buf, &args);
c277df42 8270 va_end(args);
cfd0369c 8271 message = SvPV_const(msv,l1);
c277df42
IZ
8272 if (l1 > 512)
8273 l1 = 512;
8274 Copy(message, buf, l1 , char);
197cf9b9 8275 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8276 Perl_croak(aTHX_ "%s", buf);
c277df42 8277}
a0ed51b3
LW
8278
8279/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8280
76234dfb 8281#ifndef PERL_IN_XSUB_RE
a0ed51b3 8282void
864dbfa3 8283Perl_save_re_context(pTHX)
b81d288d 8284{
97aff369 8285 dVAR;
1ade1aa1
NC
8286
8287 struct re_save_state *state;
8288
8289 SAVEVPTR(PL_curcop);
8290 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8291
8292 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8293 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8294 SSPUSHINT(SAVEt_RE_STATE);
8295
46ab3289 8296 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8297
a0ed51b3 8298 PL_reg_start_tmp = 0;
a0ed51b3 8299 PL_reg_start_tmpl = 0;
c445ea15 8300 PL_reg_oldsaved = NULL;
a5db57d6 8301 PL_reg_oldsavedlen = 0;
a5db57d6 8302 PL_reg_maxiter = 0;
a5db57d6 8303 PL_reg_leftiter = 0;
c445ea15 8304 PL_reg_poscache = NULL;
a5db57d6 8305 PL_reg_poscache_size = 0;
1ade1aa1
NC
8306#ifdef PERL_OLD_COPY_ON_WRITE
8307 PL_nrs = NULL;
8308#endif
ada6e8a9 8309
c445ea15
AL
8310 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8311 if (PL_curpm) {
8312 const REGEXP * const rx = PM_GETRE(PL_curpm);
8313 if (rx) {
1df70142 8314 U32 i;
ada6e8a9 8315 for (i = 1; i <= rx->nparens; i++) {
1df70142 8316 char digits[TYPE_CHARS(long)];
d9fad198 8317 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8318 GV *const *const gvp
8319 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8320
b37c2d43
AL
8321 if (gvp) {
8322 GV * const gv = *gvp;
8323 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8324 save_scalar(gv);
49f27e4b 8325 }
ada6e8a9
AMS
8326 }
8327 }
8328 }
a0ed51b3 8329}
76234dfb 8330#endif
51371543 8331
51371543 8332static void
acfe0abc 8333clear_re(pTHX_ void *r)
51371543 8334{
97aff369 8335 dVAR;
51371543
GS
8336 ReREFCNT_dec((regexp *)r);
8337}
ffbc6a93 8338
a28509cc
AL
8339#ifdef DEBUGGING
8340
8341STATIC void
8342S_put_byte(pTHX_ SV *sv, int c)
8343{
8344 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8345 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8346 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8347 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8348 else
8349 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8350}
8351
786e8c11 8352
3dab1dad
YO
8353#define CLEAR_OPTSTART \
8354 if (optstart) STMT_START { \
07be1b83 8355 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8356 optstart=NULL; \
8357 } STMT_END
8358
786e8c11 8359#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8360
b5a2f8d8
NC
8361STATIC const regnode *
8362S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8363 const regnode *last, const regnode *plast,
8364 SV* sv, I32 indent, U32 depth)
a28509cc 8365{
97aff369 8366 dVAR;
786e8c11 8367 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8368 register const regnode *next;
3dab1dad
YO
8369 const regnode *optstart= NULL;
8370 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8371
786e8c11
YO
8372#ifdef DEBUG_DUMPUNTIL
8373 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8374 last ? last-start : 0,plast ? plast-start : 0);
8375#endif
8376
8377 if (plast && plast < last)
8378 last= plast;
8379
8380 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8381 /* While that wasn't END last time... */
8382
8383 NODE_ALIGN(node);
8384 op = OP(node);
8385 if (op == CLOSE)
786e8c11 8386 indent--;
b5a2f8d8 8387 next = regnext((regnode *)node);
07be1b83 8388
a28509cc 8389 /* Where, what. */
8e11feef 8390 if (OP(node) == OPTIMIZED) {
e68ec53f 8391 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8392 optstart = node;
3dab1dad 8393 else
8e11feef 8394 goto after_print;
3dab1dad
YO
8395 } else
8396 CLEAR_OPTSTART;
07be1b83 8397
32fc9b6a 8398 regprop(r, sv, node);
a28509cc 8399 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8400 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8401
8402 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8403 if (next == NULL) /* Next ptr. */
8404 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8405 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8406 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8407 else
8408 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8409
1de06328 8410 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8411 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8412 }
8413
a28509cc
AL
8414 after_print:
8415 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8416 assert(next);
8417 {
8418 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8419 ? regnext((regnode *)next)
8420 : next);
be8e71aa
YO
8421 if (last && nnode > last)
8422 nnode = last;
786e8c11 8423 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8424 }
a28509cc
AL
8425 }
8426 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8427 assert(next);
786e8c11 8428 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8429 }
8430 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 8431 const char op = OP(node);
a28509cc 8432 const I32 n = ARG(node);
1de06328
YO
8433 const reg_ac_data * const ac = op>=AHOCORASICK ?
8434 (reg_ac_data *)r->data->data[n] :
8435 NULL;
8436 const reg_trie_data * const trie = op<AHOCORASICK ?
8437 (reg_trie_data*)r->data->data[n] :
8438 ac->trie;
786e8c11 8439 const regnode *nextbranch= NULL;
a28509cc 8440 I32 word_idx;
1de06328 8441 sv_setpvn(sv, "", 0);
786e8c11 8442 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8443 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8444
8445 PerlIO_printf(Perl_debug_log, "%*s%s ",
8446 (int)(2*(indent+3)), "",
8447 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8448 PL_colors[0], PL_colors[1],
8449 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8450 PERL_PV_PRETTY_ELIPSES |
8451 PERL_PV_PRETTY_LTGT
786e8c11
YO
8452 )
8453 : "???"
8454 );
8455 if (trie->jump) {
8456 U16 dist= trie->jump[word_idx+1];
8457 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
8458 if (dist) {
8459 if (!nextbranch)
8460 nextbranch= next - trie->jump[0];
8461 DUMPUNTIL(next - dist, nextbranch);
8462 }
8463 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8464 nextbranch= regnext((regnode *)nextbranch);
8465 } else {
8466 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8467 }
786e8c11
YO
8468 }
8469 if (last && next > last)
8470 node= last;
8471 else
8472 node= next;
a28509cc 8473 }
786e8c11
YO
8474 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8475 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8476 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8477 }
8478 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8479 assert(next);
786e8c11 8480 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8481 }
8482 else if ( op == PLUS || op == STAR) {
786e8c11 8483 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8484 }
8485 else if (op == ANYOF) {
8486 /* arglen 1 + class block */
8487 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8488 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8489 node = NEXTOPER(node);
8490 }
8491 else if (PL_regkind[(U8)op] == EXACT) {
8492 /* Literal string, where present. */
8493 node += NODE_SZ_STR(node) - 1;
8494 node = NEXTOPER(node);
8495 }
8496 else {
8497 node = NEXTOPER(node);
8498 node += regarglen[(U8)op];
8499 }
8500 if (op == CURLYX || op == OPEN)
786e8c11 8501 indent++;
a28509cc 8502 else if (op == WHILEM)
786e8c11 8503 indent--;
a28509cc 8504 }
3dab1dad 8505 CLEAR_OPTSTART;
786e8c11
YO
8506#ifdef DEBUG_DUMPUNTIL
8507 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8508#endif
1de06328 8509 return node;
a28509cc
AL
8510}
8511
8512#endif /* DEBUGGING */
8513
241d1a3b
NC
8514/*
8515 * Local variables:
8516 * c-indentation-style: bsd
8517 * c-basic-offset: 4
8518 * indent-tabs-mode: t
8519 * End:
8520 *
37442d52
RGS
8521 * ex: set ts=8 sts=4 sw=4 noet:
8522 */