This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
C++: add -Wno-used-parameter, and drop ODBM_File
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
786e8c11 174#define FULL_TRIE_STUDY
07be1b83
YO
175#define TRIE_STCLASS
176#endif
1de06328
YO
177
178
179/* About scan_data_t.
180
181 During optimisation we recurse through the regexp program performing
182 various inplace (keyhole style) optimisations. In addition study_chunk
183 and scan_commit populate this data structure with information about
184 what strings MUST appear in the pattern. We look for the longest
185 string that must appear for at a fixed location, and we look for the
186 longest string that may appear at a floating location. So for instance
187 in the pattern:
188
189 /FOO[xX]A.*B[xX]BAR/
190
191 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
192 strings (because they follow a .* construct). study_chunk will identify
193 both FOO and BAR as being the longest fixed and floating strings respectively.
194
195 The strings can be composites, for instance
196
197 /(f)(o)(o)/
198
199 will result in a composite fixed substring 'foo'.
200
201 For each string some basic information is maintained:
202
203 - offset or min_offset
204 This is the position the string must appear at, or not before.
205 It also implicitly (when combined with minlenp) tells us how many
206 character must match before the string we are searching.
207 Likewise when combined with minlenp and the length of the string
208 tells us how many characters must appear after the string we have
209 found.
210
211 - max_offset
212 Only used for floating strings. This is the rightmost point that
213 the string can appear at. Ifset to I32 max it indicates that the
214 string can occur infinitely far to the right.
215
216 - minlenp
217 A pointer to the minimum length of the pattern that the string
218 was found inside. This is important as in the case of positive
219 lookahead or positive lookbehind we can have multiple patterns
220 involved. Consider
221
222 /(?=FOO).*F/
223
224 The minimum length of the pattern overall is 3, the minimum length
225 of the lookahead part is 3, but the minimum length of the part that
226 will actually match is 1. So 'FOO's minimum length is 3, but the
227 minimum length for the F is 1. This is important as the minimum length
228 is used to determine offsets in front of and behind the string being
229 looked for. Since strings can be composites this is the length of the
230 pattern at the time it was commited with a scan_commit. Note that
231 the length is calculated by study_chunk, so that the minimum lengths
232 are not known until the full pattern has been compiled, thus the
233 pointer to the value.
234
235 - lookbehind
236
237 In the case of lookbehind the string being searched for can be
238 offset past the start point of the final matching string.
239 If this value was just blithely removed from the min_offset it would
240 invalidate some of the calculations for how many chars must match
241 before or after (as they are derived from min_offset and minlen and
242 the length of the string being searched for).
243 When the final pattern is compiled and the data is moved from the
244 scan_data_t structure into the regexp structure the information
245 about lookbehind is factored in, with the information that would
246 have been lost precalculated in the end_shift field for the
247 associated string.
248
249 The fields pos_min and pos_delta are used to store the minimum offset
250 and the delta to the maximum offset at the current point in the pattern.
251
252*/
2c2d71f5
JH
253
254typedef struct scan_data_t {
1de06328
YO
255 /*I32 len_min; unused */
256 /*I32 len_delta; unused */
2c2d71f5
JH
257 I32 pos_min;
258 I32 pos_delta;
259 SV *last_found;
1de06328 260 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
261 I32 last_start_min;
262 I32 last_start_max;
1de06328
YO
263 SV **longest; /* Either &l_fixed, or &l_float. */
264 SV *longest_fixed; /* longest fixed string found in pattern */
265 I32 offset_fixed; /* offset where it starts */
266 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
267 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
268 SV *longest_float; /* longest floating string found in pattern */
269 I32 offset_float_min; /* earliest point in string it can appear */
270 I32 offset_float_max; /* latest point in string it can appear */
271 I32 *minlen_float; /* pointer to the minlen relevent to the string */
272 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
273 I32 flags;
274 I32 whilem_c;
cb434fcc 275 I32 *last_closep;
653099ff 276 struct regnode_charclass_class *start_class;
2c2d71f5
JH
277} scan_data_t;
278
a687059c 279/*
e50aee73 280 * Forward declarations for pregcomp()'s friends.
a687059c 281 */
a0d0e21e 282
27da23d5 283static const scan_data_t zero_scan_data =
1de06328 284 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
285
286#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
287#define SF_BEFORE_SEOL 0x0001
288#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
289#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
290#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
291
09b7f37c
CB
292#ifdef NO_UNARY_PLUS
293# define SF_FIX_SHIFT_EOL (0+2)
294# define SF_FL_SHIFT_EOL (0+4)
295#else
296# define SF_FIX_SHIFT_EOL (+2)
297# define SF_FL_SHIFT_EOL (+4)
298#endif
c277df42
IZ
299
300#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
301#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
302
303#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
304#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
305#define SF_IS_INF 0x0040
306#define SF_HAS_PAR 0x0080
307#define SF_IN_PAR 0x0100
308#define SF_HAS_EVAL 0x0200
309#define SCF_DO_SUBSTR 0x0400
653099ff
GS
310#define SCF_DO_STCLASS_AND 0x0800
311#define SCF_DO_STCLASS_OR 0x1000
312#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 313#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 314
786e8c11
YO
315#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
316
07be1b83 317
eb160463 318#define UTF (RExC_utf8 != 0)
e2509266
JH
319#define LOC ((RExC_flags & PMf_LOCALE) != 0)
320#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 321
ffc61ed2 322#define OOB_UNICODE 12345678
93733859 323#define OOB_NAMEDCLASS -1
b8c5462f 324
a0ed51b3
LW
325#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
326#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
327
8615cb43 328
b45f050a
JF
329/* length of regex to show in messages that don't mark a position within */
330#define RegexLengthToShowInErrorMessages 127
331
332/*
333 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
334 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
335 * op/pragma/warn/regcomp.
336 */
7253e4e3
RK
337#define MARKER1 "<-- HERE" /* marker as it appears in the description */
338#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 339
7253e4e3 340#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
344 * arg. Show regex, up to a maximum length. If it's too long, chop and add
345 * "...".
346 */
ccb2c380 347#define FAIL(msg) STMT_START { \
bfed75c6 348 const char *ellipses = ""; \
ccb2c380
MP
349 IV len = RExC_end - RExC_precomp; \
350 \
351 if (!SIZE_ONLY) \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 if (len > RegexLengthToShowInErrorMessages) { \
354 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
355 len = RegexLengthToShowInErrorMessages - 10; \
356 ellipses = "..."; \
357 } \
358 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
359 msg, (int)len, RExC_precomp, ellipses); \
360} STMT_END
8615cb43 361
b45f050a 362/*
b45f050a
JF
363 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
364 */
ccb2c380 365#define Simple_vFAIL(m) STMT_START { \
a28509cc 366 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
367 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
368 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
b45f050a
JF
370
371/*
372 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
373 */
ccb2c380
MP
374#define vFAIL(m) STMT_START { \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 Simple_vFAIL(m); \
378} STMT_END
b45f050a
JF
379
380/*
381 * Like Simple_vFAIL(), but accepts two arguments.
382 */
ccb2c380 383#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 384 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
385 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
386 (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
b45f050a
JF
388
389/*
390 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
391 */
ccb2c380
MP
392#define vFAIL2(m,a1) STMT_START { \
393 if (!SIZE_ONLY) \
394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
395 Simple_vFAIL2(m, a1); \
396} STMT_END
b45f050a
JF
397
398
399/*
400 * Like Simple_vFAIL(), but accepts three arguments.
401 */
ccb2c380 402#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 403 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
404 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
410 */
ccb2c380
MP
411#define vFAIL3(m,a1,a2) STMT_START { \
412 if (!SIZE_ONLY) \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL3(m, a1, a2); \
415} STMT_END
b45f050a
JF
416
417/*
418 * Like Simple_vFAIL(), but accepts four arguments.
419 */
ccb2c380 420#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 421 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
424} STMT_END
b45f050a 425
ccb2c380 426#define vWARN(loc,m) STMT_START { \
a28509cc 427 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
431
432#define vWARNdep(loc,m) STMT_START { \
a28509cc 433 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
434 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
435 "%s" REPORT_LOCATION, \
436 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
438
439
440#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 441 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
442 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
443 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
445
446#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 447 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
451
452#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
9d1d55b5 463
8615cb43 464
cd439c50 465/* Allow for side effects in s */
ccb2c380
MP
466#define REGC(c,s) STMT_START { \
467 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
468} STMT_END
cd439c50 469
fac92740
MJD
470/* Macros for recording node offsets. 20001227 mjd@plover.com
471 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
472 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
473 * Element 0 holds the number n.
07be1b83 474 * Position is 1 indexed.
fac92740
MJD
475 */
476
ccb2c380
MP
477#define Set_Node_Offset_To_R(node,byte) STMT_START { \
478 if (! SIZE_ONLY) { \
479 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 480 __LINE__, (node), (int)(byte))); \
ccb2c380 481 if((node) < 0) { \
551405c4 482 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
483 } else { \
484 RExC_offsets[2*(node)-1] = (byte); \
485 } \
486 } \
487} STMT_END
488
489#define Set_Node_Offset(node,byte) \
490 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
491#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
492
493#define Set_Node_Length_To_R(node,len) STMT_START { \
494 if (! SIZE_ONLY) { \
495 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 496 __LINE__, (int)(node), (int)(len))); \
ccb2c380 497 if((node) < 0) { \
551405c4 498 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
499 } else { \
500 RExC_offsets[2*(node)] = (len); \
501 } \
502 } \
503} STMT_END
504
505#define Set_Node_Length(node,len) \
506 Set_Node_Length_To_R((node)-RExC_emit_start, len)
507#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
508#define Set_Node_Cur_Length(node) \
509 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
510
511/* Get offsets and lengths */
512#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
513#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
514
07be1b83
YO
515#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
517 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
518} STMT_END
519
520
521#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
522#define EXPERIMENTAL_INPLACESCAN
523#endif
524
1de06328
YO
525#define DEBUG_STUDYDATA(data,depth) \
526DEBUG_OPTIMISE_r(if(data){ \
527 PerlIO_printf(Perl_debug_log, \
528 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
529 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
530 (int)(depth)*2, "", \
531 (IV)((data)->pos_min), \
532 (IV)((data)->pos_delta), \
533 (IV)((data)->flags), \
534 (IV)((data)->whilem_c), \
535 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
536 ); \
537 if ((data)->last_found) \
538 PerlIO_printf(Perl_debug_log, \
539 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
540 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
541 SvPVX_const((data)->last_found), \
542 (IV)((data)->last_end), \
543 (IV)((data)->last_start_min), \
544 (IV)((data)->last_start_max), \
545 ((data)->longest && \
546 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
547 SvPVX_const((data)->longest_fixed), \
548 (IV)((data)->offset_fixed), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_float)) ? "*" : "", \
551 SvPVX_const((data)->longest_float), \
552 (IV)((data)->offset_float_min), \
553 (IV)((data)->offset_float_max) \
554 ); \
555 PerlIO_printf(Perl_debug_log,"\n"); \
556});
557
acfe0abc 558static void clear_re(pTHX_ void *r);
4327152a 559
653099ff 560/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 561 Update the longest found anchored substring and the longest found
653099ff
GS
562 floating substrings if needed. */
563
4327152a 564STATIC void
1de06328 565S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 566{
e1ec3a88
AL
567 const STRLEN l = CHR_SVLEN(data->last_found);
568 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 569 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 570
c277df42 571 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 572 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
573 if (*data->longest == data->longest_fixed) {
574 data->offset_fixed = l ? data->last_start_min : data->pos_min;
575 if (data->flags & SF_BEFORE_EOL)
b81d288d 576 data->flags
c277df42
IZ
577 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
578 else
579 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
580 data->minlen_fixed=minlenp;
581 data->lookbehind_fixed=0;
a0ed51b3
LW
582 }
583 else {
c277df42 584 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
585 data->offset_float_max = (l
586 ? data->last_start_max
c277df42 587 : data->pos_min + data->pos_delta);
9051bda5
HS
588 if ((U32)data->offset_float_max > (U32)I32_MAX)
589 data->offset_float_max = I32_MAX;
c277df42 590 if (data->flags & SF_BEFORE_EOL)
b81d288d 591 data->flags
c277df42
IZ
592 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
593 else
594 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
595 data->minlen_float=minlenp;
596 data->lookbehind_float=0;
c277df42
IZ
597 }
598 }
599 SvCUR_set(data->last_found, 0);
0eda9292 600 {
a28509cc 601 SV * const sv = data->last_found;
097eb12c
AL
602 if (SvUTF8(sv) && SvMAGICAL(sv)) {
603 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
604 if (mg)
605 mg->mg_len = 0;
606 }
0eda9292 607 }
c277df42
IZ
608 data->last_end = -1;
609 data->flags &= ~SF_BEFORE_EOL;
1de06328 610 DEBUG_STUDYDATA(data,0);
c277df42
IZ
611}
612
653099ff
GS
613/* Can match anything (initialization) */
614STATIC void
097eb12c 615S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 616{
653099ff 617 ANYOF_CLASS_ZERO(cl);
f8bef550 618 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 619 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
620 if (LOC)
621 cl->flags |= ANYOF_LOCALE;
622}
623
624/* Can match anything (initialization) */
625STATIC int
5f66b61c 626S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
627{
628 int value;
629
aaa51d5e 630 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
631 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
632 return 1;
1aa99e6b
IH
633 if (!(cl->flags & ANYOF_UNICODE_ALL))
634 return 0;
10edeb5d 635 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 636 return 0;
653099ff
GS
637 return 1;
638}
639
640/* Can match anything (initialization) */
641STATIC void
097eb12c 642S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 643{
8ecf7187 644 Zero(cl, 1, struct regnode_charclass_class);
653099ff 645 cl->type = ANYOF;
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647}
648
649STATIC void
097eb12c 650S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
8ecf7187 652 Zero(cl, 1, struct regnode_charclass_class);
653099ff 653 cl->type = ANYOF;
830247a4 654 cl_anything(pRExC_state, cl);
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* 'And' a given class with another one. Can create false positives */
660/* We assume that cl is not inverted */
661STATIC void
5f66b61c 662S_cl_and(struct regnode_charclass_class *cl,
a28509cc 663 const struct regnode_charclass_class *and_with)
653099ff 664{
653099ff
GS
665 if (!(and_with->flags & ANYOF_CLASS)
666 && !(cl->flags & ANYOF_CLASS)
667 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
668 && !(and_with->flags & ANYOF_FOLD)
669 && !(cl->flags & ANYOF_FOLD)) {
670 int i;
671
672 if (and_with->flags & ANYOF_INVERT)
673 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
674 cl->bitmap[i] &= ~and_with->bitmap[i];
675 else
676 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
677 cl->bitmap[i] &= and_with->bitmap[i];
678 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
679 if (!(and_with->flags & ANYOF_EOS))
680 cl->flags &= ~ANYOF_EOS;
1aa99e6b 681
14ebb1a2
JH
682 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
683 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
684 cl->flags &= ~ANYOF_UNICODE_ALL;
685 cl->flags |= ANYOF_UNICODE;
686 ARG_SET(cl, ARG(and_with));
687 }
14ebb1a2
JH
688 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
689 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 690 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
691 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
692 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 693 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
694}
695
696/* 'OR' a given class with another one. Can create false positives */
697/* We assume that cl is not inverted */
698STATIC void
097eb12c 699S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 700{
653099ff
GS
701 if (or_with->flags & ANYOF_INVERT) {
702 /* We do not use
703 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
704 * <= (B1 | !B2) | (CL1 | !CL2)
705 * which is wasteful if CL2 is small, but we ignore CL2:
706 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
707 * XXXX Can we handle case-fold? Unclear:
708 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
709 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
710 */
711 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
712 && !(or_with->flags & ANYOF_FOLD)
713 && !(cl->flags & ANYOF_FOLD) ) {
714 int i;
715
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] |= ~or_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise */
719 else {
830247a4 720 cl_anything(pRExC_state, cl);
653099ff
GS
721 }
722 } else {
723 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
724 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 725 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
726 || (cl->flags & ANYOF_FOLD)) ) {
727 int i;
728
729 /* OR char bitmap and class bitmap separately */
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] |= or_with->bitmap[i];
732 if (or_with->flags & ANYOF_CLASS) {
733 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
734 cl->classflags[i] |= or_with->classflags[i];
735 cl->flags |= ANYOF_CLASS;
736 }
737 }
738 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 739 cl_anything(pRExC_state, cl);
653099ff
GS
740 }
741 }
742 if (or_with->flags & ANYOF_EOS)
743 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
744
745 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
746 ARG(cl) != ARG(or_with)) {
747 cl->flags |= ANYOF_UNICODE_ALL;
748 cl->flags &= ~ANYOF_UNICODE;
749 }
750 if (or_with->flags & ANYOF_UNICODE_ALL) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
753 }
653099ff
GS
754}
755
a3621e74
YO
756#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
757#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
758#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
759#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
760
3dab1dad
YO
761
762#ifdef DEBUGGING
07be1b83 763/*
3dab1dad
YO
764 dump_trie(trie)
765 dump_trie_interim_list(trie,next_alloc)
766 dump_trie_interim_table(trie,next_alloc)
767
768 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
769 The _interim_ variants are used for debugging the interim
770 tables that are used to generate the final compressed
771 representation which is what dump_trie expects.
772
3dab1dad
YO
773 Part of the reason for their existance is to provide a form
774 of documentation as to how the different representations function.
07be1b83
YO
775
776*/
3dab1dad
YO
777
778/*
779 dump_trie(trie)
780 Dumps the final compressed table form of the trie to Perl_debug_log.
781 Used for debugging make_trie().
782*/
783
784STATIC void
785S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
786{
787 U32 state;
ab3bbdeb
YO
788 SV *sv=sv_newmortal();
789 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
790 GET_RE_DEBUG_FLAGS_DECL;
791
ab3bbdeb 792
3dab1dad
YO
793 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
794 (int)depth * 2 + 2,"",
795 "Match","Base","Ofs" );
796
797 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 798 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 799 if ( tmp ) {
ab3bbdeb
YO
800 PerlIO_printf( Perl_debug_log, "%*s",
801 colwidth,
ddc5bc0f 802 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
803 PL_colors[0], PL_colors[1],
804 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
805 PERL_PV_ESCAPE_FIRSTCHAR
806 )
807 );
3dab1dad
YO
808 }
809 }
810 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
811 (int)depth * 2 + 2,"");
812
813 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 814 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
815 PerlIO_printf( Perl_debug_log, "\n");
816
786e8c11 817 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 818 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
819
820 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
821
822 if ( trie->states[ state ].wordnum ) {
823 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
824 } else {
825 PerlIO_printf( Perl_debug_log, "%6s", "" );
826 }
827
828 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
829
830 if ( base ) {
831 U32 ofs = 0;
832
833 while( ( base + ofs < trie->uniquecharcount ) ||
834 ( base + ofs - trie->uniquecharcount < trie->lasttrans
835 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
836 ofs++;
837
838 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
839
840 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
841 if ( ( base + ofs >= trie->uniquecharcount ) &&
842 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
843 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
844 {
ab3bbdeb
YO
845 PerlIO_printf( Perl_debug_log, "%*"UVXf,
846 colwidth,
3dab1dad
YO
847 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
848 } else {
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
850 }
851 }
852
853 PerlIO_printf( Perl_debug_log, "]");
854
855 }
856 PerlIO_printf( Perl_debug_log, "\n" );
857 }
858}
859/*
860 dump_trie_interim_list(trie,next_alloc)
861 Dumps a fully constructed but uncompressed trie in list form.
862 List tries normally only are used for construction when the number of
863 possible chars (trie->uniquecharcount) is very high.
864 Used for debugging make_trie().
865*/
866STATIC void
867S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
868{
869 U32 state;
ab3bbdeb
YO
870 SV *sv=sv_newmortal();
871 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
872 GET_RE_DEBUG_FLAGS_DECL;
873 /* print out the table precompression. */
ab3bbdeb
YO
874 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
875 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
876 "------:-----+-----------------\n" );
3dab1dad
YO
877
878 for( state=1 ; state < next_alloc ; state ++ ) {
879 U16 charid;
880
ab3bbdeb 881 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
882 (int)depth * 2 + 2,"", (UV)state );
883 if ( ! trie->states[ state ].wordnum ) {
884 PerlIO_printf( Perl_debug_log, "%5s| ","");
885 } else {
886 PerlIO_printf( Perl_debug_log, "W%4x| ",
887 trie->states[ state ].wordnum
888 );
889 }
890 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 891 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
892 if ( tmp ) {
893 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
894 colwidth,
ddc5bc0f 895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
899 ) ,
3dab1dad
YO
900 TRIE_LIST_ITEM(state,charid).forid,
901 (UV)TRIE_LIST_ITEM(state,charid).newstate
902 );
903 }
ab3bbdeb
YO
904 }
905 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
906 }
907}
908
909/*
910 dump_trie_interim_table(trie,next_alloc)
911 Dumps a fully constructed but uncompressed trie in table form.
912 This is the normal DFA style state transition table, with a few
913 twists to facilitate compression later.
914 Used for debugging make_trie().
915*/
916STATIC void
917S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
918{
919 U32 state;
920 U16 charid;
ab3bbdeb
YO
921 SV *sv=sv_newmortal();
922 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
923 GET_RE_DEBUG_FLAGS_DECL;
924
925 /*
926 print out the table precompression so that we can do a visual check
927 that they are identical.
928 */
929
930 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
931
932 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 933 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 934 if ( tmp ) {
ab3bbdeb
YO
935 PerlIO_printf( Perl_debug_log, "%*s",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 )
942 );
3dab1dad
YO
943 }
944 }
945
946 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
947
948 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
950 }
951
952 PerlIO_printf( Perl_debug_log, "\n" );
953
954 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
955
956 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
957 (int)depth * 2 + 2,"",
958 (UV)TRIE_NODENUM( state ) );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
961 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
962 if (v)
963 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
964 else
965 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
966 }
967 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
968 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
969 } else {
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
971 trie->states[ TRIE_NODENUM( state ) ].wordnum );
972 }
973 }
07be1b83 974}
3dab1dad
YO
975
976#endif
977
786e8c11
YO
978/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
979 startbranch: the first branch in the whole branch sequence
980 first : start branch of sequence of branch-exact nodes.
981 May be the same as startbranch
982 last : Thing following the last branch.
983 May be the same as tail.
984 tail : item following the branch sequence
985 count : words in the sequence
986 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
987 depth : indent depth
3dab1dad 988
786e8c11 989Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 990
786e8c11
YO
991A trie is an N'ary tree where the branches are determined by digital
992decomposition of the key. IE, at the root node you look up the 1st character and
993follow that branch repeat until you find the end of the branches. Nodes can be
994marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 995
786e8c11 996 /he|she|his|hers/
72f13be8 997
786e8c11
YO
998would convert into the following structure. Numbers represent states, letters
999following numbers represent valid transitions on the letter from that state, if
1000the number is in square brackets it represents an accepting state, otherwise it
1001will be in parenthesis.
07be1b83 1002
786e8c11
YO
1003 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1004 | |
1005 | (2)
1006 | |
1007 (1) +-i->(6)-+-s->[7]
1008 |
1009 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1010
786e8c11
YO
1011 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1012
1013This shows that when matching against the string 'hers' we will begin at state 1
1014read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1015then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1016is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1017single traverse. We store a mapping from accepting to state to which word was
1018matched, and then when we have multiple possibilities we try to complete the
1019rest of the regex in the order in which they occured in the alternation.
1020
1021The only prior NFA like behaviour that would be changed by the TRIE support is
1022the silent ignoring of duplicate alternations which are of the form:
1023
1024 / (DUPE|DUPE) X? (?{ ... }) Y /x
1025
1026Thus EVAL blocks follwing a trie may be called a different number of times with
1027and without the optimisation. With the optimisations dupes will be silently
1028ignored. This inconsistant behaviour of EVAL type nodes is well established as
1029the following demonstrates:
1030
1031 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1032
1033which prints out 'word' three times, but
1034
1035 'words'=~/(word|word|word)(?{ print $1 })S/
1036
1037which doesnt print it out at all. This is due to other optimisations kicking in.
1038
1039Example of what happens on a structural level:
1040
1041The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1042
1043 1: CURLYM[1] {1,32767}(18)
1044 5: BRANCH(8)
1045 6: EXACT <ac>(16)
1046 8: BRANCH(11)
1047 9: EXACT <ad>(16)
1048 11: BRANCH(14)
1049 12: EXACT <ab>(16)
1050 16: SUCCEED(0)
1051 17: NOTHING(18)
1052 18: END(0)
1053
1054This would be optimizable with startbranch=5, first=5, last=16, tail=16
1055and should turn into:
1056
1057 1: CURLYM[1] {1,32767}(18)
1058 5: TRIE(16)
1059 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1060 <ac>
1061 <ad>
1062 <ab>
1063 16: SUCCEED(0)
1064 17: NOTHING(18)
1065 18: END(0)
1066
1067Cases where tail != last would be like /(?foo|bar)baz/:
1068
1069 1: BRANCH(4)
1070 2: EXACT <foo>(8)
1071 4: BRANCH(7)
1072 5: EXACT <bar>(8)
1073 7: TAIL(8)
1074 8: EXACT <baz>(10)
1075 10: END(0)
1076
1077which would be optimizable with startbranch=1, first=1, last=7, tail=8
1078and would end up looking like:
1079
1080 1: TRIE(8)
1081 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1082 <foo>
1083 <bar>
1084 7: TAIL(8)
1085 8: EXACT <baz>(10)
1086 10: END(0)
1087
1088 d = uvuni_to_utf8_flags(d, uv, 0);
1089
1090is the recommended Unicode-aware way of saying
1091
1092 *(d++) = uv;
1093*/
1094
1095#define TRIE_STORE_REVCHAR \
1096 STMT_START { \
1097 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1098 if (UTF) SvUTF8_on(tmp); \
1099 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1100 } STMT_END
1101
1102#define TRIE_READ_CHAR STMT_START { \
1103 wordlen++; \
1104 if ( UTF ) { \
1105 if ( folder ) { \
1106 if ( foldlen > 0 ) { \
1107 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1108 foldlen -= len; \
1109 scan += len; \
1110 len = 0; \
1111 } else { \
1112 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1113 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1114 foldlen -= UNISKIP( uvc ); \
1115 scan = foldbuf + UNISKIP( uvc ); \
1116 } \
1117 } else { \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1119 } \
1120 } else { \
1121 uvc = (U32)*uc; \
1122 len = 1; \
1123 } \
1124} STMT_END
1125
1126
1127
1128#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1129 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1130 TRIE_LIST_LEN( state ) *= 2; \
1131 Renew( trie->states[ state ].trans.list, \
1132 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1133 } \
1134 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1135 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1136 TRIE_LIST_CUR( state )++; \
1137} STMT_END
07be1b83 1138
786e8c11
YO
1139#define TRIE_LIST_NEW(state) STMT_START { \
1140 Newxz( trie->states[ state ].trans.list, \
1141 4, reg_trie_trans_le ); \
1142 TRIE_LIST_CUR( state ) = 1; \
1143 TRIE_LIST_LEN( state ) = 4; \
1144} STMT_END
07be1b83 1145
786e8c11
YO
1146#define TRIE_HANDLE_WORD(state) STMT_START { \
1147 U16 dupe= trie->states[ state ].wordnum; \
1148 regnode * const noper_next = regnext( noper ); \
1149 \
1150 if (trie->wordlen) \
1151 trie->wordlen[ curword ] = wordlen; \
1152 DEBUG_r({ \
1153 /* store the word for dumping */ \
1154 SV* tmp; \
1155 if (OP(noper) != NOTHING) \
1156 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1157 else \
1158 tmp = newSVpvn( "", 0 ); \
1159 if ( UTF ) SvUTF8_on( tmp ); \
1160 av_push( trie->words, tmp ); \
1161 }); \
1162 \
1163 curword++; \
1164 \
1165 if ( noper_next < tail ) { \
1166 if (!trie->jump) \
1167 Newxz( trie->jump, word_count + 1, U16); \
1168 trie->jump[curword] = (U16)(tail - noper_next); \
1169 if (!jumper) \
1170 jumper = noper_next; \
1171 if (!nextbranch) \
1172 nextbranch= regnext(cur); \
1173 } \
1174 \
1175 if ( dupe ) { \
1176 /* So it's a dupe. This means we need to maintain a */\
1177 /* linked-list from the first to the next. */\
1178 /* we only allocate the nextword buffer when there */\
1179 /* a dupe, so first time we have to do the allocation */\
1180 if (!trie->nextword) \
1181 Newxz( trie->nextword, word_count + 1, U16); \
1182 while ( trie->nextword[dupe] ) \
1183 dupe= trie->nextword[dupe]; \
1184 trie->nextword[dupe]= curword; \
1185 } else { \
1186 /* we haven't inserted this word yet. */ \
1187 trie->states[ state ].wordnum = curword; \
1188 } \
1189} STMT_END
07be1b83 1190
3dab1dad 1191
786e8c11
YO
1192#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1193 ( ( base + charid >= ucharcount \
1194 && base + charid < ubound \
1195 && state == trie->trans[ base - ucharcount + charid ].check \
1196 && trie->trans[ base - ucharcount + charid ].next ) \
1197 ? trie->trans[ base - ucharcount + charid ].next \
1198 : ( state==1 ? special : 0 ) \
1199 )
3dab1dad 1200
786e8c11
YO
1201#define MADE_TRIE 1
1202#define MADE_JUMP_TRIE 2
1203#define MADE_EXACT_TRIE 4
3dab1dad 1204
a3621e74 1205STATIC I32
786e8c11 1206S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1207{
27da23d5 1208 dVAR;
a3621e74
YO
1209 /* first pass, loop through and scan words */
1210 reg_trie_data *trie;
1211 regnode *cur;
9f7f3913 1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1213 STRLEN len = 0;
1214 UV uvc = 0;
1215 U16 curword = 0;
1216 U32 next_alloc = 0;
786e8c11
YO
1217 regnode *jumper = NULL;
1218 regnode *nextbranch = NULL;
a3621e74 1219 /* we just use folder as a flag in utf8 */
e1ec3a88 1220 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1221 ? PL_fold
1222 : ( flags == EXACTFL
1223 ? PL_fold_locale
1224 : NULL
1225 )
1226 );
1227
e1ec3a88 1228 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1229 SV *re_trie_maxbuff;
3dab1dad
YO
1230#ifndef DEBUGGING
1231 /* these are only used during construction but are useful during
8e11feef 1232 * debugging so we store them in the struct when debugging.
8e11feef 1233 */
3dab1dad 1234 STRLEN trie_charcount=0;
3dab1dad
YO
1235 AV *trie_revcharmap;
1236#endif
a3621e74 1237 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1238#ifndef DEBUGGING
1239 PERL_UNUSED_ARG(depth);
1240#endif
a3621e74 1241
a02a5408 1242 Newxz( trie, 1, reg_trie_data );
a3621e74 1243 trie->refcount = 1;
3dab1dad 1244 trie->startstate = 1;
786e8c11 1245 trie->wordcount = word_count;
a3621e74 1246 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1247 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1248 if (!(UTF && folder))
1249 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1250 DEBUG_r({
1251 trie->words = newAV();
a3621e74 1252 });
3dab1dad 1253 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1254
0111c4fd 1255 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1256 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1257 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1258 }
3dab1dad
YO
1259 DEBUG_OPTIMISE_r({
1260 PerlIO_printf( Perl_debug_log,
786e8c11 1261 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1262 (int)depth * 2 + 2, "",
1263 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1264 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1265 (int)depth);
3dab1dad 1266 });
a3621e74
YO
1267 /* -- First loop and Setup --
1268
1269 We first traverse the branches and scan each word to determine if it
1270 contains widechars, and how many unique chars there are, this is
1271 important as we have to build a table with at least as many columns as we
1272 have unique chars.
1273
1274 We use an array of integers to represent the character codes 0..255
1275 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1276 native representation of the character value as the key and IV's for the
1277 coded index.
1278
1279 *TODO* If we keep track of how many times each character is used we can
1280 remap the columns so that the table compression later on is more
1281 efficient in terms of memory by ensuring most common value is in the
1282 middle and the least common are on the outside. IMO this would be better
1283 than a most to least common mapping as theres a decent chance the most
1284 common letter will share a node with the least common, meaning the node
1285 will not be compressable. With a middle is most common approach the worst
1286 case is when we have the least common nodes twice.
1287
1288 */
1289
a3621e74 1290 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1291 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1292 const U8 *uc = (U8*)STRING( noper );
a28509cc 1293 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1294 STRLEN foldlen = 0;
1295 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1296 const U8 *scan = (U8*)NULL;
07be1b83 1297 U32 wordlen = 0; /* required init */
3dab1dad 1298 STRLEN chars=0;
a3621e74 1299
3dab1dad
YO
1300 if (OP(noper) == NOTHING) {
1301 trie->minlen= 0;
1302 continue;
1303 }
1304 if (trie->bitmap) {
1305 TRIE_BITMAP_SET(trie,*uc);
1306 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1307 }
a3621e74 1308 for ( ; uc < e ; uc += len ) {
3dab1dad 1309 TRIE_CHARCOUNT(trie)++;
a3621e74 1310 TRIE_READ_CHAR;
3dab1dad 1311 chars++;
a3621e74
YO
1312 if ( uvc < 256 ) {
1313 if ( !trie->charmap[ uvc ] ) {
1314 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1315 if ( folder )
1316 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1317 TRIE_STORE_REVCHAR;
a3621e74
YO
1318 }
1319 } else {
1320 SV** svpp;
1321 if ( !trie->widecharmap )
1322 trie->widecharmap = newHV();
1323
1324 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1325
1326 if ( !svpp )
e4584336 1327 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1328
1329 if ( !SvTRUE( *svpp ) ) {
1330 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1331 TRIE_STORE_REVCHAR;
a3621e74
YO
1332 }
1333 }
1334 }
3dab1dad
YO
1335 if( cur == first ) {
1336 trie->minlen=chars;
1337 trie->maxlen=chars;
1338 } else if (chars < trie->minlen) {
1339 trie->minlen=chars;
1340 } else if (chars > trie->maxlen) {
1341 trie->maxlen=chars;
1342 }
1343
a3621e74
YO
1344 } /* end first pass */
1345 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1346 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1347 (int)depth * 2 + 2,"",
85c3142d 1348 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1349 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1350 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1351 );
786e8c11 1352 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1353
1354 /*
1355 We now know what we are dealing with in terms of unique chars and
1356 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1357 representation using a flat table will take. If it's over a reasonable
1358 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1359 conservative but potentially much slower representation using an array
1360 of lists.
1361
1362 At the end we convert both representations into the same compressed
1363 form that will be used in regexec.c for matching with. The latter
1364 is a form that cannot be used to construct with but has memory
1365 properties similar to the list form and access properties similar
1366 to the table form making it both suitable for fast searches and
1367 small enough that its feasable to store for the duration of a program.
1368
1369 See the comment in the code where the compressed table is produced
1370 inplace from the flat tabe representation for an explanation of how
1371 the compression works.
1372
1373 */
1374
1375
3dab1dad 1376 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1377 /*
1378 Second Pass -- Array Of Lists Representation
1379
1380 Each state will be represented by a list of charid:state records
1381 (reg_trie_trans_le) the first such element holds the CUR and LEN
1382 points of the allocated array. (See defines above).
1383
1384 We build the initial structure using the lists, and then convert
1385 it into the compressed table form which allows faster lookups
1386 (but cant be modified once converted).
a3621e74
YO
1387 */
1388
a3621e74
YO
1389 STRLEN transcount = 1;
1390
3dab1dad 1391 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1392 TRIE_LIST_NEW(1);
1393 next_alloc = 2;
1394
1395 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1396
c445ea15
AL
1397 regnode * const noper = NEXTOPER( cur );
1398 U8 *uc = (U8*)STRING( noper );
1399 const U8 * const e = uc + STR_LEN( noper );
1400 U32 state = 1; /* required init */
1401 U16 charid = 0; /* sanity init */
1402 U8 *scan = (U8*)NULL; /* sanity init */
1403 STRLEN foldlen = 0; /* required init */
07be1b83 1404 U32 wordlen = 0; /* required init */
c445ea15
AL
1405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1406
3dab1dad 1407 if (OP(noper) != NOTHING) {
786e8c11 1408 for ( ; uc < e ; uc += len ) {
c445ea15 1409
786e8c11 1410 TRIE_READ_CHAR;
c445ea15 1411
786e8c11
YO
1412 if ( uvc < 256 ) {
1413 charid = trie->charmap[ uvc ];
c445ea15 1414 } else {
786e8c11
YO
1415 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1416 if ( !svpp ) {
1417 charid = 0;
1418 } else {
1419 charid=(U16)SvIV( *svpp );
1420 }
c445ea15 1421 }
786e8c11
YO
1422 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1423 if ( charid ) {
a3621e74 1424
786e8c11
YO
1425 U16 check;
1426 U32 newstate = 0;
a3621e74 1427
786e8c11
YO
1428 charid--;
1429 if ( !trie->states[ state ].trans.list ) {
1430 TRIE_LIST_NEW( state );
c445ea15 1431 }
786e8c11
YO
1432 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1433 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1434 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1435 break;
1436 }
1437 }
1438 if ( ! newstate ) {
1439 newstate = next_alloc++;
1440 TRIE_LIST_PUSH( state, charid, newstate );
1441 transcount++;
1442 }
1443 state = newstate;
1444 } else {
1445 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1446 }
a28509cc 1447 }
c445ea15 1448 }
3dab1dad 1449 TRIE_HANDLE_WORD(state);
a3621e74
YO
1450
1451 } /* end second pass */
1452
786e8c11 1453 trie->laststate = next_alloc;
a3621e74
YO
1454 Renew( trie->states, next_alloc, reg_trie_state );
1455
3dab1dad
YO
1456 /* and now dump it out before we compress it */
1457 DEBUG_TRIE_COMPILE_MORE_r(
1458 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1459 );
a3621e74 1460
a02a5408 1461 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1462 {
1463 U32 state;
a3621e74
YO
1464 U32 tp = 0;
1465 U32 zp = 0;
1466
1467
1468 for( state=1 ; state < next_alloc ; state ++ ) {
1469 U32 base=0;
1470
1471 /*
1472 DEBUG_TRIE_COMPILE_MORE_r(
1473 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1474 );
1475 */
1476
1477 if (trie->states[state].trans.list) {
1478 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1479 U16 maxid=minid;
a28509cc 1480 U16 idx;
a3621e74
YO
1481
1482 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1483 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1484 if ( forid < minid ) {
1485 minid=forid;
1486 } else if ( forid > maxid ) {
1487 maxid=forid;
1488 }
a3621e74
YO
1489 }
1490 if ( transcount < tp + maxid - minid + 1) {
1491 transcount *= 2;
1492 Renew( trie->trans, transcount, reg_trie_trans );
1493 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1494 }
1495 base = trie->uniquecharcount + tp - minid;
1496 if ( maxid == minid ) {
1497 U32 set = 0;
1498 for ( ; zp < tp ; zp++ ) {
1499 if ( ! trie->trans[ zp ].next ) {
1500 base = trie->uniquecharcount + zp - minid;
1501 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1502 trie->trans[ zp ].check = state;
1503 set = 1;
1504 break;
1505 }
1506 }
1507 if ( !set ) {
1508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1509 trie->trans[ tp ].check = state;
1510 tp++;
1511 zp = tp;
1512 }
1513 } else {
1514 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1515 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1516 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1517 trie->trans[ tid ].check = state;
1518 }
1519 tp += ( maxid - minid + 1 );
1520 }
1521 Safefree(trie->states[ state ].trans.list);
1522 }
1523 /*
1524 DEBUG_TRIE_COMPILE_MORE_r(
1525 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1526 );
1527 */
1528 trie->states[ state ].trans.base=base;
1529 }
cc601c31 1530 trie->lasttrans = tp + 1;
a3621e74
YO
1531 }
1532 } else {
1533 /*
1534 Second Pass -- Flat Table Representation.
1535
1536 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1537 We know that we will need Charcount+1 trans at most to store the data
1538 (one row per char at worst case) So we preallocate both structures
1539 assuming worst case.
1540
1541 We then construct the trie using only the .next slots of the entry
1542 structs.
1543
1544 We use the .check field of the first entry of the node temporarily to
1545 make compression both faster and easier by keeping track of how many non
1546 zero fields are in the node.
1547
1548 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1549 transition.
1550
1551 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1552 number representing the first entry of the node, and state as a
1553 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1554 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1555 are 2 entrys per node. eg:
1556
1557 A B A B
1558 1. 2 4 1. 3 7
1559 2. 0 3 3. 0 5
1560 3. 0 0 5. 0 0
1561 4. 0 0 7. 0 0
1562
1563 The table is internally in the right hand, idx form. However as we also
1564 have to deal with the states array which is indexed by nodenum we have to
1565 use TRIE_NODENUM() to convert.
1566
1567 */
1568
3dab1dad
YO
1569
1570 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1571 reg_trie_trans );
3dab1dad 1572 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1573 next_alloc = trie->uniquecharcount + 1;
1574
3dab1dad 1575
a3621e74
YO
1576 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1577
c445ea15 1578 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1579 const U8 *uc = (U8*)STRING( noper );
1580 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1581
1582 U32 state = 1; /* required init */
1583
1584 U16 charid = 0; /* sanity init */
1585 U32 accept_state = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1587
1588 STRLEN foldlen = 0; /* required init */
07be1b83 1589 U32 wordlen = 0; /* required init */
a3621e74
YO
1590 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1591
3dab1dad 1592 if ( OP(noper) != NOTHING ) {
786e8c11 1593 for ( ; uc < e ; uc += len ) {
a3621e74 1594
786e8c11 1595 TRIE_READ_CHAR;
a3621e74 1596
786e8c11
YO
1597 if ( uvc < 256 ) {
1598 charid = trie->charmap[ uvc ];
1599 } else {
1600 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1601 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1602 }
786e8c11
YO
1603 if ( charid ) {
1604 charid--;
1605 if ( !trie->trans[ state + charid ].next ) {
1606 trie->trans[ state + charid ].next = next_alloc;
1607 trie->trans[ state ].check++;
1608 next_alloc += trie->uniquecharcount;
1609 }
1610 state = trie->trans[ state + charid ].next;
1611 } else {
1612 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1613 }
1614 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1615 }
a3621e74 1616 }
3dab1dad
YO
1617 accept_state = TRIE_NODENUM( state );
1618 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1619
1620 } /* end second pass */
1621
3dab1dad
YO
1622 /* and now dump it out before we compress it */
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 dump_trie_interim_table(trie,next_alloc,depth+1)
1625 );
a3621e74 1626
a3621e74
YO
1627 {
1628 /*
1629 * Inplace compress the table.*
1630
1631 For sparse data sets the table constructed by the trie algorithm will
1632 be mostly 0/FAIL transitions or to put it another way mostly empty.
1633 (Note that leaf nodes will not contain any transitions.)
1634
1635 This algorithm compresses the tables by eliminating most such
1636 transitions, at the cost of a modest bit of extra work during lookup:
1637
1638 - Each states[] entry contains a .base field which indicates the
1639 index in the state[] array wheres its transition data is stored.
1640
1641 - If .base is 0 there are no valid transitions from that node.
1642
1643 - If .base is nonzero then charid is added to it to find an entry in
1644 the trans array.
1645
1646 -If trans[states[state].base+charid].check!=state then the
1647 transition is taken to be a 0/Fail transition. Thus if there are fail
1648 transitions at the front of the node then the .base offset will point
1649 somewhere inside the previous nodes data (or maybe even into a node
1650 even earlier), but the .check field determines if the transition is
1651 valid.
1652
786e8c11 1653 XXX - wrong maybe?
a3621e74
YO
1654 The following process inplace converts the table to the compressed
1655 table: We first do not compress the root node 1,and mark its all its
1656 .check pointers as 1 and set its .base pointer as 1 as well. This
1657 allows to do a DFA construction from the compressed table later, and
1658 ensures that any .base pointers we calculate later are greater than
1659 0.
1660
1661 - We set 'pos' to indicate the first entry of the second node.
1662
1663 - We then iterate over the columns of the node, finding the first and
1664 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1665 and set the .check pointers accordingly, and advance pos
1666 appropriately and repreat for the next node. Note that when we copy
1667 the next pointers we have to convert them from the original
1668 NODEIDX form to NODENUM form as the former is not valid post
1669 compression.
1670
1671 - If a node has no transitions used we mark its base as 0 and do not
1672 advance the pos pointer.
1673
1674 - If a node only has one transition we use a second pointer into the
1675 structure to fill in allocated fail transitions from other states.
1676 This pointer is independent of the main pointer and scans forward
1677 looking for null transitions that are allocated to a state. When it
1678 finds one it writes the single transition into the "hole". If the
786e8c11 1679 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1680
1681 - Once compressed we can Renew/realloc the structures to release the
1682 excess space.
1683
1684 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1685 specifically Fig 3.47 and the associated pseudocode.
1686
1687 demq
1688 */
a3b680e6 1689 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1690 U32 state, charid;
a3621e74 1691 U32 pos = 0, zp=0;
786e8c11 1692 trie->laststate = laststate;
a3621e74
YO
1693
1694 for ( state = 1 ; state < laststate ; state++ ) {
1695 U8 flag = 0;
a28509cc
AL
1696 const U32 stateidx = TRIE_NODEIDX( state );
1697 const U32 o_used = trie->trans[ stateidx ].check;
1698 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1699 trie->trans[ stateidx ].check = 0;
1700
1701 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1702 if ( flag || trie->trans[ stateidx + charid ].next ) {
1703 if ( trie->trans[ stateidx + charid ].next ) {
1704 if (o_used == 1) {
1705 for ( ; zp < pos ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1707 break;
1708 }
1709 }
1710 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1711 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1712 trie->trans[ zp ].check = state;
1713 if ( ++zp > pos ) pos = zp;
1714 break;
1715 }
1716 used--;
1717 }
1718 if ( !flag ) {
1719 flag = 1;
1720 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1721 }
1722 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1723 trie->trans[ pos ].check = state;
1724 pos++;
1725 }
1726 }
1727 }
cc601c31 1728 trie->lasttrans = pos + 1;
a3621e74
YO
1729 Renew( trie->states, laststate + 1, reg_trie_state);
1730 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1731 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1732 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1733 (int)depth * 2 + 2,"",
1734 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1735 (IV)next_alloc,
1736 (IV)pos,
a3621e74
YO
1737 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1738 );
1739
1740 } /* end table compress */
1741 }
cc601c31
YO
1742 /* resize the trans array to remove unused space */
1743 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1744
3dab1dad
YO
1745 /* and now dump out the compressed format */
1746 DEBUG_TRIE_COMPILE_r(
1747 dump_trie(trie,depth+1)
1748 );
07be1b83 1749
3dab1dad
YO
1750 { /* Modify the program and insert the new TRIE node*/
1751 regnode *convert;
1752 U8 nodetype =(U8)(flags & 0xFF);
1753 char *str=NULL;
786e8c11 1754
07be1b83 1755#ifdef DEBUGGING
786e8c11 1756
07be1b83
YO
1757 U32 mjd_offset;
1758 U32 mjd_nodelen;
1759#endif
a3621e74 1760 /*
3dab1dad
YO
1761 This means we convert either the first branch or the first Exact,
1762 depending on whether the thing following (in 'last') is a branch
1763 or not and whther first is the startbranch (ie is it a sub part of
1764 the alternation or is it the whole thing.)
1765 Assuming its a sub part we conver the EXACT otherwise we convert
1766 the whole branch sequence, including the first.
a3621e74 1767 */
3dab1dad
YO
1768 /* Find the node we are going to overwrite */
1769 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1770 /* whole branch chain */
3dab1dad 1771 convert = first;
07be1b83
YO
1772 DEBUG_r({
1773 const regnode *nop = NEXTOPER( convert );
1774 mjd_offset= Node_Offset((nop));
1775 mjd_nodelen= Node_Length((nop));
1776 });
1777 } else {
1778 /* branch sub-chain */
3dab1dad
YO
1779 convert = NEXTOPER( first );
1780 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1781 DEBUG_r({
1782 mjd_offset= Node_Offset((convert));
1783 mjd_nodelen= Node_Length((convert));
1784 });
1785 }
1786 DEBUG_OPTIMISE_r(
1787 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1788 (int)depth * 2 + 2, "",
786e8c11 1789 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1790 );
a3621e74 1791
3dab1dad
YO
1792 /* But first we check to see if there is a common prefix we can
1793 split out as an EXACT and put in front of the TRIE node. */
1794 trie->startstate= 1;
786e8c11 1795 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1796 U32 state;
1797 DEBUG_OPTIMISE_r(
8e11feef
RGS
1798 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1799 (int)depth * 2 + 2, "",
786e8c11 1800 (UV)trie->laststate)
8e11feef 1801 );
786e8c11 1802 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1803 U32 ofs = 0;
8e11feef
RGS
1804 I32 idx = -1;
1805 U32 count = 0;
1806 const U32 base = trie->states[ state ].trans.base;
a3621e74 1807
3dab1dad 1808 if ( trie->states[state].wordnum )
8e11feef 1809 count = 1;
a3621e74 1810
8e11feef 1811 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1812 if ( ( base + ofs >= trie->uniquecharcount ) &&
1813 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1814 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1815 {
3dab1dad 1816 if ( ++count > 1 ) {
8e11feef 1817 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1818 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1819 if ( state == 1 ) break;
3dab1dad
YO
1820 if ( count == 2 ) {
1821 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1822 DEBUG_OPTIMISE_r(
8e11feef
RGS
1823 PerlIO_printf(Perl_debug_log,
1824 "%*sNew Start State=%"UVuf" Class: [",
1825 (int)depth * 2 + 2, "",
786e8c11 1826 (UV)state));
be8e71aa
YO
1827 if (idx >= 0) {
1828 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1829 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1830
3dab1dad 1831 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1832 if ( folder )
1833 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1834 DEBUG_OPTIMISE_r(
07be1b83 1835 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1836 );
8e11feef
RGS
1837 }
1838 }
1839 TRIE_BITMAP_SET(trie,*ch);
1840 if ( folder )
1841 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1842 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1843 }
1844 idx = ofs;
1845 }
3dab1dad
YO
1846 }
1847 if ( count == 1 ) {
1848 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1849 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1850 DEBUG_OPTIMISE_r(
8e11feef
RGS
1851 PerlIO_printf( Perl_debug_log,
1852 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1853 (int)depth * 2 + 2, "",
786e8c11 1854 (UV)state, (UV)idx, ch)
3dab1dad
YO
1855 );
1856 if ( state==1 ) {
1857 OP( convert ) = nodetype;
1858 str=STRING(convert);
1859 STR_LEN(convert)=0;
1860 }
1861 *str++=*ch;
1862 STR_LEN(convert)++;
a3621e74 1863
8e11feef 1864 } else {
f9049ba1 1865#ifdef DEBUGGING
8e11feef
RGS
1866 if (state>1)
1867 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1868#endif
8e11feef
RGS
1869 break;
1870 }
1871 }
3dab1dad 1872 if (str) {
8e11feef 1873 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1874 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1875 trie->startstate = state;
07be1b83
YO
1876 trie->minlen -= (state - 1);
1877 trie->maxlen -= (state - 1);
1878 DEBUG_r({
1879 regnode *fix = convert;
1880 mjd_nodelen++;
1881 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1882 while( ++fix < n ) {
1883 Set_Node_Offset_Length(fix, 0, 0);
1884 }
1885 });
8e11feef
RGS
1886 if (trie->maxlen) {
1887 convert = n;
1888 } else {
3dab1dad
YO
1889 NEXT_OFF(convert) = (U16)(tail - convert);
1890 }
1891 }
1892 }
1893 if ( trie->maxlen ) {
8e11feef
RGS
1894 NEXT_OFF( convert ) = (U16)(tail - convert);
1895 ARG_SET( convert, data_slot );
786e8c11
YO
1896 /* Store the offset to the first unabsorbed branch in
1897 jump[0], which is otherwise unused by the jump logic.
1898 We use this when dumping a trie and during optimisation. */
1899 if (trie->jump)
1900 trie->jump[0] = (U16)(tail - nextbranch);
1901 if (!jumper)
85c3142d 1902 jumper = last;
786e8c11
YO
1903 /* XXXX */
1904 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1905 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1906 {
1907 OP( convert ) = TRIEC;
1908 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1909 Safefree(trie->bitmap);
1910 trie->bitmap= NULL;
1911 } else
1912 OP( convert ) = TRIE;
a3621e74 1913
3dab1dad
YO
1914 /* store the type in the flags */
1915 convert->flags = nodetype;
1916 /* XXX We really should free up the resource in trie now, as we wont use them */
1917 }
a3621e74
YO
1918 /* needed for dumping*/
1919 DEBUG_r({
786e8c11
YO
1920 regnode *optimize = convert
1921 + NODE_STEP_REGNODE
1922 + regarglen[ OP( convert ) ];
07be1b83
YO
1923 regnode *opt = convert;
1924 while (++opt<optimize) {
1925 Set_Node_Offset_Length(opt,0,0);
1926 }
786e8c11
YO
1927 /*
1928 Try to clean up some of the debris left after the
1929 optimisation.
a3621e74 1930 */
786e8c11 1931 while( optimize < jumper ) {
07be1b83 1932 mjd_nodelen += Node_Length((optimize));
a3621e74 1933 OP( optimize ) = OPTIMIZED;
07be1b83 1934 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1935 optimize++;
1936 }
07be1b83 1937 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1938 });
1939 } /* end node insert */
07be1b83 1940#ifndef DEBUGGING
6e8b4190 1941 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1942#endif
786e8c11
YO
1943 return trie->jump
1944 ? MADE_JUMP_TRIE
1945 : trie->startstate>1
1946 ? MADE_EXACT_TRIE
1947 : MADE_TRIE;
1948}
1949
1950STATIC void
1951S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1952{
1953/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1954
1955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1956 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1957 ISBN 0-201-10088-6
1958
1959 We find the fail state for each state in the trie, this state is the longest proper
1960 suffix of the current states 'word' that is also a proper prefix of another word in our
1961 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1962 the DFA not to have to restart after its tried and failed a word at a given point, it
1963 simply continues as though it had been matching the other word in the first place.
1964 Consider
1965 'abcdgu'=~/abcdefg|cdgu/
1966 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1967 fail, which would bring use to the state representing 'd' in the second word where we would
1968 try 'g' and succeed, prodceding to match 'cdgu'.
1969 */
1970 /* add a fail transition */
1971 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1972 U32 *q;
1973 const U32 ucharcount = trie->uniquecharcount;
1974 const U32 numstates = trie->laststate;
1975 const U32 ubound = trie->lasttrans + ucharcount;
1976 U32 q_read = 0;
1977 U32 q_write = 0;
1978 U32 charid;
1979 U32 base = trie->states[ 1 ].trans.base;
1980 U32 *fail;
1981 reg_ac_data *aho;
1982 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1983 GET_RE_DEBUG_FLAGS_DECL;
1984#ifndef DEBUGGING
1985 PERL_UNUSED_ARG(depth);
1986#endif
1987
1988
1989 ARG_SET( stclass, data_slot );
1990 Newxz( aho, 1, reg_ac_data );
1991 RExC_rx->data->data[ data_slot ] = (void*)aho;
1992 aho->trie=trie;
1993 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1994 (trie->laststate+1)*sizeof(reg_trie_state));
1995 Newxz( q, numstates, U32);
1996 Newxz( aho->fail, numstates, U32 );
1997 aho->refcount = 1;
1998 fail = aho->fail;
1999 /* initialize fail[0..1] to be 1 so that we always have
2000 a valid final fail state */
2001 fail[ 0 ] = fail[ 1 ] = 1;
2002
2003 for ( charid = 0; charid < ucharcount ; charid++ ) {
2004 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2005 if ( newstate ) {
2006 q[ q_write ] = newstate;
2007 /* set to point at the root */
2008 fail[ q[ q_write++ ] ]=1;
2009 }
2010 }
2011 while ( q_read < q_write) {
2012 const U32 cur = q[ q_read++ % numstates ];
2013 base = trie->states[ cur ].trans.base;
2014
2015 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2016 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2017 if (ch_state) {
2018 U32 fail_state = cur;
2019 U32 fail_base;
2020 do {
2021 fail_state = fail[ fail_state ];
2022 fail_base = aho->states[ fail_state ].trans.base;
2023 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2024
2025 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2026 fail[ ch_state ] = fail_state;
2027 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2028 {
2029 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2030 }
2031 q[ q_write++ % numstates] = ch_state;
2032 }
2033 }
2034 }
2035 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2036 when we fail in state 1, this allows us to use the
2037 charclass scan to find a valid start char. This is based on the principle
2038 that theres a good chance the string being searched contains lots of stuff
2039 that cant be a start char.
2040 */
2041 fail[ 0 ] = fail[ 1 ] = 0;
2042 DEBUG_TRIE_COMPILE_r({
2043 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2044 for( q_read=1; q_read<numstates; q_read++ ) {
2045 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2046 }
2047 PerlIO_printf(Perl_debug_log, "\n");
2048 });
2049 Safefree(q);
2050 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2051}
2052
786e8c11 2053
a3621e74 2054/*
5d1c421c
JH
2055 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2056 * These need to be revisited when a newer toolchain becomes available.
2057 */
2058#if defined(__sparc64__) && defined(__GNUC__)
2059# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2060# undef SPARC64_GCC_WORKAROUND
2061# define SPARC64_GCC_WORKAROUND 1
2062# endif
2063#endif
2064
07be1b83
YO
2065#define DEBUG_PEEP(str,scan,depth) \
2066 DEBUG_OPTIMISE_r({ \
2067 SV * const mysv=sv_newmortal(); \
2068 regnode *Next = regnext(scan); \
2069 regprop(RExC_rx, mysv, scan); \
2070 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2071 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2072 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2073 });
2074
1de06328
YO
2075
2076
2077
2078
07be1b83
YO
2079#define JOIN_EXACT(scan,min,flags) \
2080 if (PL_regkind[OP(scan)] == EXACT) \
2081 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2082
be8e71aa 2083STATIC U32
07be1b83
YO
2084S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2085 /* Merge several consecutive EXACTish nodes into one. */
2086 regnode *n = regnext(scan);
2087 U32 stringok = 1;
2088 regnode *next = scan + NODE_SZ_STR(scan);
2089 U32 merged = 0;
2090 U32 stopnow = 0;
2091#ifdef DEBUGGING
2092 regnode *stop = scan;
72f13be8 2093 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2094#else
d47053eb
RGS
2095 PERL_UNUSED_ARG(depth);
2096#endif
2097#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2098 PERL_UNUSED_ARG(flags);
2099 PERL_UNUSED_ARG(val);
07be1b83 2100#endif
07be1b83
YO
2101 DEBUG_PEEP("join",scan,depth);
2102
2103 /* Skip NOTHING, merge EXACT*. */
2104 while (n &&
2105 ( PL_regkind[OP(n)] == NOTHING ||
2106 (stringok && (OP(n) == OP(scan))))
2107 && NEXT_OFF(n)
2108 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2109
2110 if (OP(n) == TAIL || n > next)
2111 stringok = 0;
2112 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2113 DEBUG_PEEP("skip:",n,depth);
2114 NEXT_OFF(scan) += NEXT_OFF(n);
2115 next = n + NODE_STEP_REGNODE;
2116#ifdef DEBUGGING
2117 if (stringok)
2118 stop = n;
2119#endif
2120 n = regnext(n);
2121 }
2122 else if (stringok) {
786e8c11 2123 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2124 regnode * const nnext = regnext(n);
2125
2126 DEBUG_PEEP("merg",n,depth);
2127
2128 merged++;
2129 if (oldl + STR_LEN(n) > U8_MAX)
2130 break;
2131 NEXT_OFF(scan) += NEXT_OFF(n);
2132 STR_LEN(scan) += STR_LEN(n);
2133 next = n + NODE_SZ_STR(n);
2134 /* Now we can overwrite *n : */
2135 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2136#ifdef DEBUGGING
2137 stop = next - 1;
2138#endif
2139 n = nnext;
2140 if (stopnow) break;
2141 }
2142
d47053eb
RGS
2143#ifdef EXPERIMENTAL_INPLACESCAN
2144 if (flags && !NEXT_OFF(n)) {
2145 DEBUG_PEEP("atch", val, depth);
2146 if (reg_off_by_arg[OP(n)]) {
2147 ARG_SET(n, val - n);
2148 }
2149 else {
2150 NEXT_OFF(n) = val - n;
2151 }
2152 stopnow = 1;
2153 }
07be1b83
YO
2154#endif
2155 }
2156
2157 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2158 /*
2159 Two problematic code points in Unicode casefolding of EXACT nodes:
2160
2161 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2162 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2163
2164 which casefold to
2165
2166 Unicode UTF-8
2167
2168 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2169 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2170
2171 This means that in case-insensitive matching (or "loose matching",
2172 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2173 length of the above casefolded versions) can match a target string
2174 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2175 This would rather mess up the minimum length computation.
2176
2177 What we'll do is to look for the tail four bytes, and then peek
2178 at the preceding two bytes to see whether we need to decrease
2179 the minimum length by four (six minus two).
2180
2181 Thanks to the design of UTF-8, there cannot be false matches:
2182 A sequence of valid UTF-8 bytes cannot be a subsequence of
2183 another valid sequence of UTF-8 bytes.
2184
2185 */
2186 char * const s0 = STRING(scan), *s, *t;
2187 char * const s1 = s0 + STR_LEN(scan) - 1;
2188 char * const s2 = s1 - 4;
e294cc5d
JH
2189#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2190 const char t0[] = "\xaf\x49\xaf\x42";
2191#else
07be1b83 2192 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2193#endif
07be1b83
YO
2194 const char * const t1 = t0 + 3;
2195
2196 for (s = s0 + 2;
2197 s < s2 && (t = ninstr(s, s1, t0, t1));
2198 s = t + 4) {
e294cc5d
JH
2199#ifdef EBCDIC
2200 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2201 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2202#else
07be1b83
YO
2203 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2204 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2205#endif
07be1b83
YO
2206 *min -= 4;
2207 }
2208 }
2209
2210#ifdef DEBUGGING
2211 /* Allow dumping */
2212 n = scan + NODE_SZ_STR(scan);
2213 while (n <= stop) {
2214 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2215 OP(n) = OPTIMIZED;
2216 NEXT_OFF(n) = 0;
2217 }
2218 n++;
2219 }
2220#endif
2221 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2222 return stopnow;
2223}
2224
653099ff
GS
2225/* REx optimizer. Converts nodes into quickier variants "in place".
2226 Finds fixed substrings. */
2227
a0288114 2228/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2229 to the position after last scanned or to NULL. */
2230
07be1b83
YO
2231
2232
76e3520e 2233STATIC I32
1de06328
YO
2234S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2235 I32 *minlenp, I32 *deltap,
9a957fbc 2236 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2237 /* scanp: Start here (read-write). */
2238 /* deltap: Write maxlen-minlen here. */
2239 /* last: Stop before this one. */
2240{
97aff369 2241 dVAR;
c277df42
IZ
2242 I32 min = 0, pars = 0, code;
2243 regnode *scan = *scanp, *next;
2244 I32 delta = 0;
2245 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2246 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2247 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2248 scan_data_t data_fake;
653099ff 2249 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2250 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2251 regnode *first_non_open = scan;
2252
a3621e74
YO
2253
2254 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2255#ifdef DEBUGGING
2256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2257#endif
786e8c11
YO
2258 if ( depth == 0 ) {
2259 while (first_non_open && OP(first_non_open) == OPEN)
2260 first_non_open=regnext(first_non_open);
2261 }
2262
b81d288d 2263
c277df42
IZ
2264 while (scan && OP(scan) != END && scan < last) {
2265 /* Peephole optimizer: */
1de06328 2266 DEBUG_STUDYDATA(data,depth);
07be1b83 2267 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2268 JOIN_EXACT(scan,&min,0);
a3621e74 2269
653099ff
GS
2270 /* Follow the next-chain of the current node and optimize
2271 away all the NOTHINGs from it. */
c277df42 2272 if (OP(scan) != CURLYX) {
a3b680e6 2273 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2274 ? I32_MAX
2275 /* I32 may be smaller than U16 on CRAYs! */
2276 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2277 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2278 int noff;
2279 regnode *n = scan;
b81d288d 2280
c277df42
IZ
2281 /* Skip NOTHING and LONGJMP. */
2282 while ((n = regnext(n))
3dab1dad 2283 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2284 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2285 && off + noff < max)
2286 off += noff;
2287 if (reg_off_by_arg[OP(scan)])
2288 ARG(scan) = off;
b81d288d 2289 else
c277df42
IZ
2290 NEXT_OFF(scan) = off;
2291 }
a3621e74 2292
07be1b83 2293
3dab1dad 2294
653099ff
GS
2295 /* The principal pseudo-switch. Cannot be a switch, since we
2296 look into several different things. */
b81d288d 2297 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2298 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2299 next = regnext(scan);
2300 code = OP(scan);
a3621e74 2301 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2302
2303 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2304 /* NOTE - There is similar code to this block below for handling
2305 TRIE nodes on a re-study. If you change stuff here check there
2306 too. */
c277df42 2307 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2308 struct regnode_charclass_class accum;
d4c19fe8 2309 regnode * const startbranch=scan;
c277df42 2310
653099ff 2311 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2312 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2313 if (flags & SCF_DO_STCLASS)
830247a4 2314 cl_init_zero(pRExC_state, &accum);
a3621e74 2315
c277df42 2316 while (OP(scan) == code) {
830247a4 2317 I32 deltanext, minnext, f = 0, fake;
653099ff 2318 struct regnode_charclass_class this_class;
c277df42
IZ
2319
2320 num++;
2321 data_fake.flags = 0;
b81d288d 2322 if (data) {
2c2d71f5 2323 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2324 data_fake.last_closep = data->last_closep;
2325 }
2326 else
2327 data_fake.last_closep = &fake;
c277df42
IZ
2328 next = regnext(scan);
2329 scan = NEXTOPER(scan);
2330 if (code != BRANCH)
2331 scan = NEXTOPER(scan);
653099ff 2332 if (flags & SCF_DO_STCLASS) {
830247a4 2333 cl_init(pRExC_state, &this_class);
653099ff
GS
2334 data_fake.start_class = &this_class;
2335 f = SCF_DO_STCLASS_AND;
b81d288d 2336 }
e1901655
IZ
2337 if (flags & SCF_WHILEM_VISITED_POS)
2338 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2339
653099ff 2340 /* we suppose the run is continuous, last=next...*/
1de06328 2341 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2342 next, &data_fake, f,depth+1);
b81d288d 2343 if (min1 > minnext)
c277df42
IZ
2344 min1 = minnext;
2345 if (max1 < minnext + deltanext)
2346 max1 = minnext + deltanext;
2347 if (deltanext == I32_MAX)
aca2d497 2348 is_inf = is_inf_internal = 1;
c277df42
IZ
2349 scan = next;
2350 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2351 pars++;
3dab1dad
YO
2352 if (data) {
2353 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2354 data->flags |= SF_HAS_EVAL;
2c2d71f5 2355 data->whilem_c = data_fake.whilem_c;
3dab1dad 2356 }
653099ff 2357 if (flags & SCF_DO_STCLASS)
830247a4 2358 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2359 if (code == SUSPEND)
c277df42
IZ
2360 break;
2361 }
2362 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2363 min1 = 0;
2364 if (flags & SCF_DO_SUBSTR) {
2365 data->pos_min += min1;
2366 data->pos_delta += max1 - min1;
2367 if (max1 != min1 || is_inf)
2368 data->longest = &(data->longest_float);
2369 }
2370 min += min1;
2371 delta += max1 - min1;
653099ff 2372 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2373 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2374 if (min1) {
2375 cl_and(data->start_class, &and_with);
2376 flags &= ~SCF_DO_STCLASS;
2377 }
2378 }
2379 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2380 if (min1) {
2381 cl_and(data->start_class, &accum);
653099ff 2382 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2383 }
2384 else {
b81d288d 2385 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2386 * data->start_class */
2387 StructCopy(data->start_class, &and_with,
2388 struct regnode_charclass_class);
2389 flags &= ~SCF_DO_STCLASS_AND;
2390 StructCopy(&accum, data->start_class,
2391 struct regnode_charclass_class);
2392 flags |= SCF_DO_STCLASS_OR;
2393 data->start_class->flags |= ANYOF_EOS;
2394 }
653099ff 2395 }
a3621e74 2396
786e8c11 2397 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2398 /* demq.
2399
2400 Assuming this was/is a branch we are dealing with: 'scan' now
2401 points at the item that follows the branch sequence, whatever
2402 it is. We now start at the beginning of the sequence and look
2403 for subsequences of
2404
786e8c11
YO
2405 BRANCH->EXACT=>x1
2406 BRANCH->EXACT=>x2
2407 tail
a3621e74
YO
2408
2409 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2410
2411 If we can find such a subseqence we need to turn the first
2412 element into a trie and then add the subsequent branch exact
2413 strings to the trie.
2414
2415 We have two cases
2416
786e8c11 2417 1. patterns where the whole set of branch can be converted.
a3621e74 2418
786e8c11 2419 2. patterns where only a subset can be converted.
a3621e74
YO
2420
2421 In case 1 we can replace the whole set with a single regop
2422 for the trie. In case 2 we need to keep the start and end
2423 branchs so
2424
2425 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2426 becomes BRANCH TRIE; BRANCH X;
2427
786e8c11
YO
2428 There is an additional case, that being where there is a
2429 common prefix, which gets split out into an EXACT like node
2430 preceding the TRIE node.
2431
2432 If x(1..n)==tail then we can do a simple trie, if not we make
2433 a "jump" trie, such that when we match the appropriate word
2434 we "jump" to the appopriate tail node. Essentailly we turn
2435 a nested if into a case structure of sorts.
a3621e74
YO
2436
2437 */
786e8c11 2438
3dab1dad 2439 int made=0;
0111c4fd
RGS
2440 if (!re_trie_maxbuff) {
2441 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2442 if (!SvIOK(re_trie_maxbuff))
2443 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2444 }
786e8c11 2445 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2446 regnode *cur;
2447 regnode *first = (regnode *)NULL;
2448 regnode *last = (regnode *)NULL;
2449 regnode *tail = scan;
2450 U8 optype = 0;
2451 U32 count=0;
2452
2453#ifdef DEBUGGING
c445ea15 2454 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2455#endif
2456 /* var tail is used because there may be a TAIL
2457 regop in the way. Ie, the exacts will point to the
2458 thing following the TAIL, but the last branch will
2459 point at the TAIL. So we advance tail. If we
2460 have nested (?:) we may have to move through several
2461 tails.
2462 */
2463
2464 while ( OP( tail ) == TAIL ) {
2465 /* this is the TAIL generated by (?:) */
2466 tail = regnext( tail );
2467 }
2468
3dab1dad 2469
a3621e74 2470 DEBUG_OPTIMISE_r({
32fc9b6a 2471 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2472 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2473 (int)depth * 2 + 2, "",
2474 "Looking for TRIE'able sequences. Tail node is: ",
2475 SvPV_nolen_const( mysv )
a3621e74
YO
2476 );
2477 });
3dab1dad 2478
a3621e74
YO
2479 /*
2480
2481 step through the branches, cur represents each
2482 branch, noper is the first thing to be matched
2483 as part of that branch and noper_next is the
2484 regnext() of that node. if noper is an EXACT
2485 and noper_next is the same as scan (our current
2486 position in the regex) then the EXACT branch is
2487 a possible optimization target. Once we have
2488 two or more consequetive such branches we can
2489 create a trie of the EXACT's contents and stich
2490 it in place. If the sequence represents all of
2491 the branches we eliminate the whole thing and
2492 replace it with a single TRIE. If it is a
2493 subsequence then we need to stitch it in. This
2494 means the first branch has to remain, and needs
2495 to be repointed at the item on the branch chain
2496 following the last branch optimized. This could
2497 be either a BRANCH, in which case the
2498 subsequence is internal, or it could be the
2499 item following the branch sequence in which
2500 case the subsequence is at the end.
2501
2502 */
2503
2504 /* dont use tail as the end marker for this traverse */
2505 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2506 regnode * const noper = NEXTOPER( cur );
be981c67 2507#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2508 regnode * const noper_next = regnext( noper );
be981c67 2509#endif
a3621e74 2510
a3621e74 2511 DEBUG_OPTIMISE_r({
32fc9b6a 2512 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2513 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2514 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2515
32fc9b6a 2516 regprop(RExC_rx, mysv, noper);
a3621e74 2517 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2518 SvPV_nolen_const(mysv));
a3621e74
YO
2519
2520 if ( noper_next ) {
32fc9b6a 2521 regprop(RExC_rx, mysv, noper_next );
a3621e74 2522 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2523 SvPV_nolen_const(mysv));
a3621e74 2524 }
3dab1dad
YO
2525 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2526 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2527 });
3dab1dad
YO
2528 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2529 : PL_regkind[ OP( noper ) ] == EXACT )
2530 || OP(noper) == NOTHING )
786e8c11
YO
2531#ifdef NOJUMPTRIE
2532 && noper_next == tail
2533#endif
2534 && count < U16_MAX)
a3621e74
YO
2535 {
2536 count++;
3dab1dad
YO
2537 if ( !first || optype == NOTHING ) {
2538 if (!first) first = cur;
a3621e74
YO
2539 optype = OP( noper );
2540 } else {
a3621e74 2541 last = cur;
a3621e74
YO
2542 }
2543 } else {
2544 if ( last ) {
786e8c11
YO
2545 make_trie( pRExC_state,
2546 startbranch, first, cur, tail, count,
2547 optype, depth+1 );
a3621e74 2548 }
3dab1dad 2549 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2550#ifdef NOJUMPTRIE
2551 && noper_next == tail
2552#endif
2553 ){
a3621e74
YO
2554 count = 1;
2555 first = cur;
2556 optype = OP( noper );
2557 } else {
2558 count = 0;
2559 first = NULL;
2560 optype = 0;
2561 }
2562 last = NULL;
2563 }
2564 }
2565 DEBUG_OPTIMISE_r({
32fc9b6a 2566 regprop(RExC_rx, mysv, cur);
a3621e74 2567 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2568 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2569 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2570
2571 });
2572 if ( last ) {
786e8c11 2573 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2574#ifdef TRIE_STUDY_OPT
786e8c11
YO
2575 if ( ((made == MADE_EXACT_TRIE &&
2576 startbranch == first)
2577 || ( first_non_open == first )) &&
2578 depth==0 )
2579 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2580#endif
07be1b83 2581 }
a3621e74 2582 }
3dab1dad
YO
2583
2584 } /* do trie */
786e8c11 2585
a0ed51b3 2586 }
a3621e74 2587 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2588 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2589 } else /* single branch is optimized. */
c277df42
IZ
2590 scan = NEXTOPER(scan);
2591 continue;
a0ed51b3
LW
2592 }
2593 else if (OP(scan) == EXACT) {
cd439c50 2594 I32 l = STR_LEN(scan);
c445ea15 2595 UV uc;
a0ed51b3 2596 if (UTF) {
a3b680e6 2597 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2598 l = utf8_length(s, s + l);
9041c2e3 2599 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2600 } else {
2601 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2602 }
2603 min += l;
c277df42 2604 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2605 /* The code below prefers earlier match for fixed
2606 offset, later match for variable offset. */
2607 if (data->last_end == -1) { /* Update the start info. */
2608 data->last_start_min = data->pos_min;
2609 data->last_start_max = is_inf
b81d288d 2610 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2611 }
cd439c50 2612 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2613 if (UTF)
2614 SvUTF8_on(data->last_found);
0eda9292 2615 {
9a957fbc 2616 SV * const sv = data->last_found;
a28509cc 2617 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2618 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2619 if (mg && mg->mg_len >= 0)
5e43f467
JH
2620 mg->mg_len += utf8_length((U8*)STRING(scan),
2621 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2622 }
c277df42
IZ
2623 data->last_end = data->pos_min + l;
2624 data->pos_min += l; /* As in the first entry. */
2625 data->flags &= ~SF_BEFORE_EOL;
2626 }
653099ff
GS
2627 if (flags & SCF_DO_STCLASS_AND) {
2628 /* Check whether it is compatible with what we know already! */
2629 int compat = 1;
2630
1aa99e6b 2631 if (uc >= 0x100 ||
516a5887 2632 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2633 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2634 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2635 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2636 )
653099ff
GS
2637 compat = 0;
2638 ANYOF_CLASS_ZERO(data->start_class);
2639 ANYOF_BITMAP_ZERO(data->start_class);
2640 if (compat)
1aa99e6b 2641 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2642 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2643 if (uc < 0x100)
2644 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2645 }
2646 else if (flags & SCF_DO_STCLASS_OR) {
2647 /* false positive possible if the class is case-folded */
1aa99e6b 2648 if (uc < 0x100)
9b877dbb
IH
2649 ANYOF_BITMAP_SET(data->start_class, uc);
2650 else
2651 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2652 data->start_class->flags &= ~ANYOF_EOS;
2653 cl_and(data->start_class, &and_with);
2654 }
2655 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2656 }
3dab1dad 2657 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2658 I32 l = STR_LEN(scan);
1aa99e6b 2659 UV uc = *((U8*)STRING(scan));
653099ff
GS
2660
2661 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2662 if (flags & SCF_DO_SUBSTR) {
2663 assert(data);
1de06328 2664 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2665 }
a0ed51b3 2666 if (UTF) {
6136c704 2667 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2668 l = utf8_length(s, s + l);
9041c2e3 2669 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2670 }
2671 min += l;
ecaa9b9c 2672 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2673 data->pos_min += l;
653099ff
GS
2674 if (flags & SCF_DO_STCLASS_AND) {
2675 /* Check whether it is compatible with what we know already! */
2676 int compat = 1;
2677
1aa99e6b 2678 if (uc >= 0x100 ||
516a5887 2679 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2680 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2681 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2682 compat = 0;
2683 ANYOF_CLASS_ZERO(data->start_class);
2684 ANYOF_BITMAP_ZERO(data->start_class);
2685 if (compat) {
1aa99e6b 2686 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2687 data->start_class->flags &= ~ANYOF_EOS;
2688 data->start_class->flags |= ANYOF_FOLD;
2689 if (OP(scan) == EXACTFL)
2690 data->start_class->flags |= ANYOF_LOCALE;
2691 }
2692 }
2693 else if (flags & SCF_DO_STCLASS_OR) {
2694 if (data->start_class->flags & ANYOF_FOLD) {
2695 /* false positive possible if the class is case-folded.
2696 Assume that the locale settings are the same... */
1aa99e6b
IH
2697 if (uc < 0x100)
2698 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2699 data->start_class->flags &= ~ANYOF_EOS;
2700 }
2701 cl_and(data->start_class, &and_with);
2702 }
2703 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2704 }
bfed75c6 2705 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2706 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2707 I32 f = flags, pos_before = 0;
d4c19fe8 2708 regnode * const oscan = scan;
653099ff
GS
2709 struct regnode_charclass_class this_class;
2710 struct regnode_charclass_class *oclass = NULL;
727f22e3 2711 I32 next_is_eval = 0;
653099ff 2712
3dab1dad 2713 switch (PL_regkind[OP(scan)]) {
653099ff 2714 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2715 scan = NEXTOPER(scan);
2716 goto finish;
2717 case PLUS:
653099ff 2718 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2719 next = NEXTOPER(scan);
653099ff 2720 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2721 mincount = 1;
2722 maxcount = REG_INFTY;
c277df42
IZ
2723 next = regnext(scan);
2724 scan = NEXTOPER(scan);
2725 goto do_curly;
2726 }
2727 }
2728 if (flags & SCF_DO_SUBSTR)
2729 data->pos_min++;
2730 min++;
2731 /* Fall through. */
2732 case STAR:
653099ff
GS
2733 if (flags & SCF_DO_STCLASS) {
2734 mincount = 0;
b81d288d 2735 maxcount = REG_INFTY;
653099ff
GS
2736 next = regnext(scan);
2737 scan = NEXTOPER(scan);
2738 goto do_curly;
2739 }
b81d288d 2740 is_inf = is_inf_internal = 1;
c277df42
IZ
2741 scan = regnext(scan);
2742 if (flags & SCF_DO_SUBSTR) {
1de06328 2743 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2744 data->longest = &(data->longest_float);
2745 }
2746 goto optimize_curly_tail;
2747 case CURLY:
b81d288d 2748 mincount = ARG1(scan);
c277df42
IZ
2749 maxcount = ARG2(scan);
2750 next = regnext(scan);
cb434fcc
IZ
2751 if (OP(scan) == CURLYX) {
2752 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2753 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2754 }
c277df42 2755 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2756 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2757 do_curly:
2758 if (flags & SCF_DO_SUBSTR) {
1de06328 2759 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2760 pos_before = data->pos_min;
2761 }
2762 if (data) {
2763 fl = data->flags;
2764 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2765 if (is_inf)
2766 data->flags |= SF_IS_INF;
2767 }
653099ff 2768 if (flags & SCF_DO_STCLASS) {
830247a4 2769 cl_init(pRExC_state, &this_class);
653099ff
GS
2770 oclass = data->start_class;
2771 data->start_class = &this_class;
2772 f |= SCF_DO_STCLASS_AND;
2773 f &= ~SCF_DO_STCLASS_OR;
2774 }
e1901655
IZ
2775 /* These are the cases when once a subexpression
2776 fails at a particular position, it cannot succeed
2777 even after backtracking at the enclosing scope.
b81d288d 2778
e1901655
IZ
2779 XXXX what if minimal match and we are at the
2780 initial run of {n,m}? */
2781 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2782 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2783
c277df42 2784 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2785 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2786 (mincount == 0
2787 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2788
2789 if (flags & SCF_DO_STCLASS)
2790 data->start_class = oclass;
2791 if (mincount == 0 || minnext == 0) {
2792 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2793 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2794 }
2795 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2796 /* Switch to OR mode: cache the old value of
653099ff
GS
2797 * data->start_class */
2798 StructCopy(data->start_class, &and_with,
2799 struct regnode_charclass_class);
2800 flags &= ~SCF_DO_STCLASS_AND;
2801 StructCopy(&this_class, data->start_class,
2802 struct regnode_charclass_class);
2803 flags |= SCF_DO_STCLASS_OR;
2804 data->start_class->flags |= ANYOF_EOS;
2805 }
2806 } else { /* Non-zero len */
2807 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2808 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2809 cl_and(data->start_class, &and_with);
2810 }
2811 else if (flags & SCF_DO_STCLASS_AND)
2812 cl_and(data->start_class, &this_class);
2813 flags &= ~SCF_DO_STCLASS;
2814 }
c277df42
IZ
2815 if (!scan) /* It was not CURLYX, but CURLY. */
2816 scan = next;
041457d9
DM
2817 if ( /* ? quantifier ok, except for (?{ ... }) */
2818 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2819 && (minnext == 0) && (deltanext == 0)
99799961 2820 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2821 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2822 && ckWARN(WARN_REGEXP))
b45f050a 2823 {
830247a4 2824 vWARN(RExC_parse,
b45f050a
JF
2825 "Quantifier unexpected on zero-length expression");
2826 }
2827
c277df42 2828 min += minnext * mincount;
b81d288d 2829 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2830 && (minnext + deltanext) > 0)
2831 || deltanext == I32_MAX);
aca2d497 2832 is_inf |= is_inf_internal;
c277df42
IZ
2833 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2834
2835 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2836 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2837 && data->flags & SF_IN_PAR
2838 && !(data->flags & SF_HAS_EVAL)
2839 && !deltanext && minnext == 1 ) {
2840 /* Try to optimize to CURLYN. */
2841 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2842 regnode * const nxt1 = nxt;
497b47a8
JH
2843#ifdef DEBUGGING
2844 regnode *nxt2;
2845#endif
c277df42
IZ
2846
2847 /* Skip open. */
2848 nxt = regnext(nxt);
bfed75c6 2849 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2850 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2851 && STR_LEN(nxt) == 1))
c277df42 2852 goto nogo;
497b47a8 2853#ifdef DEBUGGING
c277df42 2854 nxt2 = nxt;
497b47a8 2855#endif
c277df42 2856 nxt = regnext(nxt);
b81d288d 2857 if (OP(nxt) != CLOSE)
c277df42
IZ
2858 goto nogo;
2859 /* Now we know that nxt2 is the only contents: */
eb160463 2860 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2861 OP(oscan) = CURLYN;
2862 OP(nxt1) = NOTHING; /* was OPEN. */
2863#ifdef DEBUGGING
2864 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2865 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2866 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2867 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2868 OP(nxt + 1) = OPTIMIZED; /* was count. */
2869 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2870#endif
c277df42 2871 }
c277df42
IZ
2872 nogo:
2873
2874 /* Try optimization CURLYX => CURLYM. */
b81d288d 2875 if ( OP(oscan) == CURLYX && data
c277df42 2876 && !(data->flags & SF_HAS_PAR)
c277df42 2877 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2878 && !deltanext /* atom is fixed width */
2879 && minnext != 0 /* CURLYM can't handle zero width */
2880 ) {
c277df42
IZ
2881 /* XXXX How to optimize if data == 0? */
2882 /* Optimize to a simpler form. */
2883 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2884 regnode *nxt2;
2885
2886 OP(oscan) = CURLYM;
2887 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2888 && (OP(nxt2) != WHILEM))
c277df42
IZ
2889 nxt = nxt2;
2890 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2891 /* Need to optimize away parenths. */
2892 if (data->flags & SF_IN_PAR) {
2893 /* Set the parenth number. */
2894 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2895
b81d288d 2896 if (OP(nxt) != CLOSE)
b45f050a 2897 FAIL("Panic opt close");
eb160463 2898 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2899 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2900 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2901#ifdef DEBUGGING
2902 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2903 OP(nxt + 1) = OPTIMIZED; /* was count. */
2904 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2905 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2906#endif
c277df42
IZ
2907#if 0
2908 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2909 regnode *nnxt = regnext(nxt1);
b81d288d 2910
c277df42
IZ
2911 if (nnxt == nxt) {
2912 if (reg_off_by_arg[OP(nxt1)])
2913 ARG_SET(nxt1, nxt2 - nxt1);
2914 else if (nxt2 - nxt1 < U16_MAX)
2915 NEXT_OFF(nxt1) = nxt2 - nxt1;
2916 else
2917 OP(nxt) = NOTHING; /* Cannot beautify */
2918 }
2919 nxt1 = nnxt;
2920 }
2921#endif
2922 /* Optimize again: */
1de06328 2923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2924 NULL, 0,depth+1);
a0ed51b3
LW
2925 }
2926 else
c277df42 2927 oscan->flags = 0;
c277df42 2928 }
e1901655
IZ
2929 else if ((OP(oscan) == CURLYX)
2930 && (flags & SCF_WHILEM_VISITED_POS)
2931 /* See the comment on a similar expression above.
2932 However, this time it not a subexpression
2933 we care about, but the expression itself. */
2934 && (maxcount == REG_INFTY)
2935 && data && ++data->whilem_c < 16) {
2936 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2937 /* Find WHILEM (as in regexec.c) */
2938 regnode *nxt = oscan + NEXT_OFF(oscan);
2939
2940 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2941 nxt += ARG(nxt);
eb160463
GS
2942 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2943 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2944 }
b81d288d 2945 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2946 pars++;
2947 if (flags & SCF_DO_SUBSTR) {
c445ea15 2948 SV *last_str = NULL;
c277df42
IZ
2949 int counted = mincount != 0;
2950
2951 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2952#if defined(SPARC64_GCC_WORKAROUND)
2953 I32 b = 0;
2954 STRLEN l = 0;
cfd0369c 2955 const char *s = NULL;
5d1c421c
JH
2956 I32 old = 0;
2957
2958 if (pos_before >= data->last_start_min)
2959 b = pos_before;
2960 else
2961 b = data->last_start_min;
2962
2963 l = 0;
cfd0369c 2964 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2965 old = b - data->last_start_min;
2966
2967#else
b81d288d 2968 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2969 ? pos_before : data->last_start_min;
2970 STRLEN l;
d4c19fe8 2971 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2972 I32 old = b - data->last_start_min;
5d1c421c 2973#endif
a0ed51b3
LW
2974
2975 if (UTF)
2976 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2977
a0ed51b3 2978 l -= old;
c277df42 2979 /* Get the added string: */
79cb57f6 2980 last_str = newSVpvn(s + old, l);
0e933229
IH
2981 if (UTF)
2982 SvUTF8_on(last_str);
c277df42
IZ
2983 if (deltanext == 0 && pos_before == b) {
2984 /* What was added is a constant string */
2985 if (mincount > 1) {
2986 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2987 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2988 SvPVX_const(last_str), l, mincount - 1);
b162af07 2989 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2990 /* Add additional parts. */
b81d288d 2991 SvCUR_set(data->last_found,
c277df42
IZ
2992 SvCUR(data->last_found) - l);
2993 sv_catsv(data->last_found, last_str);
0eda9292
JH
2994 {
2995 SV * sv = data->last_found;
2996 MAGIC *mg =
2997 SvUTF8(sv) && SvMAGICAL(sv) ?
2998 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2999 if (mg && mg->mg_len >= 0)
3000 mg->mg_len += CHR_SVLEN(last_str);
3001 }
c277df42
IZ
3002 data->last_end += l * (mincount - 1);
3003 }
2a8d9689
HS
3004 } else {
3005 /* start offset must point into the last copy */
3006 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3007 data->last_start_max += is_inf ? I32_MAX
3008 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3009 }
3010 }
3011 /* It is counted once already... */
3012 data->pos_min += minnext * (mincount - counted);
3013 data->pos_delta += - counted * deltanext +
3014 (minnext + deltanext) * maxcount - minnext * mincount;
3015 if (mincount != maxcount) {
653099ff
GS
3016 /* Cannot extend fixed substrings found inside
3017 the group. */
1de06328 3018 scan_commit(pRExC_state,data,minlenp);
c277df42 3019 if (mincount && last_str) {
d4c19fe8
AL
3020 SV * const sv = data->last_found;
3021 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3022 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3023
3024 if (mg)
3025 mg->mg_len = -1;
3026 sv_setsv(sv, last_str);
c277df42 3027 data->last_end = data->pos_min;
b81d288d 3028 data->last_start_min =
a0ed51b3 3029 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3030 data->last_start_max = is_inf
3031 ? I32_MAX
c277df42 3032 : data->pos_min + data->pos_delta
a0ed51b3 3033 - CHR_SVLEN(last_str);
c277df42
IZ
3034 }
3035 data->longest = &(data->longest_float);
3036 }
aca2d497 3037 SvREFCNT_dec(last_str);
c277df42 3038 }
405ff068 3039 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3040 data->flags |= SF_HAS_EVAL;
3041 optimize_curly_tail:
c277df42 3042 if (OP(oscan) != CURLYX) {
3dab1dad 3043 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3044 && NEXT_OFF(next))
3045 NEXT_OFF(oscan) += NEXT_OFF(next);
3046 }
c277df42 3047 continue;
653099ff 3048 default: /* REF and CLUMP only? */
c277df42 3049 if (flags & SCF_DO_SUBSTR) {
1de06328 3050 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3051 data->longest = &(data->longest_float);
3052 }
aca2d497 3053 is_inf = is_inf_internal = 1;
653099ff 3054 if (flags & SCF_DO_STCLASS_OR)
830247a4 3055 cl_anything(pRExC_state, data->start_class);
653099ff 3056 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3057 break;
3058 }
a0ed51b3 3059 }
bfed75c6 3060 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3061 int value = 0;
653099ff 3062
c277df42 3063 if (flags & SCF_DO_SUBSTR) {
1de06328 3064 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3065 data->pos_min++;
3066 }
3067 min++;
653099ff
GS
3068 if (flags & SCF_DO_STCLASS) {
3069 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3070
3071 /* Some of the logic below assumes that switching
3072 locale on will only add false positives. */
3dab1dad 3073 switch (PL_regkind[OP(scan)]) {
653099ff 3074 case SANY:
653099ff
GS
3075 default:
3076 do_default:
3077 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3078 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3079 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3080 break;
3081 case REG_ANY:
3082 if (OP(scan) == SANY)
3083 goto do_default;
3084 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3085 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3086 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3087 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3088 }
3089 if (flags & SCF_DO_STCLASS_AND || !value)
3090 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3091 break;
3092 case ANYOF:
3093 if (flags & SCF_DO_STCLASS_AND)
3094 cl_and(data->start_class,
3095 (struct regnode_charclass_class*)scan);
3096 else
830247a4 3097 cl_or(pRExC_state, data->start_class,
653099ff
GS
3098 (struct regnode_charclass_class*)scan);
3099 break;
3100 case ALNUM:
3101 if (flags & SCF_DO_STCLASS_AND) {
3102 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3103 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3104 for (value = 0; value < 256; value++)
3105 if (!isALNUM(value))
3106 ANYOF_BITMAP_CLEAR(data->start_class, value);
3107 }
3108 }
3109 else {
3110 if (data->start_class->flags & ANYOF_LOCALE)
3111 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3112 else {
3113 for (value = 0; value < 256; value++)
3114 if (isALNUM(value))
b81d288d 3115 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3116 }
3117 }
3118 break;
3119 case ALNUML:
3120 if (flags & SCF_DO_STCLASS_AND) {
3121 if (data->start_class->flags & ANYOF_LOCALE)
3122 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3123 }
3124 else {
3125 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3126 data->start_class->flags |= ANYOF_LOCALE;
3127 }
3128 break;
3129 case NALNUM:
3130 if (flags & SCF_DO_STCLASS_AND) {
3131 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3132 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3133 for (value = 0; value < 256; value++)
3134 if (isALNUM(value))
3135 ANYOF_BITMAP_CLEAR(data->start_class, value);
3136 }
3137 }
3138 else {
3139 if (data->start_class->flags & ANYOF_LOCALE)
3140 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3141 else {
3142 for (value = 0; value < 256; value++)
3143 if (!isALNUM(value))
b81d288d 3144 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3145 }
3146 }
3147 break;
3148 case NALNUML:
3149 if (flags & SCF_DO_STCLASS_AND) {
3150 if (data->start_class->flags & ANYOF_LOCALE)
3151 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3152 }
3153 else {
3154 data->start_class->flags |= ANYOF_LOCALE;
3155 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3156 }
3157 break;
3158 case SPACE:
3159 if (flags & SCF_DO_STCLASS_AND) {
3160 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3161 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3162 for (value = 0; value < 256; value++)
3163 if (!isSPACE(value))
3164 ANYOF_BITMAP_CLEAR(data->start_class, value);
3165 }
3166 }
3167 else {
3168 if (data->start_class->flags & ANYOF_LOCALE)
3169 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3170 else {
3171 for (value = 0; value < 256; value++)
3172 if (isSPACE(value))
b81d288d 3173 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3174 }
3175 }
3176 break;
3177 case SPACEL:
3178 if (flags & SCF_DO_STCLASS_AND) {
3179 if (data->start_class->flags & ANYOF_LOCALE)
3180 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3181 }
3182 else {
3183 data->start_class->flags |= ANYOF_LOCALE;
3184 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3185 }
3186 break;
3187 case NSPACE:
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3190 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3191 for (value = 0; value < 256; value++)
3192 if (isSPACE(value))
3193 ANYOF_BITMAP_CLEAR(data->start_class, value);
3194 }
3195 }
3196 else {
3197 if (data->start_class->flags & ANYOF_LOCALE)
3198 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3199 else {
3200 for (value = 0; value < 256; value++)
3201 if (!isSPACE(value))
b81d288d 3202 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3203 }
3204 }
3205 break;
3206 case NSPACEL:
3207 if (flags & SCF_DO_STCLASS_AND) {
3208 if (data->start_class->flags & ANYOF_LOCALE) {
3209 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3210 for (value = 0; value < 256; value++)
3211 if (!isSPACE(value))
3212 ANYOF_BITMAP_CLEAR(data->start_class, value);
3213 }
3214 }
3215 else {
3216 data->start_class->flags |= ANYOF_LOCALE;
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3218 }
3219 break;
3220 case DIGIT:
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3223 for (value = 0; value < 256; value++)
3224 if (!isDIGIT(value))
3225 ANYOF_BITMAP_CLEAR(data->start_class, value);
3226 }
3227 else {
3228 if (data->start_class->flags & ANYOF_LOCALE)
3229 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3230 else {
3231 for (value = 0; value < 256; value++)
3232 if (isDIGIT(value))
b81d288d 3233 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3234 }
3235 }
3236 break;
3237 case NDIGIT:
3238 if (flags & SCF_DO_STCLASS_AND) {
3239 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3240 for (value = 0; value < 256; value++)
3241 if (isDIGIT(value))
3242 ANYOF_BITMAP_CLEAR(data->start_class, value);
3243 }
3244 else {
3245 if (data->start_class->flags & ANYOF_LOCALE)
3246 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3247 else {
3248 for (value = 0; value < 256; value++)
3249 if (!isDIGIT(value))
b81d288d 3250 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3251 }
3252 }
3253 break;
3254 }
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_and(data->start_class, &and_with);
3257 flags &= ~SCF_DO_STCLASS;
3258 }
a0ed51b3 3259 }
3dab1dad 3260 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3261 data->flags |= (OP(scan) == MEOL
3262 ? SF_BEFORE_MEOL
3263 : SF_BEFORE_SEOL);
a0ed51b3 3264 }
3dab1dad 3265 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3266 /* Lookbehind, or need to calculate parens/evals/stclass: */
3267 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3268 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3269 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3270 || OP(scan) == UNLESSM )
3271 {
3272 /* Negative Lookahead/lookbehind
3273 In this case we can't do fixed string optimisation.
3274 */
3275
3276 I32 deltanext, minnext, fake = 0;
3277 regnode *nscan;
3278 struct regnode_charclass_class intrnl;
3279 int f = 0;
3280
3281 data_fake.flags = 0;
3282 if (data) {
3283 data_fake.whilem_c = data->whilem_c;
3284 data_fake.last_closep = data->last_closep;
a0ed51b3 3285 }
1de06328
YO
3286 else
3287 data_fake.last_closep = &fake;
3288 if ( flags & SCF_DO_STCLASS && !scan->flags
3289 && OP(scan) == IFMATCH ) { /* Lookahead */
3290 cl_init(pRExC_state, &intrnl);
3291 data_fake.start_class = &intrnl;
3292 f |= SCF_DO_STCLASS_AND;
c277df42 3293 }
1de06328
YO
3294 if (flags & SCF_WHILEM_VISITED_POS)
3295 f |= SCF_WHILEM_VISITED_POS;
3296 next = regnext(scan);
3297 nscan = NEXTOPER(NEXTOPER(scan));
3298 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3299 if (scan->flags) {
3300 if (deltanext) {
3301 vFAIL("Variable length lookbehind not implemented");
3302 }
3303 else if (minnext > (I32)U8_MAX) {
3304 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3305 }
3306 scan->flags = (U8)minnext;
3307 }
3308 if (data) {
3309 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3310 pars++;
3311 if (data_fake.flags & SF_HAS_EVAL)
3312 data->flags |= SF_HAS_EVAL;
3313 data->whilem_c = data_fake.whilem_c;
3314 }
3315 if (f & SCF_DO_STCLASS_AND) {
3316 const int was = (data->start_class->flags & ANYOF_EOS);
3317
3318 cl_and(data->start_class, &intrnl);
3319 if (was)
3320 data->start_class->flags |= ANYOF_EOS;
3321 }
be8e71aa 3322 }
1de06328
YO
3323#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3324 else {
3325 /* Positive Lookahead/lookbehind
3326 In this case we can do fixed string optimisation,
3327 but we must be careful about it. Note in the case of
3328 lookbehind the positions will be offset by the minimum
3329 length of the pattern, something we won't know about
3330 until after the recurse.
3331 */
3332 I32 deltanext, fake = 0;
3333 regnode *nscan;
3334 struct regnode_charclass_class intrnl;
3335 int f = 0;
3336 /* We use SAVEFREEPV so that when the full compile
3337 is finished perl will clean up the allocated
3338 minlens when its all done. This was we don't
3339 have to worry about freeing them when we know
3340 they wont be used, which would be a pain.
3341 */
3342 I32 *minnextp;
3343 Newx( minnextp, 1, I32 );
3344 SAVEFREEPV(minnextp);
3345
3346 if (data) {
3347 StructCopy(data, &data_fake, scan_data_t);
3348 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3349 f |= SCF_DO_SUBSTR;
3350 if (scan->flags)
3351 scan_commit(pRExC_state, &data_fake,minlenp);
3352 data_fake.last_found=newSVsv(data->last_found);
3353 }
3354 }
3355 else
3356 data_fake.last_closep = &fake;
3357 data_fake.flags = 0;
3358 if (is_inf)
3359 data_fake.flags |= SF_IS_INF;
3360 if ( flags & SCF_DO_STCLASS && !scan->flags
3361 && OP(scan) == IFMATCH ) { /* Lookahead */
3362 cl_init(pRExC_state, &intrnl);
3363 data_fake.start_class = &intrnl;
3364 f |= SCF_DO_STCLASS_AND;
3365 }
3366 if (flags & SCF_WHILEM_VISITED_POS)
3367 f |= SCF_WHILEM_VISITED_POS;
3368 next = regnext(scan);
3369 nscan = NEXTOPER(NEXTOPER(scan));
3370
3371 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3372 if (scan->flags) {
3373 if (deltanext) {
3374 vFAIL("Variable length lookbehind not implemented");
3375 }
3376 else if (*minnextp > (I32)U8_MAX) {
3377 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3378 }
3379 scan->flags = (U8)*minnextp;
3380 }
3381
3382 *minnextp += min;
3383
3384
3385 if (f & SCF_DO_STCLASS_AND) {
3386 const int was = (data->start_class->flags & ANYOF_EOS);
3387
3388 cl_and(data->start_class, &intrnl);
3389 if (was)
3390 data->start_class->flags |= ANYOF_EOS;
3391 }
3392 if (data) {
3393 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3394 pars++;
3395 if (data_fake.flags & SF_HAS_EVAL)
3396 data->flags |= SF_HAS_EVAL;
3397 data->whilem_c = data_fake.whilem_c;
3398 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3399 if (RExC_rx->minlen<*minnextp)
3400 RExC_rx->minlen=*minnextp;
3401 scan_commit(pRExC_state, &data_fake, minnextp);
3402 SvREFCNT_dec(data_fake.last_found);
3403
3404 if ( data_fake.minlen_fixed != minlenp )
3405 {
3406 data->offset_fixed= data_fake.offset_fixed;
3407 data->minlen_fixed= data_fake.minlen_fixed;
3408 data->lookbehind_fixed+= scan->flags;
3409 }
3410 if ( data_fake.minlen_float != minlenp )
3411 {
3412 data->minlen_float= data_fake.minlen_float;
3413 data->offset_float_min=data_fake.offset_float_min;
3414 data->offset_float_max=data_fake.offset_float_max;
3415 data->lookbehind_float+= scan->flags;
3416 }
3417 }
3418 }
3419
653099ff 3420
653099ff 3421 }
1de06328 3422#endif
a0ed51b3
LW
3423 }
3424 else if (OP(scan) == OPEN) {
c277df42 3425 pars++;
a0ed51b3 3426 }
cb434fcc 3427 else if (OP(scan) == CLOSE) {
eb160463 3428 if ((I32)ARG(scan) == is_par) {
cb434fcc 3429 next = regnext(scan);
c277df42 3430
cb434fcc
IZ
3431 if ( next && (OP(next) != WHILEM) && next < last)
3432 is_par = 0; /* Disable optimization */
3433 }
3434 if (data)
3435 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3436 }
3437 else if (OP(scan) == EVAL) {
c277df42
IZ
3438 if (data)
3439 data->flags |= SF_HAS_EVAL;
3440 }
96776eda 3441 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3442 if (flags & SCF_DO_SUBSTR) {
1de06328 3443 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3444 data->longest = &(data->longest_float);
3445 }
3446 is_inf = is_inf_internal = 1;
653099ff 3447 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3448 cl_anything(pRExC_state, data->start_class);
96776eda 3449 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3450 }
786e8c11
YO
3451#ifdef TRIE_STUDY_OPT
3452#ifdef FULL_TRIE_STUDY
3453 else if (PL_regkind[OP(scan)] == TRIE) {
3454 /* NOTE - There is similar code to this block above for handling
3455 BRANCH nodes on the initial study. If you change stuff here
3456 check there too. */
3457 regnode *tail= regnext(scan);
3458 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3459 I32 max1 = 0, min1 = I32_MAX;
3460 struct regnode_charclass_class accum;
3461
3462 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3463 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3464 if (flags & SCF_DO_STCLASS)
3465 cl_init_zero(pRExC_state, &accum);
3466
3467 if (!trie->jump) {
3468 min1= trie->minlen;
3469 max1= trie->maxlen;
3470 } else {
3471 const regnode *nextbranch= NULL;
3472 U32 word;
3473
3474 for ( word=1 ; word <= trie->wordcount ; word++)
3475 {
3476 I32 deltanext=0, minnext=0, f = 0, fake;
3477 struct regnode_charclass_class this_class;
3478
3479 data_fake.flags = 0;
3480 if (data) {
3481 data_fake.whilem_c = data->whilem_c;
3482 data_fake.last_closep = data->last_closep;
3483 }
3484 else
3485 data_fake.last_closep = &fake;
3486
3487 if (flags & SCF_DO_STCLASS) {
3488 cl_init(pRExC_state, &this_class);
3489 data_fake.start_class = &this_class;
3490 f = SCF_DO_STCLASS_AND;
3491 }
3492 if (flags & SCF_WHILEM_VISITED_POS)
3493 f |= SCF_WHILEM_VISITED_POS;
3494
3495 if (trie->jump[word]) {
3496 if (!nextbranch)
3497 nextbranch = tail - trie->jump[0];
3498 scan= tail - trie->jump[word];
3499 /* We go from the jump point to the branch that follows
3500 it. Note this means we need the vestigal unused branches
3501 even though they arent otherwise used.
3502 */
1de06328 3503 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3504 (regnode *)nextbranch, &data_fake, f,depth+1);
3505 }
3506 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3507 nextbranch= regnext((regnode*)nextbranch);
3508
3509 if (min1 > (I32)(minnext + trie->minlen))
3510 min1 = minnext + trie->minlen;
3511 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3512 max1 = minnext + deltanext + trie->maxlen;
3513 if (deltanext == I32_MAX)
3514 is_inf = is_inf_internal = 1;
3515
3516 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3517 pars++;
3518
3519 if (data) {
3520 if (data_fake.flags & SF_HAS_EVAL)
3521 data->flags |= SF_HAS_EVAL;
3522 data->whilem_c = data_fake.whilem_c;
3523 }
3524 if (flags & SCF_DO_STCLASS)
3525 cl_or(pRExC_state, &accum, &this_class);
3526 }
3527 }
3528 if (flags & SCF_DO_SUBSTR) {
3529 data->pos_min += min1;
3530 data->pos_delta += max1 - min1;
3531 if (max1 != min1 || is_inf)
3532 data->longest = &(data->longest_float);
3533 }
3534 min += min1;
3535 delta += max1 - min1;
3536 if (flags & SCF_DO_STCLASS_OR) {
3537 cl_or(pRExC_state, data->start_class, &accum);
3538 if (min1) {
3539 cl_and(data->start_class, &and_with);
3540 flags &= ~SCF_DO_STCLASS;
3541 }
3542 }
3543 else if (flags & SCF_DO_STCLASS_AND) {
3544 if (min1) {
3545 cl_and(data->start_class, &accum);
3546 flags &= ~SCF_DO_STCLASS;
3547 }
3548 else {
3549 /* Switch to OR mode: cache the old value of
3550 * data->start_class */
3551 StructCopy(data->start_class, &and_with,
3552 struct regnode_charclass_class);
3553 flags &= ~SCF_DO_STCLASS_AND;
3554 StructCopy(&accum, data->start_class,
3555 struct regnode_charclass_class);
3556 flags |= SCF_DO_STCLASS_OR;
3557 data->start_class->flags |= ANYOF_EOS;
3558 }
3559 }
3560 scan= tail;
3561 continue;
3562 }
3563#else
3564 else if (PL_regkind[OP(scan)] == TRIE) {
3565 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3566 U8*bang=NULL;
3567
3568 min += trie->minlen;
3569 delta += (trie->maxlen - trie->minlen);
3570 flags &= ~SCF_DO_STCLASS; /* xxx */
3571 if (flags & SCF_DO_SUBSTR) {
1de06328 3572 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3573 data->pos_min += trie->minlen;
3574 data->pos_delta += (trie->maxlen - trie->minlen);
3575 if (trie->maxlen != trie->minlen)
3576 data->longest = &(data->longest_float);
3577 }
3578 if (trie->jump) /* no more substrings -- for now /grr*/
3579 flags &= ~SCF_DO_SUBSTR;
3580 }
3581#endif /* old or new */
3582#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3583 /* Else: zero-length, ignore. */
3584 scan = regnext(scan);
3585 }
3586
3587 finish:
3588 *scanp = scan;
aca2d497 3589 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3590 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3591 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3592 if (is_par > (I32)U8_MAX)
c277df42
IZ
3593 is_par = 0;
3594 if (is_par && pars==1 && data) {
3595 data->flags |= SF_IN_PAR;
3596 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3597 }
3598 else if (pars && data) {
c277df42
IZ
3599 data->flags |= SF_HAS_PAR;
3600 data->flags &= ~SF_IN_PAR;
3601 }
653099ff
GS
3602 if (flags & SCF_DO_STCLASS_OR)
3603 cl_and(data->start_class, &and_with);
786e8c11
YO
3604 if (flags & SCF_TRIE_RESTUDY)
3605 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3606
3607 DEBUG_STUDYDATA(data,depth);
3608
c277df42
IZ
3609 return min;
3610}
3611
76e3520e 3612STATIC I32
5f66b61c 3613S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3614{
830247a4 3615 if (RExC_rx->data) {
b81d288d
AB
3616 Renewc(RExC_rx->data,
3617 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3618 char, struct reg_data);
830247a4
IZ
3619 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3620 RExC_rx->data->count += n;
a0ed51b3
LW
3621 }
3622 else {
a02a5408 3623 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3624 char, struct reg_data);
a02a5408 3625 Newx(RExC_rx->data->what, n, U8);
830247a4 3626 RExC_rx->data->count = n;
c277df42 3627 }
830247a4
IZ
3628 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3629 return RExC_rx->data->count - n;
c277df42
IZ
3630}
3631
76234dfb 3632#ifndef PERL_IN_XSUB_RE
d88dccdf 3633void
864dbfa3 3634Perl_reginitcolors(pTHX)
d88dccdf 3635{
97aff369 3636 dVAR;
1df70142 3637 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3638 if (s) {
1df70142
AL
3639 char *t = savepv(s);
3640 int i = 0;
3641 PL_colors[0] = t;
d88dccdf 3642 while (++i < 6) {
1df70142
AL
3643 t = strchr(t, '\t');
3644 if (t) {
3645 *t = '\0';
3646 PL_colors[i] = ++t;
d88dccdf
IZ
3647 }
3648 else
1df70142 3649 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3650 }
3651 } else {
1df70142 3652 int i = 0;
b81d288d 3653 while (i < 6)
06b5626a 3654 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3655 }
3656 PL_colorset = 1;
3657}
76234dfb 3658#endif
8615cb43 3659
07be1b83 3660
786e8c11
YO
3661#ifdef TRIE_STUDY_OPT
3662#define CHECK_RESTUDY_GOTO \
3663 if ( \
3664 (data.flags & SCF_TRIE_RESTUDY) \
3665 && ! restudied++ \
3666 ) goto reStudy
3667#else
3668#define CHECK_RESTUDY_GOTO
3669#endif
a687059c 3670/*
e50aee73 3671 - pregcomp - compile a regular expression into internal code
a687059c
LW
3672 *
3673 * We can't allocate space until we know how big the compiled form will be,
3674 * but we can't compile it (and thus know how big it is) until we've got a
3675 * place to put the code. So we cheat: we compile it twice, once with code
3676 * generation turned off and size counting turned on, and once "for real".
3677 * This also means that we don't allocate space until we are sure that the
3678 * thing really will compile successfully, and we never have to move the
3679 * code and thus invalidate pointers into it. (Note that it has to be in
3680 * one piece because free() must be able to free it all.) [NB: not true in perl]
3681 *
3682 * Beware that the optimization-preparation code in here knows about some
3683 * of the structure of the compiled regexp. [I'll say.]
3684 */
3685regexp *
864dbfa3 3686Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3687{
97aff369 3688 dVAR;
a0d0e21e 3689 register regexp *r;
c277df42 3690 regnode *scan;
c277df42 3691 regnode *first;
a0d0e21e 3692 I32 flags;
a0d0e21e
LW
3693 I32 minlen = 0;
3694 I32 sawplus = 0;
3695 I32 sawopen = 0;
2c2d71f5 3696 scan_data_t data;
830247a4 3697 RExC_state_t RExC_state;
be8e71aa 3698 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3699#ifdef TRIE_STUDY_OPT
3700 int restudied= 0;
3701 RExC_state_t copyRExC_state;
3702#endif
a0d0e21e 3703
a3621e74
YO
3704 GET_RE_DEBUG_FLAGS_DECL;
3705
a0d0e21e 3706 if (exp == NULL)
c277df42 3707 FAIL("NULL regexp argument");
a0d0e21e 3708
a5961de5 3709 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3710
5cfc7842 3711 RExC_precomp = exp;
a3621e74
YO
3712 DEBUG_r(if (!PL_colorset) reginitcolors());
3713 DEBUG_COMPILE_r({
ab3bbdeb
YO
3714 SV *dsv= sv_newmortal();
3715 RE_PV_QUOTED_DECL(s, RExC_utf8,
3716 dsv, RExC_precomp, (xend - exp), 60);
3717 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3718 PL_colors[4],PL_colors[5],s);
a5961de5 3719 });
e2509266 3720 RExC_flags = pm->op_pmflags;
830247a4 3721 RExC_sawback = 0;
bbce6d69 3722
830247a4
IZ
3723 RExC_seen = 0;
3724 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3725 RExC_seen_evals = 0;
3726 RExC_extralen = 0;
c277df42 3727
bbce6d69 3728 /* First pass: determine size, legality. */
830247a4 3729 RExC_parse = exp;
fac92740 3730 RExC_start = exp;
830247a4
IZ
3731 RExC_end = xend;
3732 RExC_naughty = 0;
3733 RExC_npar = 1;
3734 RExC_size = 0L;
3735 RExC_emit = &PL_regdummy;
3736 RExC_whilem_seen = 0;
85ddcde9
JH
3737#if 0 /* REGC() is (currently) a NOP at the first pass.
3738 * Clever compilers notice this and complain. --jhi */
830247a4 3739 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3740#endif
3dab1dad
YO
3741 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3742 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3743 RExC_precomp = NULL;
a0d0e21e
LW
3744 return(NULL);
3745 }
3dab1dad
YO
3746 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3747 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3748 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3749 DEBUG_PARSE_r({
3750 RExC_lastnum=0;
3751 RExC_lastparse=NULL;
3752 });
c277df42 3753
07be1b83 3754
c277df42
IZ
3755 /* Small enough for pointer-storage convention?
3756 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3757 if (RExC_size >= 0x10000L && RExC_extralen)
3758 RExC_size += RExC_extralen;
c277df42 3759 else
830247a4
IZ
3760 RExC_extralen = 0;
3761 if (RExC_whilem_seen > 15)
3762 RExC_whilem_seen = 15;
a0d0e21e 3763
bbce6d69 3764 /* Allocate space and initialize. */
a02a5408 3765 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3766 char, regexp);
a0d0e21e 3767 if (r == NULL)
b45f050a
JF
3768 FAIL("Regexp out of space");
3769
0f79a09d
GS
3770#ifdef DEBUGGING
3771 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3772 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3773#endif
c277df42 3774 r->refcnt = 1;
bbce6d69 3775 r->prelen = xend - exp;
5cfc7842 3776 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3777 r->subbeg = NULL;
f8c7b90f 3778#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3779 r->saved_copy = NULL;
ed252734 3780#endif
cf93c79d 3781 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3782 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3783 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3784
3785 r->substrs = 0; /* Useful during FAIL. */
3786 r->startp = 0; /* Useful during FAIL. */
3787 r->endp = 0; /* Useful during FAIL. */
3788
a02a5408 3789 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3790 if (r->offsets) {
2af232bd 3791 r->offsets[0] = RExC_size;
fac92740 3792 }
a3621e74 3793 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3794 "%s %"UVuf" bytes for offset annotations.\n",
3795 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3796 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3797
830247a4 3798 RExC_rx = r;
bbce6d69 3799
3800 /* Second pass: emit code. */
e2509266 3801 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3802 RExC_parse = exp;
3803 RExC_end = xend;
3804 RExC_naughty = 0;
3805 RExC_npar = 1;
fac92740 3806 RExC_emit_start = r->program;
830247a4 3807 RExC_emit = r->program;
2cd61cdb 3808 /* Store the count of eval-groups for security checks: */
786e8c11 3809 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3810 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3811 r->data = 0;
3dab1dad 3812 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3813 return(NULL);
07be1b83
YO
3814 /* XXXX To minimize changes to RE engine we always allocate
3815 3-units-long substrs field. */
3816 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3817
07be1b83 3818reStudy:
1de06328 3819 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3820 Zero(r->substrs, 1, struct reg_substr_data);
3821 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3822
07be1b83
YO
3823#ifdef TRIE_STUDY_OPT
3824 if ( restudied ) {
3825 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3826 RExC_state=copyRExC_state;
1de06328 3827 if (data.last_found) {
07be1b83 3828 SvREFCNT_dec(data.longest_fixed);
07be1b83 3829 SvREFCNT_dec(data.longest_float);
07be1b83 3830 SvREFCNT_dec(data.last_found);
1de06328 3831 }
07be1b83
YO
3832 } else {
3833 copyRExC_state=RExC_state;
3834 }
3835#endif
a0d0e21e 3836 /* Dig out information for optimizations. */
cf93c79d 3837 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3838 pm->op_pmflags = RExC_flags;
a0ed51b3 3839 if (UTF)
5ff6fc6d 3840 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3841 r->regstclass = NULL;
830247a4 3842 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3843 r->reganch |= ROPT_NAUGHTY;
c277df42 3844 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3845
1de06328
YO
3846 /* testing for BRANCH here tells us whether there is "must appear"
3847 data in the pattern. If there is then we can use it for optimisations */
c277df42 3848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3849 I32 fake;
c5254dd6 3850 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3851 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3852 int stclass_flag;
07be1b83 3853 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3854
3855 first = scan;
c277df42 3856 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3857 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3858 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3860 /* for now we can't handle lookbehind IFMATCH*/
3861 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3862 (OP(first) == PLUS) ||
3863 (OP(first) == MINMOD) ||
653099ff 3864 /* An {n,m} with n>0 */
07be1b83
YO
3865 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3866 {
786e8c11 3867
a0d0e21e
LW
3868 if (OP(first) == PLUS)
3869 sawplus = 1;
3870 else
3dab1dad 3871 first += regarglen[OP(first)];
07be1b83
YO
3872 if (OP(first) == IFMATCH) {
3873 first = NEXTOPER(first);
3874 first += EXTRA_STEP_2ARGS;
7c167cea 3875 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3876 first = NEXTOPER(first);
a687059c
LW
3877 }
3878
a0d0e21e
LW
3879 /* Starting-point info. */
3880 again:
786e8c11 3881 DEBUG_PEEP("first:",first,0);
07be1b83 3882 /* Ignore EXACT as we deal with it later. */
3dab1dad 3883 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3884 if (OP(first) == EXACT)
6f207bd3 3885 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3886 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3887 r->regstclass = first;
3888 }
07be1b83 3889#ifdef TRIE_STCLASS
786e8c11 3890 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3891 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3892 {
786e8c11 3893 regnode *trie_op;
07be1b83 3894 /* this can happen only on restudy */
786e8c11
YO
3895 if ( OP(first) == TRIE ) {
3896 struct regnode_1 *trieop;
3897 Newxz(trieop,1,struct regnode_1);
3898 StructCopy(first,trieop,struct regnode_1);
3899 trie_op=(regnode *)trieop;
3900 } else {
3901 struct regnode_charclass *trieop;
3902 Newxz(trieop,1,struct regnode_charclass);
3903 StructCopy(first,trieop,struct regnode_charclass);
3904 trie_op=(regnode *)trieop;
3905 }
1de06328 3906 OP(trie_op)+=2;
786e8c11
YO
3907 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3908 r->regstclass = trie_op;
07be1b83
YO
3909 }
3910#endif
bfed75c6 3911 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3912 r->regstclass = first;
3dab1dad
YO
3913 else if (PL_regkind[OP(first)] == BOUND ||
3914 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3915 r->regstclass = first;
3dab1dad 3916 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3917 r->reganch |= (OP(first) == MBOL
3918 ? ROPT_ANCH_MBOL
3919 : (OP(first) == SBOL
3920 ? ROPT_ANCH_SBOL
3921 : ROPT_ANCH_BOL));
a0d0e21e 3922 first = NEXTOPER(first);
774d564b 3923 goto again;
3924 }
3925 else if (OP(first) == GPOS) {
3926 r->reganch |= ROPT_ANCH_GPOS;
3927 first = NEXTOPER(first);
3928 goto again;
a0d0e21e 3929 }
e09294f4 3930 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3931 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3932 !(r->reganch & ROPT_ANCH) )
3933 {
3934 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3935 const int type =
3936 (OP(NEXTOPER(first)) == REG_ANY)
3937 ? ROPT_ANCH_MBOL
3938 : ROPT_ANCH_SBOL;
cad2e5aa 3939 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3940 first = NEXTOPER(first);
774d564b 3941 goto again;
a0d0e21e 3942 }
b81d288d 3943 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3944 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3945 /* x+ must match at the 1st pos of run of x's */
3946 r->reganch |= ROPT_SKIP;
a0d0e21e 3947
c277df42 3948 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3949#ifdef TRIE_STUDY_OPT
3950 DEBUG_COMPILE_r(
3951 if (!restudied)
3952 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3953 (IV)(first - scan + 1))
3954 );
3955#else
3956 DEBUG_COMPILE_r(
3957 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3958 (IV)(first - scan + 1))
3959 );
3960#endif
3961
3962
a0d0e21e
LW
3963 /*
3964 * If there's something expensive in the r.e., find the
3965 * longest literal string that must appear and make it the
3966 * regmust. Resolve ties in favor of later strings, since
3967 * the regstart check works with the beginning of the r.e.
3968 * and avoiding duplication strengthens checking. Not a
3969 * strong reason, but sufficient in the absence of others.
3970 * [Now we resolve ties in favor of the earlier string if
c277df42 3971 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3972 * earlier string may buy us something the later one won't.]
3973 */
a0d0e21e 3974 minlen = 0;
a687059c 3975
396482e1
GA
3976 data.longest_fixed = newSVpvs("");
3977 data.longest_float = newSVpvs("");
3978 data.last_found = newSVpvs("");
c277df42
IZ
3979 data.longest = &(data.longest_fixed);
3980 first = scan;
653099ff 3981 if (!r->regstclass) {
830247a4 3982 cl_init(pRExC_state, &ch_class);
653099ff
GS
3983 data.start_class = &ch_class;
3984 stclass_flag = SCF_DO_STCLASS_AND;
3985 } else /* XXXX Check for BOUND? */
3986 stclass_flag = 0;
cb434fcc 3987 data.last_closep = &last_close;
653099ff 3988
1de06328 3989 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 3990 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 3991
07be1b83 3992
786e8c11
YO
3993 CHECK_RESTUDY_GOTO;
3994
3995
830247a4 3996 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3997 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3998 && !RExC_seen_zerolen
3999 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4000 r->reganch |= ROPT_CHECK_ALL;
1de06328 4001 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4002 SvREFCNT_dec(data.last_found);
4003
1de06328
YO
4004 /* Note that code very similar to this but for anchored string
4005 follows immediately below, changes may need to be made to both.
4006 Be careful.
4007 */
a0ed51b3 4008 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4009 if (longest_float_length
c277df42
IZ
4010 || (data.flags & SF_FL_BEFORE_EOL
4011 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4012 || (RExC_flags & PMf_MULTILINE))))
4013 {
1182767e 4014 I32 t,ml;
cf93c79d 4015
1de06328 4016 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4017 && data.offset_fixed == data.offset_float_min
4018 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4019 goto remove_float; /* As in (a)+. */
4020
1de06328
YO
4021 /* copy the information about the longest float from the reg_scan_data
4022 over to the program. */
33b8afdf
JH
4023 if (SvUTF8(data.longest_float)) {
4024 r->float_utf8 = data.longest_float;
c445ea15 4025 r->float_substr = NULL;
33b8afdf
JH
4026 } else {
4027 r->float_substr = data.longest_float;
c445ea15 4028 r->float_utf8 = NULL;
33b8afdf 4029 }
1de06328
YO
4030 /* float_end_shift is how many chars that must be matched that
4031 follow this item. We calculate it ahead of time as once the
4032 lookbehind offset is added in we lose the ability to correctly
4033 calculate it.*/
4034 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4035 : (I32)longest_float_length;
1de06328
YO
4036 r->float_end_shift = ml - data.offset_float_min
4037 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4038 + data.lookbehind_float;
4039 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4040 r->float_max_offset = data.offset_float_max;
1182767e 4041 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4042 r->float_max_offset -= data.lookbehind_float;
4043
cf93c79d
IZ
4044 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4045 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4046 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4047 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4048 }
4049 else {
aca2d497 4050 remove_float:
c445ea15 4051 r->float_substr = r->float_utf8 = NULL;
c277df42 4052 SvREFCNT_dec(data.longest_float);
c5254dd6 4053 longest_float_length = 0;
a0d0e21e 4054 }
c277df42 4055
1de06328
YO
4056 /* Note that code very similar to this but for floating string
4057 is immediately above, changes may need to be made to both.
4058 Be careful.
4059 */
a0ed51b3 4060 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4061 if (longest_fixed_length
c277df42
IZ
4062 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4063 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4064 || (RExC_flags & PMf_MULTILINE))))
4065 {
1182767e 4066 I32 t,ml;
cf93c79d 4067
1de06328
YO
4068 /* copy the information about the longest fixed
4069 from the reg_scan_data over to the program. */
33b8afdf
JH
4070 if (SvUTF8(data.longest_fixed)) {
4071 r->anchored_utf8 = data.longest_fixed;
c445ea15 4072 r->anchored_substr = NULL;
33b8afdf
JH
4073 } else {
4074 r->anchored_substr = data.longest_fixed;
c445ea15 4075 r->anchored_utf8 = NULL;
33b8afdf 4076 }
1de06328
YO
4077 /* fixed_end_shift is how many chars that must be matched that
4078 follow this item. We calculate it ahead of time as once the
4079 lookbehind offset is added in we lose the ability to correctly
4080 calculate it.*/
4081 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4082 : (I32)longest_fixed_length;
1de06328
YO
4083 r->anchored_end_shift = ml - data.offset_fixed
4084 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4085 + data.lookbehind_fixed;
4086 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4087
cf93c79d
IZ
4088 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4089 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4090 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4091 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4092 }
4093 else {
c445ea15 4094 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4095 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4096 longest_fixed_length = 0;
a0d0e21e 4097 }
b81d288d 4098 if (r->regstclass
ffc61ed2 4099 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4100 r->regstclass = NULL;
33b8afdf
JH
4101 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4102 && stclass_flag
653099ff 4103 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4104 && !cl_is_anything(data.start_class))
4105 {
1df70142 4106 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4107
a02a5408 4108 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4109 struct regnode_charclass_class);
4110 StructCopy(data.start_class,
830247a4 4111 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4112 struct regnode_charclass_class);
830247a4 4113 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4114 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4115 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4116 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4117 PerlIO_printf(Perl_debug_log,
a0288114 4118 "synthetic stclass \"%s\".\n",
3f7c398e 4119 SvPVX_const(sv));});
653099ff 4120 }
c277df42
IZ
4121
4122 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4123 if (longest_fixed_length > longest_float_length) {
1de06328 4124 r->check_end_shift = r->anchored_end_shift;
c277df42 4125 r->check_substr = r->anchored_substr;
33b8afdf 4126 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4127 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4128 if (r->reganch & ROPT_ANCH_SINGLE)
4129 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4130 }
4131 else {
1de06328 4132 r->check_end_shift = r->float_end_shift;
c277df42 4133 r->check_substr = r->float_substr;
33b8afdf 4134 r->check_utf8 = r->float_utf8;
1de06328
YO
4135 r->check_offset_min = r->float_min_offset;
4136 r->check_offset_max = r->float_max_offset;
a0d0e21e 4137 }
30382c73
IZ
4138 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4139 This should be changed ASAP! */
33b8afdf 4140 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4141 r->reganch |= RE_USE_INTUIT;
33b8afdf 4142 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4143 r->reganch |= RE_INTUIT_TAIL;
4144 }
1de06328
YO
4145 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4146 if ( (STRLEN)minlen < longest_float_length )
4147 minlen= longest_float_length;
4148 if ( (STRLEN)minlen < longest_fixed_length )
4149 minlen= longest_fixed_length;
4150 */
a0ed51b3
LW
4151 }
4152 else {
c277df42
IZ
4153 /* Several toplevels. Best we can is to set minlen. */
4154 I32 fake;
653099ff 4155 struct regnode_charclass_class ch_class;
cb434fcc 4156 I32 last_close = 0;
c277df42 4157
a3621e74 4158 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4159
c277df42 4160 scan = r->program + 1;
830247a4 4161 cl_init(pRExC_state, &ch_class);
653099ff 4162 data.start_class = &ch_class;
cb434fcc 4163 data.last_closep = &last_close;
07be1b83 4164
1de06328 4165 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4166 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4167
786e8c11 4168 CHECK_RESTUDY_GOTO;
07be1b83 4169
33b8afdf 4170 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4171 = r->float_substr = r->float_utf8 = NULL;
653099ff 4172 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4173 && !cl_is_anything(data.start_class))
4174 {
1df70142 4175 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4176
a02a5408 4177 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4178 struct regnode_charclass_class);
4179 StructCopy(data.start_class,
830247a4 4180 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4181 struct regnode_charclass_class);
830247a4 4182 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4183 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4184 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4185 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4186 PerlIO_printf(Perl_debug_log,
a0288114 4187 "synthetic stclass \"%s\".\n",
3f7c398e 4188 SvPVX_const(sv));});
653099ff 4189 }
a0d0e21e
LW
4190 }
4191
1de06328
YO
4192 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4193 the "real" pattern. */
4194 if (r->minlen < minlen)
4195 r->minlen = minlen;
4196
b81d288d 4197 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4198 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4199 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4200 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4201 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4202 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4203 if (RExC_seen & REG_SEEN_CANY)
4204 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
4205 Newxz(r->startp, RExC_npar, I32);
4206 Newxz(r->endp, RExC_npar, I32);
07be1b83 4207
f2278c82 4208 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4209 DEBUG_DUMP_r({
4210 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4211 regdump(r);
4212 });
8e9a8a48
YO
4213 DEBUG_OFFSETS_r(if (r->offsets) {
4214 const U32 len = r->offsets[0];
4215 U32 i;
4216 GET_RE_DEBUG_FLAGS_DECL;
4217 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4218 for (i = 1; i <= len; i++) {
4219 if (r->offsets[i*2-1] || r->offsets[i*2])
4220 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4221 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4222 }
4223 PerlIO_printf(Perl_debug_log, "\n");
4224 });
a0d0e21e 4225 return(r);
a687059c
LW
4226}
4227
3dab1dad
YO
4228
4229#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4230 int rem=(int)(RExC_end - RExC_parse); \
4231 int cut; \
4232 int num; \
4233 int iscut=0; \
4234 if (rem>10) { \
4235 rem=10; \
4236 iscut=1; \
4237 } \
4238 cut=10-rem; \
4239 if (RExC_lastparse!=RExC_parse) \
4240 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4241 rem, RExC_parse, \
4242 cut + 4, \
4243 iscut ? "..." : "<" \
4244 ); \
4245 else \
4246 PerlIO_printf(Perl_debug_log,"%16s",""); \
4247 \
4248 if (SIZE_ONLY) \
4249 num=RExC_size; \
4250 else \
4251 num=REG_NODE_NUM(RExC_emit); \
4252 if (RExC_lastnum!=num) \
be8e71aa 4253 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4254 else \
be8e71aa
YO
4255 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4256 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4257 (int)((depth*2)), "", \
3dab1dad
YO
4258 (funcname) \
4259 ); \
4260 RExC_lastnum=num; \
4261 RExC_lastparse=RExC_parse; \
4262})
4263
07be1b83
YO
4264
4265
3dab1dad
YO
4266#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4267 DEBUG_PARSE_MSG((funcname)); \
4268 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4269})
a687059c
LW
4270/*
4271 - reg - regular expression, i.e. main body or parenthesized thing
4272 *
4273 * Caller must absorb opening parenthesis.
4274 *
4275 * Combining parenthesis handling with the base level of regular expression
4276 * is a trifle forced, but the need to tie the tails of the branches to what
4277 * follows makes it hard to avoid.
4278 */
07be1b83
YO
4279#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4280#ifdef DEBUGGING
4281#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4282#else
4283#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4284#endif
3dab1dad 4285
76e3520e 4286STATIC regnode *
3dab1dad 4287S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4288 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4289{
27da23d5 4290 dVAR;
c277df42
IZ
4291 register regnode *ret; /* Will be the head of the group. */
4292 register regnode *br;
4293 register regnode *lastbr;
cbbf8932 4294 register regnode *ender = NULL;
a0d0e21e 4295 register I32 parno = 0;
cbbf8932
AL
4296 I32 flags;
4297 const I32 oregflags = RExC_flags;
6136c704
AL
4298 bool have_branch = 0;
4299 bool is_open = 0;
9d1d55b5
JP
4300
4301 /* for (?g), (?gc), and (?o) warnings; warning
4302 about (?c) will warn about (?g) -- japhy */
4303
6136c704
AL
4304#define WASTED_O 0x01
4305#define WASTED_G 0x02
4306#define WASTED_C 0x04
4307#define WASTED_GC (0x02|0x04)
cbbf8932 4308 I32 wastedflags = 0x00;
9d1d55b5 4309
fac92740 4310 char * parse_start = RExC_parse; /* MJD */
a28509cc 4311 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4312
3dab1dad
YO
4313 GET_RE_DEBUG_FLAGS_DECL;
4314 DEBUG_PARSE("reg ");
4315
4316
821b33a5 4317 *flagp = 0; /* Tentatively. */
a0d0e21e 4318
9d1d55b5 4319
a0d0e21e
LW
4320 /* Make an OPEN node, if parenthesized. */
4321 if (paren) {
fac92740 4322 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4323 U32 posflags = 0, negflags = 0;
4324 U32 *flagsp = &posflags;
6136c704 4325 bool is_logical = 0;
a28509cc 4326 const char * const seqstart = RExC_parse;
ca9dfc88 4327
830247a4
IZ
4328 RExC_parse++;
4329 paren = *RExC_parse++;
c277df42 4330 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4331 switch (paren) {
fac92740 4332 case '<': /* (?<...) */
830247a4 4333 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4334 if (*RExC_parse == '!')
c277df42 4335 paren = ',';
b81d288d 4336 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4337 goto unknown;
830247a4 4338 RExC_parse++;
fac92740
MJD
4339 case '=': /* (?=...) */
4340 case '!': /* (?!...) */
830247a4 4341 RExC_seen_zerolen++;
fac92740
MJD
4342 case ':': /* (?:...) */
4343 case '>': /* (?>...) */
a0d0e21e 4344 break;
fac92740
MJD
4345 case '$': /* (?$...) */
4346 case '@': /* (?@...) */
8615cb43 4347 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4348 break;
fac92740 4349 case '#': /* (?#...) */
830247a4
IZ
4350 while (*RExC_parse && *RExC_parse != ')')
4351 RExC_parse++;
4352 if (*RExC_parse != ')')
c277df42 4353 FAIL("Sequence (?#... not terminated");
830247a4 4354 nextchar(pRExC_state);
a0d0e21e
LW
4355 *flagp = TRYAGAIN;
4356 return NULL;
fac92740 4357 case 'p': /* (?p...) */
9014280d 4358 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4359 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4360 /* FALL THROUGH*/
fac92740 4361 case '?': /* (??...) */
6136c704 4362 is_logical = 1;
438a3801
YST
4363 if (*RExC_parse != '{')
4364 goto unknown;
830247a4 4365 paren = *RExC_parse++;
0f5d15d6 4366 /* FALL THROUGH */
fac92740 4367 case '{': /* (?{...}) */
c277df42 4368 {
c277df42
IZ
4369 I32 count = 1, n = 0;
4370 char c;
830247a4 4371 char *s = RExC_parse;
c277df42 4372
830247a4
IZ
4373 RExC_seen_zerolen++;
4374 RExC_seen |= REG_SEEN_EVAL;
4375 while (count && (c = *RExC_parse)) {
6136c704
AL
4376 if (c == '\\') {
4377 if (RExC_parse[1])
4378 RExC_parse++;
4379 }
b81d288d 4380 else if (c == '{')
c277df42 4381 count++;
b81d288d 4382 else if (c == '}')
c277df42 4383 count--;
830247a4 4384 RExC_parse++;
c277df42 4385 }
6136c704 4386 if (*RExC_parse != ')') {
b81d288d 4387 RExC_parse = s;
b45f050a
JF
4388 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4389 }
c277df42 4390 if (!SIZE_ONLY) {
f3548bdc 4391 PAD *pad;
6136c704
AL
4392 OP_4tree *sop, *rop;
4393 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4394
569233ed
SB
4395 ENTER;
4396 Perl_save_re_context(aTHX);
f3548bdc 4397 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4398 sop->op_private |= OPpREFCOUNTED;
4399 /* re_dup will OpREFCNT_inc */
4400 OpREFCNT_set(sop, 1);
569233ed 4401 LEAVE;
c277df42 4402
830247a4
IZ
4403 n = add_data(pRExC_state, 3, "nop");
4404 RExC_rx->data->data[n] = (void*)rop;
4405 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4406 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4407 SvREFCNT_dec(sv);
a0ed51b3 4408 }
e24b16f9 4409 else { /* First pass */
830247a4 4410 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4411 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4412 /* No compiled RE interpolated, has runtime
4413 components ===> unsafe. */
4414 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4415 if (PL_tainting && PL_tainted)
cc6b7395 4416 FAIL("Eval-group in insecure regular expression");
54df2634 4417#if PERL_VERSION > 8
923e4eb5 4418 if (IN_PERL_COMPILETIME)
b5c19bd7 4419 PL_cv_has_eval = 1;
54df2634 4420#endif
c277df42 4421 }
b5c19bd7 4422
830247a4 4423 nextchar(pRExC_state);
6136c704 4424 if (is_logical) {
830247a4 4425 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4426 if (!SIZE_ONLY)
4427 ret->flags = 2;
3dab1dad 4428 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4429 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4430 return ret;
4431 }
ccb2c380
MP
4432 ret = reganode(pRExC_state, EVAL, n);
4433 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4434 Set_Node_Offset(ret, parse_start);
4435 return ret;
c277df42 4436 }
fac92740 4437 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4438 {
fac92740 4439 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4440 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4441 || RExC_parse[1] == '<'
830247a4 4442 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4443 I32 flag;
4444
830247a4 4445 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4446 if (!SIZE_ONLY)
4447 ret->flags = 1;
3dab1dad 4448 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4449 goto insert_if;
b81d288d 4450 }
a0ed51b3 4451 }
830247a4 4452 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4453 /* (?(1)...) */
6136c704 4454 char c;
830247a4 4455 parno = atoi(RExC_parse++);
c277df42 4456
830247a4
IZ
4457 while (isDIGIT(*RExC_parse))
4458 RExC_parse++;
fac92740 4459 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4460
830247a4 4461 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4462 vFAIL("Switch condition not recognized");
c277df42 4463 insert_if:
3dab1dad
YO
4464 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4465 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4466 if (br == NULL)
830247a4 4467 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4468 else
3dab1dad 4469 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4470 c = *nextchar(pRExC_state);
d1b80229
IZ
4471 if (flags&HASWIDTH)
4472 *flagp |= HASWIDTH;
c277df42 4473 if (c == '|') {
830247a4 4474 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4475 regbranch(pRExC_state, &flags, 1,depth+1);
4476 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4477 if (flags&HASWIDTH)
4478 *flagp |= HASWIDTH;
830247a4 4479 c = *nextchar(pRExC_state);
a0ed51b3
LW
4480 }
4481 else
c277df42
IZ
4482 lastbr = NULL;
4483 if (c != ')')
8615cb43 4484 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4485 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4486 REGTAIL(pRExC_state, br, ender);
c277df42 4487 if (lastbr) {
3dab1dad
YO
4488 REGTAIL(pRExC_state, lastbr, ender);
4489 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4490 }
4491 else
3dab1dad 4492 REGTAIL(pRExC_state, ret, ender);
c277df42 4493 return ret;
a0ed51b3
LW
4494 }
4495 else {
830247a4 4496 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4497 }
4498 }
1b1626e4 4499 case 0:
830247a4 4500 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4501 vFAIL("Sequence (? incomplete");
1b1626e4 4502 break;
a0d0e21e 4503 default:
830247a4 4504 --RExC_parse;
fac92740 4505 parse_flags: /* (?i) */
830247a4 4506 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4507 /* (?g), (?gc) and (?o) are useless here
4508 and must be globally applied -- japhy */
4509
4510 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4511 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4512 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4513 if (! (wastedflags & wflagbit) ) {
4514 wastedflags |= wflagbit;
4515 vWARN5(
4516 RExC_parse + 1,
4517 "Useless (%s%c) - %suse /%c modifier",
4518 flagsp == &negflags ? "?-" : "?",
4519 *RExC_parse,
4520 flagsp == &negflags ? "don't " : "",
4521 *RExC_parse
4522 );
4523 }
4524 }
4525 }
4526 else if (*RExC_parse == 'c') {
4527 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4528 if (! (wastedflags & WASTED_C) ) {
4529 wastedflags |= WASTED_GC;
9d1d55b5
JP
4530 vWARN3(
4531 RExC_parse + 1,
4532 "Useless (%sc) - %suse /gc modifier",
4533 flagsp == &negflags ? "?-" : "?",
4534 flagsp == &negflags ? "don't " : ""
4535 );
4536 }
4537 }
4538 }
4539 else { pmflag(flagsp, *RExC_parse); }
4540
830247a4 4541 ++RExC_parse;
ca9dfc88 4542 }
830247a4 4543 if (*RExC_parse == '-') {
ca9dfc88 4544 flagsp = &negflags;
9d1d55b5 4545 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4546 ++RExC_parse;
ca9dfc88 4547 goto parse_flags;
48c036b1 4548 }
e2509266
JH
4549 RExC_flags |= posflags;
4550 RExC_flags &= ~negflags;
830247a4
IZ
4551 if (*RExC_parse == ':') {
4552 RExC_parse++;
ca9dfc88
IZ
4553 paren = ':';
4554 break;
4555 }
c277df42 4556 unknown:
830247a4
IZ
4557 if (*RExC_parse != ')') {
4558 RExC_parse++;
4559 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4560 }
830247a4 4561 nextchar(pRExC_state);
a0d0e21e
LW
4562 *flagp = TRYAGAIN;
4563 return NULL;
4564 }
4565 }
fac92740 4566 else { /* (...) */
830247a4
IZ
4567 parno = RExC_npar;
4568 RExC_npar++;
4569 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4570 Set_Node_Length(ret, 1); /* MJD */
4571 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4572 is_open = 1;
a0d0e21e 4573 }
a0ed51b3 4574 }
fac92740 4575 else /* ! paren */
a0d0e21e
LW
4576 ret = NULL;
4577
4578 /* Pick up the branches, linking them together. */
fac92740 4579 parse_start = RExC_parse; /* MJD */
3dab1dad 4580 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4581 /* branch_len = (paren != 0); */
2af232bd 4582
a0d0e21e
LW
4583 if (br == NULL)
4584 return(NULL);
830247a4
IZ
4585 if (*RExC_parse == '|') {
4586 if (!SIZE_ONLY && RExC_extralen) {
4587 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4588 }
fac92740 4589 else { /* MJD */
830247a4 4590 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4591 Set_Node_Length(br, paren != 0);
4592 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4593 }
c277df42
IZ
4594 have_branch = 1;
4595 if (SIZE_ONLY)
830247a4 4596 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4597 }
4598 else if (paren == ':') {
c277df42
IZ
4599 *flagp |= flags&SIMPLE;
4600 }
6136c704 4601 if (is_open) { /* Starts with OPEN. */
3dab1dad 4602 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4603 }
4604 else if (paren != '?') /* Not Conditional */
a0d0e21e 4605 ret = br;
32a0ca98 4606 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4607 lastbr = br;
830247a4
IZ
4608 while (*RExC_parse == '|') {
4609 if (!SIZE_ONLY && RExC_extralen) {
4610 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4611 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4612 }
4613 if (SIZE_ONLY)
830247a4
IZ
4614 RExC_extralen += 2; /* Account for LONGJMP. */
4615 nextchar(pRExC_state);
3dab1dad 4616 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4617
a687059c 4618 if (br == NULL)
a0d0e21e 4619 return(NULL);
3dab1dad 4620 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4621 lastbr = br;
821b33a5
IZ
4622 if (flags&HASWIDTH)
4623 *flagp |= HASWIDTH;
a687059c 4624 *flagp |= flags&SPSTART;
a0d0e21e
LW
4625 }
4626
c277df42
IZ
4627 if (have_branch || paren != ':') {
4628 /* Make a closing node, and hook it on the end. */
4629 switch (paren) {
4630 case ':':
830247a4 4631 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4632 break;
4633 case 1:
830247a4 4634 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4635 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4636 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4637 break;
4638 case '<':
c277df42
IZ
4639 case ',':
4640 case '=':
4641 case '!':
c277df42 4642 *flagp &= ~HASWIDTH;
821b33a5
IZ
4643 /* FALL THROUGH */
4644 case '>':
830247a4 4645 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4646 break;
4647 case 0:
830247a4 4648 ender = reg_node(pRExC_state, END);
c277df42
IZ
4649 break;
4650 }
07be1b83 4651 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4652
9674d46a 4653 if (have_branch && !SIZE_ONLY) {
c277df42 4654 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4655 for (br = ret; br; br = regnext(br)) {
4656 const U8 op = PL_regkind[OP(br)];
4657 if (op == BRANCH) {
07be1b83 4658 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4659 }
4660 else if (op == BRANCHJ) {
07be1b83 4661 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4662 }
c277df42
IZ
4663 }
4664 }
a0d0e21e 4665 }
c277df42
IZ
4666
4667 {
e1ec3a88
AL
4668 const char *p;
4669 static const char parens[] = "=!<,>";
c277df42
IZ
4670
4671 if (paren && (p = strchr(parens, paren))) {
eb160463 4672 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4673 int flag = (p - parens) > 1;
4674
4675 if (paren == '>')
4676 node = SUSPEND, flag = 0;
830247a4 4677 reginsert(pRExC_state, node,ret);
45948336
EP
4678 Set_Node_Cur_Length(ret);
4679 Set_Node_Offset(ret, parse_start + 1);
c277df42 4680 ret->flags = flag;
07be1b83 4681 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4682 }
a0d0e21e
LW
4683 }
4684
4685 /* Check for proper termination. */
ce3e6498 4686 if (paren) {
e2509266 4687 RExC_flags = oregflags;
830247a4
IZ
4688 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4689 RExC_parse = oregcomp_parse;
380a0633 4690 vFAIL("Unmatched (");
ce3e6498 4691 }
a0ed51b3 4692 }
830247a4
IZ
4693 else if (!paren && RExC_parse < RExC_end) {
4694 if (*RExC_parse == ')') {
4695 RExC_parse++;
380a0633 4696 vFAIL("Unmatched )");
a0ed51b3
LW
4697 }
4698 else
b45f050a 4699 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4700 /* NOTREACHED */
4701 }
a687059c 4702
a0d0e21e 4703 return(ret);
a687059c
LW
4704}
4705
4706/*
4707 - regbranch - one alternative of an | operator
4708 *
4709 * Implements the concatenation operator.
4710 */
76e3520e 4711STATIC regnode *
3dab1dad 4712S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4713{
97aff369 4714 dVAR;
c277df42
IZ
4715 register regnode *ret;
4716 register regnode *chain = NULL;
4717 register regnode *latest;
4718 I32 flags = 0, c = 0;
3dab1dad
YO
4719 GET_RE_DEBUG_FLAGS_DECL;
4720 DEBUG_PARSE("brnc");
b81d288d 4721 if (first)
c277df42
IZ
4722 ret = NULL;
4723 else {
b81d288d 4724 if (!SIZE_ONLY && RExC_extralen)
830247a4 4725 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4726 else {
830247a4 4727 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4728 Set_Node_Length(ret, 1);
4729 }
c277df42
IZ
4730 }
4731
b81d288d 4732 if (!first && SIZE_ONLY)
830247a4 4733 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4734
c277df42 4735 *flagp = WORST; /* Tentatively. */
a0d0e21e 4736
830247a4
IZ
4737 RExC_parse--;
4738 nextchar(pRExC_state);
4739 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4740 flags &= ~TRYAGAIN;
3dab1dad 4741 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4742 if (latest == NULL) {
4743 if (flags & TRYAGAIN)
4744 continue;
4745 return(NULL);
a0ed51b3
LW
4746 }
4747 else if (ret == NULL)
c277df42 4748 ret = latest;
a0d0e21e 4749 *flagp |= flags&HASWIDTH;
c277df42 4750 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4751 *flagp |= flags&SPSTART;
4752 else {
830247a4 4753 RExC_naughty++;
3dab1dad 4754 REGTAIL(pRExC_state, chain, latest);
a687059c 4755 }
a0d0e21e 4756 chain = latest;
c277df42
IZ
4757 c++;
4758 }
4759 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4760 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4761 if (ret == NULL)
4762 ret = chain;
4763 }
4764 if (c == 1) {
4765 *flagp |= flags&SIMPLE;
a0d0e21e 4766 }
a687059c 4767
d4c19fe8 4768 return ret;
a687059c
LW
4769}
4770
4771/*
4772 - regpiece - something followed by possible [*+?]
4773 *
4774 * Note that the branching code sequences used for ? and the general cases
4775 * of * and + are somewhat optimized: they use the same NOTHING node as
4776 * both the endmarker for their branch list and the body of the last branch.
4777 * It might seem that this node could be dispensed with entirely, but the
4778 * endmarker role is not redundant.
4779 */
76e3520e 4780STATIC regnode *
3dab1dad 4781S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4782{
97aff369 4783 dVAR;
c277df42 4784 register regnode *ret;
a0d0e21e
LW
4785 register char op;
4786 register char *next;
4787 I32 flags;
1df70142 4788 const char * const origparse = RExC_parse;
a0d0e21e 4789 I32 min;
c277df42 4790 I32 max = REG_INFTY;
fac92740 4791 char *parse_start;
10edeb5d 4792 const char *maxpos = NULL;
3dab1dad
YO
4793 GET_RE_DEBUG_FLAGS_DECL;
4794 DEBUG_PARSE("piec");
a0d0e21e 4795
3dab1dad 4796 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4797 if (ret == NULL) {
4798 if (flags & TRYAGAIN)
4799 *flagp |= TRYAGAIN;
4800 return(NULL);
4801 }
4802
830247a4 4803 op = *RExC_parse;
a0d0e21e 4804
830247a4 4805 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4806 maxpos = NULL;
fac92740 4807 parse_start = RExC_parse; /* MJD */
830247a4 4808 next = RExC_parse + 1;
a0d0e21e
LW
4809 while (isDIGIT(*next) || *next == ',') {
4810 if (*next == ',') {
4811 if (maxpos)
4812 break;
4813 else
4814 maxpos = next;
a687059c 4815 }
a0d0e21e
LW
4816 next++;
4817 }
4818 if (*next == '}') { /* got one */
4819 if (!maxpos)
4820 maxpos = next;
830247a4
IZ
4821 RExC_parse++;
4822 min = atoi(RExC_parse);
a0d0e21e
LW
4823 if (*maxpos == ',')
4824 maxpos++;
4825 else
830247a4 4826 maxpos = RExC_parse;
a0d0e21e
LW
4827 max = atoi(maxpos);
4828 if (!max && *maxpos != '0')
c277df42
IZ
4829 max = REG_INFTY; /* meaning "infinity" */
4830 else if (max >= REG_INFTY)
8615cb43 4831 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4832 RExC_parse = next;
4833 nextchar(pRExC_state);
a0d0e21e
LW
4834
4835 do_curly:
4836 if ((flags&SIMPLE)) {
830247a4
IZ
4837 RExC_naughty += 2 + RExC_naughty / 2;
4838 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4839 Set_Node_Offset(ret, parse_start+1); /* MJD */
4840 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4841 }
4842 else {
3dab1dad 4843 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4844
4845 w->flags = 0;
3dab1dad 4846 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4847 if (!SIZE_ONLY && RExC_extralen) {
4848 reginsert(pRExC_state, LONGJMP,ret);
4849 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4850 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4851 }
830247a4 4852 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4853 /* MJD hk */
4854 Set_Node_Offset(ret, parse_start+1);
2af232bd 4855 Set_Node_Length(ret,
fac92740 4856 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4857
830247a4 4858 if (!SIZE_ONLY && RExC_extralen)
c277df42 4859 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4860 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4861 if (SIZE_ONLY)
830247a4
IZ
4862 RExC_whilem_seen++, RExC_extralen += 3;
4863 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4864 }
c277df42 4865 ret->flags = 0;
a0d0e21e
LW
4866
4867 if (min > 0)
821b33a5
IZ
4868 *flagp = WORST;
4869 if (max > 0)
4870 *flagp |= HASWIDTH;
a0d0e21e 4871 if (max && max < min)
8615cb43 4872 vFAIL("Can't do {n,m} with n > m");
c277df42 4873 if (!SIZE_ONLY) {
eb160463
GS
4874 ARG1_SET(ret, (U16)min);
4875 ARG2_SET(ret, (U16)max);
a687059c 4876 }
a687059c 4877
a0d0e21e 4878 goto nest_check;
a687059c 4879 }
a0d0e21e 4880 }
a687059c 4881
a0d0e21e
LW
4882 if (!ISMULT1(op)) {
4883 *flagp = flags;
a687059c 4884 return(ret);
a0d0e21e 4885 }
bb20fd44 4886
c277df42 4887#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4888
4889 /* if this is reinstated, don't forget to put this back into perldiag:
4890
4891 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4892
4893 (F) The part of the regexp subject to either the * or + quantifier
4894 could match an empty string. The {#} shows in the regular
4895 expression about where the problem was discovered.
4896
4897 */
4898
bb20fd44 4899 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4900 vFAIL("Regexp *+ operand could be empty");
b81d288d 4901#endif
bb20fd44 4902
fac92740 4903 parse_start = RExC_parse;
830247a4 4904 nextchar(pRExC_state);
a0d0e21e 4905
821b33a5 4906 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4907
4908 if (op == '*' && (flags&SIMPLE)) {
830247a4 4909 reginsert(pRExC_state, STAR, ret);
c277df42 4910 ret->flags = 0;
830247a4 4911 RExC_naughty += 4;
a0d0e21e
LW
4912 }
4913 else if (op == '*') {
4914 min = 0;
4915 goto do_curly;
a0ed51b3
LW
4916 }
4917 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4918 reginsert(pRExC_state, PLUS, ret);
c277df42 4919 ret->flags = 0;
830247a4 4920 RExC_naughty += 3;
a0d0e21e
LW
4921 }
4922 else if (op == '+') {
4923 min = 1;
4924 goto do_curly;
a0ed51b3
LW
4925 }
4926 else if (op == '?') {
a0d0e21e
LW
4927 min = 0; max = 1;
4928 goto do_curly;
4929 }
4930 nest_check:
041457d9 4931 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4932 vWARN3(RExC_parse,
b45f050a 4933 "%.*s matches null string many times",
afd78fd5 4934 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4935 origparse);
a0d0e21e
LW
4936 }
4937
830247a4
IZ
4938 if (*RExC_parse == '?') {
4939 nextchar(pRExC_state);
4940 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4941 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4942 }
830247a4
IZ
4943 if (ISMULT2(RExC_parse)) {
4944 RExC_parse++;
b45f050a
JF
4945 vFAIL("Nested quantifiers");
4946 }
a0d0e21e
LW
4947
4948 return(ret);
a687059c
LW
4949}
4950
4951/*
4952 - regatom - the lowest level
4953 *
4954 * Optimization: gobbles an entire sequence of ordinary characters so that
4955 * it can turn them into a single node, which is smaller to store and
4956 * faster to run. Backslashed characters are exceptions, each becoming a
4957 * separate node; the code is simpler that way and it's not worth fixing.
4958 *
7f6f358c
YO
4959 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4960 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4961 */
76e3520e 4962STATIC regnode *
3dab1dad 4963S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4964{
97aff369 4965 dVAR;
cbbf8932 4966 register regnode *ret = NULL;
a0d0e21e 4967 I32 flags;
45948336 4968 char *parse_start = RExC_parse;
3dab1dad
YO
4969 GET_RE_DEBUG_FLAGS_DECL;
4970 DEBUG_PARSE("atom");
a0d0e21e
LW
4971 *flagp = WORST; /* Tentatively. */
4972
4973tryagain:
830247a4 4974 switch (*RExC_parse) {
a0d0e21e 4975 case '^':
830247a4
IZ
4976 RExC_seen_zerolen++;
4977 nextchar(pRExC_state);
e2509266 4978 if (RExC_flags & PMf_MULTILINE)
830247a4 4979 ret = reg_node(pRExC_state, MBOL);
e2509266 4980 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4981 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4982 else
830247a4 4983 ret = reg_node(pRExC_state, BOL);
fac92740 4984 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4985 break;
4986 case '$':
830247a4 4987 nextchar(pRExC_state);
b81d288d 4988 if (*RExC_parse)
830247a4 4989 RExC_seen_zerolen++;
e2509266 4990 if (RExC_flags & PMf_MULTILINE)
830247a4 4991 ret = reg_node(pRExC_state, MEOL);
e2509266 4992 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4993 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4994 else
830247a4 4995 ret = reg_node(pRExC_state, EOL);
fac92740 4996 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4997 break;
4998 case '.':
830247a4 4999 nextchar(pRExC_state);
e2509266 5000 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5001 ret = reg_node(pRExC_state, SANY);
5002 else
5003 ret = reg_node(pRExC_state, REG_ANY);
5004 *flagp |= HASWIDTH|SIMPLE;
830247a4 5005 RExC_naughty++;
fac92740 5006 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5007 break;
5008 case '[':
b45f050a 5009 {
3dab1dad
YO
5010 char * const oregcomp_parse = ++RExC_parse;
5011 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5012 if (*RExC_parse != ']') {
5013 RExC_parse = oregcomp_parse;
b45f050a
JF
5014 vFAIL("Unmatched [");
5015 }
830247a4 5016 nextchar(pRExC_state);
a0d0e21e 5017 *flagp |= HASWIDTH|SIMPLE;
fac92740 5018 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5019 break;
b45f050a 5020 }
a0d0e21e 5021 case '(':
830247a4 5022 nextchar(pRExC_state);
3dab1dad 5023 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5024 if (ret == NULL) {
bf93d4cc 5025 if (flags & TRYAGAIN) {
830247a4 5026 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5027 /* Make parent create an empty node if needed. */
5028 *flagp |= TRYAGAIN;
5029 return(NULL);
5030 }
a0d0e21e 5031 goto tryagain;
bf93d4cc 5032 }
a0d0e21e
LW
5033 return(NULL);
5034 }
c277df42 5035 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5036 break;
5037 case '|':
5038 case ')':
5039 if (flags & TRYAGAIN) {
5040 *flagp |= TRYAGAIN;
5041 return NULL;
5042 }
b45f050a 5043 vFAIL("Internal urp");
a0d0e21e
LW
5044 /* Supposed to be caught earlier. */
5045 break;
85afd4ae 5046 case '{':
830247a4
IZ
5047 if (!regcurly(RExC_parse)) {
5048 RExC_parse++;
85afd4ae
CS
5049 goto defchar;
5050 }
5051 /* FALL THROUGH */
a0d0e21e
LW
5052 case '?':
5053 case '+':
5054 case '*':
830247a4 5055 RExC_parse++;
b45f050a 5056 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5057 break;
5058 case '\\':
830247a4 5059 switch (*++RExC_parse) {
a0d0e21e 5060 case 'A':
830247a4
IZ
5061 RExC_seen_zerolen++;
5062 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5063 *flagp |= SIMPLE;
830247a4 5064 nextchar(pRExC_state);
fac92740 5065 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5066 break;
5067 case 'G':
830247a4
IZ
5068 ret = reg_node(pRExC_state, GPOS);
5069 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5070 *flagp |= SIMPLE;
830247a4 5071 nextchar(pRExC_state);
fac92740 5072 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5073 break;
5074 case 'Z':
830247a4 5075 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5076 *flagp |= SIMPLE;
a1917ab9 5077 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5078 nextchar(pRExC_state);
a0d0e21e 5079 break;
b85d18e9 5080 case 'z':
830247a4 5081 ret = reg_node(pRExC_state, EOS);
b85d18e9 5082 *flagp |= SIMPLE;
830247a4
IZ
5083 RExC_seen_zerolen++; /* Do not optimize RE away */
5084 nextchar(pRExC_state);
fac92740 5085 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5086 break;
4a2d328f 5087 case 'C':
f33976b4
DB
5088 ret = reg_node(pRExC_state, CANY);
5089 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5090 *flagp |= HASWIDTH|SIMPLE;
830247a4 5091 nextchar(pRExC_state);
fac92740 5092 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5093 break;
5094 case 'X':
830247a4 5095 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5096 *flagp |= HASWIDTH;
830247a4 5097 nextchar(pRExC_state);
fac92740 5098 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5099 break;
a0d0e21e 5100 case 'w':
eb160463 5101 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5102 *flagp |= HASWIDTH|SIMPLE;
830247a4 5103 nextchar(pRExC_state);
fac92740 5104 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5105 break;
5106 case 'W':
eb160463 5107 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5108 *flagp |= HASWIDTH|SIMPLE;
830247a4 5109 nextchar(pRExC_state);
fac92740 5110 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5111 break;
5112 case 'b':
830247a4
IZ
5113 RExC_seen_zerolen++;
5114 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5115 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5116 *flagp |= SIMPLE;
830247a4 5117 nextchar(pRExC_state);
fac92740 5118 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5119 break;
5120 case 'B':
830247a4
IZ
5121 RExC_seen_zerolen++;
5122 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5123 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5124 *flagp |= SIMPLE;
830247a4 5125 nextchar(pRExC_state);
fac92740 5126 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5127 break;
5128 case 's':
eb160463 5129 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5130 *flagp |= HASWIDTH|SIMPLE;
830247a4 5131 nextchar(pRExC_state);
fac92740 5132 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5133 break;
5134 case 'S':
eb160463 5135 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5136 *flagp |= HASWIDTH|SIMPLE;
830247a4 5137 nextchar(pRExC_state);
fac92740 5138 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5139 break;
5140 case 'd':
ffc61ed2 5141 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5142 *flagp |= HASWIDTH|SIMPLE;
830247a4 5143 nextchar(pRExC_state);
fac92740 5144 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5145 break;
5146 case 'D':
ffc61ed2 5147 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5148 *flagp |= HASWIDTH|SIMPLE;
830247a4 5149 nextchar(pRExC_state);
fac92740 5150 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5151 break;
a14b48bc
LW
5152 case 'p':
5153 case 'P':
3568d838 5154 {
3dab1dad 5155 char* const oldregxend = RExC_end;
ccb2c380 5156 char* parse_start = RExC_parse - 2;
a14b48bc 5157
830247a4 5158 if (RExC_parse[1] == '{') {
3568d838 5159 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5160 RExC_end = strchr(RExC_parse, '}');
5161 if (!RExC_end) {
3dab1dad 5162 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5163 RExC_parse += 2;
5164 RExC_end = oldregxend;
0da60cf5 5165 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5166 }
830247a4 5167 RExC_end++;
a14b48bc 5168 }
af6f566e 5169 else {
830247a4 5170 RExC_end = RExC_parse + 2;
af6f566e
HS
5171 if (RExC_end > oldregxend)
5172 RExC_end = oldregxend;
5173 }
830247a4 5174 RExC_parse--;
a14b48bc 5175
3dab1dad 5176 ret = regclass(pRExC_state,depth+1);
a14b48bc 5177
830247a4
IZ
5178 RExC_end = oldregxend;
5179 RExC_parse--;
ccb2c380
MP
5180
5181 Set_Node_Offset(ret, parse_start + 2);
5182 Set_Node_Cur_Length(ret);
830247a4 5183 nextchar(pRExC_state);
a14b48bc
LW
5184 *flagp |= HASWIDTH|SIMPLE;
5185 }
5186 break;
a0d0e21e
LW
5187 case 'n':
5188 case 'r':
5189 case 't':
5190 case 'f':
5191 case 'e':
5192 case 'a':
5193 case 'x':
5194 case 'c':
5195 case '0':
5196 goto defchar;
5197 case '1': case '2': case '3': case '4':
5198 case '5': case '6': case '7': case '8': case '9':
5199 {
1df70142 5200 const I32 num = atoi(RExC_parse);
a0d0e21e 5201
830247a4 5202 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5203 goto defchar;
5204 else {
3dab1dad 5205 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5206 while (isDIGIT(*RExC_parse))
5207 RExC_parse++;
b45f050a 5208
eb160463 5209 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5210 vFAIL("Reference to nonexistent group");
830247a4 5211 RExC_sawback = 1;
eb160463
GS
5212 ret = reganode(pRExC_state,
5213 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5214 num);
a0d0e21e 5215 *flagp |= HASWIDTH;
2af232bd 5216
fac92740 5217 /* override incorrect value set in reganode MJD */
2af232bd 5218 Set_Node_Offset(ret, parse_start+1);
fac92740 5219 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5220 RExC_parse--;
5221 nextchar(pRExC_state);
a0d0e21e
LW
5222 }
5223 }
5224 break;
5225 case '\0':
830247a4 5226 if (RExC_parse >= RExC_end)
b45f050a 5227 FAIL("Trailing \\");
a0d0e21e
LW
5228 /* FALL THROUGH */
5229 default:
a0288114 5230 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5231 back into the quick-grab loop below */
45948336 5232 parse_start--;
a0d0e21e
LW
5233 goto defchar;
5234 }
5235 break;
4633a7c4
LW
5236
5237 case '#':
e2509266 5238 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5239 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5240 RExC_parse++;
830247a4 5241 if (RExC_parse < RExC_end)
4633a7c4
LW
5242 goto tryagain;
5243 }
5244 /* FALL THROUGH */
5245
a0d0e21e 5246 default: {
ba210ebe 5247 register STRLEN len;
58ae7d3f 5248 register UV ender;
a0d0e21e 5249 register char *p;
3dab1dad 5250 char *s;
80aecb99 5251 STRLEN foldlen;
89ebb4a3 5252 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5253
5254 parse_start = RExC_parse - 1;
a0d0e21e 5255
830247a4 5256 RExC_parse++;
a0d0e21e
LW
5257
5258 defchar:
58ae7d3f 5259 ender = 0;
eb160463
GS
5260 ret = reg_node(pRExC_state,
5261 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5262 s = STRING(ret);
830247a4
IZ
5263 for (len = 0, p = RExC_parse - 1;
5264 len < 127 && p < RExC_end;
a0d0e21e
LW
5265 len++)
5266 {
3dab1dad 5267 char * const oldp = p;
5b5a24f7 5268
e2509266 5269 if (RExC_flags & PMf_EXTENDED)
830247a4 5270 p = regwhite(p, RExC_end);
a0d0e21e
LW
5271 switch (*p) {
5272 case '^':
5273 case '$':
5274 case '.':
5275 case '[':
5276 case '(':
5277 case ')':
5278 case '|':
5279 goto loopdone;
5280 case '\\':
5281 switch (*++p) {
5282 case 'A':
1ed8eac0
JF
5283 case 'C':
5284 case 'X':
a0d0e21e
LW
5285 case 'G':
5286 case 'Z':
b85d18e9 5287 case 'z':
a0d0e21e
LW
5288 case 'w':
5289 case 'W':
5290 case 'b':
5291 case 'B':
5292 case 's':
5293 case 'S':
5294 case 'd':
5295 case 'D':
a14b48bc
LW
5296 case 'p':
5297 case 'P':
a0d0e21e
LW
5298 --p;
5299 goto loopdone;
5300 case 'n':
5301 ender = '\n';
5302 p++;
a687059c 5303 break;
a0d0e21e
LW
5304 case 'r':
5305 ender = '\r';
5306 p++;
a687059c 5307 break;
a0d0e21e
LW
5308 case 't':
5309 ender = '\t';
5310 p++;
a687059c 5311 break;
a0d0e21e
LW
5312 case 'f':
5313 ender = '\f';
5314 p++;
a687059c 5315 break;
a0d0e21e 5316 case 'e':
c7f1f016 5317 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5318 p++;
a687059c 5319 break;
a0d0e21e 5320 case 'a':
c7f1f016 5321 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5322 p++;
a687059c 5323 break;
a0d0e21e 5324 case 'x':
a0ed51b3 5325 if (*++p == '{') {
1df70142 5326 char* const e = strchr(p, '}');
b81d288d 5327
b45f050a 5328 if (!e) {
830247a4 5329 RExC_parse = p + 1;
b45f050a
JF
5330 vFAIL("Missing right brace on \\x{}");
5331 }
de5f0749 5332 else {
a4c04bdc
NC
5333 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5334 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5335 STRLEN numlen = e - p - 1;
53305cf1 5336 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5337 if (ender > 0xff)
5338 RExC_utf8 = 1;
a0ed51b3
LW
5339 p = e + 1;
5340 }
a0ed51b3
LW
5341 }
5342 else {
a4c04bdc 5343 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5344 STRLEN numlen = 2;
53305cf1 5345 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5346 p += numlen;
5347 }
a687059c 5348 break;
a0d0e21e
LW
5349 case 'c':
5350 p++;
bbce6d69 5351 ender = UCHARAT(p++);
5352 ender = toCTRL(ender);
a687059c 5353 break;
a0d0e21e
LW
5354 case '0': case '1': case '2': case '3':case '4':
5355 case '5': case '6': case '7': case '8':case '9':
5356 if (*p == '0' ||
830247a4 5357 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5358 I32 flags = 0;
1df70142 5359 STRLEN numlen = 3;
53305cf1 5360 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5361 p += numlen;
5362 }
5363 else {
5364 --p;
5365 goto loopdone;
a687059c
LW
5366 }
5367 break;
a0d0e21e 5368 case '\0':
830247a4 5369 if (p >= RExC_end)
b45f050a 5370 FAIL("Trailing \\");
a687059c 5371 /* FALL THROUGH */
a0d0e21e 5372 default:
041457d9 5373 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5374 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5375 goto normal_default;
a0d0e21e
LW
5376 }
5377 break;
a687059c 5378 default:
a0ed51b3 5379 normal_default:
fd400ab9 5380 if (UTF8_IS_START(*p) && UTF) {
1df70142 5381 STRLEN numlen;
5e12f4fb 5382 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5383 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5384 p += numlen;
5385 }
5386 else
5387 ender = *p++;
a0d0e21e 5388 break;
a687059c 5389 }
e2509266 5390 if (RExC_flags & PMf_EXTENDED)
830247a4 5391 p = regwhite(p, RExC_end);
60a8b682
JH
5392 if (UTF && FOLD) {
5393 /* Prime the casefolded buffer. */
ac7e0132 5394 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5395 }
a0d0e21e
LW
5396 if (ISMULT2(p)) { /* Back off on ?+*. */
5397 if (len)
5398 p = oldp;
16ea2a2e 5399 else if (UTF) {
80aecb99 5400 if (FOLD) {
60a8b682 5401 /* Emit all the Unicode characters. */
1df70142 5402 STRLEN numlen;
80aecb99
JH
5403 for (foldbuf = tmpbuf;
5404 foldlen;
5405 foldlen -= numlen) {
5406 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5407 if (numlen > 0) {
71207a34 5408 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5409 s += unilen;
5410 len += unilen;
5411 /* In EBCDIC the numlen
5412 * and unilen can differ. */
9dc45d57 5413 foldbuf += numlen;
47654450
JH
5414 if (numlen >= foldlen)
5415 break;
9dc45d57
JH
5416 }
5417 else
5418 break; /* "Can't happen." */
80aecb99
JH
5419 }
5420 }
5421 else {
71207a34 5422 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5423 if (unilen > 0) {
0ebc6274
JH
5424 s += unilen;
5425 len += unilen;
9dc45d57 5426 }
80aecb99 5427 }
a0ed51b3 5428 }
a0d0e21e
LW
5429 else {
5430 len++;
eb160463 5431 REGC((char)ender, s++);
a0d0e21e
LW
5432 }
5433 break;
a687059c 5434 }
16ea2a2e 5435 if (UTF) {
80aecb99 5436 if (FOLD) {
60a8b682 5437 /* Emit all the Unicode characters. */
1df70142 5438 STRLEN numlen;
80aecb99
JH
5439 for (foldbuf = tmpbuf;
5440 foldlen;
5441 foldlen -= numlen) {
5442 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5443 if (numlen > 0) {
71207a34 5444 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5445 len += unilen;
5446 s += unilen;
5447 /* In EBCDIC the numlen
5448 * and unilen can differ. */
9dc45d57 5449 foldbuf += numlen;
47654450
JH
5450 if (numlen >= foldlen)
5451 break;
9dc45d57
JH
5452 }
5453 else
5454 break;
80aecb99
JH
5455 }
5456 }
5457 else {
71207a34 5458 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5459 if (unilen > 0) {
0ebc6274
JH
5460 s += unilen;
5461 len += unilen;
9dc45d57 5462 }
80aecb99
JH
5463 }
5464 len--;
a0ed51b3
LW
5465 }
5466 else
eb160463 5467 REGC((char)ender, s++);
a0d0e21e
LW
5468 }
5469 loopdone:
830247a4 5470 RExC_parse = p - 1;
fac92740 5471 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5472 nextchar(pRExC_state);
793db0cb
JH
5473 {
5474 /* len is STRLEN which is unsigned, need to copy to signed */
5475 IV iv = len;
5476 if (iv < 0)
5477 vFAIL("Internal disaster");
5478 }
a0d0e21e
LW
5479 if (len > 0)
5480 *flagp |= HASWIDTH;
090f7165 5481 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5482 *flagp |= SIMPLE;
3dab1dad 5483
cd439c50 5484 if (SIZE_ONLY)
830247a4 5485 RExC_size += STR_SZ(len);
3dab1dad
YO
5486 else {
5487 STR_LEN(ret) = len;
830247a4 5488 RExC_emit += STR_SZ(len);
07be1b83 5489 }
3dab1dad 5490 }
a0d0e21e
LW
5491 break;
5492 }
a687059c 5493
60a8b682
JH
5494 /* If the encoding pragma is in effect recode the text of
5495 * any EXACT-kind nodes. */
3dab1dad
YO
5496 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5497 const STRLEN oldlen = STR_LEN(ret);
5498 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5499
5500 if (RExC_utf8)
5501 SvUTF8_on(sv);
5502 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5503 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5504 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5505
5506 if (SvUTF8(sv))
5507 RExC_utf8 = 1;
5508 if (!SIZE_ONLY) {
a3621e74
YO
5509 GET_RE_DEBUG_FLAGS_DECL;
5510 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5511 (int)oldlen, STRING(ret),
5512 (int)newlen, s));
5513 Copy(s, STRING(ret), newlen, char);
5514 STR_LEN(ret) += newlen - oldlen;
5515 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5516 } else
5517 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5518 }
a72c7584
JH
5519 }
5520
a0d0e21e 5521 return(ret);
a687059c
LW
5522}
5523
873ef191 5524STATIC char *
5f66b61c 5525S_regwhite(char *p, const char *e)
5b5a24f7
CS
5526{
5527 while (p < e) {
5528 if (isSPACE(*p))
5529 ++p;
5530 else if (*p == '#') {
5531 do {
5532 p++;
5533 } while (p < e && *p != '\n');
5534 }
5535 else
5536 break;
5537 }
5538 return p;
5539}
5540
b8c5462f
JH
5541/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5542 Character classes ([:foo:]) can also be negated ([:^foo:]).
5543 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5544 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5545 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5546
5547#define POSIXCC_DONE(c) ((c) == ':')
5548#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5549#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5550
b8c5462f 5551STATIC I32
830247a4 5552S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5553{
97aff369 5554 dVAR;
936ed897 5555 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5556
830247a4 5557 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5558 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5559 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5560 const char c = UCHARAT(RExC_parse);
097eb12c 5561 char* const s = RExC_parse++;
b81d288d 5562
9a86a77b 5563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5564 RExC_parse++;
5565 if (RExC_parse == RExC_end)
620e46c5 5566 /* Grandfather lone [:, [=, [. */
830247a4 5567 RExC_parse = s;
620e46c5 5568 else {
3dab1dad 5569 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5570 assert(*t == c);
5571
9a86a77b 5572 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5573 const char *posixcc = s + 1;
830247a4 5574 RExC_parse++; /* skip over the ending ] */
3dab1dad 5575
b8c5462f 5576 if (*s == ':') {
1df70142
AL
5577 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5578 const I32 skip = t - posixcc;
80916619
NC
5579
5580 /* Initially switch on the length of the name. */
5581 switch (skip) {
5582 case 4:
3dab1dad
YO
5583 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5584 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5585 break;
80916619
NC
5586 case 5:
5587 /* Names all of length 5. */
5588 /* alnum alpha ascii blank cntrl digit graph lower
5589 print punct space upper */
5590 /* Offset 4 gives the best switch position. */
5591 switch (posixcc[4]) {
5592 case 'a':
3dab1dad
YO
5593 if (memEQ(posixcc, "alph", 4)) /* alpha */
5594 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5595 break;
5596 case 'e':
3dab1dad
YO
5597 if (memEQ(posixcc, "spac", 4)) /* space */
5598 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5599 break;
5600 case 'h':
3dab1dad
YO
5601 if (memEQ(posixcc, "grap", 4)) /* graph */
5602 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5603 break;
5604 case 'i':
3dab1dad
YO
5605 if (memEQ(posixcc, "asci", 4)) /* ascii */
5606 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5607 break;
5608 case 'k':
3dab1dad
YO
5609 if (memEQ(posixcc, "blan", 4)) /* blank */
5610 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5611 break;
5612 case 'l':
3dab1dad
YO
5613 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5614 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5615 break;
5616 case 'm':
3dab1dad
YO
5617 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5618 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5619 break;
5620 case 'r':
3dab1dad
YO
5621 if (memEQ(posixcc, "lowe", 4)) /* lower */
5622 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5623 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5624 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5625 break;
5626 case 't':
3dab1dad
YO
5627 if (memEQ(posixcc, "digi", 4)) /* digit */
5628 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5629 else if (memEQ(posixcc, "prin", 4)) /* print */
5630 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5631 else if (memEQ(posixcc, "punc", 4)) /* punct */
5632 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5633 break;
b8c5462f
JH
5634 }
5635 break;
80916619 5636 case 6:
3dab1dad
YO
5637 if (memEQ(posixcc, "xdigit", 6))
5638 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5639 break;
5640 }
80916619
NC
5641
5642 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5643 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5644 t - s - 1, s + 1);
80916619
NC
5645 assert (posixcc[skip] == ':');
5646 assert (posixcc[skip+1] == ']');
b45f050a 5647 } else if (!SIZE_ONLY) {
b8c5462f 5648 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5649
830247a4 5650 /* adjust RExC_parse so the warning shows after
b45f050a 5651 the class closes */
9a86a77b 5652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5653 RExC_parse++;
b45f050a
JF
5654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5655 }
b8c5462f
JH
5656 } else {
5657 /* Maternal grandfather:
5658 * "[:" ending in ":" but not in ":]" */
830247a4 5659 RExC_parse = s;
767d463e 5660 }
620e46c5
JH
5661 }
5662 }
5663
b8c5462f
JH
5664 return namedclass;
5665}
5666
5667STATIC void
830247a4 5668S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5669{
97aff369 5670 dVAR;
3dab1dad 5671 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5672 const char *s = RExC_parse;
5673 const char c = *s++;
b8c5462f 5674
3dab1dad 5675 while (isALNUM(*s))
b8c5462f
JH
5676 s++;
5677 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5678 if (ckWARN(WARN_REGEXP))
5679 vWARN3(s+2,
5680 "POSIX syntax [%c %c] belongs inside character classes",
5681 c, c);
b45f050a
JF
5682
5683 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5684 if (POSIXCC_NOTYET(c)) {
830247a4 5685 /* adjust RExC_parse so the error shows after
b45f050a 5686 the class closes */
9a86a77b 5687 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5688 NOOP;
b45f050a
JF
5689 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5690 }
b8c5462f
JH
5691 }
5692 }
620e46c5
JH
5693}
5694
7f6f358c
YO
5695
5696/*
5697 parse a class specification and produce either an ANYOF node that
5698 matches the pattern. If the pattern matches a single char only and
5699 that char is < 256 then we produce an EXACT node instead.
5700*/
76e3520e 5701STATIC regnode *
3dab1dad 5702S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5703{
97aff369 5704 dVAR;
9ef43ace 5705 register UV value = 0;
9a86a77b 5706 register UV nextvalue;
3568d838 5707 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5708 register IV range = 0;
c277df42 5709 register regnode *ret;
ba210ebe 5710 STRLEN numlen;
ffc61ed2 5711 IV namedclass;
cbbf8932 5712 char *rangebegin = NULL;
936ed897 5713 bool need_class = 0;
c445ea15 5714 SV *listsv = NULL;
ffc61ed2 5715 UV n;
9e55ce06 5716 bool optimize_invert = TRUE;
cbbf8932 5717 AV* unicode_alternate = NULL;
1b2d223b
JH
5718#ifdef EBCDIC
5719 UV literal_endpoint = 0;
5720#endif
7f6f358c 5721 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5722
3dab1dad 5723 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5724 case we need to change the emitted regop to an EXACT. */
07be1b83 5725 const char * orig_parse = RExC_parse;
72f13be8 5726 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5727#ifndef DEBUGGING
5728 PERL_UNUSED_ARG(depth);
5729#endif
72f13be8 5730
3dab1dad 5731 DEBUG_PARSE("clas");
7f6f358c
YO
5732
5733 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5734 ret = reganode(pRExC_state, ANYOF, 0);
5735
5736 if (!SIZE_ONLY)
5737 ANYOF_FLAGS(ret) = 0;
5738
9a86a77b 5739 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5740 RExC_naughty++;
5741 RExC_parse++;
5742 if (!SIZE_ONLY)
5743 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5744 }
a0d0e21e 5745
73060fc4 5746 if (SIZE_ONLY) {
830247a4 5747 RExC_size += ANYOF_SKIP;
73060fc4
JH
5748 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5749 }
936ed897 5750 else {
830247a4 5751 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5752 if (FOLD)
5753 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5754 if (LOC)
5755 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5756 ANYOF_BITMAP_ZERO(ret);
396482e1 5757 listsv = newSVpvs("# comment\n");
a0d0e21e 5758 }
b8c5462f 5759
9a86a77b
JH
5760 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5761
b938889d 5762 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5763 checkposixcc(pRExC_state);
b8c5462f 5764
f064b6ad
HS
5765 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5766 if (UCHARAT(RExC_parse) == ']')
5767 goto charclassloop;
ffc61ed2 5768
9a86a77b 5769 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5770
5771 charclassloop:
5772
5773 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5774
73b437c8 5775 if (!range)
830247a4 5776 rangebegin = RExC_parse;
ffc61ed2 5777 if (UTF) {
5e12f4fb 5778 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5779 RExC_end - RExC_parse,
9f7f3913 5780 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5781 RExC_parse += numlen;
5782 }
5783 else
5784 value = UCHARAT(RExC_parse++);
7f6f358c 5785
9a86a77b
JH
5786 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5787 if (value == '[' && POSIXCC(nextvalue))
830247a4 5788 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5789 else if (value == '\\') {
ffc61ed2 5790 if (UTF) {
5e12f4fb 5791 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5792 RExC_end - RExC_parse,
9f7f3913 5793 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5794 RExC_parse += numlen;
5795 }
5796 else
5797 value = UCHARAT(RExC_parse++);
470c3474 5798 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5799 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5800 * be a problem later if we want switch on Unicode.
5801 * A similar issue a little bit later when switching on
5802 * namedclass. --jhi */
ffc61ed2 5803 switch ((I32)value) {
b8c5462f
JH
5804 case 'w': namedclass = ANYOF_ALNUM; break;
5805 case 'W': namedclass = ANYOF_NALNUM; break;
5806 case 's': namedclass = ANYOF_SPACE; break;
5807 case 'S': namedclass = ANYOF_NSPACE; break;
5808 case 'd': namedclass = ANYOF_DIGIT; break;
5809 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5810 case 'p':
5811 case 'P':
3dab1dad
YO
5812 {
5813 char *e;
af6f566e 5814 if (RExC_parse >= RExC_end)
2a4859cd 5815 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5816 if (*RExC_parse == '{') {
1df70142 5817 const U8 c = (U8)value;
ffc61ed2
JH
5818 e = strchr(RExC_parse++, '}');
5819 if (!e)
0da60cf5 5820 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5821 while (isSPACE(UCHARAT(RExC_parse)))
5822 RExC_parse++;
5823 if (e == RExC_parse)
0da60cf5 5824 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5825 n = e - RExC_parse;
ab13f0c7
JH
5826 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5827 n--;
ffc61ed2
JH
5828 }
5829 else {
5830 e = RExC_parse;
5831 n = 1;
5832 }
5833 if (!SIZE_ONLY) {
ab13f0c7
JH
5834 if (UCHARAT(RExC_parse) == '^') {
5835 RExC_parse++;
5836 n--;
5837 value = value == 'p' ? 'P' : 'p'; /* toggle */
5838 while (isSPACE(UCHARAT(RExC_parse))) {
5839 RExC_parse++;
5840 n--;
5841 }
5842 }
097eb12c
AL
5843 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5844 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5845 }
5846 RExC_parse = e + 1;
5847 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5848 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5849 }
f81125e2 5850 break;
b8c5462f
JH
5851 case 'n': value = '\n'; break;
5852 case 'r': value = '\r'; break;
5853 case 't': value = '\t'; break;
5854 case 'f': value = '\f'; break;
5855 case 'b': value = '\b'; break;
c7f1f016
NIS
5856 case 'e': value = ASCII_TO_NATIVE('\033');break;
5857 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5858 case 'x':
ffc61ed2 5859 if (*RExC_parse == '{') {
a4c04bdc
NC
5860 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5861 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5862 char * const e = strchr(RExC_parse++, '}');
b81d288d 5863 if (!e)
ffc61ed2 5864 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5865
5866 numlen = e - RExC_parse;
5867 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5868 RExC_parse = e + 1;
5869 }
5870 else {
a4c04bdc 5871 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5872 numlen = 2;
5873 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5874 RExC_parse += numlen;
5875 }
b8c5462f
JH
5876 break;
5877 case 'c':
830247a4 5878 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5879 value = toCTRL(value);
5880 break;
5881 case '0': case '1': case '2': case '3': case '4':
5882 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5883 {
5884 I32 flags = 0;
5885 numlen = 3;
5886 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5887 RExC_parse += numlen;
b8c5462f 5888 break;
53305cf1 5889 }
1028017a 5890 default:
041457d9 5891 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5892 vWARN2(RExC_parse,
5893 "Unrecognized escape \\%c in character class passed through",
5894 (int)value);
1028017a 5895 break;
b8c5462f 5896 }
ffc61ed2 5897 } /* end of \blah */
1b2d223b
JH
5898#ifdef EBCDIC
5899 else
5900 literal_endpoint++;
5901#endif
ffc61ed2
JH
5902
5903 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5904
5905 if (!SIZE_ONLY && !need_class)
936ed897 5906 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5907
936ed897 5908 need_class = 1;
ffc61ed2
JH
5909
5910 /* a bad range like a-\d, a-[:digit:] ? */
5911 if (range) {
73b437c8 5912 if (!SIZE_ONLY) {
afd78fd5 5913 if (ckWARN(WARN_REGEXP)) {
097eb12c 5914 const int w =
afd78fd5
JH
5915 RExC_parse >= rangebegin ?
5916 RExC_parse - rangebegin : 0;
830247a4 5917 vWARN4(RExC_parse,
b45f050a 5918 "False [] range \"%*.*s\"",
097eb12c 5919 w, w, rangebegin);
afd78fd5 5920 }
3568d838
JH
5921 if (prevvalue < 256) {
5922 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5923 ANYOF_BITMAP_SET(ret, '-');
5924 }
5925 else {
5926 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5927 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5928 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5929 }
b8c5462f 5930 }
ffc61ed2
JH
5931
5932 range = 0; /* this was not a true range */
73b437c8 5933 }
ffc61ed2 5934
73b437c8 5935 if (!SIZE_ONLY) {
c49a72a9
NC
5936 const char *what = NULL;
5937 char yesno = 0;
5938
3568d838
JH
5939 if (namedclass > OOB_NAMEDCLASS)
5940 optimize_invert = FALSE;
e2962f66
JH
5941 /* Possible truncation here but in some 64-bit environments
5942 * the compiler gets heartburn about switch on 64-bit values.
5943 * A similar issue a little earlier when switching on value.
98f323fa 5944 * --jhi */
e2962f66 5945 switch ((I32)namedclass) {
73b437c8
JH
5946 case ANYOF_ALNUM:
5947 if (LOC)
936ed897 5948 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5949 else {
5950 for (value = 0; value < 256; value++)
5951 if (isALNUM(value))
936ed897 5952 ANYOF_BITMAP_SET(ret, value);
73b437c8 5953 }
c49a72a9
NC
5954 yesno = '+';
5955 what = "Word";
73b437c8
JH
5956 break;
5957 case ANYOF_NALNUM:
5958 if (LOC)
936ed897 5959 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5960 else {
5961 for (value = 0; value < 256; value++)
5962 if (!isALNUM(value))
936ed897 5963 ANYOF_BITMAP_SET(ret, value);
73b437c8 5964 }
c49a72a9
NC
5965 yesno = '!';
5966 what = "Word";
73b437c8 5967 break;
ffc61ed2 5968 case ANYOF_ALNUMC:
73b437c8 5969 if (LOC)
ffc61ed2 5970 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5971 else {
5972 for (value = 0; value < 256; value++)
ffc61ed2 5973 if (isALNUMC(value))
936ed897 5974 ANYOF_BITMAP_SET(ret, value);
73b437c8 5975 }
c49a72a9
NC
5976 yesno = '+';
5977 what = "Alnum";
73b437c8
JH
5978 break;
5979 case ANYOF_NALNUMC:
5980 if (LOC)
936ed897 5981 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5982 else {
5983 for (value = 0; value < 256; value++)
5984 if (!isALNUMC(value))
936ed897 5985 ANYOF_BITMAP_SET(ret, value);
73b437c8 5986 }
c49a72a9
NC
5987 yesno = '!';
5988 what = "Alnum";
73b437c8
JH
5989 break;
5990 case ANYOF_ALPHA:
5991 if (LOC)
936ed897 5992 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5993 else {
5994 for (value = 0; value < 256; value++)
5995 if (isALPHA(value))
936ed897 5996 ANYOF_BITMAP_SET(ret, value);
73b437c8 5997 }
c49a72a9
NC
5998 yesno = '+';
5999 what = "Alpha";
73b437c8
JH
6000 break;
6001 case ANYOF_NALPHA:
6002 if (LOC)
936ed897 6003 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6004 else {
6005 for (value = 0; value < 256; value++)
6006 if (!isALPHA(value))
936ed897 6007 ANYOF_BITMAP_SET(ret, value);
73b437c8 6008 }
c49a72a9
NC
6009 yesno = '!';
6010 what = "Alpha";
73b437c8
JH
6011 break;
6012 case ANYOF_ASCII:
6013 if (LOC)
936ed897 6014 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6015 else {
c7f1f016 6016#ifndef EBCDIC
1ba5c669
JH
6017 for (value = 0; value < 128; value++)
6018 ANYOF_BITMAP_SET(ret, value);
6019#else /* EBCDIC */
ffbc6a93 6020 for (value = 0; value < 256; value++) {
3a3c4447
JH
6021 if (isASCII(value))
6022 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6023 }
1ba5c669 6024#endif /* EBCDIC */
73b437c8 6025 }
c49a72a9
NC
6026 yesno = '+';
6027 what = "ASCII";
73b437c8
JH
6028 break;
6029 case ANYOF_NASCII:
6030 if (LOC)
936ed897 6031 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6032 else {
c7f1f016 6033#ifndef EBCDIC
1ba5c669
JH
6034 for (value = 128; value < 256; value++)
6035 ANYOF_BITMAP_SET(ret, value);
6036#else /* EBCDIC */
ffbc6a93 6037 for (value = 0; value < 256; value++) {
3a3c4447
JH
6038 if (!isASCII(value))
6039 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6040 }
1ba5c669 6041#endif /* EBCDIC */
73b437c8 6042 }
c49a72a9
NC
6043 yesno = '!';
6044 what = "ASCII";
73b437c8 6045 break;
aaa51d5e
JF
6046 case ANYOF_BLANK:
6047 if (LOC)
6048 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6049 else {
6050 for (value = 0; value < 256; value++)
6051 if (isBLANK(value))
6052 ANYOF_BITMAP_SET(ret, value);
6053 }
c49a72a9
NC
6054 yesno = '+';
6055 what = "Blank";
aaa51d5e
JF
6056 break;
6057 case ANYOF_NBLANK:
6058 if (LOC)
6059 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6060 else {
6061 for (value = 0; value < 256; value++)
6062 if (!isBLANK(value))
6063 ANYOF_BITMAP_SET(ret, value);
6064 }
c49a72a9
NC
6065 yesno = '!';
6066 what = "Blank";
aaa51d5e 6067 break;
73b437c8
JH
6068 case ANYOF_CNTRL:
6069 if (LOC)
936ed897 6070 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6071 else {
6072 for (value = 0; value < 256; value++)
6073 if (isCNTRL(value))
936ed897 6074 ANYOF_BITMAP_SET(ret, value);
73b437c8 6075 }
c49a72a9
NC
6076 yesno = '+';
6077 what = "Cntrl";
73b437c8
JH
6078 break;
6079 case ANYOF_NCNTRL:
6080 if (LOC)
936ed897 6081 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6082 else {
6083 for (value = 0; value < 256; value++)
6084 if (!isCNTRL(value))
936ed897 6085 ANYOF_BITMAP_SET(ret, value);
73b437c8 6086 }
c49a72a9
NC
6087 yesno = '!';
6088 what = "Cntrl";
ffc61ed2
JH
6089 break;
6090 case ANYOF_DIGIT:
6091 if (LOC)
6092 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6093 else {
6094 /* consecutive digits assumed */
6095 for (value = '0'; value <= '9'; value++)
6096 ANYOF_BITMAP_SET(ret, value);
6097 }
c49a72a9
NC
6098 yesno = '+';
6099 what = "Digit";
ffc61ed2
JH
6100 break;
6101 case ANYOF_NDIGIT:
6102 if (LOC)
6103 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6104 else {
6105 /* consecutive digits assumed */
6106 for (value = 0; value < '0'; value++)
6107 ANYOF_BITMAP_SET(ret, value);
6108 for (value = '9' + 1; value < 256; value++)
6109 ANYOF_BITMAP_SET(ret, value);
6110 }
c49a72a9
NC
6111 yesno = '!';
6112 what = "Digit";
73b437c8
JH
6113 break;
6114 case ANYOF_GRAPH:
6115 if (LOC)
936ed897 6116 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6117 else {
6118 for (value = 0; value < 256; value++)
6119 if (isGRAPH(value))
936ed897 6120 ANYOF_BITMAP_SET(ret, value);
73b437c8 6121 }
c49a72a9
NC
6122 yesno = '+';
6123 what = "Graph";
73b437c8
JH
6124 break;
6125 case ANYOF_NGRAPH:
6126 if (LOC)
936ed897 6127 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6128 else {
6129 for (value = 0; value < 256; value++)
6130 if (!isGRAPH(value))
936ed897 6131 ANYOF_BITMAP_SET(ret, value);
73b437c8 6132 }
c49a72a9
NC
6133 yesno = '!';
6134 what = "Graph";
73b437c8
JH
6135 break;
6136 case ANYOF_LOWER:
6137 if (LOC)
936ed897 6138 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6139 else {
6140 for (value = 0; value < 256; value++)
6141 if (isLOWER(value))
936ed897 6142 ANYOF_BITMAP_SET(ret, value);
73b437c8 6143 }
c49a72a9
NC
6144 yesno = '+';
6145 what = "Lower";
73b437c8
JH
6146 break;
6147 case ANYOF_NLOWER:
6148 if (LOC)
936ed897 6149 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6150 else {
6151 for (value = 0; value < 256; value++)
6152 if (!isLOWER(value))
936ed897 6153 ANYOF_BITMAP_SET(ret, value);
73b437c8 6154 }
c49a72a9
NC
6155 yesno = '!';
6156 what = "Lower";
73b437c8
JH
6157 break;
6158 case ANYOF_PRINT:
6159 if (LOC)
936ed897 6160 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6161 else {
6162 for (value = 0; value < 256; value++)
6163 if (isPRINT(value))
936ed897 6164 ANYOF_BITMAP_SET(ret, value);
73b437c8 6165 }
c49a72a9
NC
6166 yesno = '+';
6167 what = "Print";
73b437c8
JH
6168 break;
6169 case ANYOF_NPRINT:
6170 if (LOC)
936ed897 6171 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6172 else {
6173 for (value = 0; value < 256; value++)
6174 if (!isPRINT(value))
936ed897 6175 ANYOF_BITMAP_SET(ret, value);
73b437c8 6176 }
c49a72a9
NC
6177 yesno = '!';
6178 what = "Print";
73b437c8 6179 break;
aaa51d5e
JF
6180 case ANYOF_PSXSPC:
6181 if (LOC)
6182 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6183 else {
6184 for (value = 0; value < 256; value++)
6185 if (isPSXSPC(value))
6186 ANYOF_BITMAP_SET(ret, value);
6187 }
c49a72a9
NC
6188 yesno = '+';
6189 what = "Space";
aaa51d5e
JF
6190 break;
6191 case ANYOF_NPSXSPC:
6192 if (LOC)
6193 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6194 else {
6195 for (value = 0; value < 256; value++)
6196 if (!isPSXSPC(value))
6197 ANYOF_BITMAP_SET(ret, value);
6198 }
c49a72a9
NC
6199 yesno = '!';
6200 what = "Space";
aaa51d5e 6201 break;
73b437c8
JH
6202 case ANYOF_PUNCT:
6203 if (LOC)
936ed897 6204 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6205 else {
6206 for (value = 0; value < 256; value++)
6207 if (isPUNCT(value))
936ed897 6208 ANYOF_BITMAP_SET(ret, value);
73b437c8 6209 }
c49a72a9
NC
6210 yesno = '+';
6211 what = "Punct";
73b437c8
JH
6212 break;
6213 case ANYOF_NPUNCT:
6214 if (LOC)
936ed897 6215 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6216 else {
6217 for (value = 0; value < 256; value++)
6218 if (!isPUNCT(value))
936ed897 6219 ANYOF_BITMAP_SET(ret, value);
73b437c8 6220 }
c49a72a9
NC
6221 yesno = '!';
6222 what = "Punct";
ffc61ed2
JH
6223 break;
6224 case ANYOF_SPACE:
6225 if (LOC)
6226 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6227 else {
6228 for (value = 0; value < 256; value++)
6229 if (isSPACE(value))
6230 ANYOF_BITMAP_SET(ret, value);
6231 }
c49a72a9
NC
6232 yesno = '+';
6233 what = "SpacePerl";
ffc61ed2
JH
6234 break;
6235 case ANYOF_NSPACE:
6236 if (LOC)
6237 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6238 else {
6239 for (value = 0; value < 256; value++)
6240 if (!isSPACE(value))
6241 ANYOF_BITMAP_SET(ret, value);
6242 }
c49a72a9
NC
6243 yesno = '!';
6244 what = "SpacePerl";
73b437c8
JH
6245 break;
6246 case ANYOF_UPPER:
6247 if (LOC)
936ed897 6248 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6249 else {
6250 for (value = 0; value < 256; value++)
6251 if (isUPPER(value))
936ed897 6252 ANYOF_BITMAP_SET(ret, value);
73b437c8 6253 }
c49a72a9
NC
6254 yesno = '+';
6255 what = "Upper";
73b437c8
JH
6256 break;
6257 case ANYOF_NUPPER:
6258 if (LOC)
936ed897 6259 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6260 else {
6261 for (value = 0; value < 256; value++)
6262 if (!isUPPER(value))
936ed897 6263 ANYOF_BITMAP_SET(ret, value);
73b437c8 6264 }
c49a72a9
NC
6265 yesno = '!';
6266 what = "Upper";
73b437c8
JH
6267 break;
6268 case ANYOF_XDIGIT:
6269 if (LOC)
936ed897 6270 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6271 else {
6272 for (value = 0; value < 256; value++)
6273 if (isXDIGIT(value))
936ed897 6274 ANYOF_BITMAP_SET(ret, value);
73b437c8 6275 }
c49a72a9
NC
6276 yesno = '+';
6277 what = "XDigit";
73b437c8
JH
6278 break;
6279 case ANYOF_NXDIGIT:
6280 if (LOC)
936ed897 6281 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6282 else {
6283 for (value = 0; value < 256; value++)
6284 if (!isXDIGIT(value))
936ed897 6285 ANYOF_BITMAP_SET(ret, value);
73b437c8 6286 }
c49a72a9
NC
6287 yesno = '!';
6288 what = "XDigit";
73b437c8 6289 break;
f81125e2
JP
6290 case ANYOF_MAX:
6291 /* this is to handle \p and \P */
6292 break;
73b437c8 6293 default:
b45f050a 6294 vFAIL("Invalid [::] class");
73b437c8 6295 break;
b8c5462f 6296 }
c49a72a9
NC
6297 if (what) {
6298 /* Strings such as "+utf8::isWord\n" */
6299 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6300 }
b8c5462f 6301 if (LOC)
936ed897 6302 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6303 continue;
a0d0e21e 6304 }
ffc61ed2
JH
6305 } /* end of namedclass \blah */
6306
a0d0e21e 6307 if (range) {
eb160463 6308 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6309 const int w = RExC_parse - rangebegin;
6310 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6311 range = 0; /* not a valid range */
73b437c8 6312 }
a0d0e21e
LW
6313 }
6314 else {
3568d838 6315 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6316 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6317 RExC_parse[1] != ']') {
6318 RExC_parse++;
ffc61ed2
JH
6319
6320 /* a bad range like \w-, [:word:]- ? */
6321 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6322 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6323 const int w =
afd78fd5
JH
6324 RExC_parse >= rangebegin ?
6325 RExC_parse - rangebegin : 0;
830247a4 6326 vWARN4(RExC_parse,
b45f050a 6327 "False [] range \"%*.*s\"",
097eb12c 6328 w, w, rangebegin);
afd78fd5 6329 }
73b437c8 6330 if (!SIZE_ONLY)
936ed897 6331 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6332 } else
ffc61ed2
JH
6333 range = 1; /* yeah, it's a range! */
6334 continue; /* but do it the next time */
a0d0e21e 6335 }
a687059c 6336 }
ffc61ed2 6337
93733859 6338 /* now is the next time */
07be1b83 6339 /*stored += (value - prevvalue + 1);*/
ae5c130c 6340 if (!SIZE_ONLY) {
3568d838 6341 if (prevvalue < 256) {
1df70142 6342 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6343 IV i;
3568d838 6344#ifdef EBCDIC
1b2d223b
JH
6345 /* In EBCDIC [\x89-\x91] should include
6346 * the \x8e but [i-j] should not. */
6347 if (literal_endpoint == 2 &&
6348 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6349 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6350 {
3568d838
JH
6351 if (isLOWER(prevvalue)) {
6352 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6353 if (isLOWER(i))
6354 ANYOF_BITMAP_SET(ret, i);
6355 } else {
3568d838 6356 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6357 if (isUPPER(i))
6358 ANYOF_BITMAP_SET(ret, i);
6359 }
8ada0baa 6360 }
ffc61ed2 6361 else
8ada0baa 6362#endif
07be1b83
YO
6363 for (i = prevvalue; i <= ceilvalue; i++) {
6364 if (!ANYOF_BITMAP_TEST(ret,i)) {
6365 stored++;
6366 ANYOF_BITMAP_SET(ret, i);
6367 }
6368 }
3568d838 6369 }
a5961de5 6370 if (value > 255 || UTF) {
1df70142
AL
6371 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6372 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6373 stored+=2; /* can't optimize this class */
ffc61ed2 6374 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6375 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6376 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6377 prevnatvalue, natvalue);
6378 }
6379 else if (prevnatvalue == natvalue) {
6380 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6381 if (FOLD) {
89ebb4a3 6382 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6383 STRLEN foldlen;
1df70142 6384 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6385
e294cc5d
JH
6386#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6387 if (RExC_precomp[0] == ':' &&
6388 RExC_precomp[1] == '[' &&
6389 (f == 0xDF || f == 0x92)) {
6390 f = NATIVE_TO_UNI(f);
6391 }
6392#endif
c840d2a2
JH
6393 /* If folding and foldable and a single
6394 * character, insert also the folded version
6395 * to the charclass. */
9e55ce06 6396 if (f != value) {
e294cc5d
JH
6397#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6398 if ((RExC_precomp[0] == ':' &&
6399 RExC_precomp[1] == '[' &&
6400 (f == 0xA2 &&
6401 (value == 0xFB05 || value == 0xFB06))) ?
6402 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6403 foldlen == (STRLEN)UNISKIP(f) )
6404#else
eb160463 6405 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6406#endif
9e55ce06
JH
6407 Perl_sv_catpvf(aTHX_ listsv,
6408 "%04"UVxf"\n", f);
6409 else {
6410 /* Any multicharacter foldings
6411 * require the following transform:
6412 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6413 * where E folds into "pq" and F folds
6414 * into "rst", all other characters
6415 * fold to single characters. We save
6416 * away these multicharacter foldings,
6417 * to be later saved as part of the
6418 * additional "s" data. */
6419 SV *sv;
6420
6421 if (!unicode_alternate)
6422 unicode_alternate = newAV();
6423 sv = newSVpvn((char*)foldbuf, foldlen);
6424 SvUTF8_on(sv);
6425 av_push(unicode_alternate, sv);
6426 }
6427 }
254ba52a 6428
60a8b682
JH
6429 /* If folding and the value is one of the Greek
6430 * sigmas insert a few more sigmas to make the
6431 * folding rules of the sigmas to work right.
6432 * Note that not all the possible combinations
6433 * are handled here: some of them are handled
9e55ce06
JH
6434 * by the standard folding rules, and some of
6435 * them (literal or EXACTF cases) are handled
6436 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6437 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6438 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6439 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6440 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6441 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6442 }
6443 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6444 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6445 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6446 }
6447 }
ffc61ed2 6448 }
1b2d223b
JH
6449#ifdef EBCDIC
6450 literal_endpoint = 0;
6451#endif
8ada0baa 6452 }
ffc61ed2
JH
6453
6454 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6455 }
ffc61ed2 6456
936ed897 6457 if (need_class) {
4f66b38d 6458 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6459 if (SIZE_ONLY)
830247a4 6460 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6461 else
830247a4 6462 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6463 }
ffc61ed2 6464
7f6f358c
YO
6465
6466 if (SIZE_ONLY)
6467 return ret;
6468 /****** !SIZE_ONLY AFTER HERE *********/
6469
6470 if( stored == 1 && value < 256
6471 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6472 ) {
6473 /* optimize single char class to an EXACT node
6474 but *only* when its not a UTF/high char */
07be1b83
YO
6475 const char * cur_parse= RExC_parse;
6476 RExC_emit = (regnode *)orig_emit;
6477 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6478 ret = reg_node(pRExC_state,
6479 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6480 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6481 *STRING(ret)= (char)value;
6482 STR_LEN(ret)= 1;
6483 RExC_emit += STR_SZ(1);
6484 return ret;
6485 }
ae5c130c 6486 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6487 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6488 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6489 ) {
a0ed51b3 6490 for (value = 0; value < 256; ++value) {
936ed897 6491 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6492 UV fold = PL_fold[value];
ffc61ed2
JH
6493
6494 if (fold != value)
6495 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6496 }
6497 }
936ed897 6498 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6499 }
ffc61ed2 6500
ae5c130c 6501 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6502 if (optimize_invert &&
ffc61ed2
JH
6503 /* If the only flag is inversion. */
6504 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6505 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6506 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6507 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6508 }
7f6f358c 6509 {
097eb12c 6510 AV * const av = newAV();
ffc61ed2 6511 SV *rv;
9e55ce06 6512 /* The 0th element stores the character class description
6a0407ee 6513 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6514 * to initialize the appropriate swash (which gets stored in
6515 * the 1st element), and also useful for dumping the regnode.
6516 * The 2nd element stores the multicharacter foldings,
6a0407ee 6517 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6518 av_store(av, 0, listsv);
6519 av_store(av, 1, NULL);
9e55ce06 6520 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6521 rv = newRV_noinc((SV*)av);
19860706 6522 n = add_data(pRExC_state, 1, "s");
830247a4 6523 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6524 ARG_SET(ret, n);
a0ed51b3 6525 }
a0ed51b3
LW
6526 return ret;
6527}
6528
76e3520e 6529STATIC char*
830247a4 6530S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6531{
097eb12c 6532 char* const retval = RExC_parse++;
a0d0e21e 6533
4633a7c4 6534 for (;;) {
830247a4
IZ
6535 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6536 RExC_parse[2] == '#') {
e994fd66
AE
6537 while (*RExC_parse != ')') {
6538 if (RExC_parse == RExC_end)
6539 FAIL("Sequence (?#... not terminated");
830247a4 6540 RExC_parse++;
e994fd66 6541 }
830247a4 6542 RExC_parse++;
4633a7c4
LW
6543 continue;
6544 }
e2509266 6545 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6546 if (isSPACE(*RExC_parse)) {
6547 RExC_parse++;
748a9306
LW
6548 continue;
6549 }
830247a4 6550 else if (*RExC_parse == '#') {
e994fd66
AE
6551 while (RExC_parse < RExC_end)
6552 if (*RExC_parse++ == '\n') break;
748a9306
LW
6553 continue;
6554 }
748a9306 6555 }
4633a7c4 6556 return retval;
a0d0e21e 6557 }
a687059c
LW
6558}
6559
6560/*
c277df42 6561- reg_node - emit a node
a0d0e21e 6562*/
76e3520e 6563STATIC regnode * /* Location. */
830247a4 6564S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6565{
97aff369 6566 dVAR;
c277df42 6567 register regnode *ptr;
504618e9 6568 regnode * const ret = RExC_emit;
07be1b83 6569 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6570
c277df42 6571 if (SIZE_ONLY) {
830247a4
IZ
6572 SIZE_ALIGN(RExC_size);
6573 RExC_size += 1;
a0d0e21e
LW
6574 return(ret);
6575 }
c277df42 6576 NODE_ALIGN_FILL(ret);
a0d0e21e 6577 ptr = ret;
c277df42 6578 FILL_ADVANCE_NODE(ptr, op);
fac92740 6579 if (RExC_offsets) { /* MJD */
07be1b83 6580 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6581 "reg_node", __LINE__,
6582 reg_name[op],
07be1b83
YO
6583 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6584 ? "Overwriting end of array!\n" : "OK",
6585 (UV)(RExC_emit - RExC_emit_start),
6586 (UV)(RExC_parse - RExC_start),
6587 (UV)RExC_offsets[0]));
ccb2c380 6588 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6589 }
07be1b83 6590
830247a4 6591 RExC_emit = ptr;
a687059c 6592
a0d0e21e 6593 return(ret);
a687059c
LW
6594}
6595
6596/*
a0d0e21e
LW
6597- reganode - emit a node with an argument
6598*/
76e3520e 6599STATIC regnode * /* Location. */
830247a4 6600S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6601{
97aff369 6602 dVAR;
c277df42 6603 register regnode *ptr;
504618e9 6604 regnode * const ret = RExC_emit;
07be1b83 6605 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6606
c277df42 6607 if (SIZE_ONLY) {
830247a4
IZ
6608 SIZE_ALIGN(RExC_size);
6609 RExC_size += 2;
a0d0e21e
LW
6610 return(ret);
6611 }
fe14fcc3 6612
c277df42 6613 NODE_ALIGN_FILL(ret);
a0d0e21e 6614 ptr = ret;
c277df42 6615 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6616 if (RExC_offsets) { /* MJD */
07be1b83 6617 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6618 "reganode",
ccb2c380
MP
6619 __LINE__,
6620 reg_name[op],
07be1b83 6621 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6622 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6623 (UV)(RExC_emit - RExC_emit_start),
6624 (UV)(RExC_parse - RExC_start),
6625 (UV)RExC_offsets[0]));
ccb2c380 6626 Set_Cur_Node_Offset;
fac92740
MJD
6627 }
6628
830247a4 6629 RExC_emit = ptr;
fe14fcc3 6630
a0d0e21e 6631 return(ret);
fe14fcc3
LW
6632}
6633
6634/*
cd439c50 6635- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6636*/
71207a34
AL
6637STATIC STRLEN
6638S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6639{
97aff369 6640 dVAR;
71207a34 6641 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6642}
6643
6644/*
a0d0e21e
LW
6645- reginsert - insert an operator in front of already-emitted operand
6646*
6647* Means relocating the operand.
6648*/
76e3520e 6649STATIC void
830247a4 6650S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6651{
97aff369 6652 dVAR;
c277df42
IZ
6653 register regnode *src;
6654 register regnode *dst;
6655 register regnode *place;
504618e9 6656 const int offset = regarglen[(U8)op];
07be1b83 6657 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6658/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6659
6660 if (SIZE_ONLY) {
830247a4 6661 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6662 return;
6663 }
a687059c 6664
830247a4
IZ
6665 src = RExC_emit;
6666 RExC_emit += NODE_STEP_REGNODE + offset;
6667 dst = RExC_emit;
fac92740 6668 while (src > opnd) {
c277df42 6669 StructCopy(--src, --dst, regnode);
fac92740 6670 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6671 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6672 "reg_insert",
ccb2c380
MP
6673 __LINE__,
6674 reg_name[op],
07be1b83
YO
6675 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6676 ? "Overwriting end of array!\n" : "OK",
6677 (UV)(src - RExC_emit_start),
6678 (UV)(dst - RExC_emit_start),
6679 (UV)RExC_offsets[0]));
ccb2c380
MP
6680 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6681 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6682 }
6683 }
6684
a0d0e21e
LW
6685
6686 place = opnd; /* Op node, where operand used to be. */
fac92740 6687 if (RExC_offsets) { /* MJD */
07be1b83 6688 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6689 "reginsert",
ccb2c380
MP
6690 __LINE__,
6691 reg_name[op],
07be1b83 6692 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6693 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6694 (UV)(place - RExC_emit_start),
6695 (UV)(RExC_parse - RExC_start),
786e8c11 6696 (UV)RExC_offsets[0]));
ccb2c380 6697 Set_Node_Offset(place, RExC_parse);
45948336 6698 Set_Node_Length(place, 1);
fac92740 6699 }
c277df42
IZ
6700 src = NEXTOPER(place);
6701 FILL_ADVANCE_NODE(place, op);
6702 Zero(src, offset, regnode);
a687059c
LW
6703}
6704
6705/*
c277df42 6706- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6707- SEE ALSO: regtail_study
a0d0e21e 6708*/
097eb12c 6709/* TODO: All three parms should be const */
76e3520e 6710STATIC void
3dab1dad 6711S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6712{
97aff369 6713 dVAR;
c277df42 6714 register regnode *scan;
72f13be8 6715 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6716#ifndef DEBUGGING
6717 PERL_UNUSED_ARG(depth);
6718#endif
a0d0e21e 6719
c277df42 6720 if (SIZE_ONLY)
a0d0e21e
LW
6721 return;
6722
6723 /* Find last node. */
6724 scan = p;
6725 for (;;) {
504618e9 6726 regnode * const temp = regnext(scan);
3dab1dad
YO
6727 DEBUG_PARSE_r({
6728 SV * const mysv=sv_newmortal();
6729 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6730 regprop(RExC_rx, mysv, scan);
6731 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6732 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6733 });
6734 if (temp == NULL)
6735 break;
6736 scan = temp;
6737 }
6738
6739 if (reg_off_by_arg[OP(scan)]) {
6740 ARG_SET(scan, val - scan);
6741 }
6742 else {
6743 NEXT_OFF(scan) = val - scan;
6744 }
6745}
6746
07be1b83 6747#ifdef DEBUGGING
3dab1dad
YO
6748/*
6749- regtail_study - set the next-pointer at the end of a node chain of p to val.
6750- Look for optimizable sequences at the same time.
6751- currently only looks for EXACT chains.
07be1b83
YO
6752
6753This is expermental code. The idea is to use this routine to perform
6754in place optimizations on branches and groups as they are constructed,
6755with the long term intention of removing optimization from study_chunk so
6756that it is purely analytical.
6757
6758Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6759to control which is which.
6760
3dab1dad
YO
6761*/
6762/* TODO: All four parms should be const */
07be1b83 6763
3dab1dad
YO
6764STATIC U8
6765S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6766{
6767 dVAR;
6768 register regnode *scan;
07be1b83
YO
6769 U8 exact = PSEUDO;
6770#ifdef EXPERIMENTAL_INPLACESCAN
6771 I32 min = 0;
6772#endif
6773
3dab1dad
YO
6774 GET_RE_DEBUG_FLAGS_DECL;
6775
07be1b83 6776
3dab1dad
YO
6777 if (SIZE_ONLY)
6778 return exact;
6779
6780 /* Find last node. */
6781
6782 scan = p;
6783 for (;;) {
6784 regnode * const temp = regnext(scan);
07be1b83
YO
6785#ifdef EXPERIMENTAL_INPLACESCAN
6786 if (PL_regkind[OP(scan)] == EXACT)
6787 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6788 return EXACT;
6789#endif
3dab1dad
YO
6790 if ( exact ) {
6791 switch (OP(scan)) {
6792 case EXACT:
6793 case EXACTF:
6794 case EXACTFL:
6795 if( exact == PSEUDO )
6796 exact= OP(scan);
07be1b83
YO
6797 else if ( exact != OP(scan) )
6798 exact= 0;
3dab1dad
YO
6799 case NOTHING:
6800 break;
6801 default:
6802 exact= 0;
6803 }
6804 }
6805 DEBUG_PARSE_r({
6806 SV * const mysv=sv_newmortal();
6807 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6808 regprop(RExC_rx, mysv, scan);
6809 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6810 SvPV_nolen_const(mysv),
6811 reg_name[exact],
6812 REG_NODE_NUM(scan));
6813 });
a0d0e21e
LW
6814 if (temp == NULL)
6815 break;
6816 scan = temp;
6817 }
07be1b83
YO
6818 DEBUG_PARSE_r({
6819 SV * const mysv_val=sv_newmortal();
6820 DEBUG_PARSE_MSG("");
6821 regprop(RExC_rx, mysv_val, val);
6822 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6823 SvPV_nolen_const(mysv_val),
6824 REG_NODE_NUM(val),
6825 val - scan
6826 );
6827 });
c277df42
IZ
6828 if (reg_off_by_arg[OP(scan)]) {
6829 ARG_SET(scan, val - scan);
a0ed51b3
LW
6830 }
6831 else {
c277df42
IZ
6832 NEXT_OFF(scan) = val - scan;
6833 }
3dab1dad
YO
6834
6835 return exact;
a687059c 6836}
07be1b83 6837#endif
a687059c
LW
6838
6839/*
a687059c
LW
6840 - regcurly - a little FSA that accepts {\d+,?\d*}
6841 */
79072805 6842STATIC I32
5f66b61c 6843S_regcurly(register const char *s)
a687059c
LW
6844{
6845 if (*s++ != '{')
6846 return FALSE;
f0fcb552 6847 if (!isDIGIT(*s))
a687059c 6848 return FALSE;
f0fcb552 6849 while (isDIGIT(*s))
a687059c
LW
6850 s++;
6851 if (*s == ',')
6852 s++;
f0fcb552 6853 while (isDIGIT(*s))
a687059c
LW
6854 s++;
6855 if (*s != '}')
6856 return FALSE;
6857 return TRUE;
6858}
6859
a687059c
LW
6860
6861/*
fd181c75 6862 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6863 */
6864void
097eb12c 6865Perl_regdump(pTHX_ const regexp *r)
a687059c 6866{
35ff7856 6867#ifdef DEBUGGING
97aff369 6868 dVAR;
c445ea15 6869 SV * const sv = sv_newmortal();
ab3bbdeb 6870 SV *dsv= sv_newmortal();
a687059c 6871
786e8c11 6872 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
6873
6874 /* Header fields of interest. */
ab3bbdeb
YO
6875 if (r->anchored_substr) {
6876 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6877 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 6878 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6879 "anchored %s%s at %"IVdf" ",
6880 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 6881 (IV)r->anchored_offset);
ab3bbdeb
YO
6882 } else if (r->anchored_utf8) {
6883 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6884 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 6885 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6886 "anchored utf8 %s%s at %"IVdf" ",
6887 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 6888 (IV)r->anchored_offset);
ab3bbdeb
YO
6889 }
6890 if (r->float_substr) {
6891 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6892 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 6893 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6894 "floating %s%s at %"IVdf"..%"UVuf" ",
6895 s, RE_SV_TAIL(r->float_substr),
7b0972df 6896 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
6897 } else if (r->float_utf8) {
6898 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6899 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 6900 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6901 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6902 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 6903 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 6904 }
33b8afdf 6905 if (r->check_substr || r->check_utf8)
b81d288d 6906 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
6907 (const char *)
6908 (r->check_substr == r->float_substr
6909 && r->check_utf8 == r->float_utf8
6910 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
6911 if (r->reganch & ROPT_NOSCAN)
6912 PerlIO_printf(Perl_debug_log, " noscan");
6913 if (r->reganch & ROPT_CHECK_ALL)
6914 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6915 if (r->check_substr || r->check_utf8)
c277df42
IZ
6916 PerlIO_printf(Perl_debug_log, ") ");
6917
46fc3d4c 6918 if (r->regstclass) {
32fc9b6a 6919 regprop(r, sv, r->regstclass);
1de06328 6920 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 6921 }
774d564b 6922 if (r->reganch & ROPT_ANCH) {
6923 PerlIO_printf(Perl_debug_log, "anchored");
6924 if (r->reganch & ROPT_ANCH_BOL)
6925 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6926 if (r->reganch & ROPT_ANCH_MBOL)
6927 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6928 if (r->reganch & ROPT_ANCH_SBOL)
6929 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6930 if (r->reganch & ROPT_ANCH_GPOS)
6931 PerlIO_printf(Perl_debug_log, "(GPOS)");
6932 PerlIO_putc(Perl_debug_log, ' ');
6933 }
c277df42
IZ
6934 if (r->reganch & ROPT_GPOS_SEEN)
6935 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6936 if (r->reganch & ROPT_SKIP)
760ac839 6937 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6938 if (r->reganch & ROPT_IMPLICIT)
760ac839 6939 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6940 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6941 if (r->reganch & ROPT_EVAL_SEEN)
6942 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6943 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6944#else
96a5add6 6945 PERL_UNUSED_CONTEXT;
65e66c80 6946 PERL_UNUSED_ARG(r);
17c3b450 6947#endif /* DEBUGGING */
a687059c
LW
6948}
6949
6950/*
a0d0e21e
LW
6951- regprop - printable representation of opcode
6952*/
46fc3d4c 6953void
32fc9b6a 6954Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6955{
35ff7856 6956#ifdef DEBUGGING
97aff369 6957 dVAR;
9b155405 6958 register int k;
1de06328 6959 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 6960
54dc92de 6961 sv_setpvn(sv, "", 0);
9b155405 6962 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6963 /* It would be nice to FAIL() here, but this may be called from
6964 regexec.c, and it would be hard to supply pRExC_state. */
6965 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6966 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6967
3dab1dad 6968 k = PL_regkind[OP(o)];
9b155405 6969
2a782b5b 6970 if (k == EXACT) {
396482e1 6971 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
6972 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6973 * is a crude hack but it may be the best for now since
6974 * we have no flag "this EXACTish node was UTF-8"
6975 * --jhi */
6976 const char * const s =
ddc5bc0f 6977 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
6978 PL_colors[0], PL_colors[1],
6979 PERL_PV_ESCAPE_UNI_DETECT |
6980 PERL_PV_PRETTY_ELIPSES |
6981 PERL_PV_PRETTY_LTGT
6982 );
6983 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 6984 } else if (k == TRIE) {
3dab1dad 6985 /* print the details of the trie in dumpuntil instead, as
4f639d21 6986 * prog->data isn't available here */
1de06328
YO
6987 const char op = OP(o);
6988 const I32 n = ARG(o);
6989 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
6990 (reg_ac_data *)prog->data->data[n] :
6991 NULL;
6992 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
6993 (reg_trie_data*)prog->data->data[n] :
6994 ac->trie;
6995
6996 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6997 DEBUG_TRIE_COMPILE_r(
6998 Perl_sv_catpvf(aTHX_ sv,
6999 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7000 (UV)trie->startstate,
7001 (IV)trie->laststate-1,
7002 (UV)trie->wordcount,
7003 (UV)trie->minlen,
7004 (UV)trie->maxlen,
7005 (UV)TRIE_CHARCOUNT(trie),
7006 (UV)trie->uniquecharcount
7007 )
7008 );
7009 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7010 int i;
7011 int rangestart = -1;
f46cb337 7012 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7013 Perl_sv_catpvf(aTHX_ sv, "[");
7014 for (i = 0; i <= 256; i++) {
7015 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7016 if (rangestart == -1)
7017 rangestart = i;
7018 } else if (rangestart != -1) {
7019 if (i <= rangestart + 3)
7020 for (; rangestart < i; rangestart++)
7021 put_byte(sv, rangestart);
7022 else {
7023 put_byte(sv, rangestart);
7024 sv_catpvs(sv, "-");
7025 put_byte(sv, i - 1);
7026 }
7027 rangestart = -1;
7028 }
7029 }
7030 Perl_sv_catpvf(aTHX_ sv, "]");
7031 }
7032
a3621e74 7033 } else if (k == CURLY) {
cb434fcc 7034 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7035 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7036 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7037 }
2c2d71f5
JH
7038 else if (k == WHILEM && o->flags) /* Ordinal/of */
7039 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 7040 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 7041 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 7042 else if (k == LOGICAL)
04ebc1ab 7043 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7044 else if (k == ANYOF) {
7045 int i, rangestart = -1;
2d03de9c 7046 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7047
7048 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7049 static const char * const anyofs[] = {
653099ff
GS
7050 "\\w",
7051 "\\W",
7052 "\\s",
7053 "\\S",
7054 "\\d",
7055 "\\D",
7056 "[:alnum:]",
7057 "[:^alnum:]",
7058 "[:alpha:]",
7059 "[:^alpha:]",
7060 "[:ascii:]",
7061 "[:^ascii:]",
7062 "[:ctrl:]",
7063 "[:^ctrl:]",
7064 "[:graph:]",
7065 "[:^graph:]",
7066 "[:lower:]",
7067 "[:^lower:]",
7068 "[:print:]",
7069 "[:^print:]",
7070 "[:punct:]",
7071 "[:^punct:]",
7072 "[:upper:]",
aaa51d5e 7073 "[:^upper:]",
653099ff 7074 "[:xdigit:]",
aaa51d5e
JF
7075 "[:^xdigit:]",
7076 "[:space:]",
7077 "[:^space:]",
7078 "[:blank:]",
7079 "[:^blank:]"
653099ff
GS
7080 };
7081
19860706 7082 if (flags & ANYOF_LOCALE)
396482e1 7083 sv_catpvs(sv, "{loc}");
19860706 7084 if (flags & ANYOF_FOLD)
396482e1 7085 sv_catpvs(sv, "{i}");
653099ff 7086 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7087 if (flags & ANYOF_INVERT)
396482e1 7088 sv_catpvs(sv, "^");
ffc61ed2
JH
7089 for (i = 0; i <= 256; i++) {
7090 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7091 if (rangestart == -1)
7092 rangestart = i;
7093 } else if (rangestart != -1) {
7094 if (i <= rangestart + 3)
7095 for (; rangestart < i; rangestart++)
653099ff 7096 put_byte(sv, rangestart);
ffc61ed2
JH
7097 else {
7098 put_byte(sv, rangestart);
396482e1 7099 sv_catpvs(sv, "-");
ffc61ed2 7100 put_byte(sv, i - 1);
653099ff 7101 }
ffc61ed2 7102 rangestart = -1;
653099ff 7103 }
847a199f 7104 }
ffc61ed2
JH
7105
7106 if (o->flags & ANYOF_CLASS)
bb7a0f54 7107 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7108 if (ANYOF_CLASS_TEST(o,i))
7109 sv_catpv(sv, anyofs[i]);
7110
7111 if (flags & ANYOF_UNICODE)
396482e1 7112 sv_catpvs(sv, "{unicode}");
1aa99e6b 7113 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7114 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7115
7116 {
7117 SV *lv;
32fc9b6a 7118 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7119
ffc61ed2
JH
7120 if (lv) {
7121 if (sw) {
89ebb4a3 7122 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7123
ffc61ed2 7124 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7125 uvchr_to_utf8(s, i);
ffc61ed2 7126
3568d838 7127 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7128 if (rangestart == -1)
7129 rangestart = i;
7130 } else if (rangestart != -1) {
ffc61ed2
JH
7131 if (i <= rangestart + 3)
7132 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7133 const U8 * const e = uvchr_to_utf8(s,rangestart);
7134 U8 *p;
7135 for(p = s; p < e; p++)
ffc61ed2
JH
7136 put_byte(sv, *p);
7137 }
7138 else {
2d03de9c
AL
7139 const U8 *e = uvchr_to_utf8(s,rangestart);
7140 U8 *p;
7141 for (p = s; p < e; p++)
ffc61ed2 7142 put_byte(sv, *p);
396482e1 7143 sv_catpvs(sv, "-");
2d03de9c
AL
7144 e = uvchr_to_utf8(s, i-1);
7145 for (p = s; p < e; p++)
1df70142 7146 put_byte(sv, *p);
ffc61ed2
JH
7147 }
7148 rangestart = -1;
7149 }
19860706 7150 }
ffc61ed2 7151
396482e1 7152 sv_catpvs(sv, "..."); /* et cetera */
19860706 7153 }
fde631ed 7154
ffc61ed2 7155 {
2e0de35c 7156 char *s = savesvpv(lv);
c445ea15 7157 char * const origs = s;
b81d288d 7158
3dab1dad
YO
7159 while (*s && *s != '\n')
7160 s++;
b81d288d 7161
ffc61ed2 7162 if (*s == '\n') {
2d03de9c 7163 const char * const t = ++s;
ffc61ed2
JH
7164
7165 while (*s) {
7166 if (*s == '\n')
7167 *s = ' ';
7168 s++;
7169 }
7170 if (s[-1] == ' ')
7171 s[-1] = 0;
7172
7173 sv_catpv(sv, t);
fde631ed 7174 }
b81d288d 7175
ffc61ed2 7176 Safefree(origs);
fde631ed
JH
7177 }
7178 }
653099ff 7179 }
ffc61ed2 7180
653099ff
GS
7181 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7182 }
9b155405 7183 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7184 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7185#else
96a5add6 7186 PERL_UNUSED_CONTEXT;
65e66c80
SP
7187 PERL_UNUSED_ARG(sv);
7188 PERL_UNUSED_ARG(o);
f9049ba1 7189 PERL_UNUSED_ARG(prog);
17c3b450 7190#endif /* DEBUGGING */
35ff7856 7191}
a687059c 7192
cad2e5aa
JH
7193SV *
7194Perl_re_intuit_string(pTHX_ regexp *prog)
7195{ /* Assume that RE_INTUIT is set */
97aff369 7196 dVAR;
a3621e74 7197 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7198 PERL_UNUSED_CONTEXT;
7199
a3621e74 7200 DEBUG_COMPILE_r(
cfd0369c 7201 {
2d03de9c 7202 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7203 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7204
7205 if (!PL_colorset) reginitcolors();
7206 PerlIO_printf(Perl_debug_log,
a0288114 7207 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7208 PL_colors[4],
7209 prog->check_substr ? "" : "utf8 ",
7210 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7211 s,
7212 PL_colors[1],
7213 (strlen(s) > 60 ? "..." : ""));
7214 } );
7215
33b8afdf 7216 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7217}
7218
2b69d0c2 7219void
864dbfa3 7220Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7221{
27da23d5 7222 dVAR;
0df25f3d 7223
fc32ee4a 7224 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7225
7821416a
IZ
7226 if (!r || (--r->refcnt > 0))
7227 return;
ab3bbdeb 7228 DEBUG_COMPILE_r({
0df25f3d
YO
7229 if (!PL_colorset)
7230 reginitcolors();
ab3bbdeb
YO
7231 if (RX_DEBUG(r)){
7232 SV *dsv= sv_newmortal();
7233 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7234 dsv, r->precomp, r->prelen, 60);
7235 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7236 PL_colors[4],PL_colors[5],s);
7237 }
9e55ce06 7238 });
cad2e5aa 7239
43c5f42d
NC
7240 /* gcov results gave these as non-null 100% of the time, so there's no
7241 optimisation in checking them before calling Safefree */
7242 Safefree(r->precomp);
7243 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7244 RX_MATCH_COPY_FREE(r);
f8c7b90f 7245#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7246 if (r->saved_copy)
7247 SvREFCNT_dec(r->saved_copy);
7248#endif
a193d654
GS
7249 if (r->substrs) {
7250 if (r->anchored_substr)
7251 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7252 if (r->anchored_utf8)
7253 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7254 if (r->float_substr)
7255 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7256 if (r->float_utf8)
7257 SvREFCNT_dec(r->float_utf8);
2779dcf1 7258 Safefree(r->substrs);
a193d654 7259 }
c277df42
IZ
7260 if (r->data) {
7261 int n = r->data->count;
f3548bdc
DM
7262 PAD* new_comppad = NULL;
7263 PAD* old_comppad;
4026c95a 7264 PADOFFSET refcnt;
dfad63ad 7265
c277df42 7266 while (--n >= 0) {
261faec3 7267 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7268 switch (r->data->what[n]) {
7269 case 's':
7270 SvREFCNT_dec((SV*)r->data->data[n]);
7271 break;
653099ff
GS
7272 case 'f':
7273 Safefree(r->data->data[n]);
7274 break;
dfad63ad
HS
7275 case 'p':
7276 new_comppad = (AV*)r->data->data[n];
7277 break;
c277df42 7278 case 'o':
dfad63ad 7279 if (new_comppad == NULL)
cea2e8a9 7280 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7281 PAD_SAVE_LOCAL(old_comppad,
7282 /* Watch out for global destruction's random ordering. */
c445ea15 7283 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 7284 );
b34c0dd4 7285 OP_REFCNT_LOCK;
4026c95a
SH
7286 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7287 OP_REFCNT_UNLOCK;
7288 if (!refcnt)
9b978d73 7289 op_free((OP_4tree*)r->data->data[n]);
9b978d73 7290
f3548bdc 7291 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
7292 SvREFCNT_dec((SV*)new_comppad);
7293 new_comppad = NULL;
c277df42
IZ
7294 break;
7295 case 'n':
9e55ce06 7296 break;
07be1b83 7297 case 'T':
be8e71aa
YO
7298 { /* Aho Corasick add-on structure for a trie node.
7299 Used in stclass optimization only */
07be1b83
YO
7300 U32 refcount;
7301 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7302 OP_REFCNT_LOCK;
7303 refcount = --aho->refcount;
7304 OP_REFCNT_UNLOCK;
7305 if ( !refcount ) {
7306 Safefree(aho->states);
7307 Safefree(aho->fail);
7308 aho->trie=NULL; /* not necessary to free this as it is
7309 handled by the 't' case */
7310 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 7311 Safefree(r->regstclass);
07be1b83
YO
7312 }
7313 }
7314 break;
a3621e74 7315 case 't':
07be1b83 7316 {
be8e71aa 7317 /* trie structure. */
07be1b83
YO
7318 U32 refcount;
7319 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7320 OP_REFCNT_LOCK;
7321 refcount = --trie->refcount;
7322 OP_REFCNT_UNLOCK;
7323 if ( !refcount ) {
7324 Safefree(trie->charmap);
7325 if (trie->widecharmap)
7326 SvREFCNT_dec((SV*)trie->widecharmap);
7327 Safefree(trie->states);
7328 Safefree(trie->trans);
7329 if (trie->bitmap)
7330 Safefree(trie->bitmap);
7331 if (trie->wordlen)
7332 Safefree(trie->wordlen);
786e8c11
YO
7333 if (trie->jump)
7334 Safefree(trie->jump);
7335 if (trie->nextword)
7336 Safefree(trie->nextword);
a3621e74 7337#ifdef DEBUGGING
be8e71aa
YO
7338 if (RX_DEBUG(r)) {
7339 if (trie->words)
7340 SvREFCNT_dec((SV*)trie->words);
7341 if (trie->revcharmap)
7342 SvREFCNT_dec((SV*)trie->revcharmap);
7343 }
a3621e74 7344#endif
07be1b83 7345 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7346 }
07be1b83
YO
7347 }
7348 break;
c277df42 7349 default:
830247a4 7350 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7351 }
7352 }
7353 Safefree(r->data->what);
7354 Safefree(r->data);
a0d0e21e
LW
7355 }
7356 Safefree(r->startp);
7357 Safefree(r->endp);
7358 Safefree(r);
a687059c 7359}
c277df42 7360
76234dfb 7361#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7362/*
7363 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7364 */
7365regnode *
864dbfa3 7366Perl_regnext(pTHX_ register regnode *p)
c277df42 7367{
97aff369 7368 dVAR;
c277df42
IZ
7369 register I32 offset;
7370
3280af22 7371 if (p == &PL_regdummy)
c277df42
IZ
7372 return(NULL);
7373
7374 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7375 if (offset == 0)
7376 return(NULL);
7377
c277df42 7378 return(p+offset);
c277df42 7379}
76234dfb 7380#endif
c277df42 7381
01f988be 7382STATIC void
cea2e8a9 7383S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7384{
7385 va_list args;
7386 STRLEN l1 = strlen(pat1);
7387 STRLEN l2 = strlen(pat2);
7388 char buf[512];
06bf62c7 7389 SV *msv;
73d840c0 7390 const char *message;
c277df42
IZ
7391
7392 if (l1 > 510)
7393 l1 = 510;
7394 if (l1 + l2 > 510)
7395 l2 = 510 - l1;
7396 Copy(pat1, buf, l1 , char);
7397 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7398 buf[l1 + l2] = '\n';
7399 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7400#ifdef I_STDARG
7401 /* ANSI variant takes additional second argument */
c277df42 7402 va_start(args, pat2);
8736538c
AS
7403#else
7404 va_start(args);
7405#endif
5a844595 7406 msv = vmess(buf, &args);
c277df42 7407 va_end(args);
cfd0369c 7408 message = SvPV_const(msv,l1);
c277df42
IZ
7409 if (l1 > 512)
7410 l1 = 512;
7411 Copy(message, buf, l1 , char);
197cf9b9 7412 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7413 Perl_croak(aTHX_ "%s", buf);
c277df42 7414}
a0ed51b3
LW
7415
7416/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7417
76234dfb 7418#ifndef PERL_IN_XSUB_RE
a0ed51b3 7419void
864dbfa3 7420Perl_save_re_context(pTHX)
b81d288d 7421{
97aff369 7422 dVAR;
1ade1aa1
NC
7423
7424 struct re_save_state *state;
7425
7426 SAVEVPTR(PL_curcop);
7427 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7428
7429 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7430 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7431 SSPUSHINT(SAVEt_RE_STATE);
7432
46ab3289 7433 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 7434
a0ed51b3 7435 PL_reg_start_tmp = 0;
a0ed51b3 7436 PL_reg_start_tmpl = 0;
c445ea15 7437 PL_reg_oldsaved = NULL;
a5db57d6 7438 PL_reg_oldsavedlen = 0;
a5db57d6 7439 PL_reg_maxiter = 0;
a5db57d6 7440 PL_reg_leftiter = 0;
c445ea15 7441 PL_reg_poscache = NULL;
a5db57d6 7442 PL_reg_poscache_size = 0;
1ade1aa1
NC
7443#ifdef PERL_OLD_COPY_ON_WRITE
7444 PL_nrs = NULL;
7445#endif
ada6e8a9 7446
c445ea15
AL
7447 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7448 if (PL_curpm) {
7449 const REGEXP * const rx = PM_GETRE(PL_curpm);
7450 if (rx) {
1df70142 7451 U32 i;
ada6e8a9 7452 for (i = 1; i <= rx->nparens; i++) {
1df70142 7453 char digits[TYPE_CHARS(long)];
d9fad198 7454 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
7455 GV *const *const gvp
7456 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7457
b37c2d43
AL
7458 if (gvp) {
7459 GV * const gv = *gvp;
7460 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7461 save_scalar(gv);
49f27e4b 7462 }
ada6e8a9
AMS
7463 }
7464 }
7465 }
a0ed51b3 7466}
76234dfb 7467#endif
51371543 7468
51371543 7469static void
acfe0abc 7470clear_re(pTHX_ void *r)
51371543 7471{
97aff369 7472 dVAR;
51371543
GS
7473 ReREFCNT_dec((regexp *)r);
7474}
ffbc6a93 7475
a28509cc
AL
7476#ifdef DEBUGGING
7477
7478STATIC void
7479S_put_byte(pTHX_ SV *sv, int c)
7480{
7481 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7482 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7483 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7484 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7485 else
7486 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7487}
7488
786e8c11 7489
3dab1dad
YO
7490#define CLEAR_OPTSTART \
7491 if (optstart) STMT_START { \
07be1b83 7492 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
7493 optstart=NULL; \
7494 } STMT_END
7495
786e8c11 7496#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 7497
b5a2f8d8
NC
7498STATIC const regnode *
7499S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
7500 const regnode *last, const regnode *plast,
7501 SV* sv, I32 indent, U32 depth)
a28509cc 7502{
97aff369 7503 dVAR;
786e8c11 7504 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 7505 register const regnode *next;
3dab1dad
YO
7506 const regnode *optstart= NULL;
7507 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 7508
786e8c11
YO
7509#ifdef DEBUG_DUMPUNTIL
7510 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7511 last ? last-start : 0,plast ? plast-start : 0);
7512#endif
7513
7514 if (plast && plast < last)
7515 last= plast;
7516
7517 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
7518 /* While that wasn't END last time... */
7519
7520 NODE_ALIGN(node);
7521 op = OP(node);
7522 if (op == CLOSE)
786e8c11 7523 indent--;
b5a2f8d8 7524 next = regnext((regnode *)node);
07be1b83 7525
a28509cc 7526 /* Where, what. */
8e11feef 7527 if (OP(node) == OPTIMIZED) {
e68ec53f 7528 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 7529 optstart = node;
3dab1dad 7530 else
8e11feef 7531 goto after_print;
3dab1dad
YO
7532 } else
7533 CLEAR_OPTSTART;
07be1b83 7534
32fc9b6a 7535 regprop(r, sv, node);
a28509cc 7536 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 7537 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
7538
7539 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
7540 if (next == NULL) /* Next ptr. */
7541 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
7542 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7543 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
7544 else
7545 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 7546
1de06328 7547 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 7548 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
7549 }
7550
a28509cc
AL
7551 after_print:
7552 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
7553 assert(next);
7554 {
7555 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
7556 ? regnext((regnode *)next)
7557 : next);
be8e71aa
YO
7558 if (last && nnode > last)
7559 nnode = last;
786e8c11 7560 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 7561 }
a28509cc
AL
7562 }
7563 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 7564 assert(next);
786e8c11 7565 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
7566 }
7567 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 7568 const char op = OP(node);
a28509cc 7569 const I32 n = ARG(node);
1de06328
YO
7570 const reg_ac_data * const ac = op>=AHOCORASICK ?
7571 (reg_ac_data *)r->data->data[n] :
7572 NULL;
7573 const reg_trie_data * const trie = op<AHOCORASICK ?
7574 (reg_trie_data*)r->data->data[n] :
7575 ac->trie;
786e8c11 7576 const regnode *nextbranch= NULL;
a28509cc 7577 I32 word_idx;
1de06328 7578 sv_setpvn(sv, "", 0);
786e8c11 7579 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 7580 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
7581
7582 PerlIO_printf(Perl_debug_log, "%*s%s ",
7583 (int)(2*(indent+3)), "",
7584 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7585 PL_colors[0], PL_colors[1],
7586 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7587 PERL_PV_PRETTY_ELIPSES |
7588 PERL_PV_PRETTY_LTGT
786e8c11
YO
7589 )
7590 : "???"
7591 );
7592 if (trie->jump) {
7593 U16 dist= trie->jump[word_idx+1];
7594 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7595 if (dist) {
7596 if (!nextbranch)
7597 nextbranch= next - trie->jump[0];
7598 DUMPUNTIL(next - dist, nextbranch);
7599 }
7600 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7601 nextbranch= regnext((regnode *)nextbranch);
7602 } else {
7603 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 7604 }
786e8c11
YO
7605 }
7606 if (last && next > last)
7607 node= last;
7608 else
7609 node= next;
a28509cc 7610 }
786e8c11
YO
7611 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7612 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7613 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
7614 }
7615 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7616 assert(next);
786e8c11 7617 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
7618 }
7619 else if ( op == PLUS || op == STAR) {
786e8c11 7620 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
7621 }
7622 else if (op == ANYOF) {
7623 /* arglen 1 + class block */
7624 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7625 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7626 node = NEXTOPER(node);
7627 }
7628 else if (PL_regkind[(U8)op] == EXACT) {
7629 /* Literal string, where present. */
7630 node += NODE_SZ_STR(node) - 1;
7631 node = NEXTOPER(node);
7632 }
7633 else {
7634 node = NEXTOPER(node);
7635 node += regarglen[(U8)op];
7636 }
7637 if (op == CURLYX || op == OPEN)
786e8c11 7638 indent++;
a28509cc 7639 else if (op == WHILEM)
786e8c11 7640 indent--;
a28509cc 7641 }
3dab1dad 7642 CLEAR_OPTSTART;
786e8c11
YO
7643#ifdef DEBUG_DUMPUNTIL
7644 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7645#endif
1de06328 7646 return node;
a28509cc
AL
7647}
7648
7649#endif /* DEBUGGING */
7650
241d1a3b
NC
7651/*
7652 * Local variables:
7653 * c-indentation-style: bsd
7654 * c-basic-offset: 4
7655 * indent-tabs-mode: t
7656 * End:
7657 *
37442d52
RGS
7658 * ex: set ts=8 sts=4 sw=4 noet:
7659 */