This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
POD fixes in perlce manpage
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
786e8c11 174#define FULL_TRIE_STUDY
07be1b83
YO
175#define TRIE_STCLASS
176#endif
1de06328
YO
177
178
179/* About scan_data_t.
180
181 During optimisation we recurse through the regexp program performing
182 various inplace (keyhole style) optimisations. In addition study_chunk
183 and scan_commit populate this data structure with information about
184 what strings MUST appear in the pattern. We look for the longest
185 string that must appear for at a fixed location, and we look for the
186 longest string that may appear at a floating location. So for instance
187 in the pattern:
188
189 /FOO[xX]A.*B[xX]BAR/
190
191 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
192 strings (because they follow a .* construct). study_chunk will identify
193 both FOO and BAR as being the longest fixed and floating strings respectively.
194
195 The strings can be composites, for instance
196
197 /(f)(o)(o)/
198
199 will result in a composite fixed substring 'foo'.
200
201 For each string some basic information is maintained:
202
203 - offset or min_offset
204 This is the position the string must appear at, or not before.
205 It also implicitly (when combined with minlenp) tells us how many
206 character must match before the string we are searching.
207 Likewise when combined with minlenp and the length of the string
208 tells us how many characters must appear after the string we have
209 found.
210
211 - max_offset
212 Only used for floating strings. This is the rightmost point that
213 the string can appear at. Ifset to I32 max it indicates that the
214 string can occur infinitely far to the right.
215
216 - minlenp
217 A pointer to the minimum length of the pattern that the string
218 was found inside. This is important as in the case of positive
219 lookahead or positive lookbehind we can have multiple patterns
220 involved. Consider
221
222 /(?=FOO).*F/
223
224 The minimum length of the pattern overall is 3, the minimum length
225 of the lookahead part is 3, but the minimum length of the part that
226 will actually match is 1. So 'FOO's minimum length is 3, but the
227 minimum length for the F is 1. This is important as the minimum length
228 is used to determine offsets in front of and behind the string being
229 looked for. Since strings can be composites this is the length of the
230 pattern at the time it was commited with a scan_commit. Note that
231 the length is calculated by study_chunk, so that the minimum lengths
232 are not known until the full pattern has been compiled, thus the
233 pointer to the value.
234
235 - lookbehind
236
237 In the case of lookbehind the string being searched for can be
238 offset past the start point of the final matching string.
239 If this value was just blithely removed from the min_offset it would
240 invalidate some of the calculations for how many chars must match
241 before or after (as they are derived from min_offset and minlen and
242 the length of the string being searched for).
243 When the final pattern is compiled and the data is moved from the
244 scan_data_t structure into the regexp structure the information
245 about lookbehind is factored in, with the information that would
246 have been lost precalculated in the end_shift field for the
247 associated string.
248
249 The fields pos_min and pos_delta are used to store the minimum offset
250 and the delta to the maximum offset at the current point in the pattern.
251
252*/
2c2d71f5
JH
253
254typedef struct scan_data_t {
1de06328
YO
255 /*I32 len_min; unused */
256 /*I32 len_delta; unused */
2c2d71f5
JH
257 I32 pos_min;
258 I32 pos_delta;
259 SV *last_found;
1de06328 260 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
261 I32 last_start_min;
262 I32 last_start_max;
1de06328
YO
263 SV **longest; /* Either &l_fixed, or &l_float. */
264 SV *longest_fixed; /* longest fixed string found in pattern */
265 I32 offset_fixed; /* offset where it starts */
266 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
267 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
268 SV *longest_float; /* longest floating string found in pattern */
269 I32 offset_float_min; /* earliest point in string it can appear */
270 I32 offset_float_max; /* latest point in string it can appear */
271 I32 *minlen_float; /* pointer to the minlen relevent to the string */
272 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
273 I32 flags;
274 I32 whilem_c;
cb434fcc 275 I32 *last_closep;
653099ff 276 struct regnode_charclass_class *start_class;
2c2d71f5
JH
277} scan_data_t;
278
a687059c 279/*
e50aee73 280 * Forward declarations for pregcomp()'s friends.
a687059c 281 */
a0d0e21e 282
27da23d5 283static const scan_data_t zero_scan_data =
1de06328 284 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
285
286#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
287#define SF_BEFORE_SEOL 0x0001
288#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
289#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
290#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
291
09b7f37c
CB
292#ifdef NO_UNARY_PLUS
293# define SF_FIX_SHIFT_EOL (0+2)
294# define SF_FL_SHIFT_EOL (0+4)
295#else
296# define SF_FIX_SHIFT_EOL (+2)
297# define SF_FL_SHIFT_EOL (+4)
298#endif
c277df42
IZ
299
300#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
301#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
302
303#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
304#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
305#define SF_IS_INF 0x0040
306#define SF_HAS_PAR 0x0080
307#define SF_IN_PAR 0x0100
308#define SF_HAS_EVAL 0x0200
309#define SCF_DO_SUBSTR 0x0400
653099ff
GS
310#define SCF_DO_STCLASS_AND 0x0800
311#define SCF_DO_STCLASS_OR 0x1000
312#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 313#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 314
786e8c11
YO
315#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
316
07be1b83 317
eb160463 318#define UTF (RExC_utf8 != 0)
e2509266
JH
319#define LOC ((RExC_flags & PMf_LOCALE) != 0)
320#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 321
ffc61ed2 322#define OOB_UNICODE 12345678
93733859 323#define OOB_NAMEDCLASS -1
b8c5462f 324
a0ed51b3
LW
325#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
326#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
327
8615cb43 328
b45f050a
JF
329/* length of regex to show in messages that don't mark a position within */
330#define RegexLengthToShowInErrorMessages 127
331
332/*
333 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
334 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
335 * op/pragma/warn/regcomp.
336 */
7253e4e3
RK
337#define MARKER1 "<-- HERE" /* marker as it appears in the description */
338#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 339
7253e4e3 340#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
344 * arg. Show regex, up to a maximum length. If it's too long, chop and add
345 * "...".
346 */
ccb2c380 347#define FAIL(msg) STMT_START { \
bfed75c6 348 const char *ellipses = ""; \
ccb2c380
MP
349 IV len = RExC_end - RExC_precomp; \
350 \
351 if (!SIZE_ONLY) \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 if (len > RegexLengthToShowInErrorMessages) { \
354 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
355 len = RegexLengthToShowInErrorMessages - 10; \
356 ellipses = "..."; \
357 } \
358 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
359 msg, (int)len, RExC_precomp, ellipses); \
360} STMT_END
8615cb43 361
b45f050a 362/*
b45f050a
JF
363 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
364 */
ccb2c380 365#define Simple_vFAIL(m) STMT_START { \
a28509cc 366 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
367 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
368 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
b45f050a
JF
370
371/*
372 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
373 */
ccb2c380
MP
374#define vFAIL(m) STMT_START { \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 Simple_vFAIL(m); \
378} STMT_END
b45f050a
JF
379
380/*
381 * Like Simple_vFAIL(), but accepts two arguments.
382 */
ccb2c380 383#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 384 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
385 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
386 (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
b45f050a
JF
388
389/*
390 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
391 */
ccb2c380
MP
392#define vFAIL2(m,a1) STMT_START { \
393 if (!SIZE_ONLY) \
394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
395 Simple_vFAIL2(m, a1); \
396} STMT_END
b45f050a
JF
397
398
399/*
400 * Like Simple_vFAIL(), but accepts three arguments.
401 */
ccb2c380 402#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 403 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
404 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
410 */
ccb2c380
MP
411#define vFAIL3(m,a1,a2) STMT_START { \
412 if (!SIZE_ONLY) \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL3(m, a1, a2); \
415} STMT_END
b45f050a
JF
416
417/*
418 * Like Simple_vFAIL(), but accepts four arguments.
419 */
ccb2c380 420#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 421 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
424} STMT_END
b45f050a 425
ccb2c380 426#define vWARN(loc,m) STMT_START { \
a28509cc 427 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
431
432#define vWARNdep(loc,m) STMT_START { \
a28509cc 433 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
434 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
435 "%s" REPORT_LOCATION, \
436 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
438
439
440#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 441 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
442 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
443 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
445
446#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 447 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
451
452#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
9d1d55b5 463
8615cb43 464
cd439c50 465/* Allow for side effects in s */
ccb2c380
MP
466#define REGC(c,s) STMT_START { \
467 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
468} STMT_END
cd439c50 469
fac92740
MJD
470/* Macros for recording node offsets. 20001227 mjd@plover.com
471 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
472 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
473 * Element 0 holds the number n.
07be1b83 474 * Position is 1 indexed.
fac92740
MJD
475 */
476
ccb2c380
MP
477#define Set_Node_Offset_To_R(node,byte) STMT_START { \
478 if (! SIZE_ONLY) { \
479 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 480 __LINE__, (node), (int)(byte))); \
ccb2c380 481 if((node) < 0) { \
551405c4 482 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
483 } else { \
484 RExC_offsets[2*(node)-1] = (byte); \
485 } \
486 } \
487} STMT_END
488
489#define Set_Node_Offset(node,byte) \
490 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
491#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
492
493#define Set_Node_Length_To_R(node,len) STMT_START { \
494 if (! SIZE_ONLY) { \
495 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 496 __LINE__, (int)(node), (int)(len))); \
ccb2c380 497 if((node) < 0) { \
551405c4 498 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
499 } else { \
500 RExC_offsets[2*(node)] = (len); \
501 } \
502 } \
503} STMT_END
504
505#define Set_Node_Length(node,len) \
506 Set_Node_Length_To_R((node)-RExC_emit_start, len)
507#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
508#define Set_Node_Cur_Length(node) \
509 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
510
511/* Get offsets and lengths */
512#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
513#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
514
07be1b83
YO
515#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
517 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
518} STMT_END
519
520
521#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
522#define EXPERIMENTAL_INPLACESCAN
523#endif
524
1de06328
YO
525#define DEBUG_STUDYDATA(data,depth) \
526DEBUG_OPTIMISE_r(if(data){ \
527 PerlIO_printf(Perl_debug_log, \
528 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
529 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
530 (int)(depth)*2, "", \
531 (IV)((data)->pos_min), \
532 (IV)((data)->pos_delta), \
533 (IV)((data)->flags), \
534 (IV)((data)->whilem_c), \
535 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
536 ); \
537 if ((data)->last_found) \
538 PerlIO_printf(Perl_debug_log, \
539 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
540 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
541 SvPVX_const((data)->last_found), \
542 (IV)((data)->last_end), \
543 (IV)((data)->last_start_min), \
544 (IV)((data)->last_start_max), \
545 ((data)->longest && \
546 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
547 SvPVX_const((data)->longest_fixed), \
548 (IV)((data)->offset_fixed), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_float)) ? "*" : "", \
551 SvPVX_const((data)->longest_float), \
552 (IV)((data)->offset_float_min), \
553 (IV)((data)->offset_float_max) \
554 ); \
555 PerlIO_printf(Perl_debug_log,"\n"); \
556});
557
acfe0abc 558static void clear_re(pTHX_ void *r);
4327152a 559
653099ff 560/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 561 Update the longest found anchored substring and the longest found
653099ff
GS
562 floating substrings if needed. */
563
4327152a 564STATIC void
1de06328 565S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 566{
e1ec3a88
AL
567 const STRLEN l = CHR_SVLEN(data->last_found);
568 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 569 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 570
c277df42 571 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 572 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
573 if (*data->longest == data->longest_fixed) {
574 data->offset_fixed = l ? data->last_start_min : data->pos_min;
575 if (data->flags & SF_BEFORE_EOL)
b81d288d 576 data->flags
c277df42
IZ
577 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
578 else
579 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
580 data->minlen_fixed=minlenp;
581 data->lookbehind_fixed=0;
a0ed51b3
LW
582 }
583 else {
c277df42 584 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
585 data->offset_float_max = (l
586 ? data->last_start_max
c277df42 587 : data->pos_min + data->pos_delta);
9051bda5
HS
588 if ((U32)data->offset_float_max > (U32)I32_MAX)
589 data->offset_float_max = I32_MAX;
c277df42 590 if (data->flags & SF_BEFORE_EOL)
b81d288d 591 data->flags
c277df42
IZ
592 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
593 else
594 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
595 data->minlen_float=minlenp;
596 data->lookbehind_float=0;
c277df42
IZ
597 }
598 }
599 SvCUR_set(data->last_found, 0);
0eda9292 600 {
a28509cc 601 SV * const sv = data->last_found;
097eb12c
AL
602 if (SvUTF8(sv) && SvMAGICAL(sv)) {
603 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
604 if (mg)
605 mg->mg_len = 0;
606 }
0eda9292 607 }
c277df42
IZ
608 data->last_end = -1;
609 data->flags &= ~SF_BEFORE_EOL;
1de06328 610 DEBUG_STUDYDATA(data,0);
c277df42
IZ
611}
612
653099ff
GS
613/* Can match anything (initialization) */
614STATIC void
097eb12c 615S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 616{
653099ff 617 ANYOF_CLASS_ZERO(cl);
f8bef550 618 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 619 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
620 if (LOC)
621 cl->flags |= ANYOF_LOCALE;
622}
623
624/* Can match anything (initialization) */
625STATIC int
5f66b61c 626S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
627{
628 int value;
629
aaa51d5e 630 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
631 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
632 return 1;
1aa99e6b
IH
633 if (!(cl->flags & ANYOF_UNICODE_ALL))
634 return 0;
10edeb5d 635 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 636 return 0;
653099ff
GS
637 return 1;
638}
639
640/* Can match anything (initialization) */
641STATIC void
097eb12c 642S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 643{
8ecf7187 644 Zero(cl, 1, struct regnode_charclass_class);
653099ff 645 cl->type = ANYOF;
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647}
648
649STATIC void
097eb12c 650S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
8ecf7187 652 Zero(cl, 1, struct regnode_charclass_class);
653099ff 653 cl->type = ANYOF;
830247a4 654 cl_anything(pRExC_state, cl);
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* 'And' a given class with another one. Can create false positives */
660/* We assume that cl is not inverted */
661STATIC void
5f66b61c 662S_cl_and(struct regnode_charclass_class *cl,
a28509cc 663 const struct regnode_charclass_class *and_with)
653099ff 664{
653099ff
GS
665 if (!(and_with->flags & ANYOF_CLASS)
666 && !(cl->flags & ANYOF_CLASS)
667 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
668 && !(and_with->flags & ANYOF_FOLD)
669 && !(cl->flags & ANYOF_FOLD)) {
670 int i;
671
672 if (and_with->flags & ANYOF_INVERT)
673 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
674 cl->bitmap[i] &= ~and_with->bitmap[i];
675 else
676 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
677 cl->bitmap[i] &= and_with->bitmap[i];
678 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
679 if (!(and_with->flags & ANYOF_EOS))
680 cl->flags &= ~ANYOF_EOS;
1aa99e6b 681
14ebb1a2
JH
682 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
683 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
684 cl->flags &= ~ANYOF_UNICODE_ALL;
685 cl->flags |= ANYOF_UNICODE;
686 ARG_SET(cl, ARG(and_with));
687 }
14ebb1a2
JH
688 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
689 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 690 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
691 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
692 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 693 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
694}
695
696/* 'OR' a given class with another one. Can create false positives */
697/* We assume that cl is not inverted */
698STATIC void
097eb12c 699S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 700{
653099ff
GS
701 if (or_with->flags & ANYOF_INVERT) {
702 /* We do not use
703 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
704 * <= (B1 | !B2) | (CL1 | !CL2)
705 * which is wasteful if CL2 is small, but we ignore CL2:
706 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
707 * XXXX Can we handle case-fold? Unclear:
708 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
709 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
710 */
711 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
712 && !(or_with->flags & ANYOF_FOLD)
713 && !(cl->flags & ANYOF_FOLD) ) {
714 int i;
715
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] |= ~or_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise */
719 else {
830247a4 720 cl_anything(pRExC_state, cl);
653099ff
GS
721 }
722 } else {
723 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
724 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 725 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
726 || (cl->flags & ANYOF_FOLD)) ) {
727 int i;
728
729 /* OR char bitmap and class bitmap separately */
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] |= or_with->bitmap[i];
732 if (or_with->flags & ANYOF_CLASS) {
733 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
734 cl->classflags[i] |= or_with->classflags[i];
735 cl->flags |= ANYOF_CLASS;
736 }
737 }
738 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 739 cl_anything(pRExC_state, cl);
653099ff
GS
740 }
741 }
742 if (or_with->flags & ANYOF_EOS)
743 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
744
745 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
746 ARG(cl) != ARG(or_with)) {
747 cl->flags |= ANYOF_UNICODE_ALL;
748 cl->flags &= ~ANYOF_UNICODE;
749 }
750 if (or_with->flags & ANYOF_UNICODE_ALL) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
753 }
653099ff
GS
754}
755
a3621e74
YO
756#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
757#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
758#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
759#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
760
3dab1dad
YO
761
762#ifdef DEBUGGING
07be1b83 763/*
3dab1dad
YO
764 dump_trie(trie)
765 dump_trie_interim_list(trie,next_alloc)
766 dump_trie_interim_table(trie,next_alloc)
767
768 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
769 The _interim_ variants are used for debugging the interim
770 tables that are used to generate the final compressed
771 representation which is what dump_trie expects.
772
3dab1dad
YO
773 Part of the reason for their existance is to provide a form
774 of documentation as to how the different representations function.
07be1b83
YO
775
776*/
3dab1dad
YO
777
778/*
779 dump_trie(trie)
780 Dumps the final compressed table form of the trie to Perl_debug_log.
781 Used for debugging make_trie().
782*/
783
784STATIC void
785S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
786{
787 U32 state;
ab3bbdeb
YO
788 SV *sv=sv_newmortal();
789 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
790 GET_RE_DEBUG_FLAGS_DECL;
791
ab3bbdeb 792
3dab1dad
YO
793 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
794 (int)depth * 2 + 2,"",
795 "Match","Base","Ofs" );
796
797 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 798 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 799 if ( tmp ) {
ab3bbdeb
YO
800 PerlIO_printf( Perl_debug_log, "%*s",
801 colwidth,
ddc5bc0f 802 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
803 PL_colors[0], PL_colors[1],
804 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
805 PERL_PV_ESCAPE_FIRSTCHAR
806 )
807 );
3dab1dad
YO
808 }
809 }
810 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
811 (int)depth * 2 + 2,"");
812
813 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 814 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
815 PerlIO_printf( Perl_debug_log, "\n");
816
786e8c11 817 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 818 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
819
820 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
821
822 if ( trie->states[ state ].wordnum ) {
823 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
824 } else {
825 PerlIO_printf( Perl_debug_log, "%6s", "" );
826 }
827
828 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
829
830 if ( base ) {
831 U32 ofs = 0;
832
833 while( ( base + ofs < trie->uniquecharcount ) ||
834 ( base + ofs - trie->uniquecharcount < trie->lasttrans
835 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
836 ofs++;
837
838 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
839
840 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
841 if ( ( base + ofs >= trie->uniquecharcount ) &&
842 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
843 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
844 {
ab3bbdeb
YO
845 PerlIO_printf( Perl_debug_log, "%*"UVXf,
846 colwidth,
3dab1dad
YO
847 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
848 } else {
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
850 }
851 }
852
853 PerlIO_printf( Perl_debug_log, "]");
854
855 }
856 PerlIO_printf( Perl_debug_log, "\n" );
857 }
858}
859/*
860 dump_trie_interim_list(trie,next_alloc)
861 Dumps a fully constructed but uncompressed trie in list form.
862 List tries normally only are used for construction when the number of
863 possible chars (trie->uniquecharcount) is very high.
864 Used for debugging make_trie().
865*/
866STATIC void
867S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
868{
869 U32 state;
ab3bbdeb
YO
870 SV *sv=sv_newmortal();
871 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
872 GET_RE_DEBUG_FLAGS_DECL;
873 /* print out the table precompression. */
ab3bbdeb
YO
874 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
875 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
876 "------:-----+-----------------\n" );
3dab1dad
YO
877
878 for( state=1 ; state < next_alloc ; state ++ ) {
879 U16 charid;
880
ab3bbdeb 881 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
882 (int)depth * 2 + 2,"", (UV)state );
883 if ( ! trie->states[ state ].wordnum ) {
884 PerlIO_printf( Perl_debug_log, "%5s| ","");
885 } else {
886 PerlIO_printf( Perl_debug_log, "W%4x| ",
887 trie->states[ state ].wordnum
888 );
889 }
890 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 891 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
892 if ( tmp ) {
893 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
894 colwidth,
ddc5bc0f 895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
899 ) ,
3dab1dad
YO
900 TRIE_LIST_ITEM(state,charid).forid,
901 (UV)TRIE_LIST_ITEM(state,charid).newstate
902 );
903 }
ab3bbdeb
YO
904 }
905 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
906 }
907}
908
909/*
910 dump_trie_interim_table(trie,next_alloc)
911 Dumps a fully constructed but uncompressed trie in table form.
912 This is the normal DFA style state transition table, with a few
913 twists to facilitate compression later.
914 Used for debugging make_trie().
915*/
916STATIC void
917S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
918{
919 U32 state;
920 U16 charid;
ab3bbdeb
YO
921 SV *sv=sv_newmortal();
922 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
923 GET_RE_DEBUG_FLAGS_DECL;
924
925 /*
926 print out the table precompression so that we can do a visual check
927 that they are identical.
928 */
929
930 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
931
932 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 933 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 934 if ( tmp ) {
ab3bbdeb
YO
935 PerlIO_printf( Perl_debug_log, "%*s",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 )
942 );
3dab1dad
YO
943 }
944 }
945
946 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
947
948 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
950 }
951
952 PerlIO_printf( Perl_debug_log, "\n" );
953
954 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
955
956 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
957 (int)depth * 2 + 2,"",
958 (UV)TRIE_NODENUM( state ) );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
961 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
962 if (v)
963 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
964 else
965 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
966 }
967 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
968 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
969 } else {
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
971 trie->states[ TRIE_NODENUM( state ) ].wordnum );
972 }
973 }
07be1b83 974}
3dab1dad
YO
975
976#endif
977
786e8c11
YO
978/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
979 startbranch: the first branch in the whole branch sequence
980 first : start branch of sequence of branch-exact nodes.
981 May be the same as startbranch
982 last : Thing following the last branch.
983 May be the same as tail.
984 tail : item following the branch sequence
985 count : words in the sequence
986 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
987 depth : indent depth
3dab1dad 988
786e8c11 989Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 990
786e8c11
YO
991A trie is an N'ary tree where the branches are determined by digital
992decomposition of the key. IE, at the root node you look up the 1st character and
993follow that branch repeat until you find the end of the branches. Nodes can be
994marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 995
786e8c11 996 /he|she|his|hers/
72f13be8 997
786e8c11
YO
998would convert into the following structure. Numbers represent states, letters
999following numbers represent valid transitions on the letter from that state, if
1000the number is in square brackets it represents an accepting state, otherwise it
1001will be in parenthesis.
07be1b83 1002
786e8c11
YO
1003 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1004 | |
1005 | (2)
1006 | |
1007 (1) +-i->(6)-+-s->[7]
1008 |
1009 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1010
786e8c11
YO
1011 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1012
1013This shows that when matching against the string 'hers' we will begin at state 1
1014read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1015then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1016is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1017single traverse. We store a mapping from accepting to state to which word was
1018matched, and then when we have multiple possibilities we try to complete the
1019rest of the regex in the order in which they occured in the alternation.
1020
1021The only prior NFA like behaviour that would be changed by the TRIE support is
1022the silent ignoring of duplicate alternations which are of the form:
1023
1024 / (DUPE|DUPE) X? (?{ ... }) Y /x
1025
1026Thus EVAL blocks follwing a trie may be called a different number of times with
1027and without the optimisation. With the optimisations dupes will be silently
1028ignored. This inconsistant behaviour of EVAL type nodes is well established as
1029the following demonstrates:
1030
1031 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1032
1033which prints out 'word' three times, but
1034
1035 'words'=~/(word|word|word)(?{ print $1 })S/
1036
1037which doesnt print it out at all. This is due to other optimisations kicking in.
1038
1039Example of what happens on a structural level:
1040
1041The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1042
1043 1: CURLYM[1] {1,32767}(18)
1044 5: BRANCH(8)
1045 6: EXACT <ac>(16)
1046 8: BRANCH(11)
1047 9: EXACT <ad>(16)
1048 11: BRANCH(14)
1049 12: EXACT <ab>(16)
1050 16: SUCCEED(0)
1051 17: NOTHING(18)
1052 18: END(0)
1053
1054This would be optimizable with startbranch=5, first=5, last=16, tail=16
1055and should turn into:
1056
1057 1: CURLYM[1] {1,32767}(18)
1058 5: TRIE(16)
1059 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1060 <ac>
1061 <ad>
1062 <ab>
1063 16: SUCCEED(0)
1064 17: NOTHING(18)
1065 18: END(0)
1066
1067Cases where tail != last would be like /(?foo|bar)baz/:
1068
1069 1: BRANCH(4)
1070 2: EXACT <foo>(8)
1071 4: BRANCH(7)
1072 5: EXACT <bar>(8)
1073 7: TAIL(8)
1074 8: EXACT <baz>(10)
1075 10: END(0)
1076
1077which would be optimizable with startbranch=1, first=1, last=7, tail=8
1078and would end up looking like:
1079
1080 1: TRIE(8)
1081 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1082 <foo>
1083 <bar>
1084 7: TAIL(8)
1085 8: EXACT <baz>(10)
1086 10: END(0)
1087
1088 d = uvuni_to_utf8_flags(d, uv, 0);
1089
1090is the recommended Unicode-aware way of saying
1091
1092 *(d++) = uv;
1093*/
1094
1095#define TRIE_STORE_REVCHAR \
1096 STMT_START { \
1097 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1098 if (UTF) SvUTF8_on(tmp); \
1099 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1100 } STMT_END
1101
1102#define TRIE_READ_CHAR STMT_START { \
1103 wordlen++; \
1104 if ( UTF ) { \
1105 if ( folder ) { \
1106 if ( foldlen > 0 ) { \
1107 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1108 foldlen -= len; \
1109 scan += len; \
1110 len = 0; \
1111 } else { \
1112 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1113 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1114 foldlen -= UNISKIP( uvc ); \
1115 scan = foldbuf + UNISKIP( uvc ); \
1116 } \
1117 } else { \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1119 } \
1120 } else { \
1121 uvc = (U32)*uc; \
1122 len = 1; \
1123 } \
1124} STMT_END
1125
1126
1127
1128#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1129 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1130 TRIE_LIST_LEN( state ) *= 2; \
1131 Renew( trie->states[ state ].trans.list, \
1132 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1133 } \
1134 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1135 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1136 TRIE_LIST_CUR( state )++; \
1137} STMT_END
07be1b83 1138
786e8c11
YO
1139#define TRIE_LIST_NEW(state) STMT_START { \
1140 Newxz( trie->states[ state ].trans.list, \
1141 4, reg_trie_trans_le ); \
1142 TRIE_LIST_CUR( state ) = 1; \
1143 TRIE_LIST_LEN( state ) = 4; \
1144} STMT_END
07be1b83 1145
786e8c11
YO
1146#define TRIE_HANDLE_WORD(state) STMT_START { \
1147 U16 dupe= trie->states[ state ].wordnum; \
1148 regnode * const noper_next = regnext( noper ); \
1149 \
1150 if (trie->wordlen) \
1151 trie->wordlen[ curword ] = wordlen; \
1152 DEBUG_r({ \
1153 /* store the word for dumping */ \
1154 SV* tmp; \
1155 if (OP(noper) != NOTHING) \
1156 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1157 else \
1158 tmp = newSVpvn( "", 0 ); \
1159 if ( UTF ) SvUTF8_on( tmp ); \
1160 av_push( trie->words, tmp ); \
1161 }); \
1162 \
1163 curword++; \
1164 \
1165 if ( noper_next < tail ) { \
1166 if (!trie->jump) \
1167 Newxz( trie->jump, word_count + 1, U16); \
1168 trie->jump[curword] = (U16)(tail - noper_next); \
1169 if (!jumper) \
1170 jumper = noper_next; \
1171 if (!nextbranch) \
1172 nextbranch= regnext(cur); \
1173 } \
1174 \
1175 if ( dupe ) { \
1176 /* So it's a dupe. This means we need to maintain a */\
1177 /* linked-list from the first to the next. */\
1178 /* we only allocate the nextword buffer when there */\
1179 /* a dupe, so first time we have to do the allocation */\
1180 if (!trie->nextword) \
1181 Newxz( trie->nextword, word_count + 1, U16); \
1182 while ( trie->nextword[dupe] ) \
1183 dupe= trie->nextword[dupe]; \
1184 trie->nextword[dupe]= curword; \
1185 } else { \
1186 /* we haven't inserted this word yet. */ \
1187 trie->states[ state ].wordnum = curword; \
1188 } \
1189} STMT_END
07be1b83 1190
3dab1dad 1191
786e8c11
YO
1192#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1193 ( ( base + charid >= ucharcount \
1194 && base + charid < ubound \
1195 && state == trie->trans[ base - ucharcount + charid ].check \
1196 && trie->trans[ base - ucharcount + charid ].next ) \
1197 ? trie->trans[ base - ucharcount + charid ].next \
1198 : ( state==1 ? special : 0 ) \
1199 )
3dab1dad 1200
786e8c11
YO
1201#define MADE_TRIE 1
1202#define MADE_JUMP_TRIE 2
1203#define MADE_EXACT_TRIE 4
3dab1dad 1204
a3621e74 1205STATIC I32
786e8c11 1206S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1207{
27da23d5 1208 dVAR;
a3621e74
YO
1209 /* first pass, loop through and scan words */
1210 reg_trie_data *trie;
1211 regnode *cur;
9f7f3913 1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1213 STRLEN len = 0;
1214 UV uvc = 0;
1215 U16 curword = 0;
1216 U32 next_alloc = 0;
786e8c11
YO
1217 regnode *jumper = NULL;
1218 regnode *nextbranch = NULL;
a3621e74 1219 /* we just use folder as a flag in utf8 */
e1ec3a88 1220 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1221 ? PL_fold
1222 : ( flags == EXACTFL
1223 ? PL_fold_locale
1224 : NULL
1225 )
1226 );
1227
e1ec3a88 1228 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1229 SV *re_trie_maxbuff;
3dab1dad
YO
1230#ifndef DEBUGGING
1231 /* these are only used during construction but are useful during
8e11feef 1232 * debugging so we store them in the struct when debugging.
8e11feef 1233 */
3dab1dad 1234 STRLEN trie_charcount=0;
3dab1dad
YO
1235 AV *trie_revcharmap;
1236#endif
a3621e74 1237 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1238#ifndef DEBUGGING
1239 PERL_UNUSED_ARG(depth);
1240#endif
a3621e74 1241
a02a5408 1242 Newxz( trie, 1, reg_trie_data );
a3621e74 1243 trie->refcount = 1;
3dab1dad 1244 trie->startstate = 1;
786e8c11 1245 trie->wordcount = word_count;
a3621e74 1246 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1247 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1248 if (!(UTF && folder))
1249 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1250 DEBUG_r({
1251 trie->words = newAV();
a3621e74 1252 });
3dab1dad 1253 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1254
0111c4fd 1255 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1256 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1257 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1258 }
3dab1dad
YO
1259 DEBUG_OPTIMISE_r({
1260 PerlIO_printf( Perl_debug_log,
786e8c11 1261 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1262 (int)depth * 2 + 2, "",
1263 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1264 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1265 (int)depth);
3dab1dad 1266 });
a3621e74
YO
1267 /* -- First loop and Setup --
1268
1269 We first traverse the branches and scan each word to determine if it
1270 contains widechars, and how many unique chars there are, this is
1271 important as we have to build a table with at least as many columns as we
1272 have unique chars.
1273
1274 We use an array of integers to represent the character codes 0..255
1275 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1276 native representation of the character value as the key and IV's for the
1277 coded index.
1278
1279 *TODO* If we keep track of how many times each character is used we can
1280 remap the columns so that the table compression later on is more
1281 efficient in terms of memory by ensuring most common value is in the
1282 middle and the least common are on the outside. IMO this would be better
1283 than a most to least common mapping as theres a decent chance the most
1284 common letter will share a node with the least common, meaning the node
1285 will not be compressable. With a middle is most common approach the worst
1286 case is when we have the least common nodes twice.
1287
1288 */
1289
a3621e74 1290 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1291 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1292 const U8 *uc = (U8*)STRING( noper );
a28509cc 1293 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1294 STRLEN foldlen = 0;
1295 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1296 const U8 *scan = (U8*)NULL;
07be1b83 1297 U32 wordlen = 0; /* required init */
3dab1dad 1298 STRLEN chars=0;
a3621e74 1299
3dab1dad
YO
1300 if (OP(noper) == NOTHING) {
1301 trie->minlen= 0;
1302 continue;
1303 }
1304 if (trie->bitmap) {
1305 TRIE_BITMAP_SET(trie,*uc);
1306 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1307 }
a3621e74 1308 for ( ; uc < e ; uc += len ) {
3dab1dad 1309 TRIE_CHARCOUNT(trie)++;
a3621e74 1310 TRIE_READ_CHAR;
3dab1dad 1311 chars++;
a3621e74
YO
1312 if ( uvc < 256 ) {
1313 if ( !trie->charmap[ uvc ] ) {
1314 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1315 if ( folder )
1316 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1317 TRIE_STORE_REVCHAR;
a3621e74
YO
1318 }
1319 } else {
1320 SV** svpp;
1321 if ( !trie->widecharmap )
1322 trie->widecharmap = newHV();
1323
1324 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1325
1326 if ( !svpp )
e4584336 1327 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1328
1329 if ( !SvTRUE( *svpp ) ) {
1330 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1331 TRIE_STORE_REVCHAR;
a3621e74
YO
1332 }
1333 }
1334 }
3dab1dad
YO
1335 if( cur == first ) {
1336 trie->minlen=chars;
1337 trie->maxlen=chars;
1338 } else if (chars < trie->minlen) {
1339 trie->minlen=chars;
1340 } else if (chars > trie->maxlen) {
1341 trie->maxlen=chars;
1342 }
1343
a3621e74
YO
1344 } /* end first pass */
1345 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1346 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1347 (int)depth * 2 + 2,"",
85c3142d 1348 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1349 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1350 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1351 );
786e8c11 1352 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1353
1354 /*
1355 We now know what we are dealing with in terms of unique chars and
1356 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1357 representation using a flat table will take. If it's over a reasonable
1358 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1359 conservative but potentially much slower representation using an array
1360 of lists.
1361
1362 At the end we convert both representations into the same compressed
1363 form that will be used in regexec.c for matching with. The latter
1364 is a form that cannot be used to construct with but has memory
1365 properties similar to the list form and access properties similar
1366 to the table form making it both suitable for fast searches and
1367 small enough that its feasable to store for the duration of a program.
1368
1369 See the comment in the code where the compressed table is produced
1370 inplace from the flat tabe representation for an explanation of how
1371 the compression works.
1372
1373 */
1374
1375
3dab1dad 1376 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1377 /*
1378 Second Pass -- Array Of Lists Representation
1379
1380 Each state will be represented by a list of charid:state records
1381 (reg_trie_trans_le) the first such element holds the CUR and LEN
1382 points of the allocated array. (See defines above).
1383
1384 We build the initial structure using the lists, and then convert
1385 it into the compressed table form which allows faster lookups
1386 (but cant be modified once converted).
a3621e74
YO
1387 */
1388
a3621e74
YO
1389 STRLEN transcount = 1;
1390
3dab1dad 1391 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1392 TRIE_LIST_NEW(1);
1393 next_alloc = 2;
1394
1395 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1396
c445ea15
AL
1397 regnode * const noper = NEXTOPER( cur );
1398 U8 *uc = (U8*)STRING( noper );
1399 const U8 * const e = uc + STR_LEN( noper );
1400 U32 state = 1; /* required init */
1401 U16 charid = 0; /* sanity init */
1402 U8 *scan = (U8*)NULL; /* sanity init */
1403 STRLEN foldlen = 0; /* required init */
07be1b83 1404 U32 wordlen = 0; /* required init */
c445ea15
AL
1405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1406
3dab1dad 1407 if (OP(noper) != NOTHING) {
786e8c11 1408 for ( ; uc < e ; uc += len ) {
c445ea15 1409
786e8c11 1410 TRIE_READ_CHAR;
c445ea15 1411
786e8c11
YO
1412 if ( uvc < 256 ) {
1413 charid = trie->charmap[ uvc ];
c445ea15 1414 } else {
786e8c11
YO
1415 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1416 if ( !svpp ) {
1417 charid = 0;
1418 } else {
1419 charid=(U16)SvIV( *svpp );
1420 }
c445ea15 1421 }
786e8c11
YO
1422 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1423 if ( charid ) {
a3621e74 1424
786e8c11
YO
1425 U16 check;
1426 U32 newstate = 0;
a3621e74 1427
786e8c11
YO
1428 charid--;
1429 if ( !trie->states[ state ].trans.list ) {
1430 TRIE_LIST_NEW( state );
c445ea15 1431 }
786e8c11
YO
1432 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1433 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1434 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1435 break;
1436 }
1437 }
1438 if ( ! newstate ) {
1439 newstate = next_alloc++;
1440 TRIE_LIST_PUSH( state, charid, newstate );
1441 transcount++;
1442 }
1443 state = newstate;
1444 } else {
1445 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1446 }
a28509cc 1447 }
c445ea15 1448 }
3dab1dad 1449 TRIE_HANDLE_WORD(state);
a3621e74
YO
1450
1451 } /* end second pass */
1452
786e8c11 1453 trie->laststate = next_alloc;
a3621e74
YO
1454 Renew( trie->states, next_alloc, reg_trie_state );
1455
3dab1dad
YO
1456 /* and now dump it out before we compress it */
1457 DEBUG_TRIE_COMPILE_MORE_r(
1458 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1459 );
a3621e74 1460
a02a5408 1461 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1462 {
1463 U32 state;
a3621e74
YO
1464 U32 tp = 0;
1465 U32 zp = 0;
1466
1467
1468 for( state=1 ; state < next_alloc ; state ++ ) {
1469 U32 base=0;
1470
1471 /*
1472 DEBUG_TRIE_COMPILE_MORE_r(
1473 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1474 );
1475 */
1476
1477 if (trie->states[state].trans.list) {
1478 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1479 U16 maxid=minid;
a28509cc 1480 U16 idx;
a3621e74
YO
1481
1482 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1483 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1484 if ( forid < minid ) {
1485 minid=forid;
1486 } else if ( forid > maxid ) {
1487 maxid=forid;
1488 }
a3621e74
YO
1489 }
1490 if ( transcount < tp + maxid - minid + 1) {
1491 transcount *= 2;
1492 Renew( trie->trans, transcount, reg_trie_trans );
1493 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1494 }
1495 base = trie->uniquecharcount + tp - minid;
1496 if ( maxid == minid ) {
1497 U32 set = 0;
1498 for ( ; zp < tp ; zp++ ) {
1499 if ( ! trie->trans[ zp ].next ) {
1500 base = trie->uniquecharcount + zp - minid;
1501 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1502 trie->trans[ zp ].check = state;
1503 set = 1;
1504 break;
1505 }
1506 }
1507 if ( !set ) {
1508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1509 trie->trans[ tp ].check = state;
1510 tp++;
1511 zp = tp;
1512 }
1513 } else {
1514 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1515 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1516 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1517 trie->trans[ tid ].check = state;
1518 }
1519 tp += ( maxid - minid + 1 );
1520 }
1521 Safefree(trie->states[ state ].trans.list);
1522 }
1523 /*
1524 DEBUG_TRIE_COMPILE_MORE_r(
1525 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1526 );
1527 */
1528 trie->states[ state ].trans.base=base;
1529 }
cc601c31 1530 trie->lasttrans = tp + 1;
a3621e74
YO
1531 }
1532 } else {
1533 /*
1534 Second Pass -- Flat Table Representation.
1535
1536 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1537 We know that we will need Charcount+1 trans at most to store the data
1538 (one row per char at worst case) So we preallocate both structures
1539 assuming worst case.
1540
1541 We then construct the trie using only the .next slots of the entry
1542 structs.
1543
1544 We use the .check field of the first entry of the node temporarily to
1545 make compression both faster and easier by keeping track of how many non
1546 zero fields are in the node.
1547
1548 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1549 transition.
1550
1551 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1552 number representing the first entry of the node, and state as a
1553 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1554 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1555 are 2 entrys per node. eg:
1556
1557 A B A B
1558 1. 2 4 1. 3 7
1559 2. 0 3 3. 0 5
1560 3. 0 0 5. 0 0
1561 4. 0 0 7. 0 0
1562
1563 The table is internally in the right hand, idx form. However as we also
1564 have to deal with the states array which is indexed by nodenum we have to
1565 use TRIE_NODENUM() to convert.
1566
1567 */
1568
3dab1dad
YO
1569
1570 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1571 reg_trie_trans );
3dab1dad 1572 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1573 next_alloc = trie->uniquecharcount + 1;
1574
3dab1dad 1575
a3621e74
YO
1576 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1577
c445ea15 1578 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1579 const U8 *uc = (U8*)STRING( noper );
1580 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1581
1582 U32 state = 1; /* required init */
1583
1584 U16 charid = 0; /* sanity init */
1585 U32 accept_state = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1587
1588 STRLEN foldlen = 0; /* required init */
07be1b83 1589 U32 wordlen = 0; /* required init */
a3621e74
YO
1590 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1591
3dab1dad 1592 if ( OP(noper) != NOTHING ) {
786e8c11 1593 for ( ; uc < e ; uc += len ) {
a3621e74 1594
786e8c11 1595 TRIE_READ_CHAR;
a3621e74 1596
786e8c11
YO
1597 if ( uvc < 256 ) {
1598 charid = trie->charmap[ uvc ];
1599 } else {
1600 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1601 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1602 }
786e8c11
YO
1603 if ( charid ) {
1604 charid--;
1605 if ( !trie->trans[ state + charid ].next ) {
1606 trie->trans[ state + charid ].next = next_alloc;
1607 trie->trans[ state ].check++;
1608 next_alloc += trie->uniquecharcount;
1609 }
1610 state = trie->trans[ state + charid ].next;
1611 } else {
1612 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1613 }
1614 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1615 }
a3621e74 1616 }
3dab1dad
YO
1617 accept_state = TRIE_NODENUM( state );
1618 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1619
1620 } /* end second pass */
1621
3dab1dad
YO
1622 /* and now dump it out before we compress it */
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 dump_trie_interim_table(trie,next_alloc,depth+1)
1625 );
a3621e74 1626
a3621e74
YO
1627 {
1628 /*
1629 * Inplace compress the table.*
1630
1631 For sparse data sets the table constructed by the trie algorithm will
1632 be mostly 0/FAIL transitions or to put it another way mostly empty.
1633 (Note that leaf nodes will not contain any transitions.)
1634
1635 This algorithm compresses the tables by eliminating most such
1636 transitions, at the cost of a modest bit of extra work during lookup:
1637
1638 - Each states[] entry contains a .base field which indicates the
1639 index in the state[] array wheres its transition data is stored.
1640
1641 - If .base is 0 there are no valid transitions from that node.
1642
1643 - If .base is nonzero then charid is added to it to find an entry in
1644 the trans array.
1645
1646 -If trans[states[state].base+charid].check!=state then the
1647 transition is taken to be a 0/Fail transition. Thus if there are fail
1648 transitions at the front of the node then the .base offset will point
1649 somewhere inside the previous nodes data (or maybe even into a node
1650 even earlier), but the .check field determines if the transition is
1651 valid.
1652
786e8c11 1653 XXX - wrong maybe?
a3621e74
YO
1654 The following process inplace converts the table to the compressed
1655 table: We first do not compress the root node 1,and mark its all its
1656 .check pointers as 1 and set its .base pointer as 1 as well. This
1657 allows to do a DFA construction from the compressed table later, and
1658 ensures that any .base pointers we calculate later are greater than
1659 0.
1660
1661 - We set 'pos' to indicate the first entry of the second node.
1662
1663 - We then iterate over the columns of the node, finding the first and
1664 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1665 and set the .check pointers accordingly, and advance pos
1666 appropriately and repreat for the next node. Note that when we copy
1667 the next pointers we have to convert them from the original
1668 NODEIDX form to NODENUM form as the former is not valid post
1669 compression.
1670
1671 - If a node has no transitions used we mark its base as 0 and do not
1672 advance the pos pointer.
1673
1674 - If a node only has one transition we use a second pointer into the
1675 structure to fill in allocated fail transitions from other states.
1676 This pointer is independent of the main pointer and scans forward
1677 looking for null transitions that are allocated to a state. When it
1678 finds one it writes the single transition into the "hole". If the
786e8c11 1679 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1680
1681 - Once compressed we can Renew/realloc the structures to release the
1682 excess space.
1683
1684 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1685 specifically Fig 3.47 and the associated pseudocode.
1686
1687 demq
1688 */
a3b680e6 1689 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1690 U32 state, charid;
a3621e74 1691 U32 pos = 0, zp=0;
786e8c11 1692 trie->laststate = laststate;
a3621e74
YO
1693
1694 for ( state = 1 ; state < laststate ; state++ ) {
1695 U8 flag = 0;
a28509cc
AL
1696 const U32 stateidx = TRIE_NODEIDX( state );
1697 const U32 o_used = trie->trans[ stateidx ].check;
1698 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1699 trie->trans[ stateidx ].check = 0;
1700
1701 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1702 if ( flag || trie->trans[ stateidx + charid ].next ) {
1703 if ( trie->trans[ stateidx + charid ].next ) {
1704 if (o_used == 1) {
1705 for ( ; zp < pos ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1707 break;
1708 }
1709 }
1710 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1711 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1712 trie->trans[ zp ].check = state;
1713 if ( ++zp > pos ) pos = zp;
1714 break;
1715 }
1716 used--;
1717 }
1718 if ( !flag ) {
1719 flag = 1;
1720 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1721 }
1722 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1723 trie->trans[ pos ].check = state;
1724 pos++;
1725 }
1726 }
1727 }
cc601c31 1728 trie->lasttrans = pos + 1;
a3621e74
YO
1729 Renew( trie->states, laststate + 1, reg_trie_state);
1730 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1731 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1732 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1733 (int)depth * 2 + 2,"",
1734 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1735 (IV)next_alloc,
1736 (IV)pos,
a3621e74
YO
1737 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1738 );
1739
1740 } /* end table compress */
1741 }
cc601c31
YO
1742 /* resize the trans array to remove unused space */
1743 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1744
3dab1dad
YO
1745 /* and now dump out the compressed format */
1746 DEBUG_TRIE_COMPILE_r(
1747 dump_trie(trie,depth+1)
1748 );
07be1b83 1749
3dab1dad
YO
1750 { /* Modify the program and insert the new TRIE node*/
1751 regnode *convert;
1752 U8 nodetype =(U8)(flags & 0xFF);
1753 char *str=NULL;
786e8c11 1754
07be1b83 1755#ifdef DEBUGGING
786e8c11 1756
07be1b83
YO
1757 U32 mjd_offset;
1758 U32 mjd_nodelen;
1759#endif
a3621e74 1760 /*
3dab1dad
YO
1761 This means we convert either the first branch or the first Exact,
1762 depending on whether the thing following (in 'last') is a branch
1763 or not and whther first is the startbranch (ie is it a sub part of
1764 the alternation or is it the whole thing.)
1765 Assuming its a sub part we conver the EXACT otherwise we convert
1766 the whole branch sequence, including the first.
a3621e74 1767 */
3dab1dad
YO
1768 /* Find the node we are going to overwrite */
1769 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1770 /* whole branch chain */
3dab1dad 1771 convert = first;
07be1b83
YO
1772 DEBUG_r({
1773 const regnode *nop = NEXTOPER( convert );
1774 mjd_offset= Node_Offset((nop));
1775 mjd_nodelen= Node_Length((nop));
1776 });
1777 } else {
1778 /* branch sub-chain */
3dab1dad
YO
1779 convert = NEXTOPER( first );
1780 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1781 DEBUG_r({
1782 mjd_offset= Node_Offset((convert));
1783 mjd_nodelen= Node_Length((convert));
1784 });
1785 }
1786 DEBUG_OPTIMISE_r(
1787 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1788 (int)depth * 2 + 2, "",
786e8c11 1789 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1790 );
a3621e74 1791
3dab1dad
YO
1792 /* But first we check to see if there is a common prefix we can
1793 split out as an EXACT and put in front of the TRIE node. */
1794 trie->startstate= 1;
786e8c11 1795 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1796 U32 state;
1797 DEBUG_OPTIMISE_r(
8e11feef
RGS
1798 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1799 (int)depth * 2 + 2, "",
786e8c11 1800 (UV)trie->laststate)
8e11feef 1801 );
786e8c11 1802 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1803 U32 ofs = 0;
8e11feef
RGS
1804 I32 idx = -1;
1805 U32 count = 0;
1806 const U32 base = trie->states[ state ].trans.base;
a3621e74 1807
3dab1dad 1808 if ( trie->states[state].wordnum )
8e11feef 1809 count = 1;
a3621e74 1810
8e11feef 1811 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1812 if ( ( base + ofs >= trie->uniquecharcount ) &&
1813 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1814 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1815 {
3dab1dad 1816 if ( ++count > 1 ) {
8e11feef 1817 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1818 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1819 if ( state == 1 ) break;
3dab1dad
YO
1820 if ( count == 2 ) {
1821 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1822 DEBUG_OPTIMISE_r(
8e11feef
RGS
1823 PerlIO_printf(Perl_debug_log,
1824 "%*sNew Start State=%"UVuf" Class: [",
1825 (int)depth * 2 + 2, "",
786e8c11 1826 (UV)state));
be8e71aa
YO
1827 if (idx >= 0) {
1828 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1829 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1830
3dab1dad 1831 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1832 if ( folder )
1833 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1834 DEBUG_OPTIMISE_r(
07be1b83 1835 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1836 );
8e11feef
RGS
1837 }
1838 }
1839 TRIE_BITMAP_SET(trie,*ch);
1840 if ( folder )
1841 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1842 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1843 }
1844 idx = ofs;
1845 }
3dab1dad
YO
1846 }
1847 if ( count == 1 ) {
1848 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1849 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1850 DEBUG_OPTIMISE_r(
8e11feef
RGS
1851 PerlIO_printf( Perl_debug_log,
1852 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1853 (int)depth * 2 + 2, "",
786e8c11 1854 (UV)state, (UV)idx, ch)
3dab1dad
YO
1855 );
1856 if ( state==1 ) {
1857 OP( convert ) = nodetype;
1858 str=STRING(convert);
1859 STR_LEN(convert)=0;
1860 }
1861 *str++=*ch;
1862 STR_LEN(convert)++;
a3621e74 1863
8e11feef 1864 } else {
f9049ba1 1865#ifdef DEBUGGING
8e11feef
RGS
1866 if (state>1)
1867 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1868#endif
8e11feef
RGS
1869 break;
1870 }
1871 }
3dab1dad 1872 if (str) {
8e11feef 1873 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1874 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1875 trie->startstate = state;
07be1b83
YO
1876 trie->minlen -= (state - 1);
1877 trie->maxlen -= (state - 1);
1878 DEBUG_r({
1879 regnode *fix = convert;
1880 mjd_nodelen++;
1881 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1882 while( ++fix < n ) {
1883 Set_Node_Offset_Length(fix, 0, 0);
1884 }
1885 });
8e11feef
RGS
1886 if (trie->maxlen) {
1887 convert = n;
1888 } else {
3dab1dad
YO
1889 NEXT_OFF(convert) = (U16)(tail - convert);
1890 }
1891 }
1892 }
1893 if ( trie->maxlen ) {
8e11feef
RGS
1894 NEXT_OFF( convert ) = (U16)(tail - convert);
1895 ARG_SET( convert, data_slot );
786e8c11
YO
1896 /* Store the offset to the first unabsorbed branch in
1897 jump[0], which is otherwise unused by the jump logic.
1898 We use this when dumping a trie and during optimisation. */
1899 if (trie->jump)
1900 trie->jump[0] = (U16)(tail - nextbranch);
1901 if (!jumper)
85c3142d 1902 jumper = last;
786e8c11
YO
1903 /* XXXX */
1904 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1905 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1906 {
1907 OP( convert ) = TRIEC;
1908 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1909 Safefree(trie->bitmap);
1910 trie->bitmap= NULL;
1911 } else
1912 OP( convert ) = TRIE;
a3621e74 1913
3dab1dad
YO
1914 /* store the type in the flags */
1915 convert->flags = nodetype;
1916 /* XXX We really should free up the resource in trie now, as we wont use them */
1917 }
a3621e74
YO
1918 /* needed for dumping*/
1919 DEBUG_r({
786e8c11
YO
1920 regnode *optimize = convert
1921 + NODE_STEP_REGNODE
1922 + regarglen[ OP( convert ) ];
07be1b83
YO
1923 regnode *opt = convert;
1924 while (++opt<optimize) {
1925 Set_Node_Offset_Length(opt,0,0);
1926 }
786e8c11
YO
1927 /*
1928 Try to clean up some of the debris left after the
1929 optimisation.
a3621e74 1930 */
786e8c11 1931 while( optimize < jumper ) {
07be1b83 1932 mjd_nodelen += Node_Length((optimize));
a3621e74 1933 OP( optimize ) = OPTIMIZED;
07be1b83 1934 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1935 optimize++;
1936 }
07be1b83 1937 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1938 });
1939 } /* end node insert */
07be1b83 1940#ifndef DEBUGGING
6e8b4190 1941 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1942#endif
786e8c11
YO
1943 return trie->jump
1944 ? MADE_JUMP_TRIE
1945 : trie->startstate>1
1946 ? MADE_EXACT_TRIE
1947 : MADE_TRIE;
1948}
1949
1950STATIC void
1951S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1952{
1953/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1954
1955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1956 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1957 ISBN 0-201-10088-6
1958
1959 We find the fail state for each state in the trie, this state is the longest proper
1960 suffix of the current states 'word' that is also a proper prefix of another word in our
1961 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1962 the DFA not to have to restart after its tried and failed a word at a given point, it
1963 simply continues as though it had been matching the other word in the first place.
1964 Consider
1965 'abcdgu'=~/abcdefg|cdgu/
1966 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1967 fail, which would bring use to the state representing 'd' in the second word where we would
1968 try 'g' and succeed, prodceding to match 'cdgu'.
1969 */
1970 /* add a fail transition */
1971 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1972 U32 *q;
1973 const U32 ucharcount = trie->uniquecharcount;
1974 const U32 numstates = trie->laststate;
1975 const U32 ubound = trie->lasttrans + ucharcount;
1976 U32 q_read = 0;
1977 U32 q_write = 0;
1978 U32 charid;
1979 U32 base = trie->states[ 1 ].trans.base;
1980 U32 *fail;
1981 reg_ac_data *aho;
1982 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1983 GET_RE_DEBUG_FLAGS_DECL;
1984#ifndef DEBUGGING
1985 PERL_UNUSED_ARG(depth);
1986#endif
1987
1988
1989 ARG_SET( stclass, data_slot );
1990 Newxz( aho, 1, reg_ac_data );
1991 RExC_rx->data->data[ data_slot ] = (void*)aho;
1992 aho->trie=trie;
1993 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1994 (trie->laststate+1)*sizeof(reg_trie_state));
1995 Newxz( q, numstates, U32);
1996 Newxz( aho->fail, numstates, U32 );
1997 aho->refcount = 1;
1998 fail = aho->fail;
1999 /* initialize fail[0..1] to be 1 so that we always have
2000 a valid final fail state */
2001 fail[ 0 ] = fail[ 1 ] = 1;
2002
2003 for ( charid = 0; charid < ucharcount ; charid++ ) {
2004 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2005 if ( newstate ) {
2006 q[ q_write ] = newstate;
2007 /* set to point at the root */
2008 fail[ q[ q_write++ ] ]=1;
2009 }
2010 }
2011 while ( q_read < q_write) {
2012 const U32 cur = q[ q_read++ % numstates ];
2013 base = trie->states[ cur ].trans.base;
2014
2015 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2016 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2017 if (ch_state) {
2018 U32 fail_state = cur;
2019 U32 fail_base;
2020 do {
2021 fail_state = fail[ fail_state ];
2022 fail_base = aho->states[ fail_state ].trans.base;
2023 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2024
2025 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2026 fail[ ch_state ] = fail_state;
2027 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2028 {
2029 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2030 }
2031 q[ q_write++ % numstates] = ch_state;
2032 }
2033 }
2034 }
2035 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2036 when we fail in state 1, this allows us to use the
2037 charclass scan to find a valid start char. This is based on the principle
2038 that theres a good chance the string being searched contains lots of stuff
2039 that cant be a start char.
2040 */
2041 fail[ 0 ] = fail[ 1 ] = 0;
2042 DEBUG_TRIE_COMPILE_r({
2043 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2044 for( q_read=1; q_read<numstates; q_read++ ) {
2045 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2046 }
2047 PerlIO_printf(Perl_debug_log, "\n");
2048 });
2049 Safefree(q);
2050 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2051}
2052
786e8c11 2053
a3621e74 2054/*
5d1c421c
JH
2055 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2056 * These need to be revisited when a newer toolchain becomes available.
2057 */
2058#if defined(__sparc64__) && defined(__GNUC__)
2059# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2060# undef SPARC64_GCC_WORKAROUND
2061# define SPARC64_GCC_WORKAROUND 1
2062# endif
2063#endif
2064
07be1b83
YO
2065#define DEBUG_PEEP(str,scan,depth) \
2066 DEBUG_OPTIMISE_r({ \
2067 SV * const mysv=sv_newmortal(); \
2068 regnode *Next = regnext(scan); \
2069 regprop(RExC_rx, mysv, scan); \
2070 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2071 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2072 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2073 });
2074
1de06328
YO
2075
2076
2077
2078
07be1b83
YO
2079#define JOIN_EXACT(scan,min,flags) \
2080 if (PL_regkind[OP(scan)] == EXACT) \
2081 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2082
be8e71aa 2083STATIC U32
07be1b83
YO
2084S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2085 /* Merge several consecutive EXACTish nodes into one. */
2086 regnode *n = regnext(scan);
2087 U32 stringok = 1;
2088 regnode *next = scan + NODE_SZ_STR(scan);
2089 U32 merged = 0;
2090 U32 stopnow = 0;
2091#ifdef DEBUGGING
2092 regnode *stop = scan;
72f13be8 2093 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2094#else
d47053eb
RGS
2095 PERL_UNUSED_ARG(depth);
2096#endif
2097#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2098 PERL_UNUSED_ARG(flags);
2099 PERL_UNUSED_ARG(val);
07be1b83 2100#endif
07be1b83
YO
2101 DEBUG_PEEP("join",scan,depth);
2102
2103 /* Skip NOTHING, merge EXACT*. */
2104 while (n &&
2105 ( PL_regkind[OP(n)] == NOTHING ||
2106 (stringok && (OP(n) == OP(scan))))
2107 && NEXT_OFF(n)
2108 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2109
2110 if (OP(n) == TAIL || n > next)
2111 stringok = 0;
2112 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2113 DEBUG_PEEP("skip:",n,depth);
2114 NEXT_OFF(scan) += NEXT_OFF(n);
2115 next = n + NODE_STEP_REGNODE;
2116#ifdef DEBUGGING
2117 if (stringok)
2118 stop = n;
2119#endif
2120 n = regnext(n);
2121 }
2122 else if (stringok) {
786e8c11 2123 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2124 regnode * const nnext = regnext(n);
2125
2126 DEBUG_PEEP("merg",n,depth);
2127
2128 merged++;
2129 if (oldl + STR_LEN(n) > U8_MAX)
2130 break;
2131 NEXT_OFF(scan) += NEXT_OFF(n);
2132 STR_LEN(scan) += STR_LEN(n);
2133 next = n + NODE_SZ_STR(n);
2134 /* Now we can overwrite *n : */
2135 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2136#ifdef DEBUGGING
2137 stop = next - 1;
2138#endif
2139 n = nnext;
2140 if (stopnow) break;
2141 }
2142
d47053eb
RGS
2143#ifdef EXPERIMENTAL_INPLACESCAN
2144 if (flags && !NEXT_OFF(n)) {
2145 DEBUG_PEEP("atch", val, depth);
2146 if (reg_off_by_arg[OP(n)]) {
2147 ARG_SET(n, val - n);
2148 }
2149 else {
2150 NEXT_OFF(n) = val - n;
2151 }
2152 stopnow = 1;
2153 }
07be1b83
YO
2154#endif
2155 }
2156
2157 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2158 /*
2159 Two problematic code points in Unicode casefolding of EXACT nodes:
2160
2161 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2162 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2163
2164 which casefold to
2165
2166 Unicode UTF-8
2167
2168 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2169 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2170
2171 This means that in case-insensitive matching (or "loose matching",
2172 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2173 length of the above casefolded versions) can match a target string
2174 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2175 This would rather mess up the minimum length computation.
2176
2177 What we'll do is to look for the tail four bytes, and then peek
2178 at the preceding two bytes to see whether we need to decrease
2179 the minimum length by four (six minus two).
2180
2181 Thanks to the design of UTF-8, there cannot be false matches:
2182 A sequence of valid UTF-8 bytes cannot be a subsequence of
2183 another valid sequence of UTF-8 bytes.
2184
2185 */
2186 char * const s0 = STRING(scan), *s, *t;
2187 char * const s1 = s0 + STR_LEN(scan) - 1;
2188 char * const s2 = s1 - 4;
e294cc5d
JH
2189#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2190 const char t0[] = "\xaf\x49\xaf\x42";
2191#else
07be1b83 2192 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2193#endif
07be1b83
YO
2194 const char * const t1 = t0 + 3;
2195
2196 for (s = s0 + 2;
2197 s < s2 && (t = ninstr(s, s1, t0, t1));
2198 s = t + 4) {
e294cc5d
JH
2199#ifdef EBCDIC
2200 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2201 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2202#else
07be1b83
YO
2203 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2204 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2205#endif
07be1b83
YO
2206 *min -= 4;
2207 }
2208 }
2209
2210#ifdef DEBUGGING
2211 /* Allow dumping */
2212 n = scan + NODE_SZ_STR(scan);
2213 while (n <= stop) {
2214 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2215 OP(n) = OPTIMIZED;
2216 NEXT_OFF(n) = 0;
2217 }
2218 n++;
2219 }
2220#endif
2221 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2222 return stopnow;
2223}
2224
653099ff
GS
2225/* REx optimizer. Converts nodes into quickier variants "in place".
2226 Finds fixed substrings. */
2227
a0288114 2228/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2229 to the position after last scanned or to NULL. */
2230
07be1b83
YO
2231
2232
76e3520e 2233STATIC I32
1de06328
YO
2234S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2235 I32 *minlenp, I32 *deltap,
9a957fbc 2236 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2237 /* scanp: Start here (read-write). */
2238 /* deltap: Write maxlen-minlen here. */
2239 /* last: Stop before this one. */
2240{
97aff369 2241 dVAR;
c277df42
IZ
2242 I32 min = 0, pars = 0, code;
2243 regnode *scan = *scanp, *next;
2244 I32 delta = 0;
2245 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2246 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2247 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2248 scan_data_t data_fake;
653099ff 2249 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2250 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2251 regnode *first_non_open = scan;
2252
a3621e74
YO
2253
2254 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2255#ifdef DEBUGGING
2256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2257#endif
786e8c11
YO
2258 if ( depth == 0 ) {
2259 while (first_non_open && OP(first_non_open) == OPEN)
2260 first_non_open=regnext(first_non_open);
2261 }
2262
b81d288d 2263
c277df42
IZ
2264 while (scan && OP(scan) != END && scan < last) {
2265 /* Peephole optimizer: */
1de06328 2266 DEBUG_STUDYDATA(data,depth);
07be1b83 2267 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2268 JOIN_EXACT(scan,&min,0);
a3621e74 2269
653099ff
GS
2270 /* Follow the next-chain of the current node and optimize
2271 away all the NOTHINGs from it. */
c277df42 2272 if (OP(scan) != CURLYX) {
a3b680e6 2273 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2274 ? I32_MAX
2275 /* I32 may be smaller than U16 on CRAYs! */
2276 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2277 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2278 int noff;
2279 regnode *n = scan;
b81d288d 2280
c277df42
IZ
2281 /* Skip NOTHING and LONGJMP. */
2282 while ((n = regnext(n))
3dab1dad 2283 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2284 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2285 && off + noff < max)
2286 off += noff;
2287 if (reg_off_by_arg[OP(scan)])
2288 ARG(scan) = off;
b81d288d 2289 else
c277df42
IZ
2290 NEXT_OFF(scan) = off;
2291 }
a3621e74 2292
07be1b83 2293
3dab1dad 2294
653099ff
GS
2295 /* The principal pseudo-switch. Cannot be a switch, since we
2296 look into several different things. */
b81d288d 2297 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2298 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2299 next = regnext(scan);
2300 code = OP(scan);
a3621e74 2301 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2302
2303 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2304 /* NOTE - There is similar code to this block below for handling
2305 TRIE nodes on a re-study. If you change stuff here check there
2306 too. */
c277df42 2307 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2308 struct regnode_charclass_class accum;
d4c19fe8 2309 regnode * const startbranch=scan;
c277df42 2310
653099ff 2311 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2312 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2313 if (flags & SCF_DO_STCLASS)
830247a4 2314 cl_init_zero(pRExC_state, &accum);
a3621e74 2315
c277df42 2316 while (OP(scan) == code) {
830247a4 2317 I32 deltanext, minnext, f = 0, fake;
653099ff 2318 struct regnode_charclass_class this_class;
c277df42
IZ
2319
2320 num++;
2321 data_fake.flags = 0;
b81d288d 2322 if (data) {
2c2d71f5 2323 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2324 data_fake.last_closep = data->last_closep;
2325 }
2326 else
2327 data_fake.last_closep = &fake;
c277df42
IZ
2328 next = regnext(scan);
2329 scan = NEXTOPER(scan);
2330 if (code != BRANCH)
2331 scan = NEXTOPER(scan);
653099ff 2332 if (flags & SCF_DO_STCLASS) {
830247a4 2333 cl_init(pRExC_state, &this_class);
653099ff
GS
2334 data_fake.start_class = &this_class;
2335 f = SCF_DO_STCLASS_AND;
b81d288d 2336 }
e1901655
IZ
2337 if (flags & SCF_WHILEM_VISITED_POS)
2338 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2339
653099ff 2340 /* we suppose the run is continuous, last=next...*/
1de06328 2341 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2342 next, &data_fake, f,depth+1);
b81d288d 2343 if (min1 > minnext)
c277df42
IZ
2344 min1 = minnext;
2345 if (max1 < minnext + deltanext)
2346 max1 = minnext + deltanext;
2347 if (deltanext == I32_MAX)
aca2d497 2348 is_inf = is_inf_internal = 1;
c277df42
IZ
2349 scan = next;
2350 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2351 pars++;
3dab1dad
YO
2352 if (data) {
2353 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2354 data->flags |= SF_HAS_EVAL;
2c2d71f5 2355 data->whilem_c = data_fake.whilem_c;
3dab1dad 2356 }
653099ff 2357 if (flags & SCF_DO_STCLASS)
830247a4 2358 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2359 if (code == SUSPEND)
c277df42
IZ
2360 break;
2361 }
2362 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2363 min1 = 0;
2364 if (flags & SCF_DO_SUBSTR) {
2365 data->pos_min += min1;
2366 data->pos_delta += max1 - min1;
2367 if (max1 != min1 || is_inf)
2368 data->longest = &(data->longest_float);
2369 }
2370 min += min1;
2371 delta += max1 - min1;
653099ff 2372 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2373 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2374 if (min1) {
2375 cl_and(data->start_class, &and_with);
2376 flags &= ~SCF_DO_STCLASS;
2377 }
2378 }
2379 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2380 if (min1) {
2381 cl_and(data->start_class, &accum);
653099ff 2382 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2383 }
2384 else {
b81d288d 2385 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2386 * data->start_class */
2387 StructCopy(data->start_class, &and_with,
2388 struct regnode_charclass_class);
2389 flags &= ~SCF_DO_STCLASS_AND;
2390 StructCopy(&accum, data->start_class,
2391 struct regnode_charclass_class);
2392 flags |= SCF_DO_STCLASS_OR;
2393 data->start_class->flags |= ANYOF_EOS;
2394 }
653099ff 2395 }
a3621e74 2396
786e8c11 2397 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2398 /* demq.
2399
2400 Assuming this was/is a branch we are dealing with: 'scan' now
2401 points at the item that follows the branch sequence, whatever
2402 it is. We now start at the beginning of the sequence and look
2403 for subsequences of
2404
786e8c11
YO
2405 BRANCH->EXACT=>x1
2406 BRANCH->EXACT=>x2
2407 tail
a3621e74
YO
2408
2409 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2410
2411 If we can find such a subseqence we need to turn the first
2412 element into a trie and then add the subsequent branch exact
2413 strings to the trie.
2414
2415 We have two cases
2416
786e8c11 2417 1. patterns where the whole set of branch can be converted.
a3621e74 2418
786e8c11 2419 2. patterns where only a subset can be converted.
a3621e74
YO
2420
2421 In case 1 we can replace the whole set with a single regop
2422 for the trie. In case 2 we need to keep the start and end
2423 branchs so
2424
2425 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2426 becomes BRANCH TRIE; BRANCH X;
2427
786e8c11
YO
2428 There is an additional case, that being where there is a
2429 common prefix, which gets split out into an EXACT like node
2430 preceding the TRIE node.
2431
2432 If x(1..n)==tail then we can do a simple trie, if not we make
2433 a "jump" trie, such that when we match the appropriate word
2434 we "jump" to the appopriate tail node. Essentailly we turn
2435 a nested if into a case structure of sorts.
a3621e74
YO
2436
2437 */
786e8c11 2438
3dab1dad 2439 int made=0;
0111c4fd
RGS
2440 if (!re_trie_maxbuff) {
2441 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2442 if (!SvIOK(re_trie_maxbuff))
2443 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2444 }
786e8c11 2445 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2446 regnode *cur;
2447 regnode *first = (regnode *)NULL;
2448 regnode *last = (regnode *)NULL;
2449 regnode *tail = scan;
2450 U8 optype = 0;
2451 U32 count=0;
2452
2453#ifdef DEBUGGING
c445ea15 2454 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2455#endif
2456 /* var tail is used because there may be a TAIL
2457 regop in the way. Ie, the exacts will point to the
2458 thing following the TAIL, but the last branch will
2459 point at the TAIL. So we advance tail. If we
2460 have nested (?:) we may have to move through several
2461 tails.
2462 */
2463
2464 while ( OP( tail ) == TAIL ) {
2465 /* this is the TAIL generated by (?:) */
2466 tail = regnext( tail );
2467 }
2468
3dab1dad 2469
a3621e74 2470 DEBUG_OPTIMISE_r({
32fc9b6a 2471 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2472 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2473 (int)depth * 2 + 2, "",
2474 "Looking for TRIE'able sequences. Tail node is: ",
2475 SvPV_nolen_const( mysv )
a3621e74
YO
2476 );
2477 });
3dab1dad 2478
a3621e74
YO
2479 /*
2480
2481 step through the branches, cur represents each
2482 branch, noper is the first thing to be matched
2483 as part of that branch and noper_next is the
2484 regnext() of that node. if noper is an EXACT
2485 and noper_next is the same as scan (our current
2486 position in the regex) then the EXACT branch is
2487 a possible optimization target. Once we have
2488 two or more consequetive such branches we can
2489 create a trie of the EXACT's contents and stich
2490 it in place. If the sequence represents all of
2491 the branches we eliminate the whole thing and
2492 replace it with a single TRIE. If it is a
2493 subsequence then we need to stitch it in. This
2494 means the first branch has to remain, and needs
2495 to be repointed at the item on the branch chain
2496 following the last branch optimized. This could
2497 be either a BRANCH, in which case the
2498 subsequence is internal, or it could be the
2499 item following the branch sequence in which
2500 case the subsequence is at the end.
2501
2502 */
2503
2504 /* dont use tail as the end marker for this traverse */
2505 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2506 regnode * const noper = NEXTOPER( cur );
2507 regnode * const noper_next = regnext( noper );
a3621e74 2508
a3621e74 2509 DEBUG_OPTIMISE_r({
32fc9b6a 2510 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2511 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2512 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2513
32fc9b6a 2514 regprop(RExC_rx, mysv, noper);
a3621e74 2515 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2516 SvPV_nolen_const(mysv));
a3621e74
YO
2517
2518 if ( noper_next ) {
32fc9b6a 2519 regprop(RExC_rx, mysv, noper_next );
a3621e74 2520 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2521 SvPV_nolen_const(mysv));
a3621e74 2522 }
3dab1dad
YO
2523 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2524 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2525 });
3dab1dad
YO
2526 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2527 : PL_regkind[ OP( noper ) ] == EXACT )
2528 || OP(noper) == NOTHING )
786e8c11
YO
2529#ifdef NOJUMPTRIE
2530 && noper_next == tail
2531#endif
2532 && count < U16_MAX)
a3621e74
YO
2533 {
2534 count++;
3dab1dad
YO
2535 if ( !first || optype == NOTHING ) {
2536 if (!first) first = cur;
a3621e74
YO
2537 optype = OP( noper );
2538 } else {
a3621e74 2539 last = cur;
a3621e74
YO
2540 }
2541 } else {
2542 if ( last ) {
786e8c11
YO
2543 make_trie( pRExC_state,
2544 startbranch, first, cur, tail, count,
2545 optype, depth+1 );
a3621e74 2546 }
3dab1dad 2547 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2548#ifdef NOJUMPTRIE
2549 && noper_next == tail
2550#endif
2551 ){
a3621e74
YO
2552 count = 1;
2553 first = cur;
2554 optype = OP( noper );
2555 } else {
2556 count = 0;
2557 first = NULL;
2558 optype = 0;
2559 }
2560 last = NULL;
2561 }
2562 }
2563 DEBUG_OPTIMISE_r({
32fc9b6a 2564 regprop(RExC_rx, mysv, cur);
a3621e74 2565 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2566 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2567 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2568
2569 });
2570 if ( last ) {
786e8c11 2571 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2572#ifdef TRIE_STUDY_OPT
786e8c11
YO
2573 if ( ((made == MADE_EXACT_TRIE &&
2574 startbranch == first)
2575 || ( first_non_open == first )) &&
2576 depth==0 )
2577 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2578#endif
07be1b83 2579 }
a3621e74 2580 }
3dab1dad
YO
2581
2582 } /* do trie */
786e8c11 2583
a0ed51b3 2584 }
a3621e74 2585 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2586 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2587 } else /* single branch is optimized. */
c277df42
IZ
2588 scan = NEXTOPER(scan);
2589 continue;
a0ed51b3
LW
2590 }
2591 else if (OP(scan) == EXACT) {
cd439c50 2592 I32 l = STR_LEN(scan);
c445ea15 2593 UV uc;
a0ed51b3 2594 if (UTF) {
a3b680e6 2595 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2596 l = utf8_length(s, s + l);
9041c2e3 2597 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2598 } else {
2599 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2600 }
2601 min += l;
c277df42 2602 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2603 /* The code below prefers earlier match for fixed
2604 offset, later match for variable offset. */
2605 if (data->last_end == -1) { /* Update the start info. */
2606 data->last_start_min = data->pos_min;
2607 data->last_start_max = is_inf
b81d288d 2608 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2609 }
cd439c50 2610 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2611 if (UTF)
2612 SvUTF8_on(data->last_found);
0eda9292 2613 {
9a957fbc 2614 SV * const sv = data->last_found;
a28509cc 2615 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2616 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2617 if (mg && mg->mg_len >= 0)
5e43f467
JH
2618 mg->mg_len += utf8_length((U8*)STRING(scan),
2619 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2620 }
c277df42
IZ
2621 data->last_end = data->pos_min + l;
2622 data->pos_min += l; /* As in the first entry. */
2623 data->flags &= ~SF_BEFORE_EOL;
2624 }
653099ff
GS
2625 if (flags & SCF_DO_STCLASS_AND) {
2626 /* Check whether it is compatible with what we know already! */
2627 int compat = 1;
2628
1aa99e6b 2629 if (uc >= 0x100 ||
516a5887 2630 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2631 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2632 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2633 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2634 )
653099ff
GS
2635 compat = 0;
2636 ANYOF_CLASS_ZERO(data->start_class);
2637 ANYOF_BITMAP_ZERO(data->start_class);
2638 if (compat)
1aa99e6b 2639 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2640 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2641 if (uc < 0x100)
2642 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2643 }
2644 else if (flags & SCF_DO_STCLASS_OR) {
2645 /* false positive possible if the class is case-folded */
1aa99e6b 2646 if (uc < 0x100)
9b877dbb
IH
2647 ANYOF_BITMAP_SET(data->start_class, uc);
2648 else
2649 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2650 data->start_class->flags &= ~ANYOF_EOS;
2651 cl_and(data->start_class, &and_with);
2652 }
2653 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2654 }
3dab1dad 2655 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2656 I32 l = STR_LEN(scan);
1aa99e6b 2657 UV uc = *((U8*)STRING(scan));
653099ff
GS
2658
2659 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2660 if (flags & SCF_DO_SUBSTR) {
2661 assert(data);
1de06328 2662 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2663 }
a0ed51b3 2664 if (UTF) {
6136c704 2665 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2666 l = utf8_length(s, s + l);
9041c2e3 2667 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2668 }
2669 min += l;
ecaa9b9c 2670 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2671 data->pos_min += l;
653099ff
GS
2672 if (flags & SCF_DO_STCLASS_AND) {
2673 /* Check whether it is compatible with what we know already! */
2674 int compat = 1;
2675
1aa99e6b 2676 if (uc >= 0x100 ||
516a5887 2677 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2678 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2679 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2680 compat = 0;
2681 ANYOF_CLASS_ZERO(data->start_class);
2682 ANYOF_BITMAP_ZERO(data->start_class);
2683 if (compat) {
1aa99e6b 2684 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2685 data->start_class->flags &= ~ANYOF_EOS;
2686 data->start_class->flags |= ANYOF_FOLD;
2687 if (OP(scan) == EXACTFL)
2688 data->start_class->flags |= ANYOF_LOCALE;
2689 }
2690 }
2691 else if (flags & SCF_DO_STCLASS_OR) {
2692 if (data->start_class->flags & ANYOF_FOLD) {
2693 /* false positive possible if the class is case-folded.
2694 Assume that the locale settings are the same... */
1aa99e6b
IH
2695 if (uc < 0x100)
2696 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2697 data->start_class->flags &= ~ANYOF_EOS;
2698 }
2699 cl_and(data->start_class, &and_with);
2700 }
2701 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2702 }
bfed75c6 2703 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2704 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2705 I32 f = flags, pos_before = 0;
d4c19fe8 2706 regnode * const oscan = scan;
653099ff
GS
2707 struct regnode_charclass_class this_class;
2708 struct regnode_charclass_class *oclass = NULL;
727f22e3 2709 I32 next_is_eval = 0;
653099ff 2710
3dab1dad 2711 switch (PL_regkind[OP(scan)]) {
653099ff 2712 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2713 scan = NEXTOPER(scan);
2714 goto finish;
2715 case PLUS:
653099ff 2716 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2717 next = NEXTOPER(scan);
653099ff 2718 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2719 mincount = 1;
2720 maxcount = REG_INFTY;
c277df42
IZ
2721 next = regnext(scan);
2722 scan = NEXTOPER(scan);
2723 goto do_curly;
2724 }
2725 }
2726 if (flags & SCF_DO_SUBSTR)
2727 data->pos_min++;
2728 min++;
2729 /* Fall through. */
2730 case STAR:
653099ff
GS
2731 if (flags & SCF_DO_STCLASS) {
2732 mincount = 0;
b81d288d 2733 maxcount = REG_INFTY;
653099ff
GS
2734 next = regnext(scan);
2735 scan = NEXTOPER(scan);
2736 goto do_curly;
2737 }
b81d288d 2738 is_inf = is_inf_internal = 1;
c277df42
IZ
2739 scan = regnext(scan);
2740 if (flags & SCF_DO_SUBSTR) {
1de06328 2741 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2742 data->longest = &(data->longest_float);
2743 }
2744 goto optimize_curly_tail;
2745 case CURLY:
b81d288d 2746 mincount = ARG1(scan);
c277df42
IZ
2747 maxcount = ARG2(scan);
2748 next = regnext(scan);
cb434fcc
IZ
2749 if (OP(scan) == CURLYX) {
2750 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2751 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2752 }
c277df42 2753 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2754 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2755 do_curly:
2756 if (flags & SCF_DO_SUBSTR) {
1de06328 2757 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2758 pos_before = data->pos_min;
2759 }
2760 if (data) {
2761 fl = data->flags;
2762 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2763 if (is_inf)
2764 data->flags |= SF_IS_INF;
2765 }
653099ff 2766 if (flags & SCF_DO_STCLASS) {
830247a4 2767 cl_init(pRExC_state, &this_class);
653099ff
GS
2768 oclass = data->start_class;
2769 data->start_class = &this_class;
2770 f |= SCF_DO_STCLASS_AND;
2771 f &= ~SCF_DO_STCLASS_OR;
2772 }
e1901655
IZ
2773 /* These are the cases when once a subexpression
2774 fails at a particular position, it cannot succeed
2775 even after backtracking at the enclosing scope.
b81d288d 2776
e1901655
IZ
2777 XXXX what if minimal match and we are at the
2778 initial run of {n,m}? */
2779 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2780 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2781
c277df42 2782 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2783 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2784 (mincount == 0
2785 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2786
2787 if (flags & SCF_DO_STCLASS)
2788 data->start_class = oclass;
2789 if (mincount == 0 || minnext == 0) {
2790 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2791 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2792 }
2793 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2794 /* Switch to OR mode: cache the old value of
653099ff
GS
2795 * data->start_class */
2796 StructCopy(data->start_class, &and_with,
2797 struct regnode_charclass_class);
2798 flags &= ~SCF_DO_STCLASS_AND;
2799 StructCopy(&this_class, data->start_class,
2800 struct regnode_charclass_class);
2801 flags |= SCF_DO_STCLASS_OR;
2802 data->start_class->flags |= ANYOF_EOS;
2803 }
2804 } else { /* Non-zero len */
2805 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2806 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2807 cl_and(data->start_class, &and_with);
2808 }
2809 else if (flags & SCF_DO_STCLASS_AND)
2810 cl_and(data->start_class, &this_class);
2811 flags &= ~SCF_DO_STCLASS;
2812 }
c277df42
IZ
2813 if (!scan) /* It was not CURLYX, but CURLY. */
2814 scan = next;
041457d9
DM
2815 if ( /* ? quantifier ok, except for (?{ ... }) */
2816 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2817 && (minnext == 0) && (deltanext == 0)
99799961 2818 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2819 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2820 && ckWARN(WARN_REGEXP))
b45f050a 2821 {
830247a4 2822 vWARN(RExC_parse,
b45f050a
JF
2823 "Quantifier unexpected on zero-length expression");
2824 }
2825
c277df42 2826 min += minnext * mincount;
b81d288d 2827 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2828 && (minnext + deltanext) > 0)
2829 || deltanext == I32_MAX);
aca2d497 2830 is_inf |= is_inf_internal;
c277df42
IZ
2831 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2832
2833 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2834 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2835 && data->flags & SF_IN_PAR
2836 && !(data->flags & SF_HAS_EVAL)
2837 && !deltanext && minnext == 1 ) {
2838 /* Try to optimize to CURLYN. */
2839 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2840 regnode * const nxt1 = nxt;
497b47a8
JH
2841#ifdef DEBUGGING
2842 regnode *nxt2;
2843#endif
c277df42
IZ
2844
2845 /* Skip open. */
2846 nxt = regnext(nxt);
bfed75c6 2847 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2848 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2849 && STR_LEN(nxt) == 1))
c277df42 2850 goto nogo;
497b47a8 2851#ifdef DEBUGGING
c277df42 2852 nxt2 = nxt;
497b47a8 2853#endif
c277df42 2854 nxt = regnext(nxt);
b81d288d 2855 if (OP(nxt) != CLOSE)
c277df42
IZ
2856 goto nogo;
2857 /* Now we know that nxt2 is the only contents: */
eb160463 2858 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2859 OP(oscan) = CURLYN;
2860 OP(nxt1) = NOTHING; /* was OPEN. */
2861#ifdef DEBUGGING
2862 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2863 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2864 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2865 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2866 OP(nxt + 1) = OPTIMIZED; /* was count. */
2867 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2868#endif
c277df42 2869 }
c277df42
IZ
2870 nogo:
2871
2872 /* Try optimization CURLYX => CURLYM. */
b81d288d 2873 if ( OP(oscan) == CURLYX && data
c277df42 2874 && !(data->flags & SF_HAS_PAR)
c277df42 2875 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2876 && !deltanext /* atom is fixed width */
2877 && minnext != 0 /* CURLYM can't handle zero width */
2878 ) {
c277df42
IZ
2879 /* XXXX How to optimize if data == 0? */
2880 /* Optimize to a simpler form. */
2881 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2882 regnode *nxt2;
2883
2884 OP(oscan) = CURLYM;
2885 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2886 && (OP(nxt2) != WHILEM))
c277df42
IZ
2887 nxt = nxt2;
2888 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2889 /* Need to optimize away parenths. */
2890 if (data->flags & SF_IN_PAR) {
2891 /* Set the parenth number. */
2892 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2893
b81d288d 2894 if (OP(nxt) != CLOSE)
b45f050a 2895 FAIL("Panic opt close");
eb160463 2896 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2897 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2898 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2899#ifdef DEBUGGING
2900 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2901 OP(nxt + 1) = OPTIMIZED; /* was count. */
2902 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2903 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2904#endif
c277df42
IZ
2905#if 0
2906 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2907 regnode *nnxt = regnext(nxt1);
b81d288d 2908
c277df42
IZ
2909 if (nnxt == nxt) {
2910 if (reg_off_by_arg[OP(nxt1)])
2911 ARG_SET(nxt1, nxt2 - nxt1);
2912 else if (nxt2 - nxt1 < U16_MAX)
2913 NEXT_OFF(nxt1) = nxt2 - nxt1;
2914 else
2915 OP(nxt) = NOTHING; /* Cannot beautify */
2916 }
2917 nxt1 = nnxt;
2918 }
2919#endif
2920 /* Optimize again: */
1de06328 2921 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2922 NULL, 0,depth+1);
a0ed51b3
LW
2923 }
2924 else
c277df42 2925 oscan->flags = 0;
c277df42 2926 }
e1901655
IZ
2927 else if ((OP(oscan) == CURLYX)
2928 && (flags & SCF_WHILEM_VISITED_POS)
2929 /* See the comment on a similar expression above.
2930 However, this time it not a subexpression
2931 we care about, but the expression itself. */
2932 && (maxcount == REG_INFTY)
2933 && data && ++data->whilem_c < 16) {
2934 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2935 /* Find WHILEM (as in regexec.c) */
2936 regnode *nxt = oscan + NEXT_OFF(oscan);
2937
2938 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2939 nxt += ARG(nxt);
eb160463
GS
2940 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2941 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2942 }
b81d288d 2943 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2944 pars++;
2945 if (flags & SCF_DO_SUBSTR) {
c445ea15 2946 SV *last_str = NULL;
c277df42
IZ
2947 int counted = mincount != 0;
2948
2949 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2950#if defined(SPARC64_GCC_WORKAROUND)
2951 I32 b = 0;
2952 STRLEN l = 0;
cfd0369c 2953 const char *s = NULL;
5d1c421c
JH
2954 I32 old = 0;
2955
2956 if (pos_before >= data->last_start_min)
2957 b = pos_before;
2958 else
2959 b = data->last_start_min;
2960
2961 l = 0;
cfd0369c 2962 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2963 old = b - data->last_start_min;
2964
2965#else
b81d288d 2966 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2967 ? pos_before : data->last_start_min;
2968 STRLEN l;
d4c19fe8 2969 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2970 I32 old = b - data->last_start_min;
5d1c421c 2971#endif
a0ed51b3
LW
2972
2973 if (UTF)
2974 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2975
a0ed51b3 2976 l -= old;
c277df42 2977 /* Get the added string: */
79cb57f6 2978 last_str = newSVpvn(s + old, l);
0e933229
IH
2979 if (UTF)
2980 SvUTF8_on(last_str);
c277df42
IZ
2981 if (deltanext == 0 && pos_before == b) {
2982 /* What was added is a constant string */
2983 if (mincount > 1) {
2984 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2985 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2986 SvPVX_const(last_str), l, mincount - 1);
b162af07 2987 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2988 /* Add additional parts. */
b81d288d 2989 SvCUR_set(data->last_found,
c277df42
IZ
2990 SvCUR(data->last_found) - l);
2991 sv_catsv(data->last_found, last_str);
0eda9292
JH
2992 {
2993 SV * sv = data->last_found;
2994 MAGIC *mg =
2995 SvUTF8(sv) && SvMAGICAL(sv) ?
2996 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2997 if (mg && mg->mg_len >= 0)
2998 mg->mg_len += CHR_SVLEN(last_str);
2999 }
c277df42
IZ
3000 data->last_end += l * (mincount - 1);
3001 }
2a8d9689
HS
3002 } else {
3003 /* start offset must point into the last copy */
3004 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3005 data->last_start_max += is_inf ? I32_MAX
3006 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3007 }
3008 }
3009 /* It is counted once already... */
3010 data->pos_min += minnext * (mincount - counted);
3011 data->pos_delta += - counted * deltanext +
3012 (minnext + deltanext) * maxcount - minnext * mincount;
3013 if (mincount != maxcount) {
653099ff
GS
3014 /* Cannot extend fixed substrings found inside
3015 the group. */
1de06328 3016 scan_commit(pRExC_state,data,minlenp);
c277df42 3017 if (mincount && last_str) {
d4c19fe8
AL
3018 SV * const sv = data->last_found;
3019 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3020 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3021
3022 if (mg)
3023 mg->mg_len = -1;
3024 sv_setsv(sv, last_str);
c277df42 3025 data->last_end = data->pos_min;
b81d288d 3026 data->last_start_min =
a0ed51b3 3027 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3028 data->last_start_max = is_inf
3029 ? I32_MAX
c277df42 3030 : data->pos_min + data->pos_delta
a0ed51b3 3031 - CHR_SVLEN(last_str);
c277df42
IZ
3032 }
3033 data->longest = &(data->longest_float);
3034 }
aca2d497 3035 SvREFCNT_dec(last_str);
c277df42 3036 }
405ff068 3037 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3038 data->flags |= SF_HAS_EVAL;
3039 optimize_curly_tail:
c277df42 3040 if (OP(oscan) != CURLYX) {
3dab1dad 3041 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3042 && NEXT_OFF(next))
3043 NEXT_OFF(oscan) += NEXT_OFF(next);
3044 }
c277df42 3045 continue;
653099ff 3046 default: /* REF and CLUMP only? */
c277df42 3047 if (flags & SCF_DO_SUBSTR) {
1de06328 3048 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3049 data->longest = &(data->longest_float);
3050 }
aca2d497 3051 is_inf = is_inf_internal = 1;
653099ff 3052 if (flags & SCF_DO_STCLASS_OR)
830247a4 3053 cl_anything(pRExC_state, data->start_class);
653099ff 3054 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3055 break;
3056 }
a0ed51b3 3057 }
bfed75c6 3058 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3059 int value = 0;
653099ff 3060
c277df42 3061 if (flags & SCF_DO_SUBSTR) {
1de06328 3062 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3063 data->pos_min++;
3064 }
3065 min++;
653099ff
GS
3066 if (flags & SCF_DO_STCLASS) {
3067 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3068
3069 /* Some of the logic below assumes that switching
3070 locale on will only add false positives. */
3dab1dad 3071 switch (PL_regkind[OP(scan)]) {
653099ff 3072 case SANY:
653099ff
GS
3073 default:
3074 do_default:
3075 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3076 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3077 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3078 break;
3079 case REG_ANY:
3080 if (OP(scan) == SANY)
3081 goto do_default;
3082 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3083 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3084 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3085 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3086 }
3087 if (flags & SCF_DO_STCLASS_AND || !value)
3088 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3089 break;
3090 case ANYOF:
3091 if (flags & SCF_DO_STCLASS_AND)
3092 cl_and(data->start_class,
3093 (struct regnode_charclass_class*)scan);
3094 else
830247a4 3095 cl_or(pRExC_state, data->start_class,
653099ff
GS
3096 (struct regnode_charclass_class*)scan);
3097 break;
3098 case ALNUM:
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3101 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3102 for (value = 0; value < 256; value++)
3103 if (!isALNUM(value))
3104 ANYOF_BITMAP_CLEAR(data->start_class, value);
3105 }
3106 }
3107 else {
3108 if (data->start_class->flags & ANYOF_LOCALE)
3109 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3110 else {
3111 for (value = 0; value < 256; value++)
3112 if (isALNUM(value))
b81d288d 3113 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3114 }
3115 }
3116 break;
3117 case ALNUML:
3118 if (flags & SCF_DO_STCLASS_AND) {
3119 if (data->start_class->flags & ANYOF_LOCALE)
3120 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3121 }
3122 else {
3123 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3124 data->start_class->flags |= ANYOF_LOCALE;
3125 }
3126 break;
3127 case NALNUM:
3128 if (flags & SCF_DO_STCLASS_AND) {
3129 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3130 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3131 for (value = 0; value < 256; value++)
3132 if (isALNUM(value))
3133 ANYOF_BITMAP_CLEAR(data->start_class, value);
3134 }
3135 }
3136 else {
3137 if (data->start_class->flags & ANYOF_LOCALE)
3138 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3139 else {
3140 for (value = 0; value < 256; value++)
3141 if (!isALNUM(value))
b81d288d 3142 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3143 }
3144 }
3145 break;
3146 case NALNUML:
3147 if (flags & SCF_DO_STCLASS_AND) {
3148 if (data->start_class->flags & ANYOF_LOCALE)
3149 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3150 }
3151 else {
3152 data->start_class->flags |= ANYOF_LOCALE;
3153 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3154 }
3155 break;
3156 case SPACE:
3157 if (flags & SCF_DO_STCLASS_AND) {
3158 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3159 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3160 for (value = 0; value < 256; value++)
3161 if (!isSPACE(value))
3162 ANYOF_BITMAP_CLEAR(data->start_class, value);
3163 }
3164 }
3165 else {
3166 if (data->start_class->flags & ANYOF_LOCALE)
3167 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3168 else {
3169 for (value = 0; value < 256; value++)
3170 if (isSPACE(value))
b81d288d 3171 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3172 }
3173 }
3174 break;
3175 case SPACEL:
3176 if (flags & SCF_DO_STCLASS_AND) {
3177 if (data->start_class->flags & ANYOF_LOCALE)
3178 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3179 }
3180 else {
3181 data->start_class->flags |= ANYOF_LOCALE;
3182 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3183 }
3184 break;
3185 case NSPACE:
3186 if (flags & SCF_DO_STCLASS_AND) {
3187 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3188 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3189 for (value = 0; value < 256; value++)
3190 if (isSPACE(value))
3191 ANYOF_BITMAP_CLEAR(data->start_class, value);
3192 }
3193 }
3194 else {
3195 if (data->start_class->flags & ANYOF_LOCALE)
3196 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3197 else {
3198 for (value = 0; value < 256; value++)
3199 if (!isSPACE(value))
b81d288d 3200 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3201 }
3202 }
3203 break;
3204 case NSPACEL:
3205 if (flags & SCF_DO_STCLASS_AND) {
3206 if (data->start_class->flags & ANYOF_LOCALE) {
3207 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3208 for (value = 0; value < 256; value++)
3209 if (!isSPACE(value))
3210 ANYOF_BITMAP_CLEAR(data->start_class, value);
3211 }
3212 }
3213 else {
3214 data->start_class->flags |= ANYOF_LOCALE;
3215 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3216 }
3217 break;
3218 case DIGIT:
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3221 for (value = 0; value < 256; value++)
3222 if (!isDIGIT(value))
3223 ANYOF_BITMAP_CLEAR(data->start_class, value);
3224 }
3225 else {
3226 if (data->start_class->flags & ANYOF_LOCALE)
3227 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3228 else {
3229 for (value = 0; value < 256; value++)
3230 if (isDIGIT(value))
b81d288d 3231 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3232 }
3233 }
3234 break;
3235 case NDIGIT:
3236 if (flags & SCF_DO_STCLASS_AND) {
3237 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3238 for (value = 0; value < 256; value++)
3239 if (isDIGIT(value))
3240 ANYOF_BITMAP_CLEAR(data->start_class, value);
3241 }
3242 else {
3243 if (data->start_class->flags & ANYOF_LOCALE)
3244 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3245 else {
3246 for (value = 0; value < 256; value++)
3247 if (!isDIGIT(value))
b81d288d 3248 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3249 }
3250 }
3251 break;
3252 }
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_and(data->start_class, &and_with);
3255 flags &= ~SCF_DO_STCLASS;
3256 }
a0ed51b3 3257 }
3dab1dad 3258 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3259 data->flags |= (OP(scan) == MEOL
3260 ? SF_BEFORE_MEOL
3261 : SF_BEFORE_SEOL);
a0ed51b3 3262 }
3dab1dad 3263 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3264 /* Lookbehind, or need to calculate parens/evals/stclass: */
3265 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3266 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3267 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3268 || OP(scan) == UNLESSM )
3269 {
3270 /* Negative Lookahead/lookbehind
3271 In this case we can't do fixed string optimisation.
3272 */
3273
3274 I32 deltanext, minnext, fake = 0;
3275 regnode *nscan;
3276 struct regnode_charclass_class intrnl;
3277 int f = 0;
3278
3279 data_fake.flags = 0;
3280 if (data) {
3281 data_fake.whilem_c = data->whilem_c;
3282 data_fake.last_closep = data->last_closep;
a0ed51b3 3283 }
1de06328
YO
3284 else
3285 data_fake.last_closep = &fake;
3286 if ( flags & SCF_DO_STCLASS && !scan->flags
3287 && OP(scan) == IFMATCH ) { /* Lookahead */
3288 cl_init(pRExC_state, &intrnl);
3289 data_fake.start_class = &intrnl;
3290 f |= SCF_DO_STCLASS_AND;
c277df42 3291 }
1de06328
YO
3292 if (flags & SCF_WHILEM_VISITED_POS)
3293 f |= SCF_WHILEM_VISITED_POS;
3294 next = regnext(scan);
3295 nscan = NEXTOPER(NEXTOPER(scan));
3296 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3297 if (scan->flags) {
3298 if (deltanext) {
3299 vFAIL("Variable length lookbehind not implemented");
3300 }
3301 else if (minnext > (I32)U8_MAX) {
3302 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3303 }
3304 scan->flags = (U8)minnext;
3305 }
3306 if (data) {
3307 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3308 pars++;
3309 if (data_fake.flags & SF_HAS_EVAL)
3310 data->flags |= SF_HAS_EVAL;
3311 data->whilem_c = data_fake.whilem_c;
3312 }
3313 if (f & SCF_DO_STCLASS_AND) {
3314 const int was = (data->start_class->flags & ANYOF_EOS);
3315
3316 cl_and(data->start_class, &intrnl);
3317 if (was)
3318 data->start_class->flags |= ANYOF_EOS;
3319 }
be8e71aa 3320 }
1de06328
YO
3321#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3322 else {
3323 /* Positive Lookahead/lookbehind
3324 In this case we can do fixed string optimisation,
3325 but we must be careful about it. Note in the case of
3326 lookbehind the positions will be offset by the minimum
3327 length of the pattern, something we won't know about
3328 until after the recurse.
3329 */
3330 I32 deltanext, fake = 0;
3331 regnode *nscan;
3332 struct regnode_charclass_class intrnl;
3333 int f = 0;
3334 /* We use SAVEFREEPV so that when the full compile
3335 is finished perl will clean up the allocated
3336 minlens when its all done. This was we don't
3337 have to worry about freeing them when we know
3338 they wont be used, which would be a pain.
3339 */
3340 I32 *minnextp;
3341 Newx( minnextp, 1, I32 );
3342 SAVEFREEPV(minnextp);
3343
3344 if (data) {
3345 StructCopy(data, &data_fake, scan_data_t);
3346 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3347 f |= SCF_DO_SUBSTR;
3348 if (scan->flags)
3349 scan_commit(pRExC_state, &data_fake,minlenp);
3350 data_fake.last_found=newSVsv(data->last_found);
3351 }
3352 }
3353 else
3354 data_fake.last_closep = &fake;
3355 data_fake.flags = 0;
3356 if (is_inf)
3357 data_fake.flags |= SF_IS_INF;
3358 if ( flags & SCF_DO_STCLASS && !scan->flags
3359 && OP(scan) == IFMATCH ) { /* Lookahead */
3360 cl_init(pRExC_state, &intrnl);
3361 data_fake.start_class = &intrnl;
3362 f |= SCF_DO_STCLASS_AND;
3363 }
3364 if (flags & SCF_WHILEM_VISITED_POS)
3365 f |= SCF_WHILEM_VISITED_POS;
3366 next = regnext(scan);
3367 nscan = NEXTOPER(NEXTOPER(scan));
3368
3369 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3370 if (scan->flags) {
3371 if (deltanext) {
3372 vFAIL("Variable length lookbehind not implemented");
3373 }
3374 else if (*minnextp > (I32)U8_MAX) {
3375 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3376 }
3377 scan->flags = (U8)*minnextp;
3378 }
3379
3380 *minnextp += min;
3381
3382
3383 if (f & SCF_DO_STCLASS_AND) {
3384 const int was = (data->start_class->flags & ANYOF_EOS);
3385
3386 cl_and(data->start_class, &intrnl);
3387 if (was)
3388 data->start_class->flags |= ANYOF_EOS;
3389 }
3390 if (data) {
3391 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3392 pars++;
3393 if (data_fake.flags & SF_HAS_EVAL)
3394 data->flags |= SF_HAS_EVAL;
3395 data->whilem_c = data_fake.whilem_c;
3396 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3397 if (RExC_rx->minlen<*minnextp)
3398 RExC_rx->minlen=*minnextp;
3399 scan_commit(pRExC_state, &data_fake, minnextp);
3400 SvREFCNT_dec(data_fake.last_found);
3401
3402 if ( data_fake.minlen_fixed != minlenp )
3403 {
3404 data->offset_fixed= data_fake.offset_fixed;
3405 data->minlen_fixed= data_fake.minlen_fixed;
3406 data->lookbehind_fixed+= scan->flags;
3407 }
3408 if ( data_fake.minlen_float != minlenp )
3409 {
3410 data->minlen_float= data_fake.minlen_float;
3411 data->offset_float_min=data_fake.offset_float_min;
3412 data->offset_float_max=data_fake.offset_float_max;
3413 data->lookbehind_float+= scan->flags;
3414 }
3415 }
3416 }
3417
653099ff 3418
653099ff 3419 }
1de06328 3420#endif
a0ed51b3
LW
3421 }
3422 else if (OP(scan) == OPEN) {
c277df42 3423 pars++;
a0ed51b3 3424 }
cb434fcc 3425 else if (OP(scan) == CLOSE) {
eb160463 3426 if ((I32)ARG(scan) == is_par) {
cb434fcc 3427 next = regnext(scan);
c277df42 3428
cb434fcc
IZ
3429 if ( next && (OP(next) != WHILEM) && next < last)
3430 is_par = 0; /* Disable optimization */
3431 }
3432 if (data)
3433 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3434 }
3435 else if (OP(scan) == EVAL) {
c277df42
IZ
3436 if (data)
3437 data->flags |= SF_HAS_EVAL;
3438 }
96776eda 3439 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3440 if (flags & SCF_DO_SUBSTR) {
1de06328 3441 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3442 data->longest = &(data->longest_float);
3443 }
3444 is_inf = is_inf_internal = 1;
653099ff 3445 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3446 cl_anything(pRExC_state, data->start_class);
96776eda 3447 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3448 }
786e8c11
YO
3449#ifdef TRIE_STUDY_OPT
3450#ifdef FULL_TRIE_STUDY
3451 else if (PL_regkind[OP(scan)] == TRIE) {
3452 /* NOTE - There is similar code to this block above for handling
3453 BRANCH nodes on the initial study. If you change stuff here
3454 check there too. */
3455 regnode *tail= regnext(scan);
3456 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3457 I32 max1 = 0, min1 = I32_MAX;
3458 struct regnode_charclass_class accum;
3459
3460 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3461 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3462 if (flags & SCF_DO_STCLASS)
3463 cl_init_zero(pRExC_state, &accum);
3464
3465 if (!trie->jump) {
3466 min1= trie->minlen;
3467 max1= trie->maxlen;
3468 } else {
3469 const regnode *nextbranch= NULL;
3470 U32 word;
3471
3472 for ( word=1 ; word <= trie->wordcount ; word++)
3473 {
3474 I32 deltanext=0, minnext=0, f = 0, fake;
3475 struct regnode_charclass_class this_class;
3476
3477 data_fake.flags = 0;
3478 if (data) {
3479 data_fake.whilem_c = data->whilem_c;
3480 data_fake.last_closep = data->last_closep;
3481 }
3482 else
3483 data_fake.last_closep = &fake;
3484
3485 if (flags & SCF_DO_STCLASS) {
3486 cl_init(pRExC_state, &this_class);
3487 data_fake.start_class = &this_class;
3488 f = SCF_DO_STCLASS_AND;
3489 }
3490 if (flags & SCF_WHILEM_VISITED_POS)
3491 f |= SCF_WHILEM_VISITED_POS;
3492
3493 if (trie->jump[word]) {
3494 if (!nextbranch)
3495 nextbranch = tail - trie->jump[0];
3496 scan= tail - trie->jump[word];
3497 /* We go from the jump point to the branch that follows
3498 it. Note this means we need the vestigal unused branches
3499 even though they arent otherwise used.
3500 */
1de06328 3501 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3502 (regnode *)nextbranch, &data_fake, f,depth+1);
3503 }
3504 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3505 nextbranch= regnext((regnode*)nextbranch);
3506
3507 if (min1 > (I32)(minnext + trie->minlen))
3508 min1 = minnext + trie->minlen;
3509 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3510 max1 = minnext + deltanext + trie->maxlen;
3511 if (deltanext == I32_MAX)
3512 is_inf = is_inf_internal = 1;
3513
3514 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3515 pars++;
3516
3517 if (data) {
3518 if (data_fake.flags & SF_HAS_EVAL)
3519 data->flags |= SF_HAS_EVAL;
3520 data->whilem_c = data_fake.whilem_c;
3521 }
3522 if (flags & SCF_DO_STCLASS)
3523 cl_or(pRExC_state, &accum, &this_class);
3524 }
3525 }
3526 if (flags & SCF_DO_SUBSTR) {
3527 data->pos_min += min1;
3528 data->pos_delta += max1 - min1;
3529 if (max1 != min1 || is_inf)
3530 data->longest = &(data->longest_float);
3531 }
3532 min += min1;
3533 delta += max1 - min1;
3534 if (flags & SCF_DO_STCLASS_OR) {
3535 cl_or(pRExC_state, data->start_class, &accum);
3536 if (min1) {
3537 cl_and(data->start_class, &and_with);
3538 flags &= ~SCF_DO_STCLASS;
3539 }
3540 }
3541 else if (flags & SCF_DO_STCLASS_AND) {
3542 if (min1) {
3543 cl_and(data->start_class, &accum);
3544 flags &= ~SCF_DO_STCLASS;
3545 }
3546 else {
3547 /* Switch to OR mode: cache the old value of
3548 * data->start_class */
3549 StructCopy(data->start_class, &and_with,
3550 struct regnode_charclass_class);
3551 flags &= ~SCF_DO_STCLASS_AND;
3552 StructCopy(&accum, data->start_class,
3553 struct regnode_charclass_class);
3554 flags |= SCF_DO_STCLASS_OR;
3555 data->start_class->flags |= ANYOF_EOS;
3556 }
3557 }
3558 scan= tail;
3559 continue;
3560 }
3561#else
3562 else if (PL_regkind[OP(scan)] == TRIE) {
3563 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3564 U8*bang=NULL;
3565
3566 min += trie->minlen;
3567 delta += (trie->maxlen - trie->minlen);
3568 flags &= ~SCF_DO_STCLASS; /* xxx */
3569 if (flags & SCF_DO_SUBSTR) {
1de06328 3570 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3571 data->pos_min += trie->minlen;
3572 data->pos_delta += (trie->maxlen - trie->minlen);
3573 if (trie->maxlen != trie->minlen)
3574 data->longest = &(data->longest_float);
3575 }
3576 if (trie->jump) /* no more substrings -- for now /grr*/
3577 flags &= ~SCF_DO_SUBSTR;
3578 }
3579#endif /* old or new */
3580#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3581 /* Else: zero-length, ignore. */
3582 scan = regnext(scan);
3583 }
3584
3585 finish:
3586 *scanp = scan;
aca2d497 3587 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3588 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3589 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3590 if (is_par > (I32)U8_MAX)
c277df42
IZ
3591 is_par = 0;
3592 if (is_par && pars==1 && data) {
3593 data->flags |= SF_IN_PAR;
3594 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3595 }
3596 else if (pars && data) {
c277df42
IZ
3597 data->flags |= SF_HAS_PAR;
3598 data->flags &= ~SF_IN_PAR;
3599 }
653099ff
GS
3600 if (flags & SCF_DO_STCLASS_OR)
3601 cl_and(data->start_class, &and_with);
786e8c11
YO
3602 if (flags & SCF_TRIE_RESTUDY)
3603 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3604
3605 DEBUG_STUDYDATA(data,depth);
3606
c277df42
IZ
3607 return min;
3608}
3609
76e3520e 3610STATIC I32
5f66b61c 3611S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3612{
830247a4 3613 if (RExC_rx->data) {
b81d288d
AB
3614 Renewc(RExC_rx->data,
3615 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3616 char, struct reg_data);
830247a4
IZ
3617 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3618 RExC_rx->data->count += n;
a0ed51b3
LW
3619 }
3620 else {
a02a5408 3621 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3622 char, struct reg_data);
a02a5408 3623 Newx(RExC_rx->data->what, n, U8);
830247a4 3624 RExC_rx->data->count = n;
c277df42 3625 }
830247a4
IZ
3626 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3627 return RExC_rx->data->count - n;
c277df42
IZ
3628}
3629
76234dfb 3630#ifndef PERL_IN_XSUB_RE
d88dccdf 3631void
864dbfa3 3632Perl_reginitcolors(pTHX)
d88dccdf 3633{
97aff369 3634 dVAR;
1df70142 3635 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3636 if (s) {
1df70142
AL
3637 char *t = savepv(s);
3638 int i = 0;
3639 PL_colors[0] = t;
d88dccdf 3640 while (++i < 6) {
1df70142
AL
3641 t = strchr(t, '\t');
3642 if (t) {
3643 *t = '\0';
3644 PL_colors[i] = ++t;
d88dccdf
IZ
3645 }
3646 else
1df70142 3647 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3648 }
3649 } else {
1df70142 3650 int i = 0;
b81d288d 3651 while (i < 6)
06b5626a 3652 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3653 }
3654 PL_colorset = 1;
3655}
76234dfb 3656#endif
8615cb43 3657
07be1b83 3658
786e8c11
YO
3659#ifdef TRIE_STUDY_OPT
3660#define CHECK_RESTUDY_GOTO \
3661 if ( \
3662 (data.flags & SCF_TRIE_RESTUDY) \
3663 && ! restudied++ \
3664 ) goto reStudy
3665#else
3666#define CHECK_RESTUDY_GOTO
3667#endif
a687059c 3668/*
e50aee73 3669 - pregcomp - compile a regular expression into internal code
a687059c
LW
3670 *
3671 * We can't allocate space until we know how big the compiled form will be,
3672 * but we can't compile it (and thus know how big it is) until we've got a
3673 * place to put the code. So we cheat: we compile it twice, once with code
3674 * generation turned off and size counting turned on, and once "for real".
3675 * This also means that we don't allocate space until we are sure that the
3676 * thing really will compile successfully, and we never have to move the
3677 * code and thus invalidate pointers into it. (Note that it has to be in
3678 * one piece because free() must be able to free it all.) [NB: not true in perl]
3679 *
3680 * Beware that the optimization-preparation code in here knows about some
3681 * of the structure of the compiled regexp. [I'll say.]
3682 */
3683regexp *
864dbfa3 3684Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3685{
97aff369 3686 dVAR;
a0d0e21e 3687 register regexp *r;
c277df42 3688 regnode *scan;
c277df42 3689 regnode *first;
a0d0e21e 3690 I32 flags;
a0d0e21e
LW
3691 I32 minlen = 0;
3692 I32 sawplus = 0;
3693 I32 sawopen = 0;
2c2d71f5 3694 scan_data_t data;
830247a4 3695 RExC_state_t RExC_state;
be8e71aa 3696 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3697#ifdef TRIE_STUDY_OPT
3698 int restudied= 0;
3699 RExC_state_t copyRExC_state;
3700#endif
a0d0e21e 3701
a3621e74
YO
3702 GET_RE_DEBUG_FLAGS_DECL;
3703
a0d0e21e 3704 if (exp == NULL)
c277df42 3705 FAIL("NULL regexp argument");
a0d0e21e 3706
a5961de5 3707 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3708
5cfc7842 3709 RExC_precomp = exp;
a3621e74
YO
3710 DEBUG_r(if (!PL_colorset) reginitcolors());
3711 DEBUG_COMPILE_r({
ab3bbdeb
YO
3712 SV *dsv= sv_newmortal();
3713 RE_PV_QUOTED_DECL(s, RExC_utf8,
3714 dsv, RExC_precomp, (xend - exp), 60);
3715 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3716 PL_colors[4],PL_colors[5],s);
a5961de5 3717 });
e2509266 3718 RExC_flags = pm->op_pmflags;
830247a4 3719 RExC_sawback = 0;
bbce6d69 3720
830247a4
IZ
3721 RExC_seen = 0;
3722 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3723 RExC_seen_evals = 0;
3724 RExC_extralen = 0;
c277df42 3725
bbce6d69 3726 /* First pass: determine size, legality. */
830247a4 3727 RExC_parse = exp;
fac92740 3728 RExC_start = exp;
830247a4
IZ
3729 RExC_end = xend;
3730 RExC_naughty = 0;
3731 RExC_npar = 1;
3732 RExC_size = 0L;
3733 RExC_emit = &PL_regdummy;
3734 RExC_whilem_seen = 0;
85ddcde9
JH
3735#if 0 /* REGC() is (currently) a NOP at the first pass.
3736 * Clever compilers notice this and complain. --jhi */
830247a4 3737 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3738#endif
3dab1dad
YO
3739 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3740 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3741 RExC_precomp = NULL;
a0d0e21e
LW
3742 return(NULL);
3743 }
3dab1dad
YO
3744 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3745 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3746 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3747 DEBUG_PARSE_r({
3748 RExC_lastnum=0;
3749 RExC_lastparse=NULL;
3750 });
c277df42 3751
07be1b83 3752
c277df42
IZ
3753 /* Small enough for pointer-storage convention?
3754 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3755 if (RExC_size >= 0x10000L && RExC_extralen)
3756 RExC_size += RExC_extralen;
c277df42 3757 else
830247a4
IZ
3758 RExC_extralen = 0;
3759 if (RExC_whilem_seen > 15)
3760 RExC_whilem_seen = 15;
a0d0e21e 3761
bbce6d69 3762 /* Allocate space and initialize. */
a02a5408 3763 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3764 char, regexp);
a0d0e21e 3765 if (r == NULL)
b45f050a
JF
3766 FAIL("Regexp out of space");
3767
0f79a09d
GS
3768#ifdef DEBUGGING
3769 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3770 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3771#endif
c277df42 3772 r->refcnt = 1;
bbce6d69 3773 r->prelen = xend - exp;
5cfc7842 3774 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3775 r->subbeg = NULL;
f8c7b90f 3776#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3777 r->saved_copy = NULL;
ed252734 3778#endif
cf93c79d 3779 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3780 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3781 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3782
3783 r->substrs = 0; /* Useful during FAIL. */
3784 r->startp = 0; /* Useful during FAIL. */
3785 r->endp = 0; /* Useful during FAIL. */
3786
a02a5408 3787 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3788 if (r->offsets) {
2af232bd 3789 r->offsets[0] = RExC_size;
fac92740 3790 }
a3621e74 3791 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3792 "%s %"UVuf" bytes for offset annotations.\n",
3793 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3794 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3795
830247a4 3796 RExC_rx = r;
bbce6d69 3797
3798 /* Second pass: emit code. */
e2509266 3799 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3800 RExC_parse = exp;
3801 RExC_end = xend;
3802 RExC_naughty = 0;
3803 RExC_npar = 1;
fac92740 3804 RExC_emit_start = r->program;
830247a4 3805 RExC_emit = r->program;
2cd61cdb 3806 /* Store the count of eval-groups for security checks: */
786e8c11 3807 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3808 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3809 r->data = 0;
3dab1dad 3810 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3811 return(NULL);
07be1b83
YO
3812 /* XXXX To minimize changes to RE engine we always allocate
3813 3-units-long substrs field. */
3814 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3815
07be1b83 3816reStudy:
1de06328 3817 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3818 Zero(r->substrs, 1, struct reg_substr_data);
3819 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3820
07be1b83
YO
3821#ifdef TRIE_STUDY_OPT
3822 if ( restudied ) {
3823 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3824 RExC_state=copyRExC_state;
1de06328 3825 if (data.last_found) {
07be1b83 3826 SvREFCNT_dec(data.longest_fixed);
07be1b83 3827 SvREFCNT_dec(data.longest_float);
07be1b83 3828 SvREFCNT_dec(data.last_found);
1de06328 3829 }
07be1b83
YO
3830 } else {
3831 copyRExC_state=RExC_state;
3832 }
3833#endif
a0d0e21e 3834 /* Dig out information for optimizations. */
cf93c79d 3835 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3836 pm->op_pmflags = RExC_flags;
a0ed51b3 3837 if (UTF)
5ff6fc6d 3838 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3839 r->regstclass = NULL;
830247a4 3840 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3841 r->reganch |= ROPT_NAUGHTY;
c277df42 3842 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3843
1de06328
YO
3844 /* testing for BRANCH here tells us whether there is "must appear"
3845 data in the pattern. If there is then we can use it for optimisations */
c277df42 3846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3847 I32 fake;
c5254dd6 3848 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3849 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3850 int stclass_flag;
07be1b83 3851 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3852
3853 first = scan;
c277df42 3854 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3855 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3856 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3858 /* for now we can't handle lookbehind IFMATCH*/
3859 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3860 (OP(first) == PLUS) ||
3861 (OP(first) == MINMOD) ||
653099ff 3862 /* An {n,m} with n>0 */
07be1b83
YO
3863 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3864 {
786e8c11 3865
a0d0e21e
LW
3866 if (OP(first) == PLUS)
3867 sawplus = 1;
3868 else
3dab1dad 3869 first += regarglen[OP(first)];
07be1b83
YO
3870 if (OP(first) == IFMATCH) {
3871 first = NEXTOPER(first);
3872 first += EXTRA_STEP_2ARGS;
7c167cea 3873 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3874 first = NEXTOPER(first);
a687059c
LW
3875 }
3876
a0d0e21e
LW
3877 /* Starting-point info. */
3878 again:
786e8c11 3879 DEBUG_PEEP("first:",first,0);
07be1b83 3880 /* Ignore EXACT as we deal with it later. */
3dab1dad 3881 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3882 if (OP(first) == EXACT)
6f207bd3 3883 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3884 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3885 r->regstclass = first;
3886 }
07be1b83 3887#ifdef TRIE_STCLASS
786e8c11 3888 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3889 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3890 {
786e8c11 3891 regnode *trie_op;
07be1b83 3892 /* this can happen only on restudy */
786e8c11
YO
3893 if ( OP(first) == TRIE ) {
3894 struct regnode_1 *trieop;
3895 Newxz(trieop,1,struct regnode_1);
3896 StructCopy(first,trieop,struct regnode_1);
3897 trie_op=(regnode *)trieop;
3898 } else {
3899 struct regnode_charclass *trieop;
3900 Newxz(trieop,1,struct regnode_charclass);
3901 StructCopy(first,trieop,struct regnode_charclass);
3902 trie_op=(regnode *)trieop;
3903 }
1de06328 3904 OP(trie_op)+=2;
786e8c11
YO
3905 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3906 r->regstclass = trie_op;
07be1b83
YO
3907 }
3908#endif
bfed75c6 3909 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3910 r->regstclass = first;
3dab1dad
YO
3911 else if (PL_regkind[OP(first)] == BOUND ||
3912 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3913 r->regstclass = first;
3dab1dad 3914 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3915 r->reganch |= (OP(first) == MBOL
3916 ? ROPT_ANCH_MBOL
3917 : (OP(first) == SBOL
3918 ? ROPT_ANCH_SBOL
3919 : ROPT_ANCH_BOL));
a0d0e21e 3920 first = NEXTOPER(first);
774d564b 3921 goto again;
3922 }
3923 else if (OP(first) == GPOS) {
3924 r->reganch |= ROPT_ANCH_GPOS;
3925 first = NEXTOPER(first);
3926 goto again;
a0d0e21e 3927 }
e09294f4 3928 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3929 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3930 !(r->reganch & ROPT_ANCH) )
3931 {
3932 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3933 const int type =
3934 (OP(NEXTOPER(first)) == REG_ANY)
3935 ? ROPT_ANCH_MBOL
3936 : ROPT_ANCH_SBOL;
cad2e5aa 3937 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3938 first = NEXTOPER(first);
774d564b 3939 goto again;
a0d0e21e 3940 }
b81d288d 3941 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3942 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3943 /* x+ must match at the 1st pos of run of x's */
3944 r->reganch |= ROPT_SKIP;
a0d0e21e 3945
c277df42 3946 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3947#ifdef TRIE_STUDY_OPT
3948 DEBUG_COMPILE_r(
3949 if (!restudied)
3950 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3951 (IV)(first - scan + 1))
3952 );
3953#else
3954 DEBUG_COMPILE_r(
3955 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3956 (IV)(first - scan + 1))
3957 );
3958#endif
3959
3960
a0d0e21e
LW
3961 /*
3962 * If there's something expensive in the r.e., find the
3963 * longest literal string that must appear and make it the
3964 * regmust. Resolve ties in favor of later strings, since
3965 * the regstart check works with the beginning of the r.e.
3966 * and avoiding duplication strengthens checking. Not a
3967 * strong reason, but sufficient in the absence of others.
3968 * [Now we resolve ties in favor of the earlier string if
c277df42 3969 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3970 * earlier string may buy us something the later one won't.]
3971 */
a0d0e21e 3972 minlen = 0;
a687059c 3973
396482e1
GA
3974 data.longest_fixed = newSVpvs("");
3975 data.longest_float = newSVpvs("");
3976 data.last_found = newSVpvs("");
c277df42
IZ
3977 data.longest = &(data.longest_fixed);
3978 first = scan;
653099ff 3979 if (!r->regstclass) {
830247a4 3980 cl_init(pRExC_state, &ch_class);
653099ff
GS
3981 data.start_class = &ch_class;
3982 stclass_flag = SCF_DO_STCLASS_AND;
3983 } else /* XXXX Check for BOUND? */
3984 stclass_flag = 0;
cb434fcc 3985 data.last_closep = &last_close;
653099ff 3986
1de06328 3987 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 3988 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 3989
07be1b83 3990
786e8c11
YO
3991 CHECK_RESTUDY_GOTO;
3992
3993
830247a4 3994 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3995 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3996 && !RExC_seen_zerolen
3997 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3998 r->reganch |= ROPT_CHECK_ALL;
1de06328 3999 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4000 SvREFCNT_dec(data.last_found);
4001
1de06328
YO
4002 /* Note that code very similar to this but for anchored string
4003 follows immediately below, changes may need to be made to both.
4004 Be careful.
4005 */
a0ed51b3 4006 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4007 if (longest_float_length
c277df42
IZ
4008 || (data.flags & SF_FL_BEFORE_EOL
4009 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4010 || (RExC_flags & PMf_MULTILINE))))
4011 {
1182767e 4012 I32 t,ml;
cf93c79d 4013
1de06328 4014 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4015 && data.offset_fixed == data.offset_float_min
4016 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4017 goto remove_float; /* As in (a)+. */
4018
1de06328
YO
4019 /* copy the information about the longest float from the reg_scan_data
4020 over to the program. */
33b8afdf
JH
4021 if (SvUTF8(data.longest_float)) {
4022 r->float_utf8 = data.longest_float;
c445ea15 4023 r->float_substr = NULL;
33b8afdf
JH
4024 } else {
4025 r->float_substr = data.longest_float;
c445ea15 4026 r->float_utf8 = NULL;
33b8afdf 4027 }
1de06328
YO
4028 /* float_end_shift is how many chars that must be matched that
4029 follow this item. We calculate it ahead of time as once the
4030 lookbehind offset is added in we lose the ability to correctly
4031 calculate it.*/
4032 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4033 : (I32)longest_float_length;
1de06328
YO
4034 r->float_end_shift = ml - data.offset_float_min
4035 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4036 + data.lookbehind_float;
4037 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4038 r->float_max_offset = data.offset_float_max;
1182767e 4039 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4040 r->float_max_offset -= data.lookbehind_float;
4041
cf93c79d
IZ
4042 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4043 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4044 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4045 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4046 }
4047 else {
aca2d497 4048 remove_float:
c445ea15 4049 r->float_substr = r->float_utf8 = NULL;
c277df42 4050 SvREFCNT_dec(data.longest_float);
c5254dd6 4051 longest_float_length = 0;
a0d0e21e 4052 }
c277df42 4053
1de06328
YO
4054 /* Note that code very similar to this but for floating string
4055 is immediately above, changes may need to be made to both.
4056 Be careful.
4057 */
a0ed51b3 4058 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4059 if (longest_fixed_length
c277df42
IZ
4060 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4061 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4062 || (RExC_flags & PMf_MULTILINE))))
4063 {
1182767e 4064 I32 t,ml;
cf93c79d 4065
1de06328
YO
4066 /* copy the information about the longest fixed
4067 from the reg_scan_data over to the program. */
33b8afdf
JH
4068 if (SvUTF8(data.longest_fixed)) {
4069 r->anchored_utf8 = data.longest_fixed;
c445ea15 4070 r->anchored_substr = NULL;
33b8afdf
JH
4071 } else {
4072 r->anchored_substr = data.longest_fixed;
c445ea15 4073 r->anchored_utf8 = NULL;
33b8afdf 4074 }
1de06328
YO
4075 /* fixed_end_shift is how many chars that must be matched that
4076 follow this item. We calculate it ahead of time as once the
4077 lookbehind offset is added in we lose the ability to correctly
4078 calculate it.*/
4079 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4080 : (I32)longest_fixed_length;
1de06328
YO
4081 r->anchored_end_shift = ml - data.offset_fixed
4082 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4083 + data.lookbehind_fixed;
4084 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4085
cf93c79d
IZ
4086 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4087 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4088 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4089 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4090 }
4091 else {
c445ea15 4092 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4093 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4094 longest_fixed_length = 0;
a0d0e21e 4095 }
b81d288d 4096 if (r->regstclass
ffc61ed2 4097 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4098 r->regstclass = NULL;
33b8afdf
JH
4099 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4100 && stclass_flag
653099ff 4101 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4102 && !cl_is_anything(data.start_class))
4103 {
1df70142 4104 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4105
a02a5408 4106 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4107 struct regnode_charclass_class);
4108 StructCopy(data.start_class,
830247a4 4109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4110 struct regnode_charclass_class);
830247a4 4111 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4113 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4114 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4115 PerlIO_printf(Perl_debug_log,
a0288114 4116 "synthetic stclass \"%s\".\n",
3f7c398e 4117 SvPVX_const(sv));});
653099ff 4118 }
c277df42
IZ
4119
4120 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4121 if (longest_fixed_length > longest_float_length) {
1de06328 4122 r->check_end_shift = r->anchored_end_shift;
c277df42 4123 r->check_substr = r->anchored_substr;
33b8afdf 4124 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4125 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4126 if (r->reganch & ROPT_ANCH_SINGLE)
4127 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4128 }
4129 else {
1de06328 4130 r->check_end_shift = r->float_end_shift;
c277df42 4131 r->check_substr = r->float_substr;
33b8afdf 4132 r->check_utf8 = r->float_utf8;
1de06328
YO
4133 r->check_offset_min = r->float_min_offset;
4134 r->check_offset_max = r->float_max_offset;
a0d0e21e 4135 }
30382c73
IZ
4136 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4137 This should be changed ASAP! */
33b8afdf 4138 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4139 r->reganch |= RE_USE_INTUIT;
33b8afdf 4140 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4141 r->reganch |= RE_INTUIT_TAIL;
4142 }
1de06328
YO
4143 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4144 if ( (STRLEN)minlen < longest_float_length )
4145 minlen= longest_float_length;
4146 if ( (STRLEN)minlen < longest_fixed_length )
4147 minlen= longest_fixed_length;
4148 */
a0ed51b3
LW
4149 }
4150 else {
c277df42
IZ
4151 /* Several toplevels. Best we can is to set minlen. */
4152 I32 fake;
653099ff 4153 struct regnode_charclass_class ch_class;
cb434fcc 4154 I32 last_close = 0;
c277df42 4155
a3621e74 4156 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4157
c277df42 4158 scan = r->program + 1;
830247a4 4159 cl_init(pRExC_state, &ch_class);
653099ff 4160 data.start_class = &ch_class;
cb434fcc 4161 data.last_closep = &last_close;
07be1b83 4162
1de06328 4163 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4164 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4165
786e8c11 4166 CHECK_RESTUDY_GOTO;
07be1b83 4167
33b8afdf 4168 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4169 = r->float_substr = r->float_utf8 = NULL;
653099ff 4170 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4171 && !cl_is_anything(data.start_class))
4172 {
1df70142 4173 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4174
a02a5408 4175 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4176 struct regnode_charclass_class);
4177 StructCopy(data.start_class,
830247a4 4178 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4179 struct regnode_charclass_class);
830247a4 4180 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4181 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4182 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4183 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4184 PerlIO_printf(Perl_debug_log,
a0288114 4185 "synthetic stclass \"%s\".\n",
3f7c398e 4186 SvPVX_const(sv));});
653099ff 4187 }
a0d0e21e
LW
4188 }
4189
1de06328
YO
4190 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4191 the "real" pattern. */
4192 if (r->minlen < minlen)
4193 r->minlen = minlen;
4194
b81d288d 4195 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4196 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4197 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4198 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4199 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4200 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4201 if (RExC_seen & REG_SEEN_CANY)
4202 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
4203 Newxz(r->startp, RExC_npar, I32);
4204 Newxz(r->endp, RExC_npar, I32);
07be1b83 4205
f2278c82 4206 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4207 DEBUG_DUMP_r({
4208 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4209 regdump(r);
4210 });
8e9a8a48
YO
4211 DEBUG_OFFSETS_r(if (r->offsets) {
4212 const U32 len = r->offsets[0];
4213 U32 i;
4214 GET_RE_DEBUG_FLAGS_DECL;
4215 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4216 for (i = 1; i <= len; i++) {
4217 if (r->offsets[i*2-1] || r->offsets[i*2])
4218 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4219 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4220 }
4221 PerlIO_printf(Perl_debug_log, "\n");
4222 });
a0d0e21e 4223 return(r);
a687059c
LW
4224}
4225
3dab1dad
YO
4226
4227#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4228 int rem=(int)(RExC_end - RExC_parse); \
4229 int cut; \
4230 int num; \
4231 int iscut=0; \
4232 if (rem>10) { \
4233 rem=10; \
4234 iscut=1; \
4235 } \
4236 cut=10-rem; \
4237 if (RExC_lastparse!=RExC_parse) \
4238 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4239 rem, RExC_parse, \
4240 cut + 4, \
4241 iscut ? "..." : "<" \
4242 ); \
4243 else \
4244 PerlIO_printf(Perl_debug_log,"%16s",""); \
4245 \
4246 if (SIZE_ONLY) \
4247 num=RExC_size; \
4248 else \
4249 num=REG_NODE_NUM(RExC_emit); \
4250 if (RExC_lastnum!=num) \
be8e71aa 4251 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4252 else \
be8e71aa
YO
4253 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4254 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4255 (int)((depth*2)), "", \
3dab1dad
YO
4256 (funcname) \
4257 ); \
4258 RExC_lastnum=num; \
4259 RExC_lastparse=RExC_parse; \
4260})
4261
07be1b83
YO
4262
4263
3dab1dad
YO
4264#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4265 DEBUG_PARSE_MSG((funcname)); \
4266 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4267})
a687059c
LW
4268/*
4269 - reg - regular expression, i.e. main body or parenthesized thing
4270 *
4271 * Caller must absorb opening parenthesis.
4272 *
4273 * Combining parenthesis handling with the base level of regular expression
4274 * is a trifle forced, but the need to tie the tails of the branches to what
4275 * follows makes it hard to avoid.
4276 */
07be1b83
YO
4277#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4278#ifdef DEBUGGING
4279#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4280#else
4281#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4282#endif
3dab1dad 4283
76e3520e 4284STATIC regnode *
3dab1dad 4285S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4286 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4287{
27da23d5 4288 dVAR;
c277df42
IZ
4289 register regnode *ret; /* Will be the head of the group. */
4290 register regnode *br;
4291 register regnode *lastbr;
cbbf8932 4292 register regnode *ender = NULL;
a0d0e21e 4293 register I32 parno = 0;
cbbf8932
AL
4294 I32 flags;
4295 const I32 oregflags = RExC_flags;
6136c704
AL
4296 bool have_branch = 0;
4297 bool is_open = 0;
9d1d55b5
JP
4298
4299 /* for (?g), (?gc), and (?o) warnings; warning
4300 about (?c) will warn about (?g) -- japhy */
4301
6136c704
AL
4302#define WASTED_O 0x01
4303#define WASTED_G 0x02
4304#define WASTED_C 0x04
4305#define WASTED_GC (0x02|0x04)
cbbf8932 4306 I32 wastedflags = 0x00;
9d1d55b5 4307
fac92740 4308 char * parse_start = RExC_parse; /* MJD */
a28509cc 4309 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4310
3dab1dad
YO
4311 GET_RE_DEBUG_FLAGS_DECL;
4312 DEBUG_PARSE("reg ");
4313
4314
821b33a5 4315 *flagp = 0; /* Tentatively. */
a0d0e21e 4316
9d1d55b5 4317
a0d0e21e
LW
4318 /* Make an OPEN node, if parenthesized. */
4319 if (paren) {
fac92740 4320 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4321 U32 posflags = 0, negflags = 0;
4322 U32 *flagsp = &posflags;
6136c704 4323 bool is_logical = 0;
a28509cc 4324 const char * const seqstart = RExC_parse;
ca9dfc88 4325
830247a4
IZ
4326 RExC_parse++;
4327 paren = *RExC_parse++;
c277df42 4328 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4329 switch (paren) {
fac92740 4330 case '<': /* (?<...) */
830247a4 4331 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4332 if (*RExC_parse == '!')
c277df42 4333 paren = ',';
b81d288d 4334 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4335 goto unknown;
830247a4 4336 RExC_parse++;
fac92740
MJD
4337 case '=': /* (?=...) */
4338 case '!': /* (?!...) */
830247a4 4339 RExC_seen_zerolen++;
fac92740
MJD
4340 case ':': /* (?:...) */
4341 case '>': /* (?>...) */
a0d0e21e 4342 break;
fac92740
MJD
4343 case '$': /* (?$...) */
4344 case '@': /* (?@...) */
8615cb43 4345 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4346 break;
fac92740 4347 case '#': /* (?#...) */
830247a4
IZ
4348 while (*RExC_parse && *RExC_parse != ')')
4349 RExC_parse++;
4350 if (*RExC_parse != ')')
c277df42 4351 FAIL("Sequence (?#... not terminated");
830247a4 4352 nextchar(pRExC_state);
a0d0e21e
LW
4353 *flagp = TRYAGAIN;
4354 return NULL;
fac92740 4355 case 'p': /* (?p...) */
9014280d 4356 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4357 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4358 /* FALL THROUGH*/
fac92740 4359 case '?': /* (??...) */
6136c704 4360 is_logical = 1;
438a3801
YST
4361 if (*RExC_parse != '{')
4362 goto unknown;
830247a4 4363 paren = *RExC_parse++;
0f5d15d6 4364 /* FALL THROUGH */
fac92740 4365 case '{': /* (?{...}) */
c277df42 4366 {
c277df42
IZ
4367 I32 count = 1, n = 0;
4368 char c;
830247a4 4369 char *s = RExC_parse;
c277df42 4370
830247a4
IZ
4371 RExC_seen_zerolen++;
4372 RExC_seen |= REG_SEEN_EVAL;
4373 while (count && (c = *RExC_parse)) {
6136c704
AL
4374 if (c == '\\') {
4375 if (RExC_parse[1])
4376 RExC_parse++;
4377 }
b81d288d 4378 else if (c == '{')
c277df42 4379 count++;
b81d288d 4380 else if (c == '}')
c277df42 4381 count--;
830247a4 4382 RExC_parse++;
c277df42 4383 }
6136c704 4384 if (*RExC_parse != ')') {
b81d288d 4385 RExC_parse = s;
b45f050a
JF
4386 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4387 }
c277df42 4388 if (!SIZE_ONLY) {
f3548bdc 4389 PAD *pad;
6136c704
AL
4390 OP_4tree *sop, *rop;
4391 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4392
569233ed
SB
4393 ENTER;
4394 Perl_save_re_context(aTHX);
f3548bdc 4395 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4396 sop->op_private |= OPpREFCOUNTED;
4397 /* re_dup will OpREFCNT_inc */
4398 OpREFCNT_set(sop, 1);
569233ed 4399 LEAVE;
c277df42 4400
830247a4
IZ
4401 n = add_data(pRExC_state, 3, "nop");
4402 RExC_rx->data->data[n] = (void*)rop;
4403 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4404 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4405 SvREFCNT_dec(sv);
a0ed51b3 4406 }
e24b16f9 4407 else { /* First pass */
830247a4 4408 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4409 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4410 /* No compiled RE interpolated, has runtime
4411 components ===> unsafe. */
4412 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4413 if (PL_tainting && PL_tainted)
cc6b7395 4414 FAIL("Eval-group in insecure regular expression");
54df2634 4415#if PERL_VERSION > 8
923e4eb5 4416 if (IN_PERL_COMPILETIME)
b5c19bd7 4417 PL_cv_has_eval = 1;
54df2634 4418#endif
c277df42 4419 }
b5c19bd7 4420
830247a4 4421 nextchar(pRExC_state);
6136c704 4422 if (is_logical) {
830247a4 4423 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4424 if (!SIZE_ONLY)
4425 ret->flags = 2;
3dab1dad 4426 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4427 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4428 return ret;
4429 }
ccb2c380
MP
4430 ret = reganode(pRExC_state, EVAL, n);
4431 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4432 Set_Node_Offset(ret, parse_start);
4433 return ret;
c277df42 4434 }
fac92740 4435 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4436 {
fac92740 4437 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4438 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4439 || RExC_parse[1] == '<'
830247a4 4440 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4441 I32 flag;
4442
830247a4 4443 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4444 if (!SIZE_ONLY)
4445 ret->flags = 1;
3dab1dad 4446 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4447 goto insert_if;
b81d288d 4448 }
a0ed51b3 4449 }
830247a4 4450 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4451 /* (?(1)...) */
6136c704 4452 char c;
830247a4 4453 parno = atoi(RExC_parse++);
c277df42 4454
830247a4
IZ
4455 while (isDIGIT(*RExC_parse))
4456 RExC_parse++;
fac92740 4457 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4458
830247a4 4459 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4460 vFAIL("Switch condition not recognized");
c277df42 4461 insert_if:
3dab1dad
YO
4462 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4463 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4464 if (br == NULL)
830247a4 4465 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4466 else
3dab1dad 4467 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4468 c = *nextchar(pRExC_state);
d1b80229
IZ
4469 if (flags&HASWIDTH)
4470 *flagp |= HASWIDTH;
c277df42 4471 if (c == '|') {
830247a4 4472 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4473 regbranch(pRExC_state, &flags, 1,depth+1);
4474 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4475 if (flags&HASWIDTH)
4476 *flagp |= HASWIDTH;
830247a4 4477 c = *nextchar(pRExC_state);
a0ed51b3
LW
4478 }
4479 else
c277df42
IZ
4480 lastbr = NULL;
4481 if (c != ')')
8615cb43 4482 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4483 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4484 REGTAIL(pRExC_state, br, ender);
c277df42 4485 if (lastbr) {
3dab1dad
YO
4486 REGTAIL(pRExC_state, lastbr, ender);
4487 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4488 }
4489 else
3dab1dad 4490 REGTAIL(pRExC_state, ret, ender);
c277df42 4491 return ret;
a0ed51b3
LW
4492 }
4493 else {
830247a4 4494 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4495 }
4496 }
1b1626e4 4497 case 0:
830247a4 4498 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4499 vFAIL("Sequence (? incomplete");
1b1626e4 4500 break;
a0d0e21e 4501 default:
830247a4 4502 --RExC_parse;
fac92740 4503 parse_flags: /* (?i) */
830247a4 4504 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4505 /* (?g), (?gc) and (?o) are useless here
4506 and must be globally applied -- japhy */
4507
4508 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4509 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4510 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4511 if (! (wastedflags & wflagbit) ) {
4512 wastedflags |= wflagbit;
4513 vWARN5(
4514 RExC_parse + 1,
4515 "Useless (%s%c) - %suse /%c modifier",
4516 flagsp == &negflags ? "?-" : "?",
4517 *RExC_parse,
4518 flagsp == &negflags ? "don't " : "",
4519 *RExC_parse
4520 );
4521 }
4522 }
4523 }
4524 else if (*RExC_parse == 'c') {
4525 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4526 if (! (wastedflags & WASTED_C) ) {
4527 wastedflags |= WASTED_GC;
9d1d55b5
JP
4528 vWARN3(
4529 RExC_parse + 1,
4530 "Useless (%sc) - %suse /gc modifier",
4531 flagsp == &negflags ? "?-" : "?",
4532 flagsp == &negflags ? "don't " : ""
4533 );
4534 }
4535 }
4536 }
4537 else { pmflag(flagsp, *RExC_parse); }
4538
830247a4 4539 ++RExC_parse;
ca9dfc88 4540 }
830247a4 4541 if (*RExC_parse == '-') {
ca9dfc88 4542 flagsp = &negflags;
9d1d55b5 4543 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4544 ++RExC_parse;
ca9dfc88 4545 goto parse_flags;
48c036b1 4546 }
e2509266
JH
4547 RExC_flags |= posflags;
4548 RExC_flags &= ~negflags;
830247a4
IZ
4549 if (*RExC_parse == ':') {
4550 RExC_parse++;
ca9dfc88
IZ
4551 paren = ':';
4552 break;
4553 }
c277df42 4554 unknown:
830247a4
IZ
4555 if (*RExC_parse != ')') {
4556 RExC_parse++;
4557 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4558 }
830247a4 4559 nextchar(pRExC_state);
a0d0e21e
LW
4560 *flagp = TRYAGAIN;
4561 return NULL;
4562 }
4563 }
fac92740 4564 else { /* (...) */
830247a4
IZ
4565 parno = RExC_npar;
4566 RExC_npar++;
4567 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4568 Set_Node_Length(ret, 1); /* MJD */
4569 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4570 is_open = 1;
a0d0e21e 4571 }
a0ed51b3 4572 }
fac92740 4573 else /* ! paren */
a0d0e21e
LW
4574 ret = NULL;
4575
4576 /* Pick up the branches, linking them together. */
fac92740 4577 parse_start = RExC_parse; /* MJD */
3dab1dad 4578 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4579 /* branch_len = (paren != 0); */
2af232bd 4580
a0d0e21e
LW
4581 if (br == NULL)
4582 return(NULL);
830247a4
IZ
4583 if (*RExC_parse == '|') {
4584 if (!SIZE_ONLY && RExC_extralen) {
4585 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4586 }
fac92740 4587 else { /* MJD */
830247a4 4588 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4589 Set_Node_Length(br, paren != 0);
4590 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4591 }
c277df42
IZ
4592 have_branch = 1;
4593 if (SIZE_ONLY)
830247a4 4594 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4595 }
4596 else if (paren == ':') {
c277df42
IZ
4597 *flagp |= flags&SIMPLE;
4598 }
6136c704 4599 if (is_open) { /* Starts with OPEN. */
3dab1dad 4600 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4601 }
4602 else if (paren != '?') /* Not Conditional */
a0d0e21e 4603 ret = br;
32a0ca98 4604 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4605 lastbr = br;
830247a4
IZ
4606 while (*RExC_parse == '|') {
4607 if (!SIZE_ONLY && RExC_extralen) {
4608 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4609 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4610 }
4611 if (SIZE_ONLY)
830247a4
IZ
4612 RExC_extralen += 2; /* Account for LONGJMP. */
4613 nextchar(pRExC_state);
3dab1dad 4614 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4615
a687059c 4616 if (br == NULL)
a0d0e21e 4617 return(NULL);
3dab1dad 4618 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4619 lastbr = br;
821b33a5
IZ
4620 if (flags&HASWIDTH)
4621 *flagp |= HASWIDTH;
a687059c 4622 *flagp |= flags&SPSTART;
a0d0e21e
LW
4623 }
4624
c277df42
IZ
4625 if (have_branch || paren != ':') {
4626 /* Make a closing node, and hook it on the end. */
4627 switch (paren) {
4628 case ':':
830247a4 4629 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4630 break;
4631 case 1:
830247a4 4632 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4633 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4634 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4635 break;
4636 case '<':
c277df42
IZ
4637 case ',':
4638 case '=':
4639 case '!':
c277df42 4640 *flagp &= ~HASWIDTH;
821b33a5
IZ
4641 /* FALL THROUGH */
4642 case '>':
830247a4 4643 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4644 break;
4645 case 0:
830247a4 4646 ender = reg_node(pRExC_state, END);
c277df42
IZ
4647 break;
4648 }
07be1b83 4649 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4650
9674d46a 4651 if (have_branch && !SIZE_ONLY) {
c277df42 4652 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4653 for (br = ret; br; br = regnext(br)) {
4654 const U8 op = PL_regkind[OP(br)];
4655 if (op == BRANCH) {
07be1b83 4656 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4657 }
4658 else if (op == BRANCHJ) {
07be1b83 4659 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4660 }
c277df42
IZ
4661 }
4662 }
a0d0e21e 4663 }
c277df42
IZ
4664
4665 {
e1ec3a88
AL
4666 const char *p;
4667 static const char parens[] = "=!<,>";
c277df42
IZ
4668
4669 if (paren && (p = strchr(parens, paren))) {
eb160463 4670 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4671 int flag = (p - parens) > 1;
4672
4673 if (paren == '>')
4674 node = SUSPEND, flag = 0;
830247a4 4675 reginsert(pRExC_state, node,ret);
45948336
EP
4676 Set_Node_Cur_Length(ret);
4677 Set_Node_Offset(ret, parse_start + 1);
c277df42 4678 ret->flags = flag;
07be1b83 4679 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4680 }
a0d0e21e
LW
4681 }
4682
4683 /* Check for proper termination. */
ce3e6498 4684 if (paren) {
e2509266 4685 RExC_flags = oregflags;
830247a4
IZ
4686 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4687 RExC_parse = oregcomp_parse;
380a0633 4688 vFAIL("Unmatched (");
ce3e6498 4689 }
a0ed51b3 4690 }
830247a4
IZ
4691 else if (!paren && RExC_parse < RExC_end) {
4692 if (*RExC_parse == ')') {
4693 RExC_parse++;
380a0633 4694 vFAIL("Unmatched )");
a0ed51b3
LW
4695 }
4696 else
b45f050a 4697 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4698 /* NOTREACHED */
4699 }
a687059c 4700
a0d0e21e 4701 return(ret);
a687059c
LW
4702}
4703
4704/*
4705 - regbranch - one alternative of an | operator
4706 *
4707 * Implements the concatenation operator.
4708 */
76e3520e 4709STATIC regnode *
3dab1dad 4710S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4711{
97aff369 4712 dVAR;
c277df42
IZ
4713 register regnode *ret;
4714 register regnode *chain = NULL;
4715 register regnode *latest;
4716 I32 flags = 0, c = 0;
3dab1dad
YO
4717 GET_RE_DEBUG_FLAGS_DECL;
4718 DEBUG_PARSE("brnc");
b81d288d 4719 if (first)
c277df42
IZ
4720 ret = NULL;
4721 else {
b81d288d 4722 if (!SIZE_ONLY && RExC_extralen)
830247a4 4723 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4724 else {
830247a4 4725 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4726 Set_Node_Length(ret, 1);
4727 }
c277df42
IZ
4728 }
4729
b81d288d 4730 if (!first && SIZE_ONLY)
830247a4 4731 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4732
c277df42 4733 *flagp = WORST; /* Tentatively. */
a0d0e21e 4734
830247a4
IZ
4735 RExC_parse--;
4736 nextchar(pRExC_state);
4737 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4738 flags &= ~TRYAGAIN;
3dab1dad 4739 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4740 if (latest == NULL) {
4741 if (flags & TRYAGAIN)
4742 continue;
4743 return(NULL);
a0ed51b3
LW
4744 }
4745 else if (ret == NULL)
c277df42 4746 ret = latest;
a0d0e21e 4747 *flagp |= flags&HASWIDTH;
c277df42 4748 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4749 *flagp |= flags&SPSTART;
4750 else {
830247a4 4751 RExC_naughty++;
3dab1dad 4752 REGTAIL(pRExC_state, chain, latest);
a687059c 4753 }
a0d0e21e 4754 chain = latest;
c277df42
IZ
4755 c++;
4756 }
4757 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4758 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4759 if (ret == NULL)
4760 ret = chain;
4761 }
4762 if (c == 1) {
4763 *flagp |= flags&SIMPLE;
a0d0e21e 4764 }
a687059c 4765
d4c19fe8 4766 return ret;
a687059c
LW
4767}
4768
4769/*
4770 - regpiece - something followed by possible [*+?]
4771 *
4772 * Note that the branching code sequences used for ? and the general cases
4773 * of * and + are somewhat optimized: they use the same NOTHING node as
4774 * both the endmarker for their branch list and the body of the last branch.
4775 * It might seem that this node could be dispensed with entirely, but the
4776 * endmarker role is not redundant.
4777 */
76e3520e 4778STATIC regnode *
3dab1dad 4779S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4780{
97aff369 4781 dVAR;
c277df42 4782 register regnode *ret;
a0d0e21e
LW
4783 register char op;
4784 register char *next;
4785 I32 flags;
1df70142 4786 const char * const origparse = RExC_parse;
a0d0e21e 4787 I32 min;
c277df42 4788 I32 max = REG_INFTY;
fac92740 4789 char *parse_start;
10edeb5d 4790 const char *maxpos = NULL;
3dab1dad
YO
4791 GET_RE_DEBUG_FLAGS_DECL;
4792 DEBUG_PARSE("piec");
a0d0e21e 4793
3dab1dad 4794 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4795 if (ret == NULL) {
4796 if (flags & TRYAGAIN)
4797 *flagp |= TRYAGAIN;
4798 return(NULL);
4799 }
4800
830247a4 4801 op = *RExC_parse;
a0d0e21e 4802
830247a4 4803 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4804 maxpos = NULL;
fac92740 4805 parse_start = RExC_parse; /* MJD */
830247a4 4806 next = RExC_parse + 1;
a0d0e21e
LW
4807 while (isDIGIT(*next) || *next == ',') {
4808 if (*next == ',') {
4809 if (maxpos)
4810 break;
4811 else
4812 maxpos = next;
a687059c 4813 }
a0d0e21e
LW
4814 next++;
4815 }
4816 if (*next == '}') { /* got one */
4817 if (!maxpos)
4818 maxpos = next;
830247a4
IZ
4819 RExC_parse++;
4820 min = atoi(RExC_parse);
a0d0e21e
LW
4821 if (*maxpos == ',')
4822 maxpos++;
4823 else
830247a4 4824 maxpos = RExC_parse;
a0d0e21e
LW
4825 max = atoi(maxpos);
4826 if (!max && *maxpos != '0')
c277df42
IZ
4827 max = REG_INFTY; /* meaning "infinity" */
4828 else if (max >= REG_INFTY)
8615cb43 4829 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4830 RExC_parse = next;
4831 nextchar(pRExC_state);
a0d0e21e
LW
4832
4833 do_curly:
4834 if ((flags&SIMPLE)) {
830247a4
IZ
4835 RExC_naughty += 2 + RExC_naughty / 2;
4836 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4837 Set_Node_Offset(ret, parse_start+1); /* MJD */
4838 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4839 }
4840 else {
3dab1dad 4841 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4842
4843 w->flags = 0;
3dab1dad 4844 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4845 if (!SIZE_ONLY && RExC_extralen) {
4846 reginsert(pRExC_state, LONGJMP,ret);
4847 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4848 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4849 }
830247a4 4850 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4851 /* MJD hk */
4852 Set_Node_Offset(ret, parse_start+1);
2af232bd 4853 Set_Node_Length(ret,
fac92740 4854 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4855
830247a4 4856 if (!SIZE_ONLY && RExC_extralen)
c277df42 4857 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4858 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4859 if (SIZE_ONLY)
830247a4
IZ
4860 RExC_whilem_seen++, RExC_extralen += 3;
4861 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4862 }
c277df42 4863 ret->flags = 0;
a0d0e21e
LW
4864
4865 if (min > 0)
821b33a5
IZ
4866 *flagp = WORST;
4867 if (max > 0)
4868 *flagp |= HASWIDTH;
a0d0e21e 4869 if (max && max < min)
8615cb43 4870 vFAIL("Can't do {n,m} with n > m");
c277df42 4871 if (!SIZE_ONLY) {
eb160463
GS
4872 ARG1_SET(ret, (U16)min);
4873 ARG2_SET(ret, (U16)max);
a687059c 4874 }
a687059c 4875
a0d0e21e 4876 goto nest_check;
a687059c 4877 }
a0d0e21e 4878 }
a687059c 4879
a0d0e21e
LW
4880 if (!ISMULT1(op)) {
4881 *flagp = flags;
a687059c 4882 return(ret);
a0d0e21e 4883 }
bb20fd44 4884
c277df42 4885#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4886
4887 /* if this is reinstated, don't forget to put this back into perldiag:
4888
4889 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4890
4891 (F) The part of the regexp subject to either the * or + quantifier
4892 could match an empty string. The {#} shows in the regular
4893 expression about where the problem was discovered.
4894
4895 */
4896
bb20fd44 4897 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4898 vFAIL("Regexp *+ operand could be empty");
b81d288d 4899#endif
bb20fd44 4900
fac92740 4901 parse_start = RExC_parse;
830247a4 4902 nextchar(pRExC_state);
a0d0e21e 4903
821b33a5 4904 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4905
4906 if (op == '*' && (flags&SIMPLE)) {
830247a4 4907 reginsert(pRExC_state, STAR, ret);
c277df42 4908 ret->flags = 0;
830247a4 4909 RExC_naughty += 4;
a0d0e21e
LW
4910 }
4911 else if (op == '*') {
4912 min = 0;
4913 goto do_curly;
a0ed51b3
LW
4914 }
4915 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4916 reginsert(pRExC_state, PLUS, ret);
c277df42 4917 ret->flags = 0;
830247a4 4918 RExC_naughty += 3;
a0d0e21e
LW
4919 }
4920 else if (op == '+') {
4921 min = 1;
4922 goto do_curly;
a0ed51b3
LW
4923 }
4924 else if (op == '?') {
a0d0e21e
LW
4925 min = 0; max = 1;
4926 goto do_curly;
4927 }
4928 nest_check:
041457d9 4929 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4930 vWARN3(RExC_parse,
b45f050a 4931 "%.*s matches null string many times",
afd78fd5 4932 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4933 origparse);
a0d0e21e
LW
4934 }
4935
830247a4
IZ
4936 if (*RExC_parse == '?') {
4937 nextchar(pRExC_state);
4938 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4939 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4940 }
830247a4
IZ
4941 if (ISMULT2(RExC_parse)) {
4942 RExC_parse++;
b45f050a
JF
4943 vFAIL("Nested quantifiers");
4944 }
a0d0e21e
LW
4945
4946 return(ret);
a687059c
LW
4947}
4948
4949/*
4950 - regatom - the lowest level
4951 *
4952 * Optimization: gobbles an entire sequence of ordinary characters so that
4953 * it can turn them into a single node, which is smaller to store and
4954 * faster to run. Backslashed characters are exceptions, each becoming a
4955 * separate node; the code is simpler that way and it's not worth fixing.
4956 *
7f6f358c
YO
4957 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4958 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4959 */
76e3520e 4960STATIC regnode *
3dab1dad 4961S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4962{
97aff369 4963 dVAR;
cbbf8932 4964 register regnode *ret = NULL;
a0d0e21e 4965 I32 flags;
45948336 4966 char *parse_start = RExC_parse;
3dab1dad
YO
4967 GET_RE_DEBUG_FLAGS_DECL;
4968 DEBUG_PARSE("atom");
a0d0e21e
LW
4969 *flagp = WORST; /* Tentatively. */
4970
4971tryagain:
830247a4 4972 switch (*RExC_parse) {
a0d0e21e 4973 case '^':
830247a4
IZ
4974 RExC_seen_zerolen++;
4975 nextchar(pRExC_state);
e2509266 4976 if (RExC_flags & PMf_MULTILINE)
830247a4 4977 ret = reg_node(pRExC_state, MBOL);
e2509266 4978 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4979 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4980 else
830247a4 4981 ret = reg_node(pRExC_state, BOL);
fac92740 4982 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4983 break;
4984 case '$':
830247a4 4985 nextchar(pRExC_state);
b81d288d 4986 if (*RExC_parse)
830247a4 4987 RExC_seen_zerolen++;
e2509266 4988 if (RExC_flags & PMf_MULTILINE)
830247a4 4989 ret = reg_node(pRExC_state, MEOL);
e2509266 4990 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4991 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4992 else
830247a4 4993 ret = reg_node(pRExC_state, EOL);
fac92740 4994 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4995 break;
4996 case '.':
830247a4 4997 nextchar(pRExC_state);
e2509266 4998 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4999 ret = reg_node(pRExC_state, SANY);
5000 else
5001 ret = reg_node(pRExC_state, REG_ANY);
5002 *flagp |= HASWIDTH|SIMPLE;
830247a4 5003 RExC_naughty++;
fac92740 5004 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5005 break;
5006 case '[':
b45f050a 5007 {
3dab1dad
YO
5008 char * const oregcomp_parse = ++RExC_parse;
5009 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5010 if (*RExC_parse != ']') {
5011 RExC_parse = oregcomp_parse;
b45f050a
JF
5012 vFAIL("Unmatched [");
5013 }
830247a4 5014 nextchar(pRExC_state);
a0d0e21e 5015 *flagp |= HASWIDTH|SIMPLE;
fac92740 5016 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5017 break;
b45f050a 5018 }
a0d0e21e 5019 case '(':
830247a4 5020 nextchar(pRExC_state);
3dab1dad 5021 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5022 if (ret == NULL) {
bf93d4cc 5023 if (flags & TRYAGAIN) {
830247a4 5024 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5025 /* Make parent create an empty node if needed. */
5026 *flagp |= TRYAGAIN;
5027 return(NULL);
5028 }
a0d0e21e 5029 goto tryagain;
bf93d4cc 5030 }
a0d0e21e
LW
5031 return(NULL);
5032 }
c277df42 5033 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5034 break;
5035 case '|':
5036 case ')':
5037 if (flags & TRYAGAIN) {
5038 *flagp |= TRYAGAIN;
5039 return NULL;
5040 }
b45f050a 5041 vFAIL("Internal urp");
a0d0e21e
LW
5042 /* Supposed to be caught earlier. */
5043 break;
85afd4ae 5044 case '{':
830247a4
IZ
5045 if (!regcurly(RExC_parse)) {
5046 RExC_parse++;
85afd4ae
CS
5047 goto defchar;
5048 }
5049 /* FALL THROUGH */
a0d0e21e
LW
5050 case '?':
5051 case '+':
5052 case '*':
830247a4 5053 RExC_parse++;
b45f050a 5054 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5055 break;
5056 case '\\':
830247a4 5057 switch (*++RExC_parse) {
a0d0e21e 5058 case 'A':
830247a4
IZ
5059 RExC_seen_zerolen++;
5060 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5061 *flagp |= SIMPLE;
830247a4 5062 nextchar(pRExC_state);
fac92740 5063 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5064 break;
5065 case 'G':
830247a4
IZ
5066 ret = reg_node(pRExC_state, GPOS);
5067 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5068 *flagp |= SIMPLE;
830247a4 5069 nextchar(pRExC_state);
fac92740 5070 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5071 break;
5072 case 'Z':
830247a4 5073 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5074 *flagp |= SIMPLE;
a1917ab9 5075 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5076 nextchar(pRExC_state);
a0d0e21e 5077 break;
b85d18e9 5078 case 'z':
830247a4 5079 ret = reg_node(pRExC_state, EOS);
b85d18e9 5080 *flagp |= SIMPLE;
830247a4
IZ
5081 RExC_seen_zerolen++; /* Do not optimize RE away */
5082 nextchar(pRExC_state);
fac92740 5083 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5084 break;
4a2d328f 5085 case 'C':
f33976b4
DB
5086 ret = reg_node(pRExC_state, CANY);
5087 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5088 *flagp |= HASWIDTH|SIMPLE;
830247a4 5089 nextchar(pRExC_state);
fac92740 5090 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5091 break;
5092 case 'X':
830247a4 5093 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5094 *flagp |= HASWIDTH;
830247a4 5095 nextchar(pRExC_state);
fac92740 5096 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5097 break;
a0d0e21e 5098 case 'w':
eb160463 5099 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5100 *flagp |= HASWIDTH|SIMPLE;
830247a4 5101 nextchar(pRExC_state);
fac92740 5102 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5103 break;
5104 case 'W':
eb160463 5105 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5106 *flagp |= HASWIDTH|SIMPLE;
830247a4 5107 nextchar(pRExC_state);
fac92740 5108 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5109 break;
5110 case 'b':
830247a4
IZ
5111 RExC_seen_zerolen++;
5112 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5113 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5114 *flagp |= SIMPLE;
830247a4 5115 nextchar(pRExC_state);
fac92740 5116 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5117 break;
5118 case 'B':
830247a4
IZ
5119 RExC_seen_zerolen++;
5120 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5121 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5122 *flagp |= SIMPLE;
830247a4 5123 nextchar(pRExC_state);
fac92740 5124 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5125 break;
5126 case 's':
eb160463 5127 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5128 *flagp |= HASWIDTH|SIMPLE;
830247a4 5129 nextchar(pRExC_state);
fac92740 5130 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5131 break;
5132 case 'S':
eb160463 5133 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5134 *flagp |= HASWIDTH|SIMPLE;
830247a4 5135 nextchar(pRExC_state);
fac92740 5136 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5137 break;
5138 case 'd':
ffc61ed2 5139 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5140 *flagp |= HASWIDTH|SIMPLE;
830247a4 5141 nextchar(pRExC_state);
fac92740 5142 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5143 break;
5144 case 'D':
ffc61ed2 5145 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5146 *flagp |= HASWIDTH|SIMPLE;
830247a4 5147 nextchar(pRExC_state);
fac92740 5148 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5149 break;
a14b48bc
LW
5150 case 'p':
5151 case 'P':
3568d838 5152 {
3dab1dad 5153 char* const oldregxend = RExC_end;
ccb2c380 5154 char* parse_start = RExC_parse - 2;
a14b48bc 5155
830247a4 5156 if (RExC_parse[1] == '{') {
3568d838 5157 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5158 RExC_end = strchr(RExC_parse, '}');
5159 if (!RExC_end) {
3dab1dad 5160 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5161 RExC_parse += 2;
5162 RExC_end = oldregxend;
0da60cf5 5163 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5164 }
830247a4 5165 RExC_end++;
a14b48bc 5166 }
af6f566e 5167 else {
830247a4 5168 RExC_end = RExC_parse + 2;
af6f566e
HS
5169 if (RExC_end > oldregxend)
5170 RExC_end = oldregxend;
5171 }
830247a4 5172 RExC_parse--;
a14b48bc 5173
3dab1dad 5174 ret = regclass(pRExC_state,depth+1);
a14b48bc 5175
830247a4
IZ
5176 RExC_end = oldregxend;
5177 RExC_parse--;
ccb2c380
MP
5178
5179 Set_Node_Offset(ret, parse_start + 2);
5180 Set_Node_Cur_Length(ret);
830247a4 5181 nextchar(pRExC_state);
a14b48bc
LW
5182 *flagp |= HASWIDTH|SIMPLE;
5183 }
5184 break;
a0d0e21e
LW
5185 case 'n':
5186 case 'r':
5187 case 't':
5188 case 'f':
5189 case 'e':
5190 case 'a':
5191 case 'x':
5192 case 'c':
5193 case '0':
5194 goto defchar;
5195 case '1': case '2': case '3': case '4':
5196 case '5': case '6': case '7': case '8': case '9':
5197 {
1df70142 5198 const I32 num = atoi(RExC_parse);
a0d0e21e 5199
830247a4 5200 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5201 goto defchar;
5202 else {
3dab1dad 5203 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5204 while (isDIGIT(*RExC_parse))
5205 RExC_parse++;
b45f050a 5206
eb160463 5207 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5208 vFAIL("Reference to nonexistent group");
830247a4 5209 RExC_sawback = 1;
eb160463
GS
5210 ret = reganode(pRExC_state,
5211 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5212 num);
a0d0e21e 5213 *flagp |= HASWIDTH;
2af232bd 5214
fac92740 5215 /* override incorrect value set in reganode MJD */
2af232bd 5216 Set_Node_Offset(ret, parse_start+1);
fac92740 5217 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5218 RExC_parse--;
5219 nextchar(pRExC_state);
a0d0e21e
LW
5220 }
5221 }
5222 break;
5223 case '\0':
830247a4 5224 if (RExC_parse >= RExC_end)
b45f050a 5225 FAIL("Trailing \\");
a0d0e21e
LW
5226 /* FALL THROUGH */
5227 default:
a0288114 5228 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5229 back into the quick-grab loop below */
45948336 5230 parse_start--;
a0d0e21e
LW
5231 goto defchar;
5232 }
5233 break;
4633a7c4
LW
5234
5235 case '#':
e2509266 5236 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5237 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5238 RExC_parse++;
830247a4 5239 if (RExC_parse < RExC_end)
4633a7c4
LW
5240 goto tryagain;
5241 }
5242 /* FALL THROUGH */
5243
a0d0e21e 5244 default: {
ba210ebe 5245 register STRLEN len;
58ae7d3f 5246 register UV ender;
a0d0e21e 5247 register char *p;
3dab1dad 5248 char *s;
80aecb99 5249 STRLEN foldlen;
89ebb4a3 5250 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5251
5252 parse_start = RExC_parse - 1;
a0d0e21e 5253
830247a4 5254 RExC_parse++;
a0d0e21e
LW
5255
5256 defchar:
58ae7d3f 5257 ender = 0;
eb160463
GS
5258 ret = reg_node(pRExC_state,
5259 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5260 s = STRING(ret);
830247a4
IZ
5261 for (len = 0, p = RExC_parse - 1;
5262 len < 127 && p < RExC_end;
a0d0e21e
LW
5263 len++)
5264 {
3dab1dad 5265 char * const oldp = p;
5b5a24f7 5266
e2509266 5267 if (RExC_flags & PMf_EXTENDED)
830247a4 5268 p = regwhite(p, RExC_end);
a0d0e21e
LW
5269 switch (*p) {
5270 case '^':
5271 case '$':
5272 case '.':
5273 case '[':
5274 case '(':
5275 case ')':
5276 case '|':
5277 goto loopdone;
5278 case '\\':
5279 switch (*++p) {
5280 case 'A':
1ed8eac0
JF
5281 case 'C':
5282 case 'X':
a0d0e21e
LW
5283 case 'G':
5284 case 'Z':
b85d18e9 5285 case 'z':
a0d0e21e
LW
5286 case 'w':
5287 case 'W':
5288 case 'b':
5289 case 'B':
5290 case 's':
5291 case 'S':
5292 case 'd':
5293 case 'D':
a14b48bc
LW
5294 case 'p':
5295 case 'P':
a0d0e21e
LW
5296 --p;
5297 goto loopdone;
5298 case 'n':
5299 ender = '\n';
5300 p++;
a687059c 5301 break;
a0d0e21e
LW
5302 case 'r':
5303 ender = '\r';
5304 p++;
a687059c 5305 break;
a0d0e21e
LW
5306 case 't':
5307 ender = '\t';
5308 p++;
a687059c 5309 break;
a0d0e21e
LW
5310 case 'f':
5311 ender = '\f';
5312 p++;
a687059c 5313 break;
a0d0e21e 5314 case 'e':
c7f1f016 5315 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5316 p++;
a687059c 5317 break;
a0d0e21e 5318 case 'a':
c7f1f016 5319 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5320 p++;
a687059c 5321 break;
a0d0e21e 5322 case 'x':
a0ed51b3 5323 if (*++p == '{') {
1df70142 5324 char* const e = strchr(p, '}');
b81d288d 5325
b45f050a 5326 if (!e) {
830247a4 5327 RExC_parse = p + 1;
b45f050a
JF
5328 vFAIL("Missing right brace on \\x{}");
5329 }
de5f0749 5330 else {
a4c04bdc
NC
5331 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5332 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5333 STRLEN numlen = e - p - 1;
53305cf1 5334 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5335 if (ender > 0xff)
5336 RExC_utf8 = 1;
a0ed51b3
LW
5337 p = e + 1;
5338 }
a0ed51b3
LW
5339 }
5340 else {
a4c04bdc 5341 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5342 STRLEN numlen = 2;
53305cf1 5343 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5344 p += numlen;
5345 }
a687059c 5346 break;
a0d0e21e
LW
5347 case 'c':
5348 p++;
bbce6d69 5349 ender = UCHARAT(p++);
5350 ender = toCTRL(ender);
a687059c 5351 break;
a0d0e21e
LW
5352 case '0': case '1': case '2': case '3':case '4':
5353 case '5': case '6': case '7': case '8':case '9':
5354 if (*p == '0' ||
830247a4 5355 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5356 I32 flags = 0;
1df70142 5357 STRLEN numlen = 3;
53305cf1 5358 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5359 p += numlen;
5360 }
5361 else {
5362 --p;
5363 goto loopdone;
a687059c
LW
5364 }
5365 break;
a0d0e21e 5366 case '\0':
830247a4 5367 if (p >= RExC_end)
b45f050a 5368 FAIL("Trailing \\");
a687059c 5369 /* FALL THROUGH */
a0d0e21e 5370 default:
041457d9 5371 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5372 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5373 goto normal_default;
a0d0e21e
LW
5374 }
5375 break;
a687059c 5376 default:
a0ed51b3 5377 normal_default:
fd400ab9 5378 if (UTF8_IS_START(*p) && UTF) {
1df70142 5379 STRLEN numlen;
5e12f4fb 5380 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5381 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5382 p += numlen;
5383 }
5384 else
5385 ender = *p++;
a0d0e21e 5386 break;
a687059c 5387 }
e2509266 5388 if (RExC_flags & PMf_EXTENDED)
830247a4 5389 p = regwhite(p, RExC_end);
60a8b682
JH
5390 if (UTF && FOLD) {
5391 /* Prime the casefolded buffer. */
ac7e0132 5392 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5393 }
a0d0e21e
LW
5394 if (ISMULT2(p)) { /* Back off on ?+*. */
5395 if (len)
5396 p = oldp;
16ea2a2e 5397 else if (UTF) {
80aecb99 5398 if (FOLD) {
60a8b682 5399 /* Emit all the Unicode characters. */
1df70142 5400 STRLEN numlen;
80aecb99
JH
5401 for (foldbuf = tmpbuf;
5402 foldlen;
5403 foldlen -= numlen) {
5404 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5405 if (numlen > 0) {
71207a34 5406 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5407 s += unilen;
5408 len += unilen;
5409 /* In EBCDIC the numlen
5410 * and unilen can differ. */
9dc45d57 5411 foldbuf += numlen;
47654450
JH
5412 if (numlen >= foldlen)
5413 break;
9dc45d57
JH
5414 }
5415 else
5416 break; /* "Can't happen." */
80aecb99
JH
5417 }
5418 }
5419 else {
71207a34 5420 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5421 if (unilen > 0) {
0ebc6274
JH
5422 s += unilen;
5423 len += unilen;
9dc45d57 5424 }
80aecb99 5425 }
a0ed51b3 5426 }
a0d0e21e
LW
5427 else {
5428 len++;
eb160463 5429 REGC((char)ender, s++);
a0d0e21e
LW
5430 }
5431 break;
a687059c 5432 }
16ea2a2e 5433 if (UTF) {
80aecb99 5434 if (FOLD) {
60a8b682 5435 /* Emit all the Unicode characters. */
1df70142 5436 STRLEN numlen;
80aecb99
JH
5437 for (foldbuf = tmpbuf;
5438 foldlen;
5439 foldlen -= numlen) {
5440 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5441 if (numlen > 0) {
71207a34 5442 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5443 len += unilen;
5444 s += unilen;
5445 /* In EBCDIC the numlen
5446 * and unilen can differ. */
9dc45d57 5447 foldbuf += numlen;
47654450
JH
5448 if (numlen >= foldlen)
5449 break;
9dc45d57
JH
5450 }
5451 else
5452 break;
80aecb99
JH
5453 }
5454 }
5455 else {
71207a34 5456 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5457 if (unilen > 0) {
0ebc6274
JH
5458 s += unilen;
5459 len += unilen;
9dc45d57 5460 }
80aecb99
JH
5461 }
5462 len--;
a0ed51b3
LW
5463 }
5464 else
eb160463 5465 REGC((char)ender, s++);
a0d0e21e
LW
5466 }
5467 loopdone:
830247a4 5468 RExC_parse = p - 1;
fac92740 5469 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5470 nextchar(pRExC_state);
793db0cb
JH
5471 {
5472 /* len is STRLEN which is unsigned, need to copy to signed */
5473 IV iv = len;
5474 if (iv < 0)
5475 vFAIL("Internal disaster");
5476 }
a0d0e21e
LW
5477 if (len > 0)
5478 *flagp |= HASWIDTH;
090f7165 5479 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5480 *flagp |= SIMPLE;
3dab1dad 5481
cd439c50 5482 if (SIZE_ONLY)
830247a4 5483 RExC_size += STR_SZ(len);
3dab1dad
YO
5484 else {
5485 STR_LEN(ret) = len;
830247a4 5486 RExC_emit += STR_SZ(len);
07be1b83 5487 }
3dab1dad 5488 }
a0d0e21e
LW
5489 break;
5490 }
a687059c 5491
60a8b682
JH
5492 /* If the encoding pragma is in effect recode the text of
5493 * any EXACT-kind nodes. */
3dab1dad
YO
5494 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5495 const STRLEN oldlen = STR_LEN(ret);
5496 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5497
5498 if (RExC_utf8)
5499 SvUTF8_on(sv);
5500 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5501 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5502 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5503
5504 if (SvUTF8(sv))
5505 RExC_utf8 = 1;
5506 if (!SIZE_ONLY) {
a3621e74
YO
5507 GET_RE_DEBUG_FLAGS_DECL;
5508 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5509 (int)oldlen, STRING(ret),
5510 (int)newlen, s));
5511 Copy(s, STRING(ret), newlen, char);
5512 STR_LEN(ret) += newlen - oldlen;
5513 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5514 } else
5515 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5516 }
a72c7584
JH
5517 }
5518
a0d0e21e 5519 return(ret);
a687059c
LW
5520}
5521
873ef191 5522STATIC char *
5f66b61c 5523S_regwhite(char *p, const char *e)
5b5a24f7
CS
5524{
5525 while (p < e) {
5526 if (isSPACE(*p))
5527 ++p;
5528 else if (*p == '#') {
5529 do {
5530 p++;
5531 } while (p < e && *p != '\n');
5532 }
5533 else
5534 break;
5535 }
5536 return p;
5537}
5538
b8c5462f
JH
5539/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5540 Character classes ([:foo:]) can also be negated ([:^foo:]).
5541 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5542 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5543 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5544
5545#define POSIXCC_DONE(c) ((c) == ':')
5546#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5547#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5548
b8c5462f 5549STATIC I32
830247a4 5550S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5551{
97aff369 5552 dVAR;
936ed897 5553 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5554
830247a4 5555 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5556 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5557 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5558 const char c = UCHARAT(RExC_parse);
097eb12c 5559 char* const s = RExC_parse++;
b81d288d 5560
9a86a77b 5561 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5562 RExC_parse++;
5563 if (RExC_parse == RExC_end)
620e46c5 5564 /* Grandfather lone [:, [=, [. */
830247a4 5565 RExC_parse = s;
620e46c5 5566 else {
3dab1dad 5567 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5568 assert(*t == c);
5569
9a86a77b 5570 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5571 const char *posixcc = s + 1;
830247a4 5572 RExC_parse++; /* skip over the ending ] */
3dab1dad 5573
b8c5462f 5574 if (*s == ':') {
1df70142
AL
5575 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5576 const I32 skip = t - posixcc;
80916619
NC
5577
5578 /* Initially switch on the length of the name. */
5579 switch (skip) {
5580 case 4:
3dab1dad
YO
5581 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5582 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5583 break;
80916619
NC
5584 case 5:
5585 /* Names all of length 5. */
5586 /* alnum alpha ascii blank cntrl digit graph lower
5587 print punct space upper */
5588 /* Offset 4 gives the best switch position. */
5589 switch (posixcc[4]) {
5590 case 'a':
3dab1dad
YO
5591 if (memEQ(posixcc, "alph", 4)) /* alpha */
5592 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5593 break;
5594 case 'e':
3dab1dad
YO
5595 if (memEQ(posixcc, "spac", 4)) /* space */
5596 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5597 break;
5598 case 'h':
3dab1dad
YO
5599 if (memEQ(posixcc, "grap", 4)) /* graph */
5600 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5601 break;
5602 case 'i':
3dab1dad
YO
5603 if (memEQ(posixcc, "asci", 4)) /* ascii */
5604 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5605 break;
5606 case 'k':
3dab1dad
YO
5607 if (memEQ(posixcc, "blan", 4)) /* blank */
5608 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5609 break;
5610 case 'l':
3dab1dad
YO
5611 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5612 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5613 break;
5614 case 'm':
3dab1dad
YO
5615 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5616 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5617 break;
5618 case 'r':
3dab1dad
YO
5619 if (memEQ(posixcc, "lowe", 4)) /* lower */
5620 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5621 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5622 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5623 break;
5624 case 't':
3dab1dad
YO
5625 if (memEQ(posixcc, "digi", 4)) /* digit */
5626 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5627 else if (memEQ(posixcc, "prin", 4)) /* print */
5628 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5629 else if (memEQ(posixcc, "punc", 4)) /* punct */
5630 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5631 break;
b8c5462f
JH
5632 }
5633 break;
80916619 5634 case 6:
3dab1dad
YO
5635 if (memEQ(posixcc, "xdigit", 6))
5636 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5637 break;
5638 }
80916619
NC
5639
5640 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5641 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5642 t - s - 1, s + 1);
80916619
NC
5643 assert (posixcc[skip] == ':');
5644 assert (posixcc[skip+1] == ']');
b45f050a 5645 } else if (!SIZE_ONLY) {
b8c5462f 5646 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5647
830247a4 5648 /* adjust RExC_parse so the warning shows after
b45f050a 5649 the class closes */
9a86a77b 5650 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5651 RExC_parse++;
b45f050a
JF
5652 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5653 }
b8c5462f
JH
5654 } else {
5655 /* Maternal grandfather:
5656 * "[:" ending in ":" but not in ":]" */
830247a4 5657 RExC_parse = s;
767d463e 5658 }
620e46c5
JH
5659 }
5660 }
5661
b8c5462f
JH
5662 return namedclass;
5663}
5664
5665STATIC void
830247a4 5666S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5667{
97aff369 5668 dVAR;
3dab1dad 5669 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5670 const char *s = RExC_parse;
5671 const char c = *s++;
b8c5462f 5672
3dab1dad 5673 while (isALNUM(*s))
b8c5462f
JH
5674 s++;
5675 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5676 if (ckWARN(WARN_REGEXP))
5677 vWARN3(s+2,
5678 "POSIX syntax [%c %c] belongs inside character classes",
5679 c, c);
b45f050a
JF
5680
5681 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5682 if (POSIXCC_NOTYET(c)) {
830247a4 5683 /* adjust RExC_parse so the error shows after
b45f050a 5684 the class closes */
9a86a77b 5685 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5686 NOOP;
b45f050a
JF
5687 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5688 }
b8c5462f
JH
5689 }
5690 }
620e46c5
JH
5691}
5692
7f6f358c
YO
5693
5694/*
5695 parse a class specification and produce either an ANYOF node that
5696 matches the pattern. If the pattern matches a single char only and
5697 that char is < 256 then we produce an EXACT node instead.
5698*/
76e3520e 5699STATIC regnode *
3dab1dad 5700S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5701{
97aff369 5702 dVAR;
07be1b83 5703 register UV value;
9a86a77b 5704 register UV nextvalue;
3568d838 5705 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5706 register IV range = 0;
c277df42 5707 register regnode *ret;
ba210ebe 5708 STRLEN numlen;
ffc61ed2 5709 IV namedclass;
cbbf8932 5710 char *rangebegin = NULL;
936ed897 5711 bool need_class = 0;
c445ea15 5712 SV *listsv = NULL;
ffc61ed2 5713 UV n;
9e55ce06 5714 bool optimize_invert = TRUE;
cbbf8932 5715 AV* unicode_alternate = NULL;
1b2d223b
JH
5716#ifdef EBCDIC
5717 UV literal_endpoint = 0;
5718#endif
7f6f358c 5719 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5720
3dab1dad 5721 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5722 case we need to change the emitted regop to an EXACT. */
07be1b83 5723 const char * orig_parse = RExC_parse;
72f13be8 5724 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5725#ifndef DEBUGGING
5726 PERL_UNUSED_ARG(depth);
5727#endif
72f13be8 5728
3dab1dad 5729 DEBUG_PARSE("clas");
7f6f358c
YO
5730
5731 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5732 ret = reganode(pRExC_state, ANYOF, 0);
5733
5734 if (!SIZE_ONLY)
5735 ANYOF_FLAGS(ret) = 0;
5736
9a86a77b 5737 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5738 RExC_naughty++;
5739 RExC_parse++;
5740 if (!SIZE_ONLY)
5741 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5742 }
a0d0e21e 5743
73060fc4 5744 if (SIZE_ONLY) {
830247a4 5745 RExC_size += ANYOF_SKIP;
73060fc4
JH
5746 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5747 }
936ed897 5748 else {
830247a4 5749 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5750 if (FOLD)
5751 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5752 if (LOC)
5753 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5754 ANYOF_BITMAP_ZERO(ret);
396482e1 5755 listsv = newSVpvs("# comment\n");
a0d0e21e 5756 }
b8c5462f 5757
9a86a77b
JH
5758 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5759
b938889d 5760 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5761 checkposixcc(pRExC_state);
b8c5462f 5762
f064b6ad
HS
5763 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5764 if (UCHARAT(RExC_parse) == ']')
5765 goto charclassloop;
ffc61ed2 5766
9a86a77b 5767 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5768
5769 charclassloop:
5770
5771 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5772
73b437c8 5773 if (!range)
830247a4 5774 rangebegin = RExC_parse;
ffc61ed2 5775 if (UTF) {
5e12f4fb 5776 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5777 RExC_end - RExC_parse,
9f7f3913 5778 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5779 RExC_parse += numlen;
5780 }
5781 else
5782 value = UCHARAT(RExC_parse++);
7f6f358c 5783
9a86a77b
JH
5784 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5785 if (value == '[' && POSIXCC(nextvalue))
830247a4 5786 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5787 else if (value == '\\') {
ffc61ed2 5788 if (UTF) {
5e12f4fb 5789 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5790 RExC_end - RExC_parse,
9f7f3913 5791 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5792 RExC_parse += numlen;
5793 }
5794 else
5795 value = UCHARAT(RExC_parse++);
470c3474 5796 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5797 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5798 * be a problem later if we want switch on Unicode.
5799 * A similar issue a little bit later when switching on
5800 * namedclass. --jhi */
ffc61ed2 5801 switch ((I32)value) {
b8c5462f
JH
5802 case 'w': namedclass = ANYOF_ALNUM; break;
5803 case 'W': namedclass = ANYOF_NALNUM; break;
5804 case 's': namedclass = ANYOF_SPACE; break;
5805 case 'S': namedclass = ANYOF_NSPACE; break;
5806 case 'd': namedclass = ANYOF_DIGIT; break;
5807 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5808 case 'p':
5809 case 'P':
3dab1dad
YO
5810 {
5811 char *e;
af6f566e 5812 if (RExC_parse >= RExC_end)
2a4859cd 5813 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5814 if (*RExC_parse == '{') {
1df70142 5815 const U8 c = (U8)value;
ffc61ed2
JH
5816 e = strchr(RExC_parse++, '}');
5817 if (!e)
0da60cf5 5818 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5819 while (isSPACE(UCHARAT(RExC_parse)))
5820 RExC_parse++;
5821 if (e == RExC_parse)
0da60cf5 5822 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5823 n = e - RExC_parse;
ab13f0c7
JH
5824 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5825 n--;
ffc61ed2
JH
5826 }
5827 else {
5828 e = RExC_parse;
5829 n = 1;
5830 }
5831 if (!SIZE_ONLY) {
ab13f0c7
JH
5832 if (UCHARAT(RExC_parse) == '^') {
5833 RExC_parse++;
5834 n--;
5835 value = value == 'p' ? 'P' : 'p'; /* toggle */
5836 while (isSPACE(UCHARAT(RExC_parse))) {
5837 RExC_parse++;
5838 n--;
5839 }
5840 }
097eb12c
AL
5841 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5842 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5843 }
5844 RExC_parse = e + 1;
5845 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5846 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5847 }
f81125e2 5848 break;
b8c5462f
JH
5849 case 'n': value = '\n'; break;
5850 case 'r': value = '\r'; break;
5851 case 't': value = '\t'; break;
5852 case 'f': value = '\f'; break;
5853 case 'b': value = '\b'; break;
c7f1f016
NIS
5854 case 'e': value = ASCII_TO_NATIVE('\033');break;
5855 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5856 case 'x':
ffc61ed2 5857 if (*RExC_parse == '{') {
a4c04bdc
NC
5858 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5859 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5860 char * const e = strchr(RExC_parse++, '}');
b81d288d 5861 if (!e)
ffc61ed2 5862 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5863
5864 numlen = e - RExC_parse;
5865 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5866 RExC_parse = e + 1;
5867 }
5868 else {
a4c04bdc 5869 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5870 numlen = 2;
5871 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5872 RExC_parse += numlen;
5873 }
b8c5462f
JH
5874 break;
5875 case 'c':
830247a4 5876 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5877 value = toCTRL(value);
5878 break;
5879 case '0': case '1': case '2': case '3': case '4':
5880 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5881 {
5882 I32 flags = 0;
5883 numlen = 3;
5884 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5885 RExC_parse += numlen;
b8c5462f 5886 break;
53305cf1 5887 }
1028017a 5888 default:
041457d9 5889 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5890 vWARN2(RExC_parse,
5891 "Unrecognized escape \\%c in character class passed through",
5892 (int)value);
1028017a 5893 break;
b8c5462f 5894 }
ffc61ed2 5895 } /* end of \blah */
1b2d223b
JH
5896#ifdef EBCDIC
5897 else
5898 literal_endpoint++;
5899#endif
ffc61ed2
JH
5900
5901 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5902
5903 if (!SIZE_ONLY && !need_class)
936ed897 5904 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5905
936ed897 5906 need_class = 1;
ffc61ed2
JH
5907
5908 /* a bad range like a-\d, a-[:digit:] ? */
5909 if (range) {
73b437c8 5910 if (!SIZE_ONLY) {
afd78fd5 5911 if (ckWARN(WARN_REGEXP)) {
097eb12c 5912 const int w =
afd78fd5
JH
5913 RExC_parse >= rangebegin ?
5914 RExC_parse - rangebegin : 0;
830247a4 5915 vWARN4(RExC_parse,
b45f050a 5916 "False [] range \"%*.*s\"",
097eb12c 5917 w, w, rangebegin);
afd78fd5 5918 }
3568d838
JH
5919 if (prevvalue < 256) {
5920 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5921 ANYOF_BITMAP_SET(ret, '-');
5922 }
5923 else {
5924 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5925 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5926 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5927 }
b8c5462f 5928 }
ffc61ed2
JH
5929
5930 range = 0; /* this was not a true range */
73b437c8 5931 }
ffc61ed2 5932
73b437c8 5933 if (!SIZE_ONLY) {
c49a72a9
NC
5934 const char *what = NULL;
5935 char yesno = 0;
5936
3568d838
JH
5937 if (namedclass > OOB_NAMEDCLASS)
5938 optimize_invert = FALSE;
e2962f66
JH
5939 /* Possible truncation here but in some 64-bit environments
5940 * the compiler gets heartburn about switch on 64-bit values.
5941 * A similar issue a little earlier when switching on value.
98f323fa 5942 * --jhi */
e2962f66 5943 switch ((I32)namedclass) {
73b437c8
JH
5944 case ANYOF_ALNUM:
5945 if (LOC)
936ed897 5946 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5947 else {
5948 for (value = 0; value < 256; value++)
5949 if (isALNUM(value))
936ed897 5950 ANYOF_BITMAP_SET(ret, value);
73b437c8 5951 }
c49a72a9
NC
5952 yesno = '+';
5953 what = "Word";
73b437c8
JH
5954 break;
5955 case ANYOF_NALNUM:
5956 if (LOC)
936ed897 5957 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5958 else {
5959 for (value = 0; value < 256; value++)
5960 if (!isALNUM(value))
936ed897 5961 ANYOF_BITMAP_SET(ret, value);
73b437c8 5962 }
c49a72a9
NC
5963 yesno = '!';
5964 what = "Word";
73b437c8 5965 break;
ffc61ed2 5966 case ANYOF_ALNUMC:
73b437c8 5967 if (LOC)
ffc61ed2 5968 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5969 else {
5970 for (value = 0; value < 256; value++)
ffc61ed2 5971 if (isALNUMC(value))
936ed897 5972 ANYOF_BITMAP_SET(ret, value);
73b437c8 5973 }
c49a72a9
NC
5974 yesno = '+';
5975 what = "Alnum";
73b437c8
JH
5976 break;
5977 case ANYOF_NALNUMC:
5978 if (LOC)
936ed897 5979 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5980 else {
5981 for (value = 0; value < 256; value++)
5982 if (!isALNUMC(value))
936ed897 5983 ANYOF_BITMAP_SET(ret, value);
73b437c8 5984 }
c49a72a9
NC
5985 yesno = '!';
5986 what = "Alnum";
73b437c8
JH
5987 break;
5988 case ANYOF_ALPHA:
5989 if (LOC)
936ed897 5990 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5991 else {
5992 for (value = 0; value < 256; value++)
5993 if (isALPHA(value))
936ed897 5994 ANYOF_BITMAP_SET(ret, value);
73b437c8 5995 }
c49a72a9
NC
5996 yesno = '+';
5997 what = "Alpha";
73b437c8
JH
5998 break;
5999 case ANYOF_NALPHA:
6000 if (LOC)
936ed897 6001 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6002 else {
6003 for (value = 0; value < 256; value++)
6004 if (!isALPHA(value))
936ed897 6005 ANYOF_BITMAP_SET(ret, value);
73b437c8 6006 }
c49a72a9
NC
6007 yesno = '!';
6008 what = "Alpha";
73b437c8
JH
6009 break;
6010 case ANYOF_ASCII:
6011 if (LOC)
936ed897 6012 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6013 else {
c7f1f016 6014#ifndef EBCDIC
1ba5c669
JH
6015 for (value = 0; value < 128; value++)
6016 ANYOF_BITMAP_SET(ret, value);
6017#else /* EBCDIC */
ffbc6a93 6018 for (value = 0; value < 256; value++) {
3a3c4447
JH
6019 if (isASCII(value))
6020 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6021 }
1ba5c669 6022#endif /* EBCDIC */
73b437c8 6023 }
c49a72a9
NC
6024 yesno = '+';
6025 what = "ASCII";
73b437c8
JH
6026 break;
6027 case ANYOF_NASCII:
6028 if (LOC)
936ed897 6029 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6030 else {
c7f1f016 6031#ifndef EBCDIC
1ba5c669
JH
6032 for (value = 128; value < 256; value++)
6033 ANYOF_BITMAP_SET(ret, value);
6034#else /* EBCDIC */
ffbc6a93 6035 for (value = 0; value < 256; value++) {
3a3c4447
JH
6036 if (!isASCII(value))
6037 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6038 }
1ba5c669 6039#endif /* EBCDIC */
73b437c8 6040 }
c49a72a9
NC
6041 yesno = '!';
6042 what = "ASCII";
73b437c8 6043 break;
aaa51d5e
JF
6044 case ANYOF_BLANK:
6045 if (LOC)
6046 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6047 else {
6048 for (value = 0; value < 256; value++)
6049 if (isBLANK(value))
6050 ANYOF_BITMAP_SET(ret, value);
6051 }
c49a72a9
NC
6052 yesno = '+';
6053 what = "Blank";
aaa51d5e
JF
6054 break;
6055 case ANYOF_NBLANK:
6056 if (LOC)
6057 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6058 else {
6059 for (value = 0; value < 256; value++)
6060 if (!isBLANK(value))
6061 ANYOF_BITMAP_SET(ret, value);
6062 }
c49a72a9
NC
6063 yesno = '!';
6064 what = "Blank";
aaa51d5e 6065 break;
73b437c8
JH
6066 case ANYOF_CNTRL:
6067 if (LOC)
936ed897 6068 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6069 else {
6070 for (value = 0; value < 256; value++)
6071 if (isCNTRL(value))
936ed897 6072 ANYOF_BITMAP_SET(ret, value);
73b437c8 6073 }
c49a72a9
NC
6074 yesno = '+';
6075 what = "Cntrl";
73b437c8
JH
6076 break;
6077 case ANYOF_NCNTRL:
6078 if (LOC)
936ed897 6079 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6080 else {
6081 for (value = 0; value < 256; value++)
6082 if (!isCNTRL(value))
936ed897 6083 ANYOF_BITMAP_SET(ret, value);
73b437c8 6084 }
c49a72a9
NC
6085 yesno = '!';
6086 what = "Cntrl";
ffc61ed2
JH
6087 break;
6088 case ANYOF_DIGIT:
6089 if (LOC)
6090 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6091 else {
6092 /* consecutive digits assumed */
6093 for (value = '0'; value <= '9'; value++)
6094 ANYOF_BITMAP_SET(ret, value);
6095 }
c49a72a9
NC
6096 yesno = '+';
6097 what = "Digit";
ffc61ed2
JH
6098 break;
6099 case ANYOF_NDIGIT:
6100 if (LOC)
6101 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6102 else {
6103 /* consecutive digits assumed */
6104 for (value = 0; value < '0'; value++)
6105 ANYOF_BITMAP_SET(ret, value);
6106 for (value = '9' + 1; value < 256; value++)
6107 ANYOF_BITMAP_SET(ret, value);
6108 }
c49a72a9
NC
6109 yesno = '!';
6110 what = "Digit";
73b437c8
JH
6111 break;
6112 case ANYOF_GRAPH:
6113 if (LOC)
936ed897 6114 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6115 else {
6116 for (value = 0; value < 256; value++)
6117 if (isGRAPH(value))
936ed897 6118 ANYOF_BITMAP_SET(ret, value);
73b437c8 6119 }
c49a72a9
NC
6120 yesno = '+';
6121 what = "Graph";
73b437c8
JH
6122 break;
6123 case ANYOF_NGRAPH:
6124 if (LOC)
936ed897 6125 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6126 else {
6127 for (value = 0; value < 256; value++)
6128 if (!isGRAPH(value))
936ed897 6129 ANYOF_BITMAP_SET(ret, value);
73b437c8 6130 }
c49a72a9
NC
6131 yesno = '!';
6132 what = "Graph";
73b437c8
JH
6133 break;
6134 case ANYOF_LOWER:
6135 if (LOC)
936ed897 6136 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6137 else {
6138 for (value = 0; value < 256; value++)
6139 if (isLOWER(value))
936ed897 6140 ANYOF_BITMAP_SET(ret, value);
73b437c8 6141 }
c49a72a9
NC
6142 yesno = '+';
6143 what = "Lower";
73b437c8
JH
6144 break;
6145 case ANYOF_NLOWER:
6146 if (LOC)
936ed897 6147 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6148 else {
6149 for (value = 0; value < 256; value++)
6150 if (!isLOWER(value))
936ed897 6151 ANYOF_BITMAP_SET(ret, value);
73b437c8 6152 }
c49a72a9
NC
6153 yesno = '!';
6154 what = "Lower";
73b437c8
JH
6155 break;
6156 case ANYOF_PRINT:
6157 if (LOC)
936ed897 6158 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6159 else {
6160 for (value = 0; value < 256; value++)
6161 if (isPRINT(value))
936ed897 6162 ANYOF_BITMAP_SET(ret, value);
73b437c8 6163 }
c49a72a9
NC
6164 yesno = '+';
6165 what = "Print";
73b437c8
JH
6166 break;
6167 case ANYOF_NPRINT:
6168 if (LOC)
936ed897 6169 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6170 else {
6171 for (value = 0; value < 256; value++)
6172 if (!isPRINT(value))
936ed897 6173 ANYOF_BITMAP_SET(ret, value);
73b437c8 6174 }
c49a72a9
NC
6175 yesno = '!';
6176 what = "Print";
73b437c8 6177 break;
aaa51d5e
JF
6178 case ANYOF_PSXSPC:
6179 if (LOC)
6180 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6181 else {
6182 for (value = 0; value < 256; value++)
6183 if (isPSXSPC(value))
6184 ANYOF_BITMAP_SET(ret, value);
6185 }
c49a72a9
NC
6186 yesno = '+';
6187 what = "Space";
aaa51d5e
JF
6188 break;
6189 case ANYOF_NPSXSPC:
6190 if (LOC)
6191 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6192 else {
6193 for (value = 0; value < 256; value++)
6194 if (!isPSXSPC(value))
6195 ANYOF_BITMAP_SET(ret, value);
6196 }
c49a72a9
NC
6197 yesno = '!';
6198 what = "Space";
aaa51d5e 6199 break;
73b437c8
JH
6200 case ANYOF_PUNCT:
6201 if (LOC)
936ed897 6202 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6203 else {
6204 for (value = 0; value < 256; value++)
6205 if (isPUNCT(value))
936ed897 6206 ANYOF_BITMAP_SET(ret, value);
73b437c8 6207 }
c49a72a9
NC
6208 yesno = '+';
6209 what = "Punct";
73b437c8
JH
6210 break;
6211 case ANYOF_NPUNCT:
6212 if (LOC)
936ed897 6213 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6214 else {
6215 for (value = 0; value < 256; value++)
6216 if (!isPUNCT(value))
936ed897 6217 ANYOF_BITMAP_SET(ret, value);
73b437c8 6218 }
c49a72a9
NC
6219 yesno = '!';
6220 what = "Punct";
ffc61ed2
JH
6221 break;
6222 case ANYOF_SPACE:
6223 if (LOC)
6224 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6225 else {
6226 for (value = 0; value < 256; value++)
6227 if (isSPACE(value))
6228 ANYOF_BITMAP_SET(ret, value);
6229 }
c49a72a9
NC
6230 yesno = '+';
6231 what = "SpacePerl";
ffc61ed2
JH
6232 break;
6233 case ANYOF_NSPACE:
6234 if (LOC)
6235 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6236 else {
6237 for (value = 0; value < 256; value++)
6238 if (!isSPACE(value))
6239 ANYOF_BITMAP_SET(ret, value);
6240 }
c49a72a9
NC
6241 yesno = '!';
6242 what = "SpacePerl";
73b437c8
JH
6243 break;
6244 case ANYOF_UPPER:
6245 if (LOC)
936ed897 6246 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6247 else {
6248 for (value = 0; value < 256; value++)
6249 if (isUPPER(value))
936ed897 6250 ANYOF_BITMAP_SET(ret, value);
73b437c8 6251 }
c49a72a9
NC
6252 yesno = '+';
6253 what = "Upper";
73b437c8
JH
6254 break;
6255 case ANYOF_NUPPER:
6256 if (LOC)
936ed897 6257 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6258 else {
6259 for (value = 0; value < 256; value++)
6260 if (!isUPPER(value))
936ed897 6261 ANYOF_BITMAP_SET(ret, value);
73b437c8 6262 }
c49a72a9
NC
6263 yesno = '!';
6264 what = "Upper";
73b437c8
JH
6265 break;
6266 case ANYOF_XDIGIT:
6267 if (LOC)
936ed897 6268 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6269 else {
6270 for (value = 0; value < 256; value++)
6271 if (isXDIGIT(value))
936ed897 6272 ANYOF_BITMAP_SET(ret, value);
73b437c8 6273 }
c49a72a9
NC
6274 yesno = '+';
6275 what = "XDigit";
73b437c8
JH
6276 break;
6277 case ANYOF_NXDIGIT:
6278 if (LOC)
936ed897 6279 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6280 else {
6281 for (value = 0; value < 256; value++)
6282 if (!isXDIGIT(value))
936ed897 6283 ANYOF_BITMAP_SET(ret, value);
73b437c8 6284 }
c49a72a9
NC
6285 yesno = '!';
6286 what = "XDigit";
73b437c8 6287 break;
f81125e2
JP
6288 case ANYOF_MAX:
6289 /* this is to handle \p and \P */
6290 break;
73b437c8 6291 default:
b45f050a 6292 vFAIL("Invalid [::] class");
73b437c8 6293 break;
b8c5462f 6294 }
c49a72a9
NC
6295 if (what) {
6296 /* Strings such as "+utf8::isWord\n" */
6297 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6298 }
b8c5462f 6299 if (LOC)
936ed897 6300 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6301 continue;
a0d0e21e 6302 }
ffc61ed2
JH
6303 } /* end of namedclass \blah */
6304
a0d0e21e 6305 if (range) {
eb160463 6306 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6307 const int w = RExC_parse - rangebegin;
6308 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6309 range = 0; /* not a valid range */
73b437c8 6310 }
a0d0e21e
LW
6311 }
6312 else {
3568d838 6313 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6314 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6315 RExC_parse[1] != ']') {
6316 RExC_parse++;
ffc61ed2
JH
6317
6318 /* a bad range like \w-, [:word:]- ? */
6319 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6320 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6321 const int w =
afd78fd5
JH
6322 RExC_parse >= rangebegin ?
6323 RExC_parse - rangebegin : 0;
830247a4 6324 vWARN4(RExC_parse,
b45f050a 6325 "False [] range \"%*.*s\"",
097eb12c 6326 w, w, rangebegin);
afd78fd5 6327 }
73b437c8 6328 if (!SIZE_ONLY)
936ed897 6329 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6330 } else
ffc61ed2
JH
6331 range = 1; /* yeah, it's a range! */
6332 continue; /* but do it the next time */
a0d0e21e 6333 }
a687059c 6334 }
ffc61ed2 6335
93733859 6336 /* now is the next time */
07be1b83 6337 /*stored += (value - prevvalue + 1);*/
ae5c130c 6338 if (!SIZE_ONLY) {
3568d838 6339 if (prevvalue < 256) {
1df70142 6340 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6341 IV i;
3568d838 6342#ifdef EBCDIC
1b2d223b
JH
6343 /* In EBCDIC [\x89-\x91] should include
6344 * the \x8e but [i-j] should not. */
6345 if (literal_endpoint == 2 &&
6346 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6347 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6348 {
3568d838
JH
6349 if (isLOWER(prevvalue)) {
6350 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6351 if (isLOWER(i))
6352 ANYOF_BITMAP_SET(ret, i);
6353 } else {
3568d838 6354 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6355 if (isUPPER(i))
6356 ANYOF_BITMAP_SET(ret, i);
6357 }
8ada0baa 6358 }
ffc61ed2 6359 else
8ada0baa 6360#endif
07be1b83
YO
6361 for (i = prevvalue; i <= ceilvalue; i++) {
6362 if (!ANYOF_BITMAP_TEST(ret,i)) {
6363 stored++;
6364 ANYOF_BITMAP_SET(ret, i);
6365 }
6366 }
3568d838 6367 }
a5961de5 6368 if (value > 255 || UTF) {
1df70142
AL
6369 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6370 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6371 stored+=2; /* can't optimize this class */
ffc61ed2 6372 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6373 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6374 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6375 prevnatvalue, natvalue);
6376 }
6377 else if (prevnatvalue == natvalue) {
6378 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6379 if (FOLD) {
89ebb4a3 6380 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6381 STRLEN foldlen;
1df70142 6382 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6383
e294cc5d
JH
6384#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6385 if (RExC_precomp[0] == ':' &&
6386 RExC_precomp[1] == '[' &&
6387 (f == 0xDF || f == 0x92)) {
6388 f = NATIVE_TO_UNI(f);
6389 }
6390#endif
c840d2a2
JH
6391 /* If folding and foldable and a single
6392 * character, insert also the folded version
6393 * to the charclass. */
9e55ce06 6394 if (f != value) {
e294cc5d
JH
6395#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6396 if ((RExC_precomp[0] == ':' &&
6397 RExC_precomp[1] == '[' &&
6398 (f == 0xA2 &&
6399 (value == 0xFB05 || value == 0xFB06))) ?
6400 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6401 foldlen == (STRLEN)UNISKIP(f) )
6402#else
eb160463 6403 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6404#endif
9e55ce06
JH
6405 Perl_sv_catpvf(aTHX_ listsv,
6406 "%04"UVxf"\n", f);
6407 else {
6408 /* Any multicharacter foldings
6409 * require the following transform:
6410 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6411 * where E folds into "pq" and F folds
6412 * into "rst", all other characters
6413 * fold to single characters. We save
6414 * away these multicharacter foldings,
6415 * to be later saved as part of the
6416 * additional "s" data. */
6417 SV *sv;
6418
6419 if (!unicode_alternate)
6420 unicode_alternate = newAV();
6421 sv = newSVpvn((char*)foldbuf, foldlen);
6422 SvUTF8_on(sv);
6423 av_push(unicode_alternate, sv);
6424 }
6425 }
254ba52a 6426
60a8b682
JH
6427 /* If folding and the value is one of the Greek
6428 * sigmas insert a few more sigmas to make the
6429 * folding rules of the sigmas to work right.
6430 * Note that not all the possible combinations
6431 * are handled here: some of them are handled
9e55ce06
JH
6432 * by the standard folding rules, and some of
6433 * them (literal or EXACTF cases) are handled
6434 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6435 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6436 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6437 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6438 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6439 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6440 }
6441 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6442 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6443 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6444 }
6445 }
ffc61ed2 6446 }
1b2d223b
JH
6447#ifdef EBCDIC
6448 literal_endpoint = 0;
6449#endif
8ada0baa 6450 }
ffc61ed2
JH
6451
6452 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6453 }
ffc61ed2 6454
936ed897 6455 if (need_class) {
4f66b38d 6456 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6457 if (SIZE_ONLY)
830247a4 6458 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6459 else
830247a4 6460 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6461 }
ffc61ed2 6462
7f6f358c
YO
6463
6464 if (SIZE_ONLY)
6465 return ret;
6466 /****** !SIZE_ONLY AFTER HERE *********/
6467
6468 if( stored == 1 && value < 256
6469 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6470 ) {
6471 /* optimize single char class to an EXACT node
6472 but *only* when its not a UTF/high char */
07be1b83
YO
6473 const char * cur_parse= RExC_parse;
6474 RExC_emit = (regnode *)orig_emit;
6475 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6476 ret = reg_node(pRExC_state,
6477 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6478 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6479 *STRING(ret)= (char)value;
6480 STR_LEN(ret)= 1;
6481 RExC_emit += STR_SZ(1);
6482 return ret;
6483 }
ae5c130c 6484 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6485 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6486 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6487 ) {
a0ed51b3 6488 for (value = 0; value < 256; ++value) {
936ed897 6489 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6490 UV fold = PL_fold[value];
ffc61ed2
JH
6491
6492 if (fold != value)
6493 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6494 }
6495 }
936ed897 6496 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6497 }
ffc61ed2 6498
ae5c130c 6499 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6500 if (optimize_invert &&
ffc61ed2
JH
6501 /* If the only flag is inversion. */
6502 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6503 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6504 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6505 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6506 }
7f6f358c 6507 {
097eb12c 6508 AV * const av = newAV();
ffc61ed2 6509 SV *rv;
9e55ce06 6510 /* The 0th element stores the character class description
6a0407ee 6511 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6512 * to initialize the appropriate swash (which gets stored in
6513 * the 1st element), and also useful for dumping the regnode.
6514 * The 2nd element stores the multicharacter foldings,
6a0407ee 6515 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6516 av_store(av, 0, listsv);
6517 av_store(av, 1, NULL);
9e55ce06 6518 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6519 rv = newRV_noinc((SV*)av);
19860706 6520 n = add_data(pRExC_state, 1, "s");
830247a4 6521 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6522 ARG_SET(ret, n);
a0ed51b3 6523 }
a0ed51b3
LW
6524 return ret;
6525}
6526
76e3520e 6527STATIC char*
830247a4 6528S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6529{
097eb12c 6530 char* const retval = RExC_parse++;
a0d0e21e 6531
4633a7c4 6532 for (;;) {
830247a4
IZ
6533 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6534 RExC_parse[2] == '#') {
e994fd66
AE
6535 while (*RExC_parse != ')') {
6536 if (RExC_parse == RExC_end)
6537 FAIL("Sequence (?#... not terminated");
830247a4 6538 RExC_parse++;
e994fd66 6539 }
830247a4 6540 RExC_parse++;
4633a7c4
LW
6541 continue;
6542 }
e2509266 6543 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6544 if (isSPACE(*RExC_parse)) {
6545 RExC_parse++;
748a9306
LW
6546 continue;
6547 }
830247a4 6548 else if (*RExC_parse == '#') {
e994fd66
AE
6549 while (RExC_parse < RExC_end)
6550 if (*RExC_parse++ == '\n') break;
748a9306
LW
6551 continue;
6552 }
748a9306 6553 }
4633a7c4 6554 return retval;
a0d0e21e 6555 }
a687059c
LW
6556}
6557
6558/*
c277df42 6559- reg_node - emit a node
a0d0e21e 6560*/
76e3520e 6561STATIC regnode * /* Location. */
830247a4 6562S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6563{
97aff369 6564 dVAR;
c277df42 6565 register regnode *ptr;
504618e9 6566 regnode * const ret = RExC_emit;
07be1b83 6567 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6568
c277df42 6569 if (SIZE_ONLY) {
830247a4
IZ
6570 SIZE_ALIGN(RExC_size);
6571 RExC_size += 1;
a0d0e21e
LW
6572 return(ret);
6573 }
c277df42 6574 NODE_ALIGN_FILL(ret);
a0d0e21e 6575 ptr = ret;
c277df42 6576 FILL_ADVANCE_NODE(ptr, op);
fac92740 6577 if (RExC_offsets) { /* MJD */
07be1b83 6578 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6579 "reg_node", __LINE__,
6580 reg_name[op],
07be1b83
YO
6581 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6582 ? "Overwriting end of array!\n" : "OK",
6583 (UV)(RExC_emit - RExC_emit_start),
6584 (UV)(RExC_parse - RExC_start),
6585 (UV)RExC_offsets[0]));
ccb2c380 6586 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6587 }
07be1b83 6588
830247a4 6589 RExC_emit = ptr;
a687059c 6590
a0d0e21e 6591 return(ret);
a687059c
LW
6592}
6593
6594/*
a0d0e21e
LW
6595- reganode - emit a node with an argument
6596*/
76e3520e 6597STATIC regnode * /* Location. */
830247a4 6598S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6599{
97aff369 6600 dVAR;
c277df42 6601 register regnode *ptr;
504618e9 6602 regnode * const ret = RExC_emit;
07be1b83 6603 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6604
c277df42 6605 if (SIZE_ONLY) {
830247a4
IZ
6606 SIZE_ALIGN(RExC_size);
6607 RExC_size += 2;
a0d0e21e
LW
6608 return(ret);
6609 }
fe14fcc3 6610
c277df42 6611 NODE_ALIGN_FILL(ret);
a0d0e21e 6612 ptr = ret;
c277df42 6613 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6614 if (RExC_offsets) { /* MJD */
07be1b83 6615 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6616 "reganode",
ccb2c380
MP
6617 __LINE__,
6618 reg_name[op],
07be1b83 6619 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6620 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6621 (UV)(RExC_emit - RExC_emit_start),
6622 (UV)(RExC_parse - RExC_start),
6623 (UV)RExC_offsets[0]));
ccb2c380 6624 Set_Cur_Node_Offset;
fac92740
MJD
6625 }
6626
830247a4 6627 RExC_emit = ptr;
fe14fcc3 6628
a0d0e21e 6629 return(ret);
fe14fcc3
LW
6630}
6631
6632/*
cd439c50 6633- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6634*/
71207a34
AL
6635STATIC STRLEN
6636S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6637{
97aff369 6638 dVAR;
71207a34 6639 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6640}
6641
6642/*
a0d0e21e
LW
6643- reginsert - insert an operator in front of already-emitted operand
6644*
6645* Means relocating the operand.
6646*/
76e3520e 6647STATIC void
830247a4 6648S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6649{
97aff369 6650 dVAR;
c277df42
IZ
6651 register regnode *src;
6652 register regnode *dst;
6653 register regnode *place;
504618e9 6654 const int offset = regarglen[(U8)op];
07be1b83 6655 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6656/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6657
6658 if (SIZE_ONLY) {
830247a4 6659 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6660 return;
6661 }
a687059c 6662
830247a4
IZ
6663 src = RExC_emit;
6664 RExC_emit += NODE_STEP_REGNODE + offset;
6665 dst = RExC_emit;
fac92740 6666 while (src > opnd) {
c277df42 6667 StructCopy(--src, --dst, regnode);
fac92740 6668 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6669 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6670 "reg_insert",
ccb2c380
MP
6671 __LINE__,
6672 reg_name[op],
07be1b83
YO
6673 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6674 ? "Overwriting end of array!\n" : "OK",
6675 (UV)(src - RExC_emit_start),
6676 (UV)(dst - RExC_emit_start),
6677 (UV)RExC_offsets[0]));
ccb2c380
MP
6678 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6679 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6680 }
6681 }
6682
a0d0e21e
LW
6683
6684 place = opnd; /* Op node, where operand used to be. */
fac92740 6685 if (RExC_offsets) { /* MJD */
07be1b83 6686 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6687 "reginsert",
ccb2c380
MP
6688 __LINE__,
6689 reg_name[op],
07be1b83 6690 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6691 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6692 (UV)(place - RExC_emit_start),
6693 (UV)(RExC_parse - RExC_start),
786e8c11 6694 (UV)RExC_offsets[0]));
ccb2c380 6695 Set_Node_Offset(place, RExC_parse);
45948336 6696 Set_Node_Length(place, 1);
fac92740 6697 }
c277df42
IZ
6698 src = NEXTOPER(place);
6699 FILL_ADVANCE_NODE(place, op);
6700 Zero(src, offset, regnode);
a687059c
LW
6701}
6702
6703/*
c277df42 6704- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6705- SEE ALSO: regtail_study
a0d0e21e 6706*/
097eb12c 6707/* TODO: All three parms should be const */
76e3520e 6708STATIC void
3dab1dad 6709S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6710{
97aff369 6711 dVAR;
c277df42 6712 register regnode *scan;
72f13be8 6713 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6714#ifndef DEBUGGING
6715 PERL_UNUSED_ARG(depth);
6716#endif
a0d0e21e 6717
c277df42 6718 if (SIZE_ONLY)
a0d0e21e
LW
6719 return;
6720
6721 /* Find last node. */
6722 scan = p;
6723 for (;;) {
504618e9 6724 regnode * const temp = regnext(scan);
3dab1dad
YO
6725 DEBUG_PARSE_r({
6726 SV * const mysv=sv_newmortal();
6727 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6728 regprop(RExC_rx, mysv, scan);
6729 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6730 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6731 });
6732 if (temp == NULL)
6733 break;
6734 scan = temp;
6735 }
6736
6737 if (reg_off_by_arg[OP(scan)]) {
6738 ARG_SET(scan, val - scan);
6739 }
6740 else {
6741 NEXT_OFF(scan) = val - scan;
6742 }
6743}
6744
07be1b83 6745#ifdef DEBUGGING
3dab1dad
YO
6746/*
6747- regtail_study - set the next-pointer at the end of a node chain of p to val.
6748- Look for optimizable sequences at the same time.
6749- currently only looks for EXACT chains.
07be1b83
YO
6750
6751This is expermental code. The idea is to use this routine to perform
6752in place optimizations on branches and groups as they are constructed,
6753with the long term intention of removing optimization from study_chunk so
6754that it is purely analytical.
6755
6756Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6757to control which is which.
6758
3dab1dad
YO
6759*/
6760/* TODO: All four parms should be const */
07be1b83 6761
3dab1dad
YO
6762STATIC U8
6763S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6764{
6765 dVAR;
6766 register regnode *scan;
07be1b83
YO
6767 U8 exact = PSEUDO;
6768#ifdef EXPERIMENTAL_INPLACESCAN
6769 I32 min = 0;
6770#endif
6771
3dab1dad
YO
6772 GET_RE_DEBUG_FLAGS_DECL;
6773
07be1b83 6774
3dab1dad
YO
6775 if (SIZE_ONLY)
6776 return exact;
6777
6778 /* Find last node. */
6779
6780 scan = p;
6781 for (;;) {
6782 regnode * const temp = regnext(scan);
07be1b83
YO
6783#ifdef EXPERIMENTAL_INPLACESCAN
6784 if (PL_regkind[OP(scan)] == EXACT)
6785 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6786 return EXACT;
6787#endif
3dab1dad
YO
6788 if ( exact ) {
6789 switch (OP(scan)) {
6790 case EXACT:
6791 case EXACTF:
6792 case EXACTFL:
6793 if( exact == PSEUDO )
6794 exact= OP(scan);
07be1b83
YO
6795 else if ( exact != OP(scan) )
6796 exact= 0;
3dab1dad
YO
6797 case NOTHING:
6798 break;
6799 default:
6800 exact= 0;
6801 }
6802 }
6803 DEBUG_PARSE_r({
6804 SV * const mysv=sv_newmortal();
6805 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6806 regprop(RExC_rx, mysv, scan);
6807 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6808 SvPV_nolen_const(mysv),
6809 reg_name[exact],
6810 REG_NODE_NUM(scan));
6811 });
a0d0e21e
LW
6812 if (temp == NULL)
6813 break;
6814 scan = temp;
6815 }
07be1b83
YO
6816 DEBUG_PARSE_r({
6817 SV * const mysv_val=sv_newmortal();
6818 DEBUG_PARSE_MSG("");
6819 regprop(RExC_rx, mysv_val, val);
6820 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6821 SvPV_nolen_const(mysv_val),
6822 REG_NODE_NUM(val),
6823 val - scan
6824 );
6825 });
c277df42
IZ
6826 if (reg_off_by_arg[OP(scan)]) {
6827 ARG_SET(scan, val - scan);
a0ed51b3
LW
6828 }
6829 else {
c277df42
IZ
6830 NEXT_OFF(scan) = val - scan;
6831 }
3dab1dad
YO
6832
6833 return exact;
a687059c 6834}
07be1b83 6835#endif
a687059c
LW
6836
6837/*
a687059c
LW
6838 - regcurly - a little FSA that accepts {\d+,?\d*}
6839 */
79072805 6840STATIC I32
5f66b61c 6841S_regcurly(register const char *s)
a687059c
LW
6842{
6843 if (*s++ != '{')
6844 return FALSE;
f0fcb552 6845 if (!isDIGIT(*s))
a687059c 6846 return FALSE;
f0fcb552 6847 while (isDIGIT(*s))
a687059c
LW
6848 s++;
6849 if (*s == ',')
6850 s++;
f0fcb552 6851 while (isDIGIT(*s))
a687059c
LW
6852 s++;
6853 if (*s != '}')
6854 return FALSE;
6855 return TRUE;
6856}
6857
a687059c
LW
6858
6859/*
fd181c75 6860 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6861 */
6862void
097eb12c 6863Perl_regdump(pTHX_ const regexp *r)
a687059c 6864{
35ff7856 6865#ifdef DEBUGGING
97aff369 6866 dVAR;
c445ea15 6867 SV * const sv = sv_newmortal();
ab3bbdeb 6868 SV *dsv= sv_newmortal();
a687059c 6869
786e8c11 6870 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
6871
6872 /* Header fields of interest. */
ab3bbdeb
YO
6873 if (r->anchored_substr) {
6874 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6875 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 6876 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6877 "anchored %s%s at %"IVdf" ",
6878 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 6879 (IV)r->anchored_offset);
ab3bbdeb
YO
6880 } else if (r->anchored_utf8) {
6881 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6882 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 6883 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6884 "anchored utf8 %s%s at %"IVdf" ",
6885 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 6886 (IV)r->anchored_offset);
ab3bbdeb
YO
6887 }
6888 if (r->float_substr) {
6889 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6890 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 6891 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6892 "floating %s%s at %"IVdf"..%"UVuf" ",
6893 s, RE_SV_TAIL(r->float_substr),
7b0972df 6894 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
6895 } else if (r->float_utf8) {
6896 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6897 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 6898 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6899 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6900 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 6901 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 6902 }
33b8afdf 6903 if (r->check_substr || r->check_utf8)
b81d288d 6904 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
6905 (const char *)
6906 (r->check_substr == r->float_substr
6907 && r->check_utf8 == r->float_utf8
6908 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
6909 if (r->reganch & ROPT_NOSCAN)
6910 PerlIO_printf(Perl_debug_log, " noscan");
6911 if (r->reganch & ROPT_CHECK_ALL)
6912 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6913 if (r->check_substr || r->check_utf8)
c277df42
IZ
6914 PerlIO_printf(Perl_debug_log, ") ");
6915
46fc3d4c 6916 if (r->regstclass) {
32fc9b6a 6917 regprop(r, sv, r->regstclass);
1de06328 6918 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 6919 }
774d564b 6920 if (r->reganch & ROPT_ANCH) {
6921 PerlIO_printf(Perl_debug_log, "anchored");
6922 if (r->reganch & ROPT_ANCH_BOL)
6923 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6924 if (r->reganch & ROPT_ANCH_MBOL)
6925 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6926 if (r->reganch & ROPT_ANCH_SBOL)
6927 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6928 if (r->reganch & ROPT_ANCH_GPOS)
6929 PerlIO_printf(Perl_debug_log, "(GPOS)");
6930 PerlIO_putc(Perl_debug_log, ' ');
6931 }
c277df42
IZ
6932 if (r->reganch & ROPT_GPOS_SEEN)
6933 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6934 if (r->reganch & ROPT_SKIP)
760ac839 6935 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6936 if (r->reganch & ROPT_IMPLICIT)
760ac839 6937 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6938 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6939 if (r->reganch & ROPT_EVAL_SEEN)
6940 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6941 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6942#else
96a5add6 6943 PERL_UNUSED_CONTEXT;
65e66c80 6944 PERL_UNUSED_ARG(r);
17c3b450 6945#endif /* DEBUGGING */
a687059c
LW
6946}
6947
6948/*
a0d0e21e
LW
6949- regprop - printable representation of opcode
6950*/
46fc3d4c 6951void
32fc9b6a 6952Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6953{
35ff7856 6954#ifdef DEBUGGING
97aff369 6955 dVAR;
9b155405 6956 register int k;
1de06328 6957 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 6958
54dc92de 6959 sv_setpvn(sv, "", 0);
9b155405 6960 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6961 /* It would be nice to FAIL() here, but this may be called from
6962 regexec.c, and it would be hard to supply pRExC_state. */
6963 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6964 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6965
3dab1dad 6966 k = PL_regkind[OP(o)];
9b155405 6967
2a782b5b 6968 if (k == EXACT) {
396482e1 6969 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
6970 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6971 * is a crude hack but it may be the best for now since
6972 * we have no flag "this EXACTish node was UTF-8"
6973 * --jhi */
6974 const char * const s =
ddc5bc0f 6975 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
6976 PL_colors[0], PL_colors[1],
6977 PERL_PV_ESCAPE_UNI_DETECT |
6978 PERL_PV_PRETTY_ELIPSES |
6979 PERL_PV_PRETTY_LTGT
6980 );
6981 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 6982 } else if (k == TRIE) {
3dab1dad 6983 /* print the details of the trie in dumpuntil instead, as
4f639d21 6984 * prog->data isn't available here */
1de06328
YO
6985 const char op = OP(o);
6986 const I32 n = ARG(o);
6987 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
6988 (reg_ac_data *)prog->data->data[n] :
6989 NULL;
6990 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
6991 (reg_trie_data*)prog->data->data[n] :
6992 ac->trie;
6993
6994 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6995 DEBUG_TRIE_COMPILE_r(
6996 Perl_sv_catpvf(aTHX_ sv,
6997 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
6998 (UV)trie->startstate,
6999 (IV)trie->laststate-1,
7000 (UV)trie->wordcount,
7001 (UV)trie->minlen,
7002 (UV)trie->maxlen,
7003 (UV)TRIE_CHARCOUNT(trie),
7004 (UV)trie->uniquecharcount
7005 )
7006 );
7007 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7008 int i;
7009 int rangestart = -1;
7010 U8* bitmap = IS_ANYOF_TRIE(op) ? ANYOF_BITMAP(o) : TRIE_BITMAP(trie);
7011 Perl_sv_catpvf(aTHX_ sv, "[");
7012 for (i = 0; i <= 256; i++) {
7013 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7014 if (rangestart == -1)
7015 rangestart = i;
7016 } else if (rangestart != -1) {
7017 if (i <= rangestart + 3)
7018 for (; rangestart < i; rangestart++)
7019 put_byte(sv, rangestart);
7020 else {
7021 put_byte(sv, rangestart);
7022 sv_catpvs(sv, "-");
7023 put_byte(sv, i - 1);
7024 }
7025 rangestart = -1;
7026 }
7027 }
7028 Perl_sv_catpvf(aTHX_ sv, "]");
7029 }
7030
a3621e74 7031 } else if (k == CURLY) {
cb434fcc 7032 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7033 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7034 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7035 }
2c2d71f5
JH
7036 else if (k == WHILEM && o->flags) /* Ordinal/of */
7037 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 7038 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 7039 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 7040 else if (k == LOGICAL)
04ebc1ab 7041 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7042 else if (k == ANYOF) {
7043 int i, rangestart = -1;
2d03de9c 7044 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7045
7046 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7047 static const char * const anyofs[] = {
653099ff
GS
7048 "\\w",
7049 "\\W",
7050 "\\s",
7051 "\\S",
7052 "\\d",
7053 "\\D",
7054 "[:alnum:]",
7055 "[:^alnum:]",
7056 "[:alpha:]",
7057 "[:^alpha:]",
7058 "[:ascii:]",
7059 "[:^ascii:]",
7060 "[:ctrl:]",
7061 "[:^ctrl:]",
7062 "[:graph:]",
7063 "[:^graph:]",
7064 "[:lower:]",
7065 "[:^lower:]",
7066 "[:print:]",
7067 "[:^print:]",
7068 "[:punct:]",
7069 "[:^punct:]",
7070 "[:upper:]",
aaa51d5e 7071 "[:^upper:]",
653099ff 7072 "[:xdigit:]",
aaa51d5e
JF
7073 "[:^xdigit:]",
7074 "[:space:]",
7075 "[:^space:]",
7076 "[:blank:]",
7077 "[:^blank:]"
653099ff
GS
7078 };
7079
19860706 7080 if (flags & ANYOF_LOCALE)
396482e1 7081 sv_catpvs(sv, "{loc}");
19860706 7082 if (flags & ANYOF_FOLD)
396482e1 7083 sv_catpvs(sv, "{i}");
653099ff 7084 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7085 if (flags & ANYOF_INVERT)
396482e1 7086 sv_catpvs(sv, "^");
ffc61ed2
JH
7087 for (i = 0; i <= 256; i++) {
7088 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7089 if (rangestart == -1)
7090 rangestart = i;
7091 } else if (rangestart != -1) {
7092 if (i <= rangestart + 3)
7093 for (; rangestart < i; rangestart++)
653099ff 7094 put_byte(sv, rangestart);
ffc61ed2
JH
7095 else {
7096 put_byte(sv, rangestart);
396482e1 7097 sv_catpvs(sv, "-");
ffc61ed2 7098 put_byte(sv, i - 1);
653099ff 7099 }
ffc61ed2 7100 rangestart = -1;
653099ff 7101 }
847a199f 7102 }
ffc61ed2
JH
7103
7104 if (o->flags & ANYOF_CLASS)
bb7a0f54 7105 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7106 if (ANYOF_CLASS_TEST(o,i))
7107 sv_catpv(sv, anyofs[i]);
7108
7109 if (flags & ANYOF_UNICODE)
396482e1 7110 sv_catpvs(sv, "{unicode}");
1aa99e6b 7111 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7112 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7113
7114 {
7115 SV *lv;
32fc9b6a 7116 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7117
ffc61ed2
JH
7118 if (lv) {
7119 if (sw) {
89ebb4a3 7120 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7121
ffc61ed2 7122 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7123 uvchr_to_utf8(s, i);
ffc61ed2 7124
3568d838 7125 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7126 if (rangestart == -1)
7127 rangestart = i;
7128 } else if (rangestart != -1) {
ffc61ed2
JH
7129 if (i <= rangestart + 3)
7130 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7131 const U8 * const e = uvchr_to_utf8(s,rangestart);
7132 U8 *p;
7133 for(p = s; p < e; p++)
ffc61ed2
JH
7134 put_byte(sv, *p);
7135 }
7136 else {
2d03de9c
AL
7137 const U8 *e = uvchr_to_utf8(s,rangestart);
7138 U8 *p;
7139 for (p = s; p < e; p++)
ffc61ed2 7140 put_byte(sv, *p);
396482e1 7141 sv_catpvs(sv, "-");
2d03de9c
AL
7142 e = uvchr_to_utf8(s, i-1);
7143 for (p = s; p < e; p++)
1df70142 7144 put_byte(sv, *p);
ffc61ed2
JH
7145 }
7146 rangestart = -1;
7147 }
19860706 7148 }
ffc61ed2 7149
396482e1 7150 sv_catpvs(sv, "..."); /* et cetera */
19860706 7151 }
fde631ed 7152
ffc61ed2 7153 {
2e0de35c 7154 char *s = savesvpv(lv);
c445ea15 7155 char * const origs = s;
b81d288d 7156
3dab1dad
YO
7157 while (*s && *s != '\n')
7158 s++;
b81d288d 7159
ffc61ed2 7160 if (*s == '\n') {
2d03de9c 7161 const char * const t = ++s;
ffc61ed2
JH
7162
7163 while (*s) {
7164 if (*s == '\n')
7165 *s = ' ';
7166 s++;
7167 }
7168 if (s[-1] == ' ')
7169 s[-1] = 0;
7170
7171 sv_catpv(sv, t);
fde631ed 7172 }
b81d288d 7173
ffc61ed2 7174 Safefree(origs);
fde631ed
JH
7175 }
7176 }
653099ff 7177 }
ffc61ed2 7178
653099ff
GS
7179 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7180 }
9b155405 7181 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7182 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7183#else
96a5add6 7184 PERL_UNUSED_CONTEXT;
65e66c80
SP
7185 PERL_UNUSED_ARG(sv);
7186 PERL_UNUSED_ARG(o);
f9049ba1 7187 PERL_UNUSED_ARG(prog);
17c3b450 7188#endif /* DEBUGGING */
35ff7856 7189}
a687059c 7190
cad2e5aa
JH
7191SV *
7192Perl_re_intuit_string(pTHX_ regexp *prog)
7193{ /* Assume that RE_INTUIT is set */
97aff369 7194 dVAR;
a3621e74 7195 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7196 PERL_UNUSED_CONTEXT;
7197
a3621e74 7198 DEBUG_COMPILE_r(
cfd0369c 7199 {
2d03de9c 7200 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7201 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7202
7203 if (!PL_colorset) reginitcolors();
7204 PerlIO_printf(Perl_debug_log,
a0288114 7205 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7206 PL_colors[4],
7207 prog->check_substr ? "" : "utf8 ",
7208 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7209 s,
7210 PL_colors[1],
7211 (strlen(s) > 60 ? "..." : ""));
7212 } );
7213
33b8afdf 7214 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7215}
7216
2b69d0c2 7217void
864dbfa3 7218Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7219{
27da23d5 7220 dVAR;
0df25f3d 7221
fc32ee4a 7222 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7223
7821416a
IZ
7224 if (!r || (--r->refcnt > 0))
7225 return;
ab3bbdeb 7226 DEBUG_COMPILE_r({
0df25f3d
YO
7227 if (!PL_colorset)
7228 reginitcolors();
ab3bbdeb
YO
7229 if (RX_DEBUG(r)){
7230 SV *dsv= sv_newmortal();
7231 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7232 dsv, r->precomp, r->prelen, 60);
7233 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7234 PL_colors[4],PL_colors[5],s);
7235 }
9e55ce06 7236 });
cad2e5aa 7237
43c5f42d
NC
7238 /* gcov results gave these as non-null 100% of the time, so there's no
7239 optimisation in checking them before calling Safefree */
7240 Safefree(r->precomp);
7241 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7242 RX_MATCH_COPY_FREE(r);
f8c7b90f 7243#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7244 if (r->saved_copy)
7245 SvREFCNT_dec(r->saved_copy);
7246#endif
a193d654
GS
7247 if (r->substrs) {
7248 if (r->anchored_substr)
7249 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7250 if (r->anchored_utf8)
7251 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7252 if (r->float_substr)
7253 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7254 if (r->float_utf8)
7255 SvREFCNT_dec(r->float_utf8);
2779dcf1 7256 Safefree(r->substrs);
a193d654 7257 }
c277df42
IZ
7258 if (r->data) {
7259 int n = r->data->count;
f3548bdc
DM
7260 PAD* new_comppad = NULL;
7261 PAD* old_comppad;
4026c95a 7262 PADOFFSET refcnt;
dfad63ad 7263
c277df42 7264 while (--n >= 0) {
261faec3 7265 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7266 switch (r->data->what[n]) {
7267 case 's':
7268 SvREFCNT_dec((SV*)r->data->data[n]);
7269 break;
653099ff
GS
7270 case 'f':
7271 Safefree(r->data->data[n]);
7272 break;
dfad63ad
HS
7273 case 'p':
7274 new_comppad = (AV*)r->data->data[n];
7275 break;
c277df42 7276 case 'o':
dfad63ad 7277 if (new_comppad == NULL)
cea2e8a9 7278 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7279 PAD_SAVE_LOCAL(old_comppad,
7280 /* Watch out for global destruction's random ordering. */
c445ea15 7281 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 7282 );
b34c0dd4 7283 OP_REFCNT_LOCK;
4026c95a
SH
7284 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7285 OP_REFCNT_UNLOCK;
7286 if (!refcnt)
9b978d73 7287 op_free((OP_4tree*)r->data->data[n]);
9b978d73 7288
f3548bdc 7289 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
7290 SvREFCNT_dec((SV*)new_comppad);
7291 new_comppad = NULL;
c277df42
IZ
7292 break;
7293 case 'n':
9e55ce06 7294 break;
07be1b83 7295 case 'T':
be8e71aa
YO
7296 { /* Aho Corasick add-on structure for a trie node.
7297 Used in stclass optimization only */
07be1b83
YO
7298 U32 refcount;
7299 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7300 OP_REFCNT_LOCK;
7301 refcount = --aho->refcount;
7302 OP_REFCNT_UNLOCK;
7303 if ( !refcount ) {
7304 Safefree(aho->states);
7305 Safefree(aho->fail);
7306 aho->trie=NULL; /* not necessary to free this as it is
7307 handled by the 't' case */
7308 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 7309 Safefree(r->regstclass);
07be1b83
YO
7310 }
7311 }
7312 break;
a3621e74 7313 case 't':
07be1b83 7314 {
be8e71aa 7315 /* trie structure. */
07be1b83
YO
7316 U32 refcount;
7317 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7318 OP_REFCNT_LOCK;
7319 refcount = --trie->refcount;
7320 OP_REFCNT_UNLOCK;
7321 if ( !refcount ) {
7322 Safefree(trie->charmap);
7323 if (trie->widecharmap)
7324 SvREFCNT_dec((SV*)trie->widecharmap);
7325 Safefree(trie->states);
7326 Safefree(trie->trans);
7327 if (trie->bitmap)
7328 Safefree(trie->bitmap);
7329 if (trie->wordlen)
7330 Safefree(trie->wordlen);
786e8c11
YO
7331 if (trie->jump)
7332 Safefree(trie->jump);
7333 if (trie->nextword)
7334 Safefree(trie->nextword);
a3621e74 7335#ifdef DEBUGGING
be8e71aa
YO
7336 if (RX_DEBUG(r)) {
7337 if (trie->words)
7338 SvREFCNT_dec((SV*)trie->words);
7339 if (trie->revcharmap)
7340 SvREFCNT_dec((SV*)trie->revcharmap);
7341 }
a3621e74 7342#endif
07be1b83 7343 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7344 }
07be1b83
YO
7345 }
7346 break;
c277df42 7347 default:
830247a4 7348 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7349 }
7350 }
7351 Safefree(r->data->what);
7352 Safefree(r->data);
a0d0e21e
LW
7353 }
7354 Safefree(r->startp);
7355 Safefree(r->endp);
7356 Safefree(r);
a687059c 7357}
c277df42 7358
76234dfb 7359#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7360/*
7361 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7362 */
7363regnode *
864dbfa3 7364Perl_regnext(pTHX_ register regnode *p)
c277df42 7365{
97aff369 7366 dVAR;
c277df42
IZ
7367 register I32 offset;
7368
3280af22 7369 if (p == &PL_regdummy)
c277df42
IZ
7370 return(NULL);
7371
7372 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7373 if (offset == 0)
7374 return(NULL);
7375
c277df42 7376 return(p+offset);
c277df42 7377}
76234dfb 7378#endif
c277df42 7379
01f988be 7380STATIC void
cea2e8a9 7381S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7382{
7383 va_list args;
7384 STRLEN l1 = strlen(pat1);
7385 STRLEN l2 = strlen(pat2);
7386 char buf[512];
06bf62c7 7387 SV *msv;
73d840c0 7388 const char *message;
c277df42
IZ
7389
7390 if (l1 > 510)
7391 l1 = 510;
7392 if (l1 + l2 > 510)
7393 l2 = 510 - l1;
7394 Copy(pat1, buf, l1 , char);
7395 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7396 buf[l1 + l2] = '\n';
7397 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7398#ifdef I_STDARG
7399 /* ANSI variant takes additional second argument */
c277df42 7400 va_start(args, pat2);
8736538c
AS
7401#else
7402 va_start(args);
7403#endif
5a844595 7404 msv = vmess(buf, &args);
c277df42 7405 va_end(args);
cfd0369c 7406 message = SvPV_const(msv,l1);
c277df42
IZ
7407 if (l1 > 512)
7408 l1 = 512;
7409 Copy(message, buf, l1 , char);
197cf9b9 7410 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7411 Perl_croak(aTHX_ "%s", buf);
c277df42 7412}
a0ed51b3
LW
7413
7414/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7415
76234dfb 7416#ifndef PERL_IN_XSUB_RE
a0ed51b3 7417void
864dbfa3 7418Perl_save_re_context(pTHX)
b81d288d 7419{
97aff369 7420 dVAR;
1ade1aa1
NC
7421
7422 struct re_save_state *state;
7423
7424 SAVEVPTR(PL_curcop);
7425 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7426
7427 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7428 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7429 SSPUSHINT(SAVEt_RE_STATE);
7430
46ab3289 7431 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 7432
a0ed51b3 7433 PL_reg_start_tmp = 0;
a0ed51b3 7434 PL_reg_start_tmpl = 0;
c445ea15 7435 PL_reg_oldsaved = NULL;
a5db57d6 7436 PL_reg_oldsavedlen = 0;
a5db57d6 7437 PL_reg_maxiter = 0;
a5db57d6 7438 PL_reg_leftiter = 0;
c445ea15 7439 PL_reg_poscache = NULL;
a5db57d6 7440 PL_reg_poscache_size = 0;
1ade1aa1
NC
7441#ifdef PERL_OLD_COPY_ON_WRITE
7442 PL_nrs = NULL;
7443#endif
ada6e8a9 7444
c445ea15
AL
7445 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7446 if (PL_curpm) {
7447 const REGEXP * const rx = PM_GETRE(PL_curpm);
7448 if (rx) {
1df70142 7449 U32 i;
ada6e8a9 7450 for (i = 1; i <= rx->nparens; i++) {
1df70142 7451 char digits[TYPE_CHARS(long)];
d9fad198 7452 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
7453 GV *const *const gvp
7454 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7455
b37c2d43
AL
7456 if (gvp) {
7457 GV * const gv = *gvp;
7458 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7459 save_scalar(gv);
49f27e4b 7460 }
ada6e8a9
AMS
7461 }
7462 }
7463 }
a0ed51b3 7464}
76234dfb 7465#endif
51371543 7466
51371543 7467static void
acfe0abc 7468clear_re(pTHX_ void *r)
51371543 7469{
97aff369 7470 dVAR;
51371543
GS
7471 ReREFCNT_dec((regexp *)r);
7472}
ffbc6a93 7473
a28509cc
AL
7474#ifdef DEBUGGING
7475
7476STATIC void
7477S_put_byte(pTHX_ SV *sv, int c)
7478{
7479 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7480 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7481 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7482 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7483 else
7484 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7485}
7486
786e8c11 7487
3dab1dad
YO
7488#define CLEAR_OPTSTART \
7489 if (optstart) STMT_START { \
07be1b83 7490 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
7491 optstart=NULL; \
7492 } STMT_END
7493
786e8c11 7494#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 7495
b5a2f8d8
NC
7496STATIC const regnode *
7497S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
7498 const regnode *last, const regnode *plast,
7499 SV* sv, I32 indent, U32 depth)
a28509cc 7500{
97aff369 7501 dVAR;
786e8c11 7502 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 7503 register const regnode *next;
3dab1dad
YO
7504 const regnode *optstart= NULL;
7505 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 7506
786e8c11
YO
7507#ifdef DEBUG_DUMPUNTIL
7508 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7509 last ? last-start : 0,plast ? plast-start : 0);
7510#endif
7511
7512 if (plast && plast < last)
7513 last= plast;
7514
7515 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
7516 /* While that wasn't END last time... */
7517
7518 NODE_ALIGN(node);
7519 op = OP(node);
7520 if (op == CLOSE)
786e8c11 7521 indent--;
b5a2f8d8 7522 next = regnext((regnode *)node);
07be1b83 7523
a28509cc 7524 /* Where, what. */
8e11feef 7525 if (OP(node) == OPTIMIZED) {
e68ec53f 7526 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 7527 optstart = node;
3dab1dad 7528 else
8e11feef 7529 goto after_print;
3dab1dad
YO
7530 } else
7531 CLEAR_OPTSTART;
07be1b83 7532
32fc9b6a 7533 regprop(r, sv, node);
a28509cc 7534 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 7535 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
7536
7537 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
7538 if (next == NULL) /* Next ptr. */
7539 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
7540 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7541 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
7542 else
7543 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 7544
1de06328 7545 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 7546 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
7547 }
7548
a28509cc
AL
7549 after_print:
7550 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
7551 assert(next);
7552 {
7553 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
7554 ? regnext((regnode *)next)
7555 : next);
be8e71aa
YO
7556 if (last && nnode > last)
7557 nnode = last;
786e8c11 7558 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 7559 }
a28509cc
AL
7560 }
7561 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 7562 assert(next);
786e8c11 7563 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
7564 }
7565 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 7566 const char op = OP(node);
a28509cc 7567 const I32 n = ARG(node);
1de06328
YO
7568 const reg_ac_data * const ac = op>=AHOCORASICK ?
7569 (reg_ac_data *)r->data->data[n] :
7570 NULL;
7571 const reg_trie_data * const trie = op<AHOCORASICK ?
7572 (reg_trie_data*)r->data->data[n] :
7573 ac->trie;
786e8c11 7574 const regnode *nextbranch= NULL;
a28509cc 7575 I32 word_idx;
1de06328 7576 sv_setpvn(sv, "", 0);
786e8c11 7577 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 7578 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
7579
7580 PerlIO_printf(Perl_debug_log, "%*s%s ",
7581 (int)(2*(indent+3)), "",
7582 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7583 PL_colors[0], PL_colors[1],
7584 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7585 PERL_PV_PRETTY_ELIPSES |
7586 PERL_PV_PRETTY_LTGT
786e8c11
YO
7587 )
7588 : "???"
7589 );
7590 if (trie->jump) {
7591 U16 dist= trie->jump[word_idx+1];
7592 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7593 if (dist) {
7594 if (!nextbranch)
7595 nextbranch= next - trie->jump[0];
7596 DUMPUNTIL(next - dist, nextbranch);
7597 }
7598 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7599 nextbranch= regnext((regnode *)nextbranch);
7600 } else {
7601 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 7602 }
786e8c11
YO
7603 }
7604 if (last && next > last)
7605 node= last;
7606 else
7607 node= next;
a28509cc 7608 }
786e8c11
YO
7609 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7610 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7611 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
7612 }
7613 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7614 assert(next);
786e8c11 7615 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
7616 }
7617 else if ( op == PLUS || op == STAR) {
786e8c11 7618 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
7619 }
7620 else if (op == ANYOF) {
7621 /* arglen 1 + class block */
7622 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7623 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7624 node = NEXTOPER(node);
7625 }
7626 else if (PL_regkind[(U8)op] == EXACT) {
7627 /* Literal string, where present. */
7628 node += NODE_SZ_STR(node) - 1;
7629 node = NEXTOPER(node);
7630 }
7631 else {
7632 node = NEXTOPER(node);
7633 node += regarglen[(U8)op];
7634 }
7635 if (op == CURLYX || op == OPEN)
786e8c11 7636 indent++;
a28509cc 7637 else if (op == WHILEM)
786e8c11 7638 indent--;
a28509cc 7639 }
3dab1dad 7640 CLEAR_OPTSTART;
786e8c11
YO
7641#ifdef DEBUG_DUMPUNTIL
7642 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7643#endif
1de06328 7644 return node;
a28509cc
AL
7645}
7646
7647#endif /* DEBUGGING */
7648
241d1a3b
NC
7649/*
7650 * Local variables:
7651 * c-indentation-style: bsd
7652 * c-basic-offset: 4
7653 * indent-tabs-mode: t
7654 * End:
7655 *
37442d52
RGS
7656 * ex: set ts=8 sts=4 sw=4 noet:
7657 */