This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
INSTALL doc fix: calling the test suite using -MTestInit changed since it moved to...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
6bda09f9 135 HV *charnames; /* cache of named sequences */
81714fb9 136 HV *paren_names; /* Paren names */
1f1031fe 137
40d049e4
YO
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
3dab1dad 144#ifdef DEBUGGING
be8e71aa 145 const char *lastparse;
3dab1dad 146 I32 lastnum;
1f1031fe 147 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
148#define RExC_lastparse (pRExC_state->lastparse)
149#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 150#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 151#endif
830247a4
IZ
152} RExC_state_t;
153
e2509266 154#define RExC_flags (pRExC_state->flags)
830247a4 155#define RExC_precomp (pRExC_state->precomp)
288b8c02 156#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 157#define RExC_rx (pRExC_state->rx)
f8fc2ecf 158#define RExC_rxi (pRExC_state->rxi)
fac92740 159#define RExC_start (pRExC_state->start)
830247a4
IZ
160#define RExC_end (pRExC_state->end)
161#define RExC_parse (pRExC_state->parse)
162#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
163#ifdef RE_TRACK_PATTERN_OFFSETS
164#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165#endif
830247a4 166#define RExC_emit (pRExC_state->emit)
fac92740 167#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 168#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
169#define RExC_naughty (pRExC_state->naughty)
170#define RExC_sawback (pRExC_state->sawback)
171#define RExC_seen (pRExC_state->seen)
172#define RExC_size (pRExC_state->size)
173#define RExC_npar (pRExC_state->npar)
e2e6a0f1 174#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
175#define RExC_extralen (pRExC_state->extralen)
176#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 178#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 179#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 180#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3
YO
200#define HASWIDTH 0x01 /* Known to match non-null strings. */
201#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202#define SPSTART 0x04 /* Starts with * or +. */
203#define TRYAGAIN 0x08 /* Weeded out a declaration. */
204#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 205
3dab1dad
YO
206#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
207
07be1b83
YO
208/* whether trie related optimizations are enabled */
209#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210#define TRIE_STUDY_OPT
786e8c11 211#define FULL_TRIE_STUDY
07be1b83
YO
212#define TRIE_STCLASS
213#endif
1de06328
YO
214
215
40d049e4
YO
216
217#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218#define PBITVAL(paren) (1 << ((paren) & 7))
219#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222
223
1de06328
YO
224/* About scan_data_t.
225
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
232 in the pattern:
233
234 /FOO[xX]A.*B[xX]BAR/
235
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
239
240 The strings can be composites, for instance
241
242 /(f)(o)(o)/
243
244 will result in a composite fixed substring 'foo'.
245
246 For each string some basic information is maintained:
247
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
254 found.
255
256 - max_offset
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
260
261 - minlenp
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
265 involved. Consider
266
267 /(?=FOO).*F/
268
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
279
280 - lookbehind
281
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
292 associated string.
293
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
296
297*/
2c2d71f5
JH
298
299typedef struct scan_data_t {
1de06328
YO
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
2c2d71f5
JH
302 I32 pos_min;
303 I32 pos_delta;
304 SV *last_found;
1de06328 305 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
306 I32 last_start_min;
307 I32 last_start_max;
1de06328
YO
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
318 I32 flags;
319 I32 whilem_c;
cb434fcc 320 I32 *last_closep;
653099ff 321 struct regnode_charclass_class *start_class;
2c2d71f5
JH
322} scan_data_t;
323
a687059c 324/*
e50aee73 325 * Forward declarations for pregcomp()'s friends.
a687059c 326 */
a0d0e21e 327
27da23d5 328static const scan_data_t zero_scan_data =
1de06328 329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
330
331#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
332#define SF_BEFORE_SEOL 0x0001
333#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
334#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336
09b7f37c
CB
337#ifdef NO_UNARY_PLUS
338# define SF_FIX_SHIFT_EOL (0+2)
339# define SF_FL_SHIFT_EOL (0+4)
340#else
341# define SF_FIX_SHIFT_EOL (+2)
342# define SF_FL_SHIFT_EOL (+4)
343#endif
c277df42
IZ
344
345#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
347
348#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
350#define SF_IS_INF 0x0040
351#define SF_HAS_PAR 0x0080
352#define SF_IN_PAR 0x0100
353#define SF_HAS_EVAL 0x0200
354#define SCF_DO_SUBSTR 0x0400
653099ff
GS
355#define SCF_DO_STCLASS_AND 0x0800
356#define SCF_DO_STCLASS_OR 0x1000
357#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 358#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 359
786e8c11 360#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 361#define SCF_SEEN_ACCEPT 0x8000
07be1b83 362
eb160463 363#define UTF (RExC_utf8 != 0)
bbe252da
YO
364#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 366
ffc61ed2 367#define OOB_UNICODE 12345678
93733859 368#define OOB_NAMEDCLASS -1
b8c5462f 369
a0ed51b3
LW
370#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372
8615cb43 373
b45f050a
JF
374/* length of regex to show in messages that don't mark a position within */
375#define RegexLengthToShowInErrorMessages 127
376
377/*
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
381 */
7253e4e3
RK
382#define MARKER1 "<-- HERE" /* marker as it appears in the description */
383#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 384
7253e4e3 385#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
386
387/*
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 * "...".
391 */
58e23c8d 392#define _FAIL(code) STMT_START { \
bfed75c6 393 const char *ellipses = ""; \
ccb2c380
MP
394 IV len = RExC_end - RExC_precomp; \
395 \
396 if (!SIZE_ONLY) \
288b8c02 397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
401 ellipses = "..."; \
402 } \
58e23c8d 403 code; \
ccb2c380 404} STMT_END
8615cb43 405
58e23c8d
YO
406#define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
409
410#define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
413
b45f050a 414/*
b45f050a
JF
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
416 */
ccb2c380 417#define Simple_vFAIL(m) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
425 */
ccb2c380
MP
426#define vFAIL(m) STMT_START { \
427 if (!SIZE_ONLY) \
288b8c02 428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
429 Simple_vFAIL(m); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Like Simple_vFAIL(), but accepts two arguments.
434 */
ccb2c380 435#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
443 */
ccb2c380
MP
444#define vFAIL2(m,a1) STMT_START { \
445 if (!SIZE_ONLY) \
288b8c02 446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
447 Simple_vFAIL2(m, a1); \
448} STMT_END
b45f050a
JF
449
450
451/*
452 * Like Simple_vFAIL(), but accepts three arguments.
453 */
ccb2c380 454#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a
JF
459
460/*
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
462 */
ccb2c380
MP
463#define vFAIL3(m,a1,a2) STMT_START { \
464 if (!SIZE_ONLY) \
288b8c02 465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
466 Simple_vFAIL3(m, a1, a2); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Like Simple_vFAIL(), but accepts four arguments.
471 */
ccb2c380 472#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 473 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
b45f050a 477
668c081a 478#define ckWARNreg(loc,m) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
482} STMT_END
483
668c081a 484#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
d1d15184 486 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
487 m REPORT_LOCATION, \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
489} STMT_END
490
668c081a 491#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
668c081a 493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
496
497#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 498 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501} STMT_END
502
668c081a
NC
503#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507} STMT_END
508
ccb2c380 509#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 510 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513} STMT_END
514
668c081a
NC
515#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519} STMT_END
520
ccb2c380 521#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 522 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
525} STMT_END
9d1d55b5 526
8615cb43 527
cd439c50 528/* Allow for side effects in s */
ccb2c380
MP
529#define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531} STMT_END
cd439c50 532
fac92740
MJD
533/* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
07be1b83 537 * Position is 1 indexed.
fac92740 538 */
7122b237
YO
539#ifndef RE_TRACK_PATTERN_OFFSETS
540#define Set_Node_Offset_To_R(node,byte)
541#define Set_Node_Offset(node,byte)
542#define Set_Cur_Node_Offset
543#define Set_Node_Length_To_R(node,len)
544#define Set_Node_Length(node,len)
545#define Set_Node_Cur_Length(node)
546#define Node_Offset(n)
547#define Node_Length(n)
548#define Set_Node_Offset_Length(node,offset,len)
549#define ProgLen(ri) ri->u.proglen
550#define SetProgLen(ri,x) ri->u.proglen = x
551#else
552#define ProgLen(ri) ri->u.offsets[0]
553#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
554#define Set_Node_Offset_To_R(node,byte) STMT_START { \
555 if (! SIZE_ONLY) { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 557 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 558 if((node) < 0) { \
551405c4 559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
560 } else { \
561 RExC_offsets[2*(node)-1] = (byte); \
562 } \
563 } \
564} STMT_END
565
566#define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
569
570#define Set_Node_Length_To_R(node,len) STMT_START { \
571 if (! SIZE_ONLY) { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 573 __LINE__, (int)(node), (int)(len))); \
ccb2c380 574 if((node) < 0) { \
551405c4 575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
576 } else { \
577 RExC_offsets[2*(node)] = (len); \
578 } \
579 } \
580} STMT_END
581
582#define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585#define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
587
588/* Get offsets and lengths */
589#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
591
07be1b83
YO
592#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
595} STMT_END
7122b237 596#endif
07be1b83
YO
597
598#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599#define EXPERIMENTAL_INPLACESCAN
7122b237 600#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 601
304ee84b
YO
602#define DEBUG_STUDYDATA(str,data,depth) \
603DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 604 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
304ee84b 610 (UV)((data)->flags), \
1de06328 611 (IV)((data)->whilem_c), \
304ee84b
YO
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
1de06328
YO
614 ); \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
632 ); \
633 PerlIO_printf(Perl_debug_log,"\n"); \
634});
635
acfe0abc 636static void clear_re(pTHX_ void *r);
4327152a 637
653099ff 638/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 639 Update the longest found anchored substring and the longest found
653099ff
GS
640 floating substrings if needed. */
641
4327152a 642STATIC void
304ee84b 643S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 644{
e1ec3a88
AL
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 647 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 648
7918f24d
NC
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
650
c277df42 651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 652 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
b81d288d 656 data->flags
c277df42
IZ
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
658 else
659 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
a0ed51b3 662 }
304ee84b 663 else { /* *data->longest == data->longest_float */
c277df42 664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
665 data->offset_float_max = (l
666 ? data->last_start_max
c277df42 667 : data->pos_min + data->pos_delta);
304ee84b 668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 669 data->offset_float_max = I32_MAX;
c277df42 670 if (data->flags & SF_BEFORE_EOL)
b81d288d 671 data->flags
c277df42
IZ
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
673 else
674 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
c277df42
IZ
677 }
678 }
679 SvCUR_set(data->last_found, 0);
0eda9292 680 {
a28509cc 681 SV * const sv = data->last_found;
097eb12c
AL
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
684 if (mg)
685 mg->mg_len = 0;
686 }
0eda9292 687 }
c277df42
IZ
688 data->last_end = -1;
689 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 690 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
691}
692
653099ff
GS
693/* Can match anything (initialization) */
694STATIC void
097eb12c 695S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 696{
7918f24d
NC
697 PERL_ARGS_ASSERT_CL_ANYTHING;
698
653099ff 699 ANYOF_CLASS_ZERO(cl);
f8bef550 700 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
702 if (LOC)
703 cl->flags |= ANYOF_LOCALE;
704}
705
706/* Can match anything (initialization) */
707STATIC int
5f66b61c 708S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
709{
710 int value;
711
7918f24d
NC
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
713
aaa51d5e 714 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
716 return 1;
1aa99e6b
IH
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
718 return 0;
10edeb5d 719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 720 return 0;
653099ff
GS
721 return 1;
722}
723
724/* Can match anything (initialization) */
725STATIC void
097eb12c 726S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 727{
7918f24d
NC
728 PERL_ARGS_ASSERT_CL_INIT;
729
8ecf7187 730 Zero(cl, 1, struct regnode_charclass_class);
653099ff 731 cl->type = ANYOF;
830247a4 732 cl_anything(pRExC_state, cl);
653099ff
GS
733}
734
735STATIC void
097eb12c 736S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743 if (LOC)
744 cl->flags |= ANYOF_LOCALE;
745}
746
747/* 'And' a given class with another one. Can create false positives */
748/* We assume that cl is not inverted */
749STATIC void
5f66b61c 750S_cl_and(struct regnode_charclass_class *cl,
a28509cc 751 const struct regnode_charclass_class *and_with)
653099ff 752{
7918f24d 753 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
754
755 assert(and_with->type == ANYOF);
653099ff
GS
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
761 int i;
762
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
766 else
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
1aa99e6b 772
14ebb1a2
JH
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
778 }
14ebb1a2
JH
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 781 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 784 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
785}
786
787/* 'OR' a given class with another one. Can create false positives */
788/* We assume that cl is not inverted */
789STATIC void
097eb12c 790S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 791{
7918f24d
NC
792 PERL_ARGS_ASSERT_CL_OR;
793
653099ff
GS
794 if (or_with->flags & ANYOF_INVERT) {
795 /* We do not use
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
803 */
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
807 int i;
808
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
812 else {
830247a4 813 cl_anything(pRExC_state, cl);
653099ff
GS
814 }
815 } else {
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 818 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
819 || (cl->flags & ANYOF_FOLD)) ) {
820 int i;
821
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
829 }
830 }
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 832 cl_anything(pRExC_state, cl);
653099ff
GS
833 }
834 }
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
837
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
842 }
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
846 }
653099ff
GS
847}
848
a3621e74
YO
849#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
853
3dab1dad
YO
854
855#ifdef DEBUGGING
07be1b83 856/*
2b8b4781
NC
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
860
861 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
865
3dab1dad
YO
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
07be1b83
YO
868
869*/
3dab1dad
YO
870
871/*
3dab1dad
YO
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
874*/
875
876STATIC void
2b8b4781
NC
877S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
3dab1dad
YO
879{
880 U32 state;
ab3bbdeb 881 SV *sv=sv_newmortal();
55eed653 882 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
883 GET_RE_DEBUG_FLAGS_DECL;
884
7918f24d 885 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 886
3dab1dad
YO
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
890
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 892 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 893 if ( tmp ) {
ab3bbdeb
YO
894 PerlIO_printf( Perl_debug_log, "%*s",
895 colwidth,
ddc5bc0f 896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
900 )
901 );
3dab1dad
YO
902 }
903 }
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
906
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
909 PerlIO_printf( Perl_debug_log, "\n");
910
1e2e3d02 911 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 912 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
913
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
915
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
918 } else {
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 }
921
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
923
924 if ( base ) {
925 U32 ofs = 0;
926
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 ofs++;
931
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
933
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
938 {
ab3bbdeb
YO
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
940 colwidth,
3dab1dad
YO
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
942 } else {
ab3bbdeb 943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
944 }
945 }
946
947 PerlIO_printf( Perl_debug_log, "]");
948
949 }
950 PerlIO_printf( Perl_debug_log, "\n" );
951 }
952}
953/*
3dab1dad
YO
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
958*/
959STATIC void
55eed653 960S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
962 U32 depth)
3dab1dad
YO
963{
964 U32 state;
ab3bbdeb 965 SV *sv=sv_newmortal();
55eed653 966 int colwidth= widecharmap ? 6 : 4;
3dab1dad 967 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
968
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
970
3dab1dad 971 /* print out the table precompression. */
ab3bbdeb
YO
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
3dab1dad
YO
975
976 for( state=1 ; state < next_alloc ; state ++ ) {
977 U16 charid;
978
ab3bbdeb 979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
983 } else {
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
986 );
987 }
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
990 if ( tmp ) {
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
992 colwidth,
ddc5bc0f 993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
997 ) ,
1e2e3d02
YO
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 );
1001 if (!(charid % 10))
664e119d
RGS
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1e2e3d02 1004 }
ab3bbdeb
YO
1005 }
1006 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1007 }
1008}
1009
1010/*
3dab1dad
YO
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1015*/
1016STATIC void
55eed653 1017S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1019 U32 depth)
3dab1dad
YO
1020{
1021 U32 state;
1022 U16 charid;
ab3bbdeb 1023 SV *sv=sv_newmortal();
55eed653 1024 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1025 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1026
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1028
1029 /*
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1032 */
1033
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1035
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1038 if ( tmp ) {
ab3bbdeb
YO
1039 PerlIO_printf( Perl_debug_log, "%*s",
1040 colwidth,
ddc5bc0f 1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1045 )
1046 );
3dab1dad
YO
1047 }
1048 }
1049
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1051
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1054 }
1055
1056 PerlIO_printf( Perl_debug_log, "\n" );
1057
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1059
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1063
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1066 if (v)
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1068 else
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1070 }
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1073 } else {
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1076 }
1077 }
07be1b83 1078}
3dab1dad
YO
1079
1080#endif
1081
786e8c11
YO
1082/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
3dab1dad 1092
786e8c11 1093Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1094
786e8c11
YO
1095A trie is an N'ary tree where the branches are determined by digital
1096decomposition of the key. IE, at the root node you look up the 1st character and
1097follow that branch repeat until you find the end of the branches. Nodes can be
1098marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1099
786e8c11 1100 /he|she|his|hers/
72f13be8 1101
786e8c11
YO
1102would convert into the following structure. Numbers represent states, letters
1103following numbers represent valid transitions on the letter from that state, if
1104the number is in square brackets it represents an accepting state, otherwise it
1105will be in parenthesis.
07be1b83 1106
786e8c11
YO
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1108 | |
1109 | (2)
1110 | |
1111 (1) +-i->(6)-+-s->[7]
1112 |
1113 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1114
786e8c11
YO
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1116
1117This shows that when matching against the string 'hers' we will begin at state 1
1118read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121single traverse. We store a mapping from accepting to state to which word was
1122matched, and then when we have multiple possibilities we try to complete the
1123rest of the regex in the order in which they occured in the alternation.
1124
1125The only prior NFA like behaviour that would be changed by the TRIE support is
1126the silent ignoring of duplicate alternations which are of the form:
1127
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1129
1130Thus EVAL blocks follwing a trie may be called a different number of times with
1131and without the optimisation. With the optimisations dupes will be silently
1132ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133the following demonstrates:
1134
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1136
1137which prints out 'word' three times, but
1138
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1140
1141which doesnt print it out at all. This is due to other optimisations kicking in.
1142
1143Example of what happens on a structural level:
1144
1145The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1146
1147 1: CURLYM[1] {1,32767}(18)
1148 5: BRANCH(8)
1149 6: EXACT <ac>(16)
1150 8: BRANCH(11)
1151 9: EXACT <ad>(16)
1152 11: BRANCH(14)
1153 12: EXACT <ab>(16)
1154 16: SUCCEED(0)
1155 17: NOTHING(18)
1156 18: END(0)
1157
1158This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159and should turn into:
1160
1161 1: CURLYM[1] {1,32767}(18)
1162 5: TRIE(16)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1164 <ac>
1165 <ad>
1166 <ab>
1167 16: SUCCEED(0)
1168 17: NOTHING(18)
1169 18: END(0)
1170
1171Cases where tail != last would be like /(?foo|bar)baz/:
1172
1173 1: BRANCH(4)
1174 2: EXACT <foo>(8)
1175 4: BRANCH(7)
1176 5: EXACT <bar>(8)
1177 7: TAIL(8)
1178 8: EXACT <baz>(10)
1179 10: END(0)
1180
1181which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182and would end up looking like:
1183
1184 1: TRIE(8)
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1186 <foo>
1187 <bar>
1188 7: TAIL(8)
1189 8: EXACT <baz>(10)
1190 10: END(0)
1191
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1193
1194is the recommended Unicode-aware way of saying
1195
1196 *(d++) = uv;
1197*/
1198
1e2e3d02 1199#define TRIE_STORE_REVCHAR \
786e8c11 1200 STMT_START { \
73031816
NC
1201 if (UTF) { \
1202 SV *zlopp = newSV(2); \
88c9ea1e
CB
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 SvPOK_on(zlopp); \
1207 SvUTF8_on(zlopp); \
1208 av_push(revcharmap, zlopp); \
1209 } else { \
6bdeddd2 1210 char ooooff = (char)uvc; \
73031816
NC
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1212 } \
1213 } STMT_END
786e8c11
YO
1214
1215#define TRIE_READ_CHAR STMT_START { \
1216 wordlen++; \
1217 if ( UTF ) { \
1218 if ( folder ) { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1221 foldlen -= len; \
1222 scan += len; \
1223 len = 0; \
1224 } else { \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1229 } \
1230 } else { \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1232 } \
1233 } else { \
1234 uvc = (U32)*uc; \
1235 len = 1; \
1236 } \
1237} STMT_END
1238
1239
1240
1241#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1245 } \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1249} STMT_END
07be1b83 1250
786e8c11
YO
1251#define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1256} STMT_END
07be1b83 1257
786e8c11
YO
1258#define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1261 \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1264 DEBUG_r({ \
1265 /* store the word for dumping */ \
1266 SV* tmp; \
1267 if (OP(noper) != NOTHING) \
740cce10 1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1269 else \
740cce10 1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1271 av_push( trie_words, tmp ); \
786e8c11
YO
1272 }); \
1273 \
1274 curword++; \
1275 \
1276 if ( noper_next < tail ) { \
1277 if (!trie->jump) \
c944940b 1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1279 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1280 if (!jumper) \
1281 jumper = noper_next; \
1282 if (!nextbranch) \
1283 nextbranch= regnext(cur); \
1284 } \
1285 \
1286 if ( dupe ) { \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
c944940b 1292 trie->nextword = (U16 *) \
446bd890 1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1297 } else { \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1300 } \
1301} STMT_END
07be1b83 1302
3dab1dad 1303
786e8c11
YO
1304#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1311 )
3dab1dad 1312
786e8c11
YO
1313#define MADE_TRIE 1
1314#define MADE_JUMP_TRIE 2
1315#define MADE_EXACT_TRIE 4
3dab1dad 1316
a3621e74 1317STATIC I32
786e8c11 1318S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1319{
27da23d5 1320 dVAR;
a3621e74
YO
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
55eed653 1323 HV *widecharmap = NULL;
2b8b4781 1324 AV *revcharmap = newAV();
a3621e74 1325 regnode *cur;
9f7f3913 1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1327 STRLEN len = 0;
1328 UV uvc = 0;
1329 U16 curword = 0;
1330 U32 next_alloc = 0;
786e8c11
YO
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
7f69552c 1333 regnode *convert = NULL;
a3621e74 1334 /* we just use folder as a flag in utf8 */
e1ec3a88 1335 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1336 ? PL_fold
1337 : ( flags == EXACTFL
1338 ? PL_fold_locale
1339 : NULL
1340 )
1341 );
1342
2b8b4781
NC
1343#ifdef DEBUGGING
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
8e11feef 1348 */
2b8b4781
NC
1349#else
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1351 STRLEN trie_charcount=0;
3dab1dad 1352#endif
2b8b4781 1353 SV *re_trie_maxbuff;
a3621e74 1354 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1355
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1357#ifndef DEBUGGING
1358 PERL_UNUSED_ARG(depth);
1359#endif
a3621e74 1360
c944940b 1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1362 trie->refcount = 1;
3dab1dad 1363 trie->startstate = 1;
786e8c11 1364 trie->wordcount = word_count;
f8fc2ecf 1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1367 if (!(UTF && folder))
c944940b 1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1369 DEBUG_r({
2b8b4781 1370 trie_words = newAV();
a3621e74 1371 });
a3621e74 1372
0111c4fd 1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1374 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1376 }
3dab1dad
YO
1377 DEBUG_OPTIMISE_r({
1378 PerlIO_printf( Perl_debug_log,
786e8c11 1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1383 (int)depth);
3dab1dad 1384 });
7f69552c
YO
1385
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1389 convert = first;
1390 } else {
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1393 }
1394
a3621e74
YO
1395 /* -- First loop and Setup --
1396
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1400 have unique chars.
1401
1402 We use an array of integers to represent the character codes 0..255
38a44b82 1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1404 native representation of the character value as the key and IV's for the
1405 coded index.
1406
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1415
1416 */
1417
a3621e74 1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1419 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1420 const U8 *uc = (U8*)STRING( noper );
a28509cc 1421 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1422 STRLEN foldlen = 0;
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1424 const U8 *scan = (U8*)NULL;
07be1b83 1425 U32 wordlen = 0; /* required init */
02daf0ab
YO
1426 STRLEN chars = 0;
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1428
3dab1dad
YO
1429 if (OP(noper) == NOTHING) {
1430 trie->minlen= 0;
1431 continue;
1432 }
02daf0ab
YO
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1436
a3621e74 1437 for ( ; uc < e ; uc += len ) {
3dab1dad 1438 TRIE_CHARCOUNT(trie)++;
a3621e74 1439 TRIE_READ_CHAR;
3dab1dad 1440 chars++;
a3621e74
YO
1441 if ( uvc < 256 ) {
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1444 if ( folder )
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1446 TRIE_STORE_REVCHAR;
a3621e74 1447 }
02daf0ab
YO
1448 if ( set_bit ) {
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1452
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455
1456 if ( !UTF ) {
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1464 }
1465 }
02daf0ab
YO
1466 set_bit = 0; /* We've done our bit :-) */
1467 }
a3621e74
YO
1468 } else {
1469 SV** svpp;
55eed653
NC
1470 if ( !widecharmap )
1471 widecharmap = newHV();
a3621e74 1472
55eed653 1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1474
1475 if ( !svpp )
e4584336 1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1477
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1480 TRIE_STORE_REVCHAR;
a3621e74
YO
1481 }
1482 }
1483 }
3dab1dad
YO
1484 if( cur == first ) {
1485 trie->minlen=chars;
1486 trie->maxlen=chars;
1487 } else if (chars < trie->minlen) {
1488 trie->minlen=chars;
1489 } else if (chars > trie->maxlen) {
1490 trie->maxlen=chars;
1491 }
1492
a3621e74
YO
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
55eed653 1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1500 );
c944940b 1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1502
1503 /*
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1508 conservative but potentially much slower representation using an array
1509 of lists.
1510
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1517
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1521
1522 */
1523
1524
3dab1dad 1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1526 /*
1527 Second Pass -- Array Of Lists Representation
1528
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1532
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
a3621e74
YO
1536 */
1537
a3621e74
YO
1538 STRLEN transcount = 1;
1539
1e2e3d02
YO
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
446bd890 1543
c944940b
JH
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
a3621e74
YO
1547 TRIE_LIST_NEW(1);
1548 next_alloc = 2;
1549
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1551
c445ea15
AL
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
07be1b83 1559 U32 wordlen = 0; /* required init */
c445ea15
AL
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1561
3dab1dad 1562 if (OP(noper) != NOTHING) {
786e8c11 1563 for ( ; uc < e ; uc += len ) {
c445ea15 1564
786e8c11 1565 TRIE_READ_CHAR;
c445ea15 1566
786e8c11
YO
1567 if ( uvc < 256 ) {
1568 charid = trie->charmap[ uvc ];
c445ea15 1569 } else {
55eed653 1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1571 if ( !svpp ) {
1572 charid = 0;
1573 } else {
1574 charid=(U16)SvIV( *svpp );
1575 }
c445ea15 1576 }
786e8c11
YO
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1578 if ( charid ) {
a3621e74 1579
786e8c11
YO
1580 U16 check;
1581 U32 newstate = 0;
a3621e74 1582
786e8c11
YO
1583 charid--;
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
c445ea15 1586 }
786e8c11
YO
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1590 break;
1591 }
1592 }
1593 if ( ! newstate ) {
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1596 transcount++;
1597 }
1598 state = newstate;
1599 } else {
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1601 }
a28509cc 1602 }
c445ea15 1603 }
3dab1dad 1604 TRIE_HANDLE_WORD(state);
a3621e74
YO
1605
1606 } /* end second pass */
1607
1e2e3d02
YO
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
c944940b
JH
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1612 next_alloc
1613 * sizeof(reg_trie_state) );
a3621e74 1614
3dab1dad 1615 /* and now dump it out before we compress it */
2b8b4781
NC
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1618 depth+1)
1e2e3d02 1619 );
a3621e74 1620
c944940b
JH
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1623 {
1624 U32 state;
a3621e74
YO
1625 U32 tp = 0;
1626 U32 zp = 0;
1627
1628
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1630 U32 base=0;
1631
1632 /*
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1635 );
1636 */
1637
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1640 U16 maxid=minid;
a28509cc 1641 U16 idx;
a3621e74
YO
1642
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1646 minid=forid;
1647 } else if ( forid > maxid ) {
1648 maxid=forid;
1649 }
a3621e74
YO
1650 }
1651 if ( transcount < tp + maxid - minid + 1) {
1652 transcount *= 2;
c944940b
JH
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
446bd890
NC
1655 transcount
1656 * sizeof(reg_trie_trans) );
a3621e74
YO
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1658 }
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1661 U32 set = 0;
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1667 set = 1;
1668 break;
1669 }
1670 }
1671 if ( !set ) {
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1674 tp++;
1675 zp = tp;
1676 }
1677 } else {
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1682 }
1683 tp += ( maxid - minid + 1 );
1684 }
1685 Safefree(trie->states[ state ].trans.list);
1686 }
1687 /*
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 );
1691 */
1692 trie->states[ state ].trans.base=base;
1693 }
cc601c31 1694 trie->lasttrans = tp + 1;
a3621e74
YO
1695 }
1696 } else {
1697 /*
1698 Second Pass -- Flat Table Representation.
1699
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1704
1705 We then construct the trie using only the .next slots of the entry
1706 structs.
1707
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1711
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 transition.
1714
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1720
1721 A B A B
1722 1. 2 4 1. 3 7
1723 2. 0 3 3. 0 5
1724 3. 0 0 5. 0 0
1725 4. 0 0 7. 0 0
1726
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1730
1731 */
1e2e3d02
YO
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
3dab1dad 1735
c944940b
JH
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
a3621e74
YO
1743 next_alloc = trie->uniquecharcount + 1;
1744
3dab1dad 1745
a3621e74
YO
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1747
c445ea15 1748 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1751
1752 U32 state = 1; /* required init */
1753
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1757
1758 STRLEN foldlen = 0; /* required init */
07be1b83 1759 U32 wordlen = 0; /* required init */
a3621e74
YO
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761
3dab1dad 1762 if ( OP(noper) != NOTHING ) {
786e8c11 1763 for ( ; uc < e ; uc += len ) {
a3621e74 1764
786e8c11 1765 TRIE_READ_CHAR;
a3621e74 1766
786e8c11
YO
1767 if ( uvc < 256 ) {
1768 charid = trie->charmap[ uvc ];
1769 } else {
55eed653 1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1772 }
786e8c11
YO
1773 if ( charid ) {
1774 charid--;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1779 }
1780 state = trie->trans[ state + charid ].next;
1781 } else {
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1783 }
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1785 }
a3621e74 1786 }
3dab1dad
YO
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1789
1790 } /* end second pass */
1791
3dab1dad 1792 /* and now dump it out before we compress it */
2b8b4781
NC
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1794 revcharmap,
1795 next_alloc, depth+1));
a3621e74 1796
a3621e74
YO
1797 {
1798 /*
1799 * Inplace compress the table.*
1800
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1804
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1807
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1810
1811 - If .base is 0 there are no valid transitions from that node.
1812
1813 - If .base is nonzero then charid is added to it to find an entry in
1814 the trans array.
1815
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1821 valid.
1822
786e8c11 1823 XXX - wrong maybe?
a3621e74
YO
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1829 0.
1830
1831 - We set 'pos' to indicate the first entry of the second node.
1832
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1839 compression.
1840
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1843
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
786e8c11 1849 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1850
1851 - Once compressed we can Renew/realloc the structures to release the
1852 excess space.
1853
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1856
1857 demq
1858 */
a3b680e6 1859 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1860 U32 state, charid;
a3621e74 1861 U32 pos = 0, zp=0;
1e2e3d02 1862 trie->statecount = laststate;
a3621e74
YO
1863
1864 for ( state = 1 ; state < laststate ; state++ ) {
1865 U8 flag = 0;
a28509cc
AL
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1869 trie->trans[ stateidx ].check = 0;
1870
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1874 if (o_used == 1) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1877 break;
1878 }
1879 }
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1884 break;
1885 }
1886 used--;
1887 }
1888 if ( !flag ) {
1889 flag = 1;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1891 }
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1894 pos++;
1895 }
1896 }
1897 }
cc601c31 1898 trie->lasttrans = pos + 1;
c944940b
JH
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
a3621e74 1902 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1903 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1907 (IV)next_alloc,
1908 (IV)pos,
a3621e74
YO
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 );
1911
1912 } /* end table compress */
1913 }
1e2e3d02
YO
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1919 );
cc601c31 1920 /* resize the trans array to remove unused space */
c944940b
JH
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
a3621e74 1924
3dab1dad 1925 /* and now dump out the compressed format */
2b8b4781 1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1927
3dab1dad 1928 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1929 U8 nodetype =(U8)(flags & 0xFF);
1930 char *str=NULL;
786e8c11 1931
07be1b83 1932#ifdef DEBUGGING
e62cc96a 1933 regnode *optimize = NULL;
7122b237
YO
1934#ifdef RE_TRACK_PATTERN_OFFSETS
1935
b57a0404
JH
1936 U32 mjd_offset = 0;
1937 U32 mjd_nodelen = 0;
7122b237
YO
1938#endif /* RE_TRACK_PATTERN_OFFSETS */
1939#endif /* DEBUGGING */
a3621e74 1940 /*
3dab1dad
YO
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
a3621e74 1947 */
3dab1dad 1948 /* Find the node we are going to overwrite */
7f69552c 1949 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1950 /* branch sub-chain */
3dab1dad 1951 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1952#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1953 DEBUG_r({
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1956 });
7122b237 1957#endif
7f69552c 1958 /* whole branch chain */
7122b237
YO
1959 }
1960#ifdef RE_TRACK_PATTERN_OFFSETS
1961 else {
7f69552c
YO
1962 DEBUG_r({
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1966 });
07be1b83
YO
1967 }
1968 DEBUG_OPTIMISE_r(
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
786e8c11 1971 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1972 );
7122b237 1973#endif
3dab1dad
YO
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
55eed653 1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1978 U32 state;
1e2e3d02 1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1980 U32 ofs = 0;
8e11feef
RGS
1981 I32 idx = -1;
1982 U32 count = 0;
1983 const U32 base = trie->states[ state ].trans.base;
a3621e74 1984
3dab1dad 1985 if ( trie->states[state].wordnum )
8e11feef 1986 count = 1;
a3621e74 1987
8e11feef 1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1992 {
3dab1dad 1993 if ( ++count > 1 ) {
2b8b4781 1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1996 if ( state == 1 ) break;
3dab1dad
YO
1997 if ( count == 2 ) {
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1999 DEBUG_OPTIMISE_r(
8e11feef
RGS
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
786e8c11 2003 (UV)state));
be8e71aa 2004 if (idx >= 0) {
2b8b4781 2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2007
3dab1dad 2008 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2009 if ( folder )
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2011 DEBUG_OPTIMISE_r(
f1f66076 2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2013 );
8e11feef
RGS
2014 }
2015 }
2016 TRIE_BITMAP_SET(trie,*ch);
2017 if ( folder )
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2020 }
2021 idx = ofs;
2022 }
3dab1dad
YO
2023 }
2024 if ( count == 1 ) {
2b8b4781 2025 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2026 STRLEN len;
2027 char *ch = SvPV( *tmp, len );
de734bd5
A
2028 DEBUG_OPTIMISE_r({
2029 SV *sv=sv_newmortal();
8e11feef
RGS
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
de734bd5
A
2033 (UV)state, (UV)idx,
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2038 )
2039 );
2040 });
3dab1dad
YO
2041 if ( state==1 ) {
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2044 STR_LEN(convert)=0;
2045 }
c490c714
YO
2046 STR_LEN(convert) += len;
2047 while (len--)
de734bd5 2048 *str++ = *ch++;
8e11feef 2049 } else {
f9049ba1 2050#ifdef DEBUGGING
8e11feef
RGS
2051 if (state>1)
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2053#endif
8e11feef
RGS
2054 break;
2055 }
2056 }
3dab1dad 2057 if (str) {
8e11feef 2058 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2060 trie->startstate = state;
07be1b83
YO
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
33809eae
JH
2063#ifdef DEBUGGING
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2066 * it right here. */
2067 if (
2068#ifdef PERL_EXT_RE_BUILD
2069 1
2070#else
2071 DEBUG_r_TEST
2072#endif
2073 ) {
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2076 mjd_nodelen++;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2080 }
2081 while (word--) {
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2083 if (tmp) {
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2086 else
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2088 }
2089 }
2090 }
2091#endif
8e11feef
RGS
2092 if (trie->maxlen) {
2093 convert = n;
2094 } else {
3dab1dad 2095 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2096 DEBUG_r(optimize= n);
3dab1dad
YO
2097 }
2098 }
2099 }
a5ca303d
YO
2100 if (!jumper)
2101 jumper = last;
3dab1dad 2102 if ( trie->maxlen ) {
8e11feef
RGS
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
786e8c11
YO
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2108 if (trie->jump)
7f69552c 2109 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2110
786e8c11
YO
2111 /* XXXX */
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2114 {
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2117 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2118 trie->bitmap= NULL;
2119 } else
2120 OP( convert ) = TRIE;
a3621e74 2121
3dab1dad
YO
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
a5ca303d
YO
2124 DEBUG_r({
2125 optimize = convert
2126 + NODE_STEP_REGNODE
2127 + regarglen[ OP( convert ) ];
2128 });
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
3dab1dad 2131 }
a3621e74 2132 /* needed for dumping*/
e62cc96a 2133 DEBUG_r(if (optimize) {
07be1b83 2134 regnode *opt = convert;
bcdf7404 2135
e62cc96a 2136 while ( ++opt < optimize) {
07be1b83
YO
2137 Set_Node_Offset_Length(opt,0,0);
2138 }
786e8c11
YO
2139 /*
2140 Try to clean up some of the debris left after the
2141 optimisation.
a3621e74 2142 */
786e8c11 2143 while( optimize < jumper ) {
07be1b83 2144 mjd_nodelen += Node_Length((optimize));
a3621e74 2145 OP( optimize ) = OPTIMIZED;
07be1b83 2146 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2147 optimize++;
2148 }
07be1b83 2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2150 });
2151 } /* end node insert */
55eed653 2152 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2153#ifdef DEBUGGING
2154 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2155 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2156#else
2157 SvREFCNT_dec(revcharmap);
07be1b83 2158#endif
786e8c11
YO
2159 return trie->jump
2160 ? MADE_JUMP_TRIE
2161 : trie->startstate>1
2162 ? MADE_EXACT_TRIE
2163 : MADE_TRIE;
2164}
2165
2166STATIC void
2167S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2168{
2169/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2170
2171 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2172 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2173 ISBN 0-201-10088-6
2174
2175 We find the fail state for each state in the trie, this state is the longest proper
2176 suffix of the current states 'word' that is also a proper prefix of another word in our
2177 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2178 the DFA not to have to restart after its tried and failed a word at a given point, it
2179 simply continues as though it had been matching the other word in the first place.
2180 Consider
2181 'abcdgu'=~/abcdefg|cdgu/
2182 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2183 fail, which would bring use to the state representing 'd' in the second word where we would
2184 try 'g' and succeed, prodceding to match 'cdgu'.
2185 */
2186 /* add a fail transition */
3251b653
NC
2187 const U32 trie_offset = ARG(source);
2188 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2189 U32 *q;
2190 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2191 const U32 numstates = trie->statecount;
786e8c11
YO
2192 const U32 ubound = trie->lasttrans + ucharcount;
2193 U32 q_read = 0;
2194 U32 q_write = 0;
2195 U32 charid;
2196 U32 base = trie->states[ 1 ].trans.base;
2197 U32 *fail;
2198 reg_ac_data *aho;
2199 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2200 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2201
2202 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2203#ifndef DEBUGGING
2204 PERL_UNUSED_ARG(depth);
2205#endif
2206
2207
2208 ARG_SET( stclass, data_slot );
c944940b 2209 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2210 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2211 aho->trie=trie_offset;
446bd890
NC
2212 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2213 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2214 Newxz( q, numstates, U32);
c944940b 2215 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2216 aho->refcount = 1;
2217 fail = aho->fail;
2218 /* initialize fail[0..1] to be 1 so that we always have
2219 a valid final fail state */
2220 fail[ 0 ] = fail[ 1 ] = 1;
2221
2222 for ( charid = 0; charid < ucharcount ; charid++ ) {
2223 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2224 if ( newstate ) {
2225 q[ q_write ] = newstate;
2226 /* set to point at the root */
2227 fail[ q[ q_write++ ] ]=1;
2228 }
2229 }
2230 while ( q_read < q_write) {
2231 const U32 cur = q[ q_read++ % numstates ];
2232 base = trie->states[ cur ].trans.base;
2233
2234 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2235 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2236 if (ch_state) {
2237 U32 fail_state = cur;
2238 U32 fail_base;
2239 do {
2240 fail_state = fail[ fail_state ];
2241 fail_base = aho->states[ fail_state ].trans.base;
2242 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2243
2244 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2245 fail[ ch_state ] = fail_state;
2246 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2247 {
2248 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2249 }
2250 q[ q_write++ % numstates] = ch_state;
2251 }
2252 }
2253 }
2254 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2255 when we fail in state 1, this allows us to use the
2256 charclass scan to find a valid start char. This is based on the principle
2257 that theres a good chance the string being searched contains lots of stuff
2258 that cant be a start char.
2259 */
2260 fail[ 0 ] = fail[ 1 ] = 0;
2261 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2262 PerlIO_printf(Perl_debug_log,
2263 "%*sStclass Failtable (%"UVuf" states): 0",
2264 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2265 );
786e8c11
YO
2266 for( q_read=1; q_read<numstates; q_read++ ) {
2267 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2268 }
2269 PerlIO_printf(Perl_debug_log, "\n");
2270 });
2271 Safefree(q);
2272 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2273}
2274
786e8c11 2275
a3621e74 2276/*
5d1c421c
JH
2277 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2278 * These need to be revisited when a newer toolchain becomes available.
2279 */
2280#if defined(__sparc64__) && defined(__GNUC__)
2281# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2282# undef SPARC64_GCC_WORKAROUND
2283# define SPARC64_GCC_WORKAROUND 1
2284# endif
2285#endif
2286
07be1b83 2287#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2288 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2289 SV * const mysv=sv_newmortal(); \
2290 regnode *Next = regnext(scan); \
2291 regprop(RExC_rx, mysv, scan); \
7f69552c 2292 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2293 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2294 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2295 }});
07be1b83 2296
1de06328
YO
2297
2298
2299
2300
07be1b83
YO
2301#define JOIN_EXACT(scan,min,flags) \
2302 if (PL_regkind[OP(scan)] == EXACT) \
2303 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2304
be8e71aa 2305STATIC U32
07be1b83
YO
2306S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2307 /* Merge several consecutive EXACTish nodes into one. */
2308 regnode *n = regnext(scan);
2309 U32 stringok = 1;
2310 regnode *next = scan + NODE_SZ_STR(scan);
2311 U32 merged = 0;
2312 U32 stopnow = 0;
2313#ifdef DEBUGGING
2314 regnode *stop = scan;
72f13be8 2315 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2316#else
d47053eb
RGS
2317 PERL_UNUSED_ARG(depth);
2318#endif
7918f24d
NC
2319
2320 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2321#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2322 PERL_UNUSED_ARG(flags);
2323 PERL_UNUSED_ARG(val);
07be1b83 2324#endif
07be1b83
YO
2325 DEBUG_PEEP("join",scan,depth);
2326
2327 /* Skip NOTHING, merge EXACT*. */
2328 while (n &&
2329 ( PL_regkind[OP(n)] == NOTHING ||
2330 (stringok && (OP(n) == OP(scan))))
2331 && NEXT_OFF(n)
2332 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2333
2334 if (OP(n) == TAIL || n > next)
2335 stringok = 0;
2336 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2337 DEBUG_PEEP("skip:",n,depth);
2338 NEXT_OFF(scan) += NEXT_OFF(n);
2339 next = n + NODE_STEP_REGNODE;
2340#ifdef DEBUGGING
2341 if (stringok)
2342 stop = n;
2343#endif
2344 n = regnext(n);
2345 }
2346 else if (stringok) {
786e8c11 2347 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2348 regnode * const nnext = regnext(n);
2349
2350 DEBUG_PEEP("merg",n,depth);
2351
2352 merged++;
2353 if (oldl + STR_LEN(n) > U8_MAX)
2354 break;
2355 NEXT_OFF(scan) += NEXT_OFF(n);
2356 STR_LEN(scan) += STR_LEN(n);
2357 next = n + NODE_SZ_STR(n);
2358 /* Now we can overwrite *n : */
2359 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2360#ifdef DEBUGGING
2361 stop = next - 1;
2362#endif
2363 n = nnext;
2364 if (stopnow) break;
2365 }
2366
d47053eb
RGS
2367#ifdef EXPERIMENTAL_INPLACESCAN
2368 if (flags && !NEXT_OFF(n)) {
2369 DEBUG_PEEP("atch", val, depth);
2370 if (reg_off_by_arg[OP(n)]) {
2371 ARG_SET(n, val - n);
2372 }
2373 else {
2374 NEXT_OFF(n) = val - n;
2375 }
2376 stopnow = 1;
2377 }
07be1b83
YO
2378#endif
2379 }
2380
2381 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2382 /*
2383 Two problematic code points in Unicode casefolding of EXACT nodes:
2384
2385 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2386 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2387
2388 which casefold to
2389
2390 Unicode UTF-8
2391
2392 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2393 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2394
2395 This means that in case-insensitive matching (or "loose matching",
2396 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2397 length of the above casefolded versions) can match a target string
2398 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2399 This would rather mess up the minimum length computation.
2400
2401 What we'll do is to look for the tail four bytes, and then peek
2402 at the preceding two bytes to see whether we need to decrease
2403 the minimum length by four (six minus two).
2404
2405 Thanks to the design of UTF-8, there cannot be false matches:
2406 A sequence of valid UTF-8 bytes cannot be a subsequence of
2407 another valid sequence of UTF-8 bytes.
2408
2409 */
2410 char * const s0 = STRING(scan), *s, *t;
2411 char * const s1 = s0 + STR_LEN(scan) - 1;
2412 char * const s2 = s1 - 4;
e294cc5d
JH
2413#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2414 const char t0[] = "\xaf\x49\xaf\x42";
2415#else
07be1b83 2416 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2417#endif
07be1b83
YO
2418 const char * const t1 = t0 + 3;
2419
2420 for (s = s0 + 2;
2421 s < s2 && (t = ninstr(s, s1, t0, t1));
2422 s = t + 4) {
e294cc5d
JH
2423#ifdef EBCDIC
2424 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2425 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2426#else
07be1b83
YO
2427 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2428 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2429#endif
07be1b83
YO
2430 *min -= 4;
2431 }
2432 }
2433
2434#ifdef DEBUGGING
2435 /* Allow dumping */
2436 n = scan + NODE_SZ_STR(scan);
2437 while (n <= stop) {
2438 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2439 OP(n) = OPTIMIZED;
2440 NEXT_OFF(n) = 0;
2441 }
2442 n++;
2443 }
2444#endif
2445 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2446 return stopnow;
2447}
2448
653099ff
GS
2449/* REx optimizer. Converts nodes into quickier variants "in place".
2450 Finds fixed substrings. */
2451
a0288114 2452/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2453 to the position after last scanned or to NULL. */
2454
40d049e4
YO
2455#define INIT_AND_WITHP \
2456 assert(!and_withp); \
2457 Newx(and_withp,1,struct regnode_charclass_class); \
2458 SAVEFREEPV(and_withp)
07be1b83 2459
b515a41d
YO
2460/* this is a chain of data about sub patterns we are processing that
2461 need to be handled seperately/specially in study_chunk. Its so
2462 we can simulate recursion without losing state. */
2463struct scan_frame;
2464typedef struct scan_frame {
2465 regnode *last; /* last node to process in this frame */
2466 regnode *next; /* next node to process when last is reached */
2467 struct scan_frame *prev; /*previous frame*/
2468 I32 stop; /* what stopparen do we use */
2469} scan_frame;
2470
304ee84b
YO
2471
2472#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2473
e1d1eefb
YO
2474#define CASE_SYNST_FNC(nAmE) \
2475case nAmE: \
2476 if (flags & SCF_DO_STCLASS_AND) { \
2477 for (value = 0; value < 256; value++) \
2478 if (!is_ ## nAmE ## _cp(value)) \
2479 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2480 } \
2481 else { \
2482 for (value = 0; value < 256; value++) \
2483 if (is_ ## nAmE ## _cp(value)) \
2484 ANYOF_BITMAP_SET(data->start_class, value); \
2485 } \
2486 break; \
2487case N ## nAmE: \
2488 if (flags & SCF_DO_STCLASS_AND) { \
2489 for (value = 0; value < 256; value++) \
2490 if (is_ ## nAmE ## _cp(value)) \
2491 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2492 } \
2493 else { \
2494 for (value = 0; value < 256; value++) \
2495 if (!is_ ## nAmE ## _cp(value)) \
2496 ANYOF_BITMAP_SET(data->start_class, value); \
2497 } \
2498 break
2499
2500
2501
76e3520e 2502STATIC I32
40d049e4 2503S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2504 I32 *minlenp, I32 *deltap,
40d049e4
YO
2505 regnode *last,
2506 scan_data_t *data,
2507 I32 stopparen,
2508 U8* recursed,
2509 struct regnode_charclass_class *and_withp,
2510 U32 flags, U32 depth)
c277df42
IZ
2511 /* scanp: Start here (read-write). */
2512 /* deltap: Write maxlen-minlen here. */
2513 /* last: Stop before this one. */
40d049e4
YO
2514 /* data: string data about the pattern */
2515 /* stopparen: treat close N as END */
2516 /* recursed: which subroutines have we recursed into */
2517 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2518{
97aff369 2519 dVAR;
c277df42
IZ
2520 I32 min = 0, pars = 0, code;
2521 regnode *scan = *scanp, *next;
2522 I32 delta = 0;
2523 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2524 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2525 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2526 scan_data_t data_fake;
a3621e74 2527 SV *re_trie_maxbuff = NULL;
786e8c11 2528 regnode *first_non_open = scan;
e2e6a0f1 2529 I32 stopmin = I32_MAX;
8aa23a47 2530 scan_frame *frame = NULL;
a3621e74 2531 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2532
7918f24d
NC
2533 PERL_ARGS_ASSERT_STUDY_CHUNK;
2534
13a24bad 2535#ifdef DEBUGGING
40d049e4 2536 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2537#endif
40d049e4 2538
786e8c11 2539 if ( depth == 0 ) {
40d049e4 2540 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2541 first_non_open=regnext(first_non_open);
2542 }
2543
b81d288d 2544
8aa23a47
YO
2545 fake_study_recurse:
2546 while ( scan && OP(scan) != END && scan < last ){
2547 /* Peephole optimizer: */
304ee84b 2548 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2549 DEBUG_PEEP("Peep",scan,depth);
2550 JOIN_EXACT(scan,&min,0);
2551
2552 /* Follow the next-chain of the current node and optimize
2553 away all the NOTHINGs from it. */
2554 if (OP(scan) != CURLYX) {
2555 const int max = (reg_off_by_arg[OP(scan)]
2556 ? I32_MAX
2557 /* I32 may be smaller than U16 on CRAYs! */
2558 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2559 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2560 int noff;
2561 regnode *n = scan;
2562
2563 /* Skip NOTHING and LONGJMP. */
2564 while ((n = regnext(n))
2565 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2566 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2567 && off + noff < max)
2568 off += noff;
2569 if (reg_off_by_arg[OP(scan)])
2570 ARG(scan) = off;
2571 else
2572 NEXT_OFF(scan) = off;
2573 }
a3621e74 2574
c277df42 2575
8aa23a47
YO
2576
2577 /* The principal pseudo-switch. Cannot be a switch, since we
2578 look into several different things. */
2579 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2580 || OP(scan) == IFTHEN) {
2581 next = regnext(scan);
2582 code = OP(scan);
2583 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2584
2585 if (OP(next) == code || code == IFTHEN) {
2586 /* NOTE - There is similar code to this block below for handling
2587 TRIE nodes on a re-study. If you change stuff here check there
2588 too. */
2589 I32 max1 = 0, min1 = I32_MAX, num = 0;
2590 struct regnode_charclass_class accum;
2591 regnode * const startbranch=scan;
2592
2593 if (flags & SCF_DO_SUBSTR)
304ee84b 2594 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2595 if (flags & SCF_DO_STCLASS)
2596 cl_init_zero(pRExC_state, &accum);
2597
2598 while (OP(scan) == code) {
2599 I32 deltanext, minnext, f = 0, fake;
2600 struct regnode_charclass_class this_class;
2601
2602 num++;
2603 data_fake.flags = 0;
2604 if (data) {
2605 data_fake.whilem_c = data->whilem_c;
2606 data_fake.last_closep = data->last_closep;
2607 }
2608 else
2609 data_fake.last_closep = &fake;
58e23c8d
YO
2610
2611 data_fake.pos_delta = delta;
8aa23a47
YO
2612 next = regnext(scan);
2613 scan = NEXTOPER(scan);
2614 if (code != BRANCH)
c277df42 2615 scan = NEXTOPER(scan);
8aa23a47
YO
2616 if (flags & SCF_DO_STCLASS) {
2617 cl_init(pRExC_state, &this_class);
2618 data_fake.start_class = &this_class;
2619 f = SCF_DO_STCLASS_AND;
58e23c8d 2620 }
8aa23a47
YO
2621 if (flags & SCF_WHILEM_VISITED_POS)
2622 f |= SCF_WHILEM_VISITED_POS;
2623
2624 /* we suppose the run is continuous, last=next...*/
2625 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2626 next, &data_fake,
2627 stopparen, recursed, NULL, f,depth+1);
2628 if (min1 > minnext)
2629 min1 = minnext;
2630 if (max1 < minnext + deltanext)
2631 max1 = minnext + deltanext;
2632 if (deltanext == I32_MAX)
2633 is_inf = is_inf_internal = 1;
2634 scan = next;
2635 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2636 pars++;
2637 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2638 if ( stopmin > minnext)
2639 stopmin = min + min1;
2640 flags &= ~SCF_DO_SUBSTR;
2641 if (data)
2642 data->flags |= SCF_SEEN_ACCEPT;
2643 }
2644 if (data) {
2645 if (data_fake.flags & SF_HAS_EVAL)
2646 data->flags |= SF_HAS_EVAL;
2647 data->whilem_c = data_fake.whilem_c;
3dab1dad 2648 }
8aa23a47
YO
2649 if (flags & SCF_DO_STCLASS)
2650 cl_or(pRExC_state, &accum, &this_class);
2651 }
2652 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2653 min1 = 0;
2654 if (flags & SCF_DO_SUBSTR) {
2655 data->pos_min += min1;
2656 data->pos_delta += max1 - min1;
2657 if (max1 != min1 || is_inf)
2658 data->longest = &(data->longest_float);
2659 }
2660 min += min1;
2661 delta += max1 - min1;
2662 if (flags & SCF_DO_STCLASS_OR) {
2663 cl_or(pRExC_state, data->start_class, &accum);
2664 if (min1) {
2665 cl_and(data->start_class, and_withp);
2666 flags &= ~SCF_DO_STCLASS;
653099ff 2667 }
8aa23a47
YO
2668 }
2669 else if (flags & SCF_DO_STCLASS_AND) {
2670 if (min1) {
2671 cl_and(data->start_class, &accum);
2672 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2673 }
8aa23a47
YO
2674 else {
2675 /* Switch to OR mode: cache the old value of
2676 * data->start_class */
2677 INIT_AND_WITHP;
2678 StructCopy(data->start_class, and_withp,
2679 struct regnode_charclass_class);
2680 flags &= ~SCF_DO_STCLASS_AND;
2681 StructCopy(&accum, data->start_class,
2682 struct regnode_charclass_class);
2683 flags |= SCF_DO_STCLASS_OR;
2684 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2685 }
8aa23a47 2686 }
a3621e74 2687
8aa23a47
YO
2688 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2689 /* demq.
a3621e74 2690
8aa23a47
YO
2691 Assuming this was/is a branch we are dealing with: 'scan' now
2692 points at the item that follows the branch sequence, whatever
2693 it is. We now start at the beginning of the sequence and look
2694 for subsequences of
a3621e74 2695
8aa23a47
YO
2696 BRANCH->EXACT=>x1
2697 BRANCH->EXACT=>x2
2698 tail
a3621e74 2699
8aa23a47 2700 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2701
8aa23a47
YO
2702 If we can find such a subseqence we need to turn the first
2703 element into a trie and then add the subsequent branch exact
2704 strings to the trie.
a3621e74 2705
8aa23a47 2706 We have two cases
a3621e74 2707
8aa23a47 2708 1. patterns where the whole set of branch can be converted.
a3621e74 2709
8aa23a47 2710 2. patterns where only a subset can be converted.
a3621e74 2711
8aa23a47
YO
2712 In case 1 we can replace the whole set with a single regop
2713 for the trie. In case 2 we need to keep the start and end
2714 branchs so
a3621e74 2715
8aa23a47
YO
2716 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2717 becomes BRANCH TRIE; BRANCH X;
786e8c11 2718
8aa23a47
YO
2719 There is an additional case, that being where there is a
2720 common prefix, which gets split out into an EXACT like node
2721 preceding the TRIE node.
a3621e74 2722
8aa23a47
YO
2723 If x(1..n)==tail then we can do a simple trie, if not we make
2724 a "jump" trie, such that when we match the appropriate word
2725 we "jump" to the appopriate tail node. Essentailly we turn
2726 a nested if into a case structure of sorts.
b515a41d 2727
8aa23a47
YO
2728 */
2729
2730 int made=0;
2731 if (!re_trie_maxbuff) {
2732 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2733 if (!SvIOK(re_trie_maxbuff))
2734 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2735 }
2736 if ( SvIV(re_trie_maxbuff)>=0 ) {
2737 regnode *cur;
2738 regnode *first = (regnode *)NULL;
2739 regnode *last = (regnode *)NULL;
2740 regnode *tail = scan;
2741 U8 optype = 0;
2742 U32 count=0;
a3621e74
YO
2743
2744#ifdef DEBUGGING
8aa23a47 2745 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2746#endif
8aa23a47
YO
2747 /* var tail is used because there may be a TAIL
2748 regop in the way. Ie, the exacts will point to the
2749 thing following the TAIL, but the last branch will
2750 point at the TAIL. So we advance tail. If we
2751 have nested (?:) we may have to move through several
2752 tails.
2753 */
2754
2755 while ( OP( tail ) == TAIL ) {
2756 /* this is the TAIL generated by (?:) */
2757 tail = regnext( tail );
2758 }
a3621e74 2759
8aa23a47
YO
2760
2761 DEBUG_OPTIMISE_r({
2762 regprop(RExC_rx, mysv, tail );
2763 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2764 (int)depth * 2 + 2, "",
2765 "Looking for TRIE'able sequences. Tail node is: ",
2766 SvPV_nolen_const( mysv )
2767 );
2768 });
2769
2770 /*
2771
2772 step through the branches, cur represents each
2773 branch, noper is the first thing to be matched
2774 as part of that branch and noper_next is the
2775 regnext() of that node. if noper is an EXACT
2776 and noper_next is the same as scan (our current
2777 position in the regex) then the EXACT branch is
2778 a possible optimization target. Once we have
2779 two or more consequetive such branches we can
2780 create a trie of the EXACT's contents and stich
2781 it in place. If the sequence represents all of
2782 the branches we eliminate the whole thing and
2783 replace it with a single TRIE. If it is a
2784 subsequence then we need to stitch it in. This
2785 means the first branch has to remain, and needs
2786 to be repointed at the item on the branch chain
2787 following the last branch optimized. This could
2788 be either a BRANCH, in which case the
2789 subsequence is internal, or it could be the
2790 item following the branch sequence in which
2791 case the subsequence is at the end.
2792
2793 */
2794
2795 /* dont use tail as the end marker for this traverse */
2796 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2797 regnode * const noper = NEXTOPER( cur );
b515a41d 2798#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2799 regnode * const noper_next = regnext( noper );
b515a41d
YO
2800#endif
2801
8aa23a47
YO
2802 DEBUG_OPTIMISE_r({
2803 regprop(RExC_rx, mysv, cur);
2804 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2805 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2806
2807 regprop(RExC_rx, mysv, noper);
2808 PerlIO_printf( Perl_debug_log, " -> %s",
2809 SvPV_nolen_const(mysv));
2810
2811 if ( noper_next ) {
2812 regprop(RExC_rx, mysv, noper_next );
2813 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2814 SvPV_nolen_const(mysv));
2815 }
2816 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2817 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2818 });
2819 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2820 : PL_regkind[ OP( noper ) ] == EXACT )
2821 || OP(noper) == NOTHING )
786e8c11 2822#ifdef NOJUMPTRIE
8aa23a47 2823 && noper_next == tail
786e8c11 2824#endif
8aa23a47
YO
2825 && count < U16_MAX)
2826 {
2827 count++;
2828 if ( !first || optype == NOTHING ) {
2829 if (!first) first = cur;
2830 optype = OP( noper );
2831 } else {
2832 last = cur;
2833 }
2834 } else {
a0a388a1 2835/*
0abd0d78
YO
2836 Currently we do not believe that the trie logic can
2837 handle case insensitive matching properly when the
2838 pattern is not unicode (thus forcing unicode semantics).
2839
2840 If/when this is fixed the following define can be swapped
2841 in below to fully enable trie logic.
2842
a0a388a1 2843#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2844
2845*/
2846#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2847
a0a388a1 2848 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2849 make_trie( pRExC_state,
2850 startbranch, first, cur, tail, count,
2851 optype, depth+1 );
2852 }
2853 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2854#ifdef NOJUMPTRIE
8aa23a47 2855 && noper_next == tail
786e8c11 2856#endif
8aa23a47
YO
2857 ){
2858 count = 1;
2859 first = cur;
2860 optype = OP( noper );
2861 } else {
2862 count = 0;
2863 first = NULL;
2864 optype = 0;
2865 }
2866 last = NULL;
2867 }
2868 }
2869 DEBUG_OPTIMISE_r({
2870 regprop(RExC_rx, mysv, cur);
2871 PerlIO_printf( Perl_debug_log,
2872 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2873 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2874
2875 });
a0a388a1
YO
2876
2877 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2878 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2879#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2880 if ( ((made == MADE_EXACT_TRIE &&
2881 startbranch == first)
2882 || ( first_non_open == first )) &&
2883 depth==0 ) {
2884 flags |= SCF_TRIE_RESTUDY;
2885 if ( startbranch == first
2886 && scan == tail )
2887 {
2888 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2889 }
2890 }
3dab1dad 2891#endif
8aa23a47
YO
2892 }
2893 }
2894
2895 } /* do trie */
2896
653099ff 2897 }
8aa23a47
YO
2898 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2899 scan = NEXTOPER(NEXTOPER(scan));
2900 } else /* single branch is optimized. */
2901 scan = NEXTOPER(scan);
2902 continue;
2903 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2904 scan_frame *newframe = NULL;
2905 I32 paren;
2906 regnode *start;
2907 regnode *end;
2908
2909 if (OP(scan) != SUSPEND) {
2910 /* set the pointer */
2911 if (OP(scan) == GOSUB) {
2912 paren = ARG(scan);
2913 RExC_recurse[ARG2L(scan)] = scan;
2914 start = RExC_open_parens[paren-1];
2915 end = RExC_close_parens[paren-1];
2916 } else {
2917 paren = 0;
f8fc2ecf 2918 start = RExC_rxi->program + 1;
8aa23a47
YO
2919 end = RExC_opend;
2920 }
2921 if (!recursed) {
2922 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2923 SAVEFREEPV(recursed);
2924 }
2925 if (!PAREN_TEST(recursed,paren+1)) {
2926 PAREN_SET(recursed,paren+1);
2927 Newx(newframe,1,scan_frame);
2928 } else {
2929 if (flags & SCF_DO_SUBSTR) {
304ee84b 2930 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2931 data->longest = &(data->longest_float);
2932 }
2933 is_inf = is_inf_internal = 1;
2934 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2935 cl_anything(pRExC_state, data->start_class);
2936 flags &= ~SCF_DO_STCLASS;
2937 }
2938 } else {
2939 Newx(newframe,1,scan_frame);
2940 paren = stopparen;
2941 start = scan+2;
2942 end = regnext(scan);
2943 }
2944 if (newframe) {
2945 assert(start);
2946 assert(end);
2947 SAVEFREEPV(newframe);
2948 newframe->next = regnext(scan);
2949 newframe->last = last;
2950 newframe->stop = stopparen;
2951 newframe->prev = frame;
2952
2953 frame = newframe;
2954 scan = start;
2955 stopparen = paren;
2956 last = end;
2957
2958 continue;
2959 }
2960 }
2961 else if (OP(scan) == EXACT) {
2962 I32 l = STR_LEN(scan);
2963 UV uc;
2964 if (UTF) {
2965 const U8 * const s = (U8*)STRING(scan);
2966 l = utf8_length(s, s + l);
2967 uc = utf8_to_uvchr(s, NULL);
2968 } else {
2969 uc = *((U8*)STRING(scan));
2970 }
2971 min += l;
2972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2973 /* The code below prefers earlier match for fixed
2974 offset, later match for variable offset. */
2975 if (data->last_end == -1) { /* Update the start info. */
2976 data->last_start_min = data->pos_min;
2977 data->last_start_max = is_inf
2978 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2979 }
8aa23a47
YO
2980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2981 if (UTF)
2982 SvUTF8_on(data->last_found);
2983 {
2984 SV * const sv = data->last_found;
2985 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2986 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2987 if (mg && mg->mg_len >= 0)
2988 mg->mg_len += utf8_length((U8*)STRING(scan),
2989 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2990 }
8aa23a47
YO
2991 data->last_end = data->pos_min + l;
2992 data->pos_min += l; /* As in the first entry. */
2993 data->flags &= ~SF_BEFORE_EOL;
2994 }
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 /* Check whether it is compatible with what we know already! */
2997 int compat = 1;
2998
2999 if (uc >= 0x100 ||
3000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3002 && (!(data->start_class->flags & ANYOF_FOLD)
3003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3004 )
3005 compat = 0;
3006 ANYOF_CLASS_ZERO(data->start_class);
3007 ANYOF_BITMAP_ZERO(data->start_class);
3008 if (compat)
3009 ANYOF_BITMAP_SET(data->start_class, uc);
3010 data->start_class->flags &= ~ANYOF_EOS;
3011 if (uc < 0x100)
3012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3013 }
3014 else if (flags & SCF_DO_STCLASS_OR) {
3015 /* false positive possible if the class is case-folded */
3016 if (uc < 0x100)
3017 ANYOF_BITMAP_SET(data->start_class, uc);
3018 else
3019 data->start_class->flags |= ANYOF_UNICODE_ALL;
3020 data->start_class->flags &= ~ANYOF_EOS;
3021 cl_and(data->start_class, and_withp);
3022 }
3023 flags &= ~SCF_DO_STCLASS;
3024 }
3025 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3026 I32 l = STR_LEN(scan);
3027 UV uc = *((U8*)STRING(scan));
3028
3029 /* Search for fixed substrings supports EXACT only. */
3030 if (flags & SCF_DO_SUBSTR) {
3031 assert(data);
304ee84b 3032 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3033 }
3034 if (UTF) {
3035 const U8 * const s = (U8 *)STRING(scan);
3036 l = utf8_length(s, s + l);
3037 uc = utf8_to_uvchr(s, NULL);
3038 }
3039 min += l;
3040 if (flags & SCF_DO_SUBSTR)
3041 data->pos_min += l;
3042 if (flags & SCF_DO_STCLASS_AND) {
3043 /* Check whether it is compatible with what we know already! */
3044 int compat = 1;
3045
3046 if (uc >= 0x100 ||
3047 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3048 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3049 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3050 compat = 0;
3051 ANYOF_CLASS_ZERO(data->start_class);
3052 ANYOF_BITMAP_ZERO(data->start_class);
3053 if (compat) {
3054 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3055 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3056 data->start_class->flags |= ANYOF_FOLD;
3057 if (OP(scan) == EXACTFL)
3058 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3059 }
8aa23a47
YO
3060 }
3061 else if (flags & SCF_DO_STCLASS_OR) {
3062 if (data->start_class->flags & ANYOF_FOLD) {
3063 /* false positive possible if the class is case-folded.
3064 Assume that the locale settings are the same... */
1aa99e6b
IH
3065 if (uc < 0x100)
3066 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3067 data->start_class->flags &= ~ANYOF_EOS;
3068 }
8aa23a47 3069 cl_and(data->start_class, and_withp);
653099ff 3070 }
8aa23a47
YO
3071 flags &= ~SCF_DO_STCLASS;
3072 }
3073 else if (strchr((const char*)PL_varies,OP(scan))) {
3074 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3075 I32 f = flags, pos_before = 0;
3076 regnode * const oscan = scan;
3077 struct regnode_charclass_class this_class;
3078 struct regnode_charclass_class *oclass = NULL;
3079 I32 next_is_eval = 0;
3080
3081 switch (PL_regkind[OP(scan)]) {
3082 case WHILEM: /* End of (?:...)* . */
3083 scan = NEXTOPER(scan);
3084 goto finish;
3085 case PLUS:
3086 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3087 next = NEXTOPER(scan);
3088 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3089 mincount = 1;
3090 maxcount = REG_INFTY;
3091 next = regnext(scan);
3092 scan = NEXTOPER(scan);
3093 goto do_curly;
3094 }
3095 }
3096 if (flags & SCF_DO_SUBSTR)
3097 data->pos_min++;
3098 min++;
3099 /* Fall through. */
3100 case STAR:
3101 if (flags & SCF_DO_STCLASS) {
3102 mincount = 0;
3103 maxcount = REG_INFTY;
3104 next = regnext(scan);
3105 scan = NEXTOPER(scan);
3106 goto do_curly;
3107 }
3108 is_inf = is_inf_internal = 1;
3109 scan = regnext(scan);
c277df42 3110 if (flags & SCF_DO_SUBSTR) {
304ee84b 3111 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3112 data->longest = &(data->longest_float);
c277df42 3113 }
8aa23a47
YO
3114 goto optimize_curly_tail;
3115 case CURLY:
3116 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3117 && (scan->flags == stopparen))
3118 {
3119 mincount = 1;
3120 maxcount = 1;
3121 } else {
3122 mincount = ARG1(scan);
3123 maxcount = ARG2(scan);
653099ff 3124 }
8aa23a47
YO
3125 next = regnext(scan);
3126 if (OP(scan) == CURLYX) {
3127 I32 lp = (data ? *(data->last_closep) : 0);
3128 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3129 }
8aa23a47
YO
3130 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3131 next_is_eval = (OP(scan) == EVAL);
3132 do_curly:
3133 if (flags & SCF_DO_SUBSTR) {
304ee84b 3134 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3135 pos_before = data->pos_min;
b45f050a 3136 }
8aa23a47
YO
3137 if (data) {
3138 fl = data->flags;
3139 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3140 if (is_inf)
3141 data->flags |= SF_IS_INF;
3142 }
3143 if (flags & SCF_DO_STCLASS) {
3144 cl_init(pRExC_state, &this_class);
3145 oclass = data->start_class;
3146 data->start_class = &this_class;
3147 f |= SCF_DO_STCLASS_AND;
3148 f &= ~SCF_DO_STCLASS_OR;
3149 }
3150 /* These are the cases when once a subexpression
3151 fails at a particular position, it cannot succeed
3152 even after backtracking at the enclosing scope.
3153
3154 XXXX what if minimal match and we are at the
3155 initial run of {n,m}? */
3156 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3157 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3158
8aa23a47
YO
3159 /* This will finish on WHILEM, setting scan, or on NULL: */
3160 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3161 last, data, stopparen, recursed, NULL,
3162 (mincount == 0
3163 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3164
8aa23a47
YO
3165 if (flags & SCF_DO_STCLASS)
3166 data->start_class = oclass;
3167 if (mincount == 0 || minnext == 0) {
3168 if (flags & SCF_DO_STCLASS_OR) {
3169 cl_or(pRExC_state, data->start_class, &this_class);
3170 }
3171 else if (flags & SCF_DO_STCLASS_AND) {
3172 /* Switch to OR mode: cache the old value of
3173 * data->start_class */
3174 INIT_AND_WITHP;
3175 StructCopy(data->start_class, and_withp,
3176 struct regnode_charclass_class);
3177 flags &= ~SCF_DO_STCLASS_AND;
3178 StructCopy(&this_class, data->start_class,
3179 struct regnode_charclass_class);
3180 flags |= SCF_DO_STCLASS_OR;
3181 data->start_class->flags |= ANYOF_EOS;
3182 }
3183 } else { /* Non-zero len */
3184 if (flags & SCF_DO_STCLASS_OR) {
3185 cl_or(pRExC_state, data->start_class, &this_class);
3186 cl_and(data->start_class, and_withp);
3187 }
3188 else if (flags & SCF_DO_STCLASS_AND)
3189 cl_and(data->start_class, &this_class);
3190 flags &= ~SCF_DO_STCLASS;
3191 }
3192 if (!scan) /* It was not CURLYX, but CURLY. */
3193 scan = next;
3194 if ( /* ? quantifier ok, except for (?{ ... }) */
3195 (next_is_eval || !(mincount == 0 && maxcount == 1))
3196 && (minnext == 0) && (deltanext == 0)
3197 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3198 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3199 {
668c081a
NC
3200 ckWARNreg(RExC_parse,
3201 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3202 }
3203
3204 min += minnext * mincount;
3205 is_inf_internal |= ((maxcount == REG_INFTY
3206 && (minnext + deltanext) > 0)
3207 || deltanext == I32_MAX);
3208 is_inf |= is_inf_internal;
3209 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3210
3211 /* Try powerful optimization CURLYX => CURLYN. */
3212 if ( OP(oscan) == CURLYX && data
3213 && data->flags & SF_IN_PAR
3214 && !(data->flags & SF_HAS_EVAL)
3215 && !deltanext && minnext == 1 ) {
3216 /* Try to optimize to CURLYN. */
3217 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3218 regnode * const nxt1 = nxt;
497b47a8 3219#ifdef DEBUGGING
8aa23a47 3220 regnode *nxt2;
497b47a8 3221#endif
c277df42 3222
8aa23a47
YO
3223 /* Skip open. */
3224 nxt = regnext(nxt);
3225 if (!strchr((const char*)PL_simple,OP(nxt))
3226 && !(PL_regkind[OP(nxt)] == EXACT
3227 && STR_LEN(nxt) == 1))
3228 goto nogo;
497b47a8 3229#ifdef DEBUGGING
8aa23a47 3230 nxt2 = nxt;
497b47a8 3231#endif
8aa23a47
YO
3232 nxt = regnext(nxt);
3233 if (OP(nxt) != CLOSE)
3234 goto nogo;
3235 if (RExC_open_parens) {
3236 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3237 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3238 }
3239 /* Now we know that nxt2 is the only contents: */
3240 oscan->flags = (U8)ARG(nxt);
3241 OP(oscan) = CURLYN;
3242 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3243
c277df42 3244#ifdef DEBUGGING
8aa23a47
YO
3245 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3246 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3247 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3248 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3249 OP(nxt + 1) = OPTIMIZED; /* was count. */
3250 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3251#endif
8aa23a47
YO
3252 }
3253 nogo:
3254
3255 /* Try optimization CURLYX => CURLYM. */
3256 if ( OP(oscan) == CURLYX && data
3257 && !(data->flags & SF_HAS_PAR)
3258 && !(data->flags & SF_HAS_EVAL)
3259 && !deltanext /* atom is fixed width */
3260 && minnext != 0 /* CURLYM can't handle zero width */
3261 ) {
3262 /* XXXX How to optimize if data == 0? */
3263 /* Optimize to a simpler form. */
3264 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3265 regnode *nxt2;
3266
3267 OP(oscan) = CURLYM;
3268 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3269 && (OP(nxt2) != WHILEM))
3270 nxt = nxt2;
3271 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3272 /* Need to optimize away parenths. */
3273 if (data->flags & SF_IN_PAR) {
3274 /* Set the parenth number. */
3275 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3276
3277 if (OP(nxt) != CLOSE)
3278 FAIL("Panic opt close");
3279 oscan->flags = (U8)ARG(nxt);
3280 if (RExC_open_parens) {
3281 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3282 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3283 }
8aa23a47
YO
3284 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3285 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3286
c277df42 3287#ifdef DEBUGGING
8aa23a47
YO
3288 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3289 OP(nxt + 1) = OPTIMIZED; /* was count. */
3290 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3291 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3292#endif
c277df42 3293#if 0
8aa23a47
YO
3294 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3295 regnode *nnxt = regnext(nxt1);
3296
3297 if (nnxt == nxt) {
3298 if (reg_off_by_arg[OP(nxt1)])
3299 ARG_SET(nxt1, nxt2 - nxt1);
3300 else if (nxt2 - nxt1 < U16_MAX)
3301 NEXT_OFF(nxt1) = nxt2 - nxt1;
3302 else
3303 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3304 }
8aa23a47 3305 nxt1 = nnxt;
c277df42 3306 }
5d1c421c 3307#endif
8aa23a47
YO
3308 /* Optimize again: */
3309 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3310 NULL, stopparen, recursed, NULL, 0,depth+1);
3311 }
3312 else
3313 oscan->flags = 0;
3314 }
3315 else if ((OP(oscan) == CURLYX)
3316 && (flags & SCF_WHILEM_VISITED_POS)
3317 /* See the comment on a similar expression above.
3318 However, this time it not a subexpression
3319 we care about, but the expression itself. */
3320 && (maxcount == REG_INFTY)
3321 && data && ++data->whilem_c < 16) {
3322 /* This stays as CURLYX, we can put the count/of pair. */
3323 /* Find WHILEM (as in regexec.c) */
3324 regnode *nxt = oscan + NEXT_OFF(oscan);
3325
3326 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3327 nxt += ARG(nxt);
3328 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3329 | (RExC_whilem_seen << 4)); /* On WHILEM */
3330 }
3331 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3332 pars++;
3333 if (flags & SCF_DO_SUBSTR) {
3334 SV *last_str = NULL;
3335 int counted = mincount != 0;
a0ed51b3 3336
8aa23a47
YO
3337 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3338#if defined(SPARC64_GCC_WORKAROUND)
3339 I32 b = 0;
3340 STRLEN l = 0;
3341 const char *s = NULL;
3342 I32 old = 0;
b515a41d 3343
8aa23a47
YO
3344 if (pos_before >= data->last_start_min)
3345 b = pos_before;
3346 else
3347 b = data->last_start_min;
b515a41d 3348
8aa23a47
YO
3349 l = 0;
3350 s = SvPV_const(data->last_found, l);
3351 old = b - data->last_start_min;
3352
3353#else
3354 I32 b = pos_before >= data->last_start_min
3355 ? pos_before : data->last_start_min;
3356 STRLEN l;
3357 const char * const s = SvPV_const(data->last_found, l);
3358 I32 old = b - data->last_start_min;
3359#endif
3360
3361 if (UTF)
3362 old = utf8_hop((U8*)s, old) - (U8*)s;
3363
3364 l -= old;
3365 /* Get the added string: */
740cce10 3366 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3367 if (deltanext == 0 && pos_before == b) {
3368 /* What was added is a constant string */
3369 if (mincount > 1) {
3370 SvGROW(last_str, (mincount * l) + 1);
3371 repeatcpy(SvPVX(last_str) + l,
3372 SvPVX_const(last_str), l, mincount - 1);
3373 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3374 /* Add additional parts. */
3375 SvCUR_set(data->last_found,
3376 SvCUR(data->last_found) - l);
3377 sv_catsv(data->last_found, last_str);
3378 {
3379 SV * sv = data->last_found;
3380 MAGIC *mg =
3381 SvUTF8(sv) && SvMAGICAL(sv) ?
3382 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3383 if (mg && mg->mg_len >= 0)
bd94e887 3384 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3385 }
8aa23a47 3386 data->last_end += l * (mincount - 1);
b515a41d 3387 }
8aa23a47
YO
3388 } else {
3389 /* start offset must point into the last copy */
3390 data->last_start_min += minnext * (mincount - 1);
3391 data->last_start_max += is_inf ? I32_MAX
3392 : (maxcount - 1) * (minnext + data->pos_delta);
3393 }
c277df42 3394 }
8aa23a47
YO
3395 /* It is counted once already... */
3396 data->pos_min += minnext * (mincount - counted);
3397 data->pos_delta += - counted * deltanext +
3398 (minnext + deltanext) * maxcount - minnext * mincount;
3399 if (mincount != maxcount) {
3400 /* Cannot extend fixed substrings found inside
3401 the group. */
304ee84b 3402 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3403 if (mincount && last_str) {
3404 SV * const sv = data->last_found;
3405 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3406 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3407
3408 if (mg)
3409 mg->mg_len = -1;
3410 sv_setsv(sv, last_str);
3411 data->last_end = data->pos_min;
3412 data->last_start_min =
3413 data->pos_min - CHR_SVLEN(last_str);
3414 data->last_start_max = is_inf
3415 ? I32_MAX
3416 : data->pos_min + data->pos_delta
3417 - CHR_SVLEN(last_str);
3418 }
3419 data->longest = &(data->longest_float);
3420 }
3421 SvREFCNT_dec(last_str);
c277df42 3422 }
8aa23a47
YO
3423 if (data && (fl & SF_HAS_EVAL))
3424 data->flags |= SF_HAS_EVAL;
3425 optimize_curly_tail:
3426 if (OP(oscan) != CURLYX) {
3427 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3428 && NEXT_OFF(next))
3429 NEXT_OFF(oscan) += NEXT_OFF(next);
3430 }
3431 continue;
3432 default: /* REF and CLUMP only? */
3433 if (flags & SCF_DO_SUBSTR) {
304ee84b 3434 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3435 data->longest = &(data->longest_float);
3436 }
3437 is_inf = is_inf_internal = 1;
3438 if (flags & SCF_DO_STCLASS_OR)
3439 cl_anything(pRExC_state, data->start_class);
3440 flags &= ~SCF_DO_STCLASS;
3441 break;
c277df42 3442 }
8aa23a47 3443 }
e1d1eefb
YO
3444 else if (OP(scan) == LNBREAK) {
3445 if (flags & SCF_DO_STCLASS) {
3446 int value = 0;
3447 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3448 if (flags & SCF_DO_STCLASS_AND) {
3449 for (value = 0; value < 256; value++)
e64b1bd1 3450 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3451 ANYOF_BITMAP_CLEAR(data->start_class, value);
3452 }
3453 else {
3454 for (value = 0; value < 256; value++)
e64b1bd1 3455 if (is_VERTWS_cp(value))
e1d1eefb
YO
3456 ANYOF_BITMAP_SET(data->start_class, value);
3457 }
3458 if (flags & SCF_DO_STCLASS_OR)
3459 cl_and(data->start_class, and_withp);
3460 flags &= ~SCF_DO_STCLASS;
3461 }
3462 min += 1;
f9a79580 3463 delta += 1;
e1d1eefb
YO
3464 if (flags & SCF_DO_SUBSTR) {
3465 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3466 data->pos_min += 1;
f9a79580 3467 data->pos_delta += 1;
e1d1eefb
YO
3468 data->longest = &(data->longest_float);
3469 }
3470
3471 }
f9a79580
RGS
3472 else if (OP(scan) == FOLDCHAR) {
3473 int d = ARG(scan)==0xDF ? 1 : 2;
3474 flags &= ~SCF_DO_STCLASS;
3475 min += 1;
3476 delta += d;
3477 if (flags & SCF_DO_SUBSTR) {
3478 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3479 data->pos_min += 1;
3480 data->pos_delta += d;
3481 data->longest = &(data->longest_float);
3482 }
3483 }
8aa23a47
YO
3484 else if (strchr((const char*)PL_simple,OP(scan))) {
3485 int value = 0;
653099ff 3486
8aa23a47 3487 if (flags & SCF_DO_SUBSTR) {
304ee84b 3488 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3489 data->pos_min++;
3490 }
3491 min++;
3492 if (flags & SCF_DO_STCLASS) {
3493 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3494
8aa23a47
YO
3495 /* Some of the logic below assumes that switching
3496 locale on will only add false positives. */
3497 switch (PL_regkind[OP(scan)]) {
3498 case SANY:
3499 default:
3500 do_default:
3501 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3502 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3503 cl_anything(pRExC_state, data->start_class);
3504 break;
3505 case REG_ANY:
3506 if (OP(scan) == SANY)
3507 goto do_default;
3508 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3509 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3510 || (data->start_class->flags & ANYOF_CLASS));
3511 cl_anything(pRExC_state, data->start_class);
653099ff 3512 }
8aa23a47
YO
3513 if (flags & SCF_DO_STCLASS_AND || !value)
3514 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3515 break;
3516 case ANYOF:
3517 if (flags & SCF_DO_STCLASS_AND)
3518 cl_and(data->start_class,
3519 (struct regnode_charclass_class*)scan);
653099ff 3520 else
8aa23a47
YO
3521 cl_or(pRExC_state, data->start_class,
3522 (struct regnode_charclass_class*)scan);
3523 break;
3524 case ALNUM:
3525 if (flags & SCF_DO_STCLASS_AND) {
3526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3528 for (value = 0; value < 256; value++)
3529 if (!isALNUM(value))
3530 ANYOF_BITMAP_CLEAR(data->start_class, value);
3531 }
653099ff 3532 }
8aa23a47
YO
3533 else {
3534 if (data->start_class->flags & ANYOF_LOCALE)
3535 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3536 else {
3537 for (value = 0; value < 256; value++)
3538 if (isALNUM(value))
3539 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3540 }
8aa23a47
YO
3541 }
3542 break;
3543 case ALNUML:
3544 if (flags & SCF_DO_STCLASS_AND) {
3545 if (data->start_class->flags & ANYOF_LOCALE)
3546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3547 }
3548 else {
3549 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3550 data->start_class->flags |= ANYOF_LOCALE;
3551 }
3552 break;
3553 case NALNUM:
3554 if (flags & SCF_DO_STCLASS_AND) {
3555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3557 for (value = 0; value < 256; value++)
3558 if (isALNUM(value))
3559 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3560 }
3561 }
8aa23a47
YO
3562 else {
3563 if (data->start_class->flags & ANYOF_LOCALE)
3564 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3565 else {
3566 for (value = 0; value < 256; value++)
3567 if (!isALNUM(value))
3568 ANYOF_BITMAP_SET(data->start_class, value);
3569 }
653099ff 3570 }
8aa23a47
YO
3571 break;
3572 case NALNUML:
3573 if (flags & SCF_DO_STCLASS_AND) {
3574 if (data->start_class->flags & ANYOF_LOCALE)
3575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3576 }
8aa23a47
YO
3577 else {
3578 data->start_class->flags |= ANYOF_LOCALE;
3579 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3580 }
3581 break;
3582 case SPACE:
3583 if (flags & SCF_DO_STCLASS_AND) {
3584 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3585 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3586 for (value = 0; value < 256; value++)
3587 if (!isSPACE(value))
3588 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3589 }
3590 }
8aa23a47
YO
3591 else {
3592 if (data->start_class->flags & ANYOF_LOCALE)
3593 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3594 else {
3595 for (value = 0; value < 256; value++)
3596 if (isSPACE(value))
3597 ANYOF_BITMAP_SET(data->start_class, value);
3598 }
653099ff 3599 }
8aa23a47
YO
3600 break;
3601 case SPACEL:
3602 if (flags & SCF_DO_STCLASS_AND) {
3603 if (data->start_class->flags & ANYOF_LOCALE)
3604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3605 }
3606 else {
3607 data->start_class->flags |= ANYOF_LOCALE;
3608 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3609 }
3610 break;
3611 case NSPACE:
3612 if (flags & SCF_DO_STCLASS_AND) {
3613 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3614 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3615 for (value = 0; value < 256; value++)
3616 if (isSPACE(value))
3617 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3618 }
8aa23a47
YO
3619 }
3620 else {
3621 if (data->start_class->flags & ANYOF_LOCALE)
3622 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3623 else {
3624 for (value = 0; value < 256; value++)
3625 if (!isSPACE(value))
3626 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3627 }
3628 }
8aa23a47
YO
3629 break;
3630 case NSPACEL:
3631 if (flags & SCF_DO_STCLASS_AND) {
3632 if (data->start_class->flags & ANYOF_LOCALE) {
3633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3634 for (value = 0; value < 256; value++)
3635 if (!isSPACE(value))
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 }
653099ff 3638 }
8aa23a47
YO
3639 else {
3640 data->start_class->flags |= ANYOF_LOCALE;
3641 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3642 }
3643 break;
3644 case DIGIT:
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3647 for (value = 0; value < 256; value++)
3648 if (!isDIGIT(value))
3649 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 }
3651 else {
3652 if (data->start_class->flags & ANYOF_LOCALE)
3653 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3654 else {
3655 for (value = 0; value < 256; value++)
3656 if (isDIGIT(value))
3657 ANYOF_BITMAP_SET(data->start_class, value);
3658 }
3659 }
3660 break;
3661 case NDIGIT:
3662 if (flags & SCF_DO_STCLASS_AND) {
3663 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3664 for (value = 0; value < 256; value++)
3665 if (isDIGIT(value))
3666 ANYOF_BITMAP_CLEAR(data->start_class, value);
3667 }
3668 else {
3669 if (data->start_class->flags & ANYOF_LOCALE)
3670 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3671 else {
3672 for (value = 0; value < 256; value++)
3673 if (!isDIGIT(value))
3674 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3675 }
3676 }
8aa23a47 3677 break;
e1d1eefb
YO
3678 CASE_SYNST_FNC(VERTWS);
3679 CASE_SYNST_FNC(HORIZWS);
3680
8aa23a47
YO
3681 }
3682 if (flags & SCF_DO_STCLASS_OR)
3683 cl_and(data->start_class, and_withp);
3684 flags &= ~SCF_DO_STCLASS;
3685 }
3686 }
3687 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3688 data->flags |= (OP(scan) == MEOL
3689 ? SF_BEFORE_MEOL
3690 : SF_BEFORE_SEOL);
3691 }
3692 else if ( PL_regkind[OP(scan)] == BRANCHJ
3693 /* Lookbehind, or need to calculate parens/evals/stclass: */
3694 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3695 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3696 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3697 || OP(scan) == UNLESSM )
3698 {
3699 /* Negative Lookahead/lookbehind
3700 In this case we can't do fixed string optimisation.
3701 */
1de06328 3702
8aa23a47
YO
3703 I32 deltanext, minnext, fake = 0;
3704 regnode *nscan;
3705 struct regnode_charclass_class intrnl;
3706 int f = 0;
1de06328 3707
8aa23a47
YO
3708 data_fake.flags = 0;
3709 if (data) {
3710 data_fake.whilem_c = data->whilem_c;
3711 data_fake.last_closep = data->last_closep;
c277df42 3712 }
8aa23a47
YO
3713 else
3714 data_fake.last_closep = &fake;
58e23c8d 3715 data_fake.pos_delta = delta;
8aa23a47
YO
3716 if ( flags & SCF_DO_STCLASS && !scan->flags
3717 && OP(scan) == IFMATCH ) { /* Lookahead */
3718 cl_init(pRExC_state, &intrnl);
3719 data_fake.start_class = &intrnl;
3720 f |= SCF_DO_STCLASS_AND;
3721 }
3722 if (flags & SCF_WHILEM_VISITED_POS)
3723 f |= SCF_WHILEM_VISITED_POS;
3724 next = regnext(scan);
3725 nscan = NEXTOPER(NEXTOPER(scan));
3726 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3727 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3728 if (scan->flags) {
3729 if (deltanext) {
58e23c8d 3730 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3731 }
3732 else if (minnext > (I32)U8_MAX) {
58e23c8d 3733 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3734 }
3735 scan->flags = (U8)minnext;
3736 }
3737 if (data) {
3738 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3739 pars++;
3740 if (data_fake.flags & SF_HAS_EVAL)
3741 data->flags |= SF_HAS_EVAL;
3742 data->whilem_c = data_fake.whilem_c;
3743 }
3744 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3745 if (flags & SCF_DO_STCLASS_OR) {
3746 /* OR before, AND after: ideally we would recurse with
3747 * data_fake to get the AND applied by study of the
3748 * remainder of the pattern, and then derecurse;
3749 * *** HACK *** for now just treat as "no information".
3750 * See [perl #56690].
3751 */
3752 cl_init(pRExC_state, data->start_class);
3753 } else {
3754 /* AND before and after: combine and continue */
3755 const int was = (data->start_class->flags & ANYOF_EOS);
3756
3757 cl_and(data->start_class, &intrnl);
3758 if (was)
3759 data->start_class->flags |= ANYOF_EOS;
3760 }
8aa23a47 3761 }
cb434fcc 3762 }
8aa23a47
YO
3763#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3764 else {
3765 /* Positive Lookahead/lookbehind
3766 In this case we can do fixed string optimisation,
3767 but we must be careful about it. Note in the case of
3768 lookbehind the positions will be offset by the minimum
3769 length of the pattern, something we won't know about
3770 until after the recurse.
3771 */
3772 I32 deltanext, fake = 0;
3773 regnode *nscan;
3774 struct regnode_charclass_class intrnl;
3775 int f = 0;
3776 /* We use SAVEFREEPV so that when the full compile
3777 is finished perl will clean up the allocated
3778 minlens when its all done. This was we don't
3779 have to worry about freeing them when we know
3780 they wont be used, which would be a pain.
3781 */
3782 I32 *minnextp;
3783 Newx( minnextp, 1, I32 );
3784 SAVEFREEPV(minnextp);
3785
3786 if (data) {
3787 StructCopy(data, &data_fake, scan_data_t);
3788 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3789 f |= SCF_DO_SUBSTR;
3790 if (scan->flags)
304ee84b 3791 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3792 data_fake.last_found=newSVsv(data->last_found);
3793 }
3794 }
3795 else
3796 data_fake.last_closep = &fake;
3797 data_fake.flags = 0;
58e23c8d 3798 data_fake.pos_delta = delta;
8aa23a47
YO
3799 if (is_inf)
3800 data_fake.flags |= SF_IS_INF;
3801 if ( flags & SCF_DO_STCLASS && !scan->flags
3802 && OP(scan) == IFMATCH ) { /* Lookahead */
3803 cl_init(pRExC_state, &intrnl);
3804 data_fake.start_class = &intrnl;
3805 f |= SCF_DO_STCLASS_AND;
3806 }
3807 if (flags & SCF_WHILEM_VISITED_POS)
3808 f |= SCF_WHILEM_VISITED_POS;
3809 next = regnext(scan);
3810 nscan = NEXTOPER(NEXTOPER(scan));
3811
3812 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3813 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3814 if (scan->flags) {
3815 if (deltanext) {
58e23c8d 3816 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3817 }
3818 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3819 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3820 }
3821 scan->flags = (U8)*minnextp;
3822 }
3823
3824 *minnextp += min;
3825
3826 if (f & SCF_DO_STCLASS_AND) {
3827 const int was = (data->start_class->flags & ANYOF_EOS);
3828
3829 cl_and(data->start_class, &intrnl);
3830 if (was)
3831 data->start_class->flags |= ANYOF_EOS;
3832 }
3833 if (data) {
3834 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3835 pars++;
3836 if (data_fake.flags & SF_HAS_EVAL)
3837 data->flags |= SF_HAS_EVAL;
3838 data->whilem_c = data_fake.whilem_c;
3839 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3840 if (RExC_rx->minlen<*minnextp)
3841 RExC_rx->minlen=*minnextp;
304ee84b 3842 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3843 SvREFCNT_dec(data_fake.last_found);
3844
3845 if ( data_fake.minlen_fixed != minlenp )
3846 {
3847 data->offset_fixed= data_fake.offset_fixed;
3848 data->minlen_fixed= data_fake.minlen_fixed;
3849 data->lookbehind_fixed+= scan->flags;
3850 }
3851 if ( data_fake.minlen_float != minlenp )
3852 {
3853 data->minlen_float= data_fake.minlen_float;
3854 data->offset_float_min=data_fake.offset_float_min;
3855 data->offset_float_max=data_fake.offset_float_max;
3856 data->lookbehind_float+= scan->flags;
3857 }
3858 }
3859 }
3860
3861
40d049e4 3862 }
8aa23a47
YO
3863#endif
3864 }
3865 else if (OP(scan) == OPEN) {
3866 if (stopparen != (I32)ARG(scan))
3867 pars++;
3868 }
3869 else if (OP(scan) == CLOSE) {
3870 if (stopparen == (I32)ARG(scan)) {
3871 break;
3872 }
3873 if ((I32)ARG(scan) == is_par) {
3874 next = regnext(scan);
b515a41d 3875
8aa23a47
YO
3876 if ( next && (OP(next) != WHILEM) && next < last)
3877 is_par = 0; /* Disable optimization */
40d049e4 3878 }
8aa23a47
YO
3879 if (data)
3880 *(data->last_closep) = ARG(scan);
3881 }
3882 else if (OP(scan) == EVAL) {
c277df42
IZ
3883 if (data)
3884 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3885 }
3886 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3887 if (flags & SCF_DO_SUBSTR) {
304ee84b 3888 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3889 flags &= ~SCF_DO_SUBSTR;
40d049e4 3890 }
8aa23a47
YO
3891 if (data && OP(scan)==ACCEPT) {
3892 data->flags |= SCF_SEEN_ACCEPT;
3893 if (stopmin > min)
3894 stopmin = min;
e2e6a0f1 3895 }
8aa23a47
YO
3896 }
3897 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3898 {
0f5d15d6 3899 if (flags & SCF_DO_SUBSTR) {
304ee84b 3900 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3901 data->longest = &(data->longest_float);
3902 }
3903 is_inf = is_inf_internal = 1;
653099ff 3904 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3905 cl_anything(pRExC_state, data->start_class);
96776eda 3906 flags &= ~SCF_DO_STCLASS;
8aa23a47 3907 }
58e23c8d 3908 else if (OP(scan) == GPOS) {
bbe252da 3909 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3910 !(delta || is_inf || (data && data->pos_delta)))
3911 {
bbe252da
YO
3912 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3913 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3914 if (RExC_rx->gofs < (U32)min)
3915 RExC_rx->gofs = min;
3916 } else {
bbe252da 3917 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3918 RExC_rx->gofs = 0;
3919 }
3920 }
786e8c11 3921#ifdef TRIE_STUDY_OPT
40d049e4 3922#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3923 else if (PL_regkind[OP(scan)] == TRIE) {
3924 /* NOTE - There is similar code to this block above for handling
3925 BRANCH nodes on the initial study. If you change stuff here
3926 check there too. */
3927 regnode *trie_node= scan;
3928 regnode *tail= regnext(scan);
f8fc2ecf 3929 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3930 I32 max1 = 0, min1 = I32_MAX;
3931 struct regnode_charclass_class accum;
3932
3933 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3934 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3935 if (flags & SCF_DO_STCLASS)
3936 cl_init_zero(pRExC_state, &accum);
3937
3938 if (!trie->jump) {
3939 min1= trie->minlen;
3940 max1= trie->maxlen;
3941 } else {
3942 const regnode *nextbranch= NULL;
3943 U32 word;
3944
3945 for ( word=1 ; word <= trie->wordcount ; word++)
3946 {
3947 I32 deltanext=0, minnext=0, f = 0, fake;
3948 struct regnode_charclass_class this_class;
3949
3950 data_fake.flags = 0;
3951 if (data) {
3952 data_fake.whilem_c = data->whilem_c;
3953 data_fake.last_closep = data->last_closep;
3954 }
3955 else
3956 data_fake.last_closep = &fake;
58e23c8d 3957 data_fake.pos_delta = delta;
8aa23a47
YO
3958 if (flags & SCF_DO_STCLASS) {
3959 cl_init(pRExC_state, &this_class);
3960 data_fake.start_class = &this_class;
3961 f = SCF_DO_STCLASS_AND;
3962 }
3963 if (flags & SCF_WHILEM_VISITED_POS)
3964 f |= SCF_WHILEM_VISITED_POS;
3965
3966 if (trie->jump[word]) {
3967 if (!nextbranch)
3968 nextbranch = trie_node + trie->jump[0];
3969 scan= trie_node + trie->jump[word];
3970 /* We go from the jump point to the branch that follows
3971 it. Note this means we need the vestigal unused branches
3972 even though they arent otherwise used.
3973 */
3974 minnext = study_chunk(pRExC_state, &scan, minlenp,
3975 &deltanext, (regnode *)nextbranch, &data_fake,
3976 stopparen, recursed, NULL, f,depth+1);
3977 }
3978 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3979 nextbranch= regnext((regnode*)nextbranch);
3980
3981 if (min1 > (I32)(minnext + trie->minlen))
3982 min1 = minnext + trie->minlen;
3983 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3984 max1 = minnext + deltanext + trie->maxlen;
3985 if (deltanext == I32_MAX)
3986 is_inf = is_inf_internal = 1;
3987
3988 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3989 pars++;
3990 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3991 if ( stopmin > min + min1)
3992 stopmin = min + min1;
3993 flags &= ~SCF_DO_SUBSTR;
3994 if (data)
3995 data->flags |= SCF_SEEN_ACCEPT;
3996 }
3997 if (data) {
3998 if (data_fake.flags & SF_HAS_EVAL)
3999 data->flags |= SF_HAS_EVAL;
4000 data->whilem_c = data_fake.whilem_c;
4001 }
4002 if (flags & SCF_DO_STCLASS)
4003 cl_or(pRExC_state, &accum, &this_class);
4004 }
4005 }
4006 if (flags & SCF_DO_SUBSTR) {
4007 data->pos_min += min1;
4008 data->pos_delta += max1 - min1;
4009 if (max1 != min1 || is_inf)
4010 data->longest = &(data->longest_float);
4011 }
4012 min += min1;
4013 delta += max1 - min1;
4014 if (flags & SCF_DO_STCLASS_OR) {
4015 cl_or(pRExC_state, data->start_class, &accum);
4016 if (min1) {
4017 cl_and(data->start_class, and_withp);
4018 flags &= ~SCF_DO_STCLASS;
4019 }
4020 }
4021 else if (flags & SCF_DO_STCLASS_AND) {
4022 if (min1) {
4023 cl_and(data->start_class, &accum);
4024 flags &= ~SCF_DO_STCLASS;
4025 }
4026 else {
4027 /* Switch to OR mode: cache the old value of
4028 * data->start_class */
4029 INIT_AND_WITHP;
4030 StructCopy(data->start_class, and_withp,
4031 struct regnode_charclass_class);
4032 flags &= ~SCF_DO_STCLASS_AND;
4033 StructCopy(&accum, data->start_class,
4034 struct regnode_charclass_class);
4035 flags |= SCF_DO_STCLASS_OR;
4036 data->start_class->flags |= ANYOF_EOS;
4037 }
4038 }
4039 scan= tail;
4040 continue;
4041 }
786e8c11 4042#else
8aa23a47 4043 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4044 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4045 U8*bang=NULL;
4046
4047 min += trie->minlen;
4048 delta += (trie->maxlen - trie->minlen);
4049 flags &= ~SCF_DO_STCLASS; /* xxx */
4050 if (flags & SCF_DO_SUBSTR) {
304ee84b 4051 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4052 data->pos_min += trie->minlen;
4053 data->pos_delta += (trie->maxlen - trie->minlen);
4054 if (trie->maxlen != trie->minlen)
4055 data->longest = &(data->longest_float);
4056 }
4057 if (trie->jump) /* no more substrings -- for now /grr*/
4058 flags &= ~SCF_DO_SUBSTR;
b515a41d 4059 }
8aa23a47
YO
4060#endif /* old or new */
4061#endif /* TRIE_STUDY_OPT */
e1d1eefb 4062
8aa23a47
YO
4063 /* Else: zero-length, ignore. */
4064 scan = regnext(scan);
4065 }
4066 if (frame) {
4067 last = frame->last;
4068 scan = frame->next;
4069 stopparen = frame->stop;
4070 frame = frame->prev;
4071 goto fake_study_recurse;
c277df42
IZ
4072 }
4073
4074 finish:
8aa23a47 4075 assert(!frame);
304ee84b 4076 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4077
c277df42 4078 *scanp = scan;
aca2d497 4079 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4080 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4081 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4082 if (is_par > (I32)U8_MAX)
c277df42
IZ
4083 is_par = 0;
4084 if (is_par && pars==1 && data) {
4085 data->flags |= SF_IN_PAR;
4086 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4087 }
4088 else if (pars && data) {
c277df42
IZ
4089 data->flags |= SF_HAS_PAR;
4090 data->flags &= ~SF_IN_PAR;
4091 }
653099ff 4092 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4093 cl_and(data->start_class, and_withp);
786e8c11
YO
4094 if (flags & SCF_TRIE_RESTUDY)
4095 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4096
304ee84b 4097 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4098
e2e6a0f1 4099 return min < stopmin ? min : stopmin;
c277df42
IZ
4100}
4101
2eccd3b2
NC
4102STATIC U32
4103S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4104{
4a4e7719
NC
4105 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4106
7918f24d
NC
4107 PERL_ARGS_ASSERT_ADD_DATA;
4108
4a4e7719
NC
4109 Renewc(RExC_rxi->data,
4110 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4111 char, struct reg_data);
4112 if(count)
f8fc2ecf 4113 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4114 else
f8fc2ecf 4115 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4116 RExC_rxi->data->count = count + n;
4117 Copy(s, RExC_rxi->data->what + count, n, U8);
4118 return count;
c277df42
IZ
4119}
4120
f8149455 4121/*XXX: todo make this not included in a non debugging perl */
76234dfb 4122#ifndef PERL_IN_XSUB_RE
d88dccdf 4123void
864dbfa3 4124Perl_reginitcolors(pTHX)
d88dccdf 4125{
97aff369 4126 dVAR;
1df70142 4127 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4128 if (s) {
1df70142
AL
4129 char *t = savepv(s);
4130 int i = 0;
4131 PL_colors[0] = t;
d88dccdf 4132 while (++i < 6) {
1df70142
AL
4133 t = strchr(t, '\t');
4134 if (t) {
4135 *t = '\0';
4136 PL_colors[i] = ++t;
d88dccdf
IZ
4137 }
4138 else
1df70142 4139 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4140 }
4141 } else {
1df70142 4142 int i = 0;
b81d288d 4143 while (i < 6)
06b5626a 4144 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4145 }
4146 PL_colorset = 1;
4147}
76234dfb 4148#endif
8615cb43 4149
07be1b83 4150
786e8c11
YO
4151#ifdef TRIE_STUDY_OPT
4152#define CHECK_RESTUDY_GOTO \
4153 if ( \
4154 (data.flags & SCF_TRIE_RESTUDY) \
4155 && ! restudied++ \
4156 ) goto reStudy
4157#else
4158#define CHECK_RESTUDY_GOTO
4159#endif
f9f4320a 4160
a687059c 4161/*
e50aee73 4162 - pregcomp - compile a regular expression into internal code
a687059c
LW
4163 *
4164 * We can't allocate space until we know how big the compiled form will be,
4165 * but we can't compile it (and thus know how big it is) until we've got a
4166 * place to put the code. So we cheat: we compile it twice, once with code
4167 * generation turned off and size counting turned on, and once "for real".
4168 * This also means that we don't allocate space until we are sure that the
4169 * thing really will compile successfully, and we never have to move the
4170 * code and thus invalidate pointers into it. (Note that it has to be in
4171 * one piece because free() must be able to free it all.) [NB: not true in perl]
4172 *
4173 * Beware that the optimization-preparation code in here knows about some
4174 * of the structure of the compiled regexp. [I'll say.]
4175 */
b9b4dddf
YO
4176
4177
4178
f9f4320a 4179#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4180#define RE_ENGINE_PTR &PL_core_reg_engine
4181#else
f9f4320a
YO
4182extern const struct regexp_engine my_reg_engine;
4183#define RE_ENGINE_PTR &my_reg_engine
4184#endif
6d5c990f
RGS
4185
4186#ifndef PERL_IN_XSUB_RE
3ab4a224 4187REGEXP *
1593ad57 4188Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4189{
97aff369 4190 dVAR;
6d5c990f 4191 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4192
4193 PERL_ARGS_ASSERT_PREGCOMP;
4194
f9f4320a
YO
4195 /* Dispatch a request to compile a regexp to correct
4196 regexp engine. */
f9f4320a
YO
4197 if (table) {
4198 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4199 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4200 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4201 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4202 DEBUG_COMPILE_r({
8d8756e7 4203 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4204 SvIV(*ptr));
4205 });
3ab4a224 4206 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4207 }
b9b4dddf 4208 }
3ab4a224 4209 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4210}
6d5c990f 4211#endif
2a5d9b1d 4212
3ab4a224 4213REGEXP *
1593ad57 4214Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4215{
4216 dVAR;
288b8c02
NC
4217 REGEXP *rx;
4218 struct regexp *r;
f8fc2ecf 4219 register regexp_internal *ri;
3ab4a224 4220 STRLEN plen;
1593ad57 4221 char *exp = SvPV(pattern, plen);
3ab4a224 4222 char* xend = exp + plen;
c277df42 4223 regnode *scan;
a0d0e21e 4224 I32 flags;
a0d0e21e
LW
4225 I32 minlen = 0;
4226 I32 sawplus = 0;
4227 I32 sawopen = 0;
2c2d71f5 4228 scan_data_t data;
830247a4 4229 RExC_state_t RExC_state;
be8e71aa 4230 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4231#ifdef TRIE_STUDY_OPT
4232 int restudied= 0;
4233 RExC_state_t copyRExC_state;
4234#endif
2a5d9b1d 4235 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4236
4237 PERL_ARGS_ASSERT_RE_COMPILE;
4238
6d5c990f 4239 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4240
b9ad30b4 4241 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4242
a3621e74 4243 DEBUG_COMPILE_r({
ab3bbdeb
YO
4244 SV *dsv= sv_newmortal();
4245 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4246 dsv, exp, plen, 60);
ab3bbdeb
YO
4247 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4248 PL_colors[4],PL_colors[5],s);
a5961de5 4249 });
02daf0ab
YO
4250
4251redo_first_pass:
4252 RExC_precomp = exp;
c737faaf 4253 RExC_flags = pm_flags;
830247a4 4254 RExC_sawback = 0;
bbce6d69 4255
830247a4
IZ
4256 RExC_seen = 0;
4257 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4258 RExC_seen_evals = 0;
4259 RExC_extralen = 0;
c277df42 4260
bbce6d69 4261 /* First pass: determine size, legality. */
830247a4 4262 RExC_parse = exp;
fac92740 4263 RExC_start = exp;
830247a4
IZ
4264 RExC_end = xend;
4265 RExC_naughty = 0;
4266 RExC_npar = 1;
e2e6a0f1 4267 RExC_nestroot = 0;
830247a4
IZ
4268 RExC_size = 0L;
4269 RExC_emit = &PL_regdummy;
4270 RExC_whilem_seen = 0;
fc8cd66c 4271 RExC_charnames = NULL;
40d049e4
YO
4272 RExC_open_parens = NULL;
4273 RExC_close_parens = NULL;
4274 RExC_opend = NULL;
81714fb9 4275 RExC_paren_names = NULL;
1f1031fe
YO
4276#ifdef DEBUGGING
4277 RExC_paren_name_list = NULL;
4278#endif
40d049e4
YO
4279 RExC_recurse = NULL;
4280 RExC_recurse_count = 0;
81714fb9 4281
85ddcde9
JH
4282#if 0 /* REGC() is (currently) a NOP at the first pass.
4283 * Clever compilers notice this and complain. --jhi */
830247a4 4284 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4285#endif
3dab1dad
YO
4286 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4287 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4288 RExC_precomp = NULL;
a0d0e21e
LW
4289 return(NULL);
4290 }
02daf0ab 4291 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4292 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4293 codepoints outside of the byte range, such as via \x{100}. If we
4294 detect such a sequence we have to convert the entire pattern to utf8
4295 and then recompile, as our sizing calculation will have been based
4296 on 1 byte == 1 character, but we will need to use utf8 to encode
4297 at least some part of the pattern, and therefore must convert the whole
4298 thing.
4299 XXX: somehow figure out how to make this less expensive...
4300 -- dmq */
3ab4a224 4301 STRLEN len = plen;
02daf0ab
YO
4302 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4303 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4304 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4305 xend = exp + len;
4306 RExC_orig_utf8 = RExC_utf8;
4307 SAVEFREEPV(exp);
4308 goto redo_first_pass;
4309 }
07be1b83 4310 DEBUG_PARSE_r({
81714fb9
YO
4311 PerlIO_printf(Perl_debug_log,
4312 "Required size %"IVdf" nodes\n"
4313 "Starting second pass (creation)\n",
4314 (IV)RExC_size);
07be1b83
YO
4315 RExC_lastnum=0;
4316 RExC_lastparse=NULL;
4317 });
c277df42
IZ
4318 /* Small enough for pointer-storage convention?
4319 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4320 if (RExC_size >= 0x10000L && RExC_extralen)
4321 RExC_size += RExC_extralen;
c277df42 4322 else
830247a4
IZ
4323 RExC_extralen = 0;
4324 if (RExC_whilem_seen > 15)
4325 RExC_whilem_seen = 15;
a0d0e21e 4326
f9f4320a
YO
4327 /* Allocate space and zero-initialize. Note, the two step process
4328 of zeroing when in debug mode, thus anything assigned has to
4329 happen after that */
d2f13c59 4330 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4331 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4332 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4333 char, regexp_internal);
4334 if ( r == NULL || ri == NULL )
b45f050a 4335 FAIL("Regexp out of space");
0f79a09d
GS
4336#ifdef DEBUGGING
4337 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4338 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4339#else
f8fc2ecf
YO
4340 /* bulk initialize base fields with 0. */
4341 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4342#endif
58e23c8d
YO
4343
4344 /* non-zero initialization begins here */
f8fc2ecf 4345 RXi_SET( r, ri );
f9f4320a 4346 r->engine= RE_ENGINE_PTR;
c737faaf 4347 r->extflags = pm_flags;
bcdf7404 4348 {
f7819f85 4349 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4350 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4351 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4352 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4353 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4354 const char *fptr = STD_PAT_MODS; /*"msix"*/
4355 char *p;
9d17798d 4356 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4357 + (sizeof(STD_PAT_MODS) - 1)
4358 + (sizeof("(?:)") - 1);
4359
ad64d0ec 4360 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
9d17798d 4361 SvCUR_set(rx, wraplen);
f7c278bf 4362 SvPOK_on(rx);
8f6ae13c 4363 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4364 *p++='('; *p++='?';
f7819f85
A
4365 if (has_p)
4366 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4367 {
4368 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4369 char *colon = r + 1;
4370 char ch;
4371
4372 while((ch = *fptr++)) {
4373 if(reganch & 1)
4374 *p++ = ch;
4375 else
4376 *r-- = ch;
4377 reganch >>= 1;
4378 }
4379 if(has_minus) {
4380 *r = '-';
4381 p = colon;
4382 }
4383 }
4384
28d8d7f4 4385 *p++ = ':';
bb661a58 4386 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4387 assert ((RX_WRAPPED(rx) - p) < 16);
4388 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4389 p += plen;
bcdf7404 4390 if (has_runon)
28d8d7f4
YO
4391 *p++ = '\n';
4392 *p++ = ')';
4393 *p = 0;
bcdf7404
YO
4394 }
4395
bbe252da 4396 r->intflags = 0;
830247a4 4397 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4398
6bda09f9 4399 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4400 Newxz(RExC_open_parens, RExC_npar,regnode *);
4401 SAVEFREEPV(RExC_open_parens);
4402 Newxz(RExC_close_parens,RExC_npar,regnode *);
4403 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4404 }
4405
4406 /* Useful during FAIL. */
7122b237
YO
4407#ifdef RE_TRACK_PATTERN_OFFSETS
4408 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4409 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4410 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4411 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4412 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4413#endif
4414 SetProgLen(ri,RExC_size);
288b8c02 4415 RExC_rx_sv = rx;
830247a4 4416 RExC_rx = r;
f8fc2ecf 4417 RExC_rxi = ri;
bbce6d69 4418
4419 /* Second pass: emit code. */
c737faaf 4420 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4421 RExC_parse = exp;
4422 RExC_end = xend;
4423 RExC_naughty = 0;
4424 RExC_npar = 1;
f8fc2ecf
YO
4425 RExC_emit_start = ri->program;
4426 RExC_emit = ri->program;
3b57cd43
YO
4427 RExC_emit_bound = ri->program + RExC_size + 1;
4428
2cd61cdb 4429 /* Store the count of eval-groups for security checks: */
f8149455 4430 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4431 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4432 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4433 ReREFCNT_dec(rx);
a0d0e21e 4434 return(NULL);
80757612 4435 }
07be1b83
YO
4436 /* XXXX To minimize changes to RE engine we always allocate
4437 3-units-long substrs field. */
4438 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4439 if (RExC_recurse_count) {
4440 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4441 SAVEFREEPV(RExC_recurse);
4442 }
a0d0e21e 4443
07be1b83 4444reStudy:
1de06328 4445 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4446 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4447
07be1b83 4448#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4449 if (!restudied) {
4450 StructCopy(&zero_scan_data, &data, scan_data_t);
4451 copyRExC_state = RExC_state;
4452 } else {
5d458dd8 4453 U32 seen=RExC_seen;
07be1b83 4454 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4455
4456 RExC_state = copyRExC_state;
4457 if (seen & REG_TOP_LEVEL_BRANCHES)
4458 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4459 else
4460 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4461 if (data.last_found) {
07be1b83 4462 SvREFCNT_dec(data.longest_fixed);
07be1b83 4463 SvREFCNT_dec(data.longest_float);
07be1b83 4464 SvREFCNT_dec(data.last_found);
1de06328 4465 }
40d049e4 4466 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4467 }
40d049e4
YO
4468#else
4469 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4470#endif
fc8cd66c 4471
a0d0e21e 4472 /* Dig out information for optimizations. */
f7819f85 4473 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4474 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4475
a0ed51b3 4476 if (UTF)
8f6ae13c 4477 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4478 ri->regstclass = NULL;
830247a4 4479 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4480 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4481 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4482
1de06328
YO
4483 /* testing for BRANCH here tells us whether there is "must appear"
4484 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4485 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4486 I32 fake;
c5254dd6 4487 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4488 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4489 int stclass_flag;
07be1b83 4490 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4491 regnode *first= scan;
4492 regnode *first_next= regnext(first);
4493
639081d6
YO
4494 /*
4495 * Skip introductions and multiplicators >= 1
4496 * so that we can extract the 'meat' of the pattern that must
4497 * match in the large if() sequence following.
4498 * NOTE that EXACT is NOT covered here, as it is normally
4499 * picked up by the optimiser separately.
4500 *
4501 * This is unfortunate as the optimiser isnt handling lookahead
4502 * properly currently.
4503 *
4504 */
a0d0e21e 4505 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4506 /* An OR of *one* alternative - should not happen now. */
5339e136 4507 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4508 /* for now we can't handle lookbehind IFMATCH*/
4509 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4510 (OP(first) == PLUS) ||
4511 (OP(first) == MINMOD) ||
653099ff 4512 /* An {n,m} with n>0 */
5339e136
YO
4513 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4514 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4515 {
639081d6
YO
4516 /*
4517 * the only op that could be a regnode is PLUS, all the rest
4518 * will be regnode_1 or regnode_2.
4519 *
4520 */
a0d0e21e
LW
4521 if (OP(first) == PLUS)
4522 sawplus = 1;
4523 else
3dab1dad 4524 first += regarglen[OP(first)];
639081d6
YO
4525
4526 first = NEXTOPER(first);
5339e136 4527 first_next= regnext(first);
a687059c
LW
4528 }
4529
a0d0e21e
LW
4530 /* Starting-point info. */
4531 again:
786e8c11 4532 DEBUG_PEEP("first:",first,0);
07be1b83 4533 /* Ignore EXACT as we deal with it later. */
3dab1dad 4534 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4535 if (OP(first) == EXACT)
6f207bd3 4536 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4537 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4538 ri->regstclass = first;
b3c9acc1 4539 }
07be1b83 4540#ifdef TRIE_STCLASS
786e8c11 4541 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4542 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4543 {
786e8c11 4544 regnode *trie_op;
07be1b83 4545 /* this can happen only on restudy */
786e8c11 4546 if ( OP(first) == TRIE ) {
c944940b 4547 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4548 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4549 StructCopy(first,trieop,struct regnode_1);
4550 trie_op=(regnode *)trieop;
4551 } else {
c944940b 4552 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4553 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4554 StructCopy(first,trieop,struct regnode_charclass);
4555 trie_op=(regnode *)trieop;
4556 }
1de06328 4557 OP(trie_op)+=2;
786e8c11 4558 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4559 ri->regstclass = trie_op;
07be1b83
YO
4560 }
4561#endif
bfed75c6 4562 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4563 ri->regstclass = first;
3dab1dad
YO
4564 else if (PL_regkind[OP(first)] == BOUND ||
4565 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4566 ri->regstclass = first;
3dab1dad 4567 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4568 r->extflags |= (OP(first) == MBOL
4569 ? RXf_ANCH_MBOL
cad2e5aa 4570 : (OP(first) == SBOL
bbe252da
YO
4571 ? RXf_ANCH_SBOL
4572 : RXf_ANCH_BOL));
a0d0e21e 4573 first = NEXTOPER(first);
774d564b 4574 goto again;
4575 }
4576 else if (OP(first) == GPOS) {
bbe252da 4577 r->extflags |= RXf_ANCH_GPOS;
774d564b 4578 first = NEXTOPER(first);
4579 goto again;
a0d0e21e 4580 }
cf2a2b69
YO
4581 else if ((!sawopen || !RExC_sawback) &&
4582 (OP(first) == STAR &&
3dab1dad 4583 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4584 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4585 {
4586 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4587 const int type =
4588 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4589 ? RXf_ANCH_MBOL
4590 : RXf_ANCH_SBOL;
4591 r->extflags |= type;
4592 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4593 first = NEXTOPER(first);
774d564b 4594 goto again;
a0d0e21e 4595 }
b81d288d 4596 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4597 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4598 /* x+ must match at the 1st pos of run of x's */
bbe252da 4599 r->intflags |= PREGf_SKIP;
a0d0e21e 4600
c277df42 4601 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4602#ifdef TRIE_STUDY_OPT
81714fb9 4603 DEBUG_PARSE_r(
be8e71aa
YO
4604 if (!restudied)
4605 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4606 (IV)(first - scan + 1))
4607 );
4608#else
81714fb9 4609 DEBUG_PARSE_r(
be8e71aa
YO
4610 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4611 (IV)(first - scan + 1))
4612 );
4613#endif
4614
4615
a0d0e21e
LW
4616 /*
4617 * If there's something expensive in the r.e., find the
4618 * longest literal string that must appear and make it the
4619 * regmust. Resolve ties in favor of later strings, since
4620 * the regstart check works with the beginning of the r.e.
4621 * and avoiding duplication strengthens checking. Not a
4622 * strong reason, but sufficient in the absence of others.
4623 * [Now we resolve ties in favor of the earlier string if
c277df42 4624 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4625 * earlier string may buy us something the later one won't.]
4626 */
de8c5301 4627
396482e1
GA
4628 data.longest_fixed = newSVpvs("");
4629 data.longest_float = newSVpvs("");
4630 data.last_found = newSVpvs("");
c277df42
IZ
4631 data.longest = &(data.longest_fixed);
4632 first = scan;
f8fc2ecf 4633 if (!ri->regstclass) {
830247a4 4634 cl_init(pRExC_state, &ch_class);
653099ff
GS
4635 data.start_class = &ch_class;
4636 stclass_flag = SCF_DO_STCLASS_AND;
4637 } else /* XXXX Check for BOUND? */
4638 stclass_flag = 0;
cb434fcc 4639 data.last_closep = &last_close;
de8c5301 4640
1de06328 4641 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4642 &data, -1, NULL, NULL,
4643 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4644
07be1b83 4645
786e8c11
YO
4646 CHECK_RESTUDY_GOTO;
4647
4648
830247a4 4649 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4650 && data.last_start_min == 0 && data.last_end > 0
830247a4 4651 && !RExC_seen_zerolen
2bf803e2 4652 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4653 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4654 r->extflags |= RXf_CHECK_ALL;
304ee84b 4655 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4656 SvREFCNT_dec(data.last_found);
4657
1de06328
YO
4658 /* Note that code very similar to this but for anchored string
4659 follows immediately below, changes may need to be made to both.
4660 Be careful.
4661 */
a0ed51b3 4662 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4663 if (longest_float_length
c277df42
IZ
4664 || (data.flags & SF_FL_BEFORE_EOL
4665 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4666 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4667 {
1182767e 4668 I32 t,ml;
cf93c79d 4669
1de06328 4670 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4671 && data.offset_fixed == data.offset_float_min
4672 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4673 goto remove_float; /* As in (a)+. */
4674
1de06328
YO
4675 /* copy the information about the longest float from the reg_scan_data
4676 over to the program. */
33b8afdf
JH
4677 if (SvUTF8(data.longest_float)) {
4678 r->float_utf8 = data.longest_float;
c445ea15 4679 r->float_substr = NULL;
33b8afdf
JH
4680 } else {
4681 r->float_substr = data.longest_float;
c445ea15 4682 r->float_utf8 = NULL;
33b8afdf 4683 }
1de06328
YO
4684 /* float_end_shift is how many chars that must be matched that
4685 follow this item. We calculate it ahead of time as once the
4686 lookbehind offset is added in we lose the ability to correctly
4687 calculate it.*/
4688 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4689 : (I32)longest_float_length;
1de06328
YO
4690 r->float_end_shift = ml - data.offset_float_min
4691 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4692 + data.lookbehind_float;
4693 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4694 r->float_max_offset = data.offset_float_max;
1182767e 4695 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4696 r->float_max_offset -= data.lookbehind_float;
4697
cf93c79d
IZ
4698 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4699 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4700 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4701 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4702 }
4703 else {
aca2d497 4704 remove_float:
c445ea15 4705 r->float_substr = r->float_utf8 = NULL;
c277df42 4706 SvREFCNT_dec(data.longest_float);
c5254dd6 4707 longest_float_length = 0;
a0d0e21e 4708 }
c277df42 4709
1de06328
YO
4710 /* Note that code very similar to this but for floating string
4711 is immediately above, changes may need to be made to both.
4712 Be careful.
4713 */
a0ed51b3 4714 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4715 if (longest_fixed_length
c277df42
IZ
4716 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4717 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4718 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4719 {
1182767e 4720 I32 t,ml;
cf93c79d 4721
1de06328
YO
4722 /* copy the information about the longest fixed
4723 from the reg_scan_data over to the program. */
33b8afdf
JH
4724 if (SvUTF8(data.longest_fixed)) {
4725 r->anchored_utf8 = data.longest_fixed;
c445ea15 4726 r->anchored_substr = NULL;
33b8afdf
JH
4727 } else {
4728 r->anchored_substr = data.longest_fixed;
c445ea15 4729 r->anchored_utf8 = NULL;
33b8afdf 4730 }
1de06328
YO
4731 /* fixed_end_shift is how many chars that must be matched that
4732 follow this item. We calculate it ahead of time as once the
4733 lookbehind offset is added in we lose the ability to correctly
4734 calculate it.*/
4735 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4736 : (I32)longest_fixed_length;
1de06328
YO
4737 r->anchored_end_shift = ml - data.offset_fixed
4738 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4739 + data.lookbehind_fixed;
4740 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4741
cf93c79d
IZ
4742 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4743 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4744 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4745 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4746 }
4747 else {
c445ea15 4748 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4749 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4750 longest_fixed_length = 0;
a0d0e21e 4751 }
f8fc2ecf
YO
4752 if (ri->regstclass
4753 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4754 ri->regstclass = NULL;
33b8afdf
JH
4755 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4756 && stclass_flag
653099ff 4757 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4758 && !cl_is_anything(data.start_class))
4759 {
2eccd3b2 4760 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4761
f8fc2ecf 4762 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4763 struct regnode_charclass_class);
4764 StructCopy(data.start_class,
f8fc2ecf 4765 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4766 struct regnode_charclass_class);
f8fc2ecf 4767 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4768 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4769 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4770 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4771 PerlIO_printf(Perl_debug_log,
a0288114 4772 "synthetic stclass \"%s\".\n",
3f7c398e 4773 SvPVX_const(sv));});
653099ff 4774 }
c277df42
IZ
4775
4776 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4777 if (longest_fixed_length > longest_float_length) {
1de06328 4778 r->check_end_shift = r->anchored_end_shift;
c277df42 4779 r->check_substr = r->anchored_substr;
33b8afdf 4780 r->check_utf8 = r->anchored_utf8;
c277df42 4781 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4782 if (r->extflags & RXf_ANCH_SINGLE)
4783 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4784 }
4785 else {
1de06328 4786 r->check_end_shift = r->float_end_shift;
c277df42 4787 r->check_substr = r->float_substr;
33b8afdf 4788 r->check_utf8 = r->float_utf8;
1de06328
YO
4789 r->check_offset_min = r->float_min_offset;
4790 r->check_offset_max = r->float_max_offset;
a0d0e21e 4791 }
30382c73
IZ
4792 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4793 This should be changed ASAP! */
bbe252da
YO
4794 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4795 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4796 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4797 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4798 }
1de06328
YO
4799 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4800 if ( (STRLEN)minlen < longest_float_length )
4801 minlen= longest_float_length;
4802 if ( (STRLEN)minlen < longest_fixed_length )
4803 minlen= longest_fixed_length;
4804 */
a0ed51b3
LW
4805 }
4806 else {
c277df42
IZ
4807 /* Several toplevels. Best we can is to set minlen. */
4808 I32 fake;
653099ff 4809 struct regnode_charclass_class ch_class;
cb434fcc 4810 I32 last_close = 0;
c277df42 4811
5d458dd8 4812 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4813
f8fc2ecf 4814 scan = ri->program + 1;
830247a4 4815 cl_init(pRExC_state, &ch_class);
653099ff 4816 data.start_class = &ch_class;
cb434fcc 4817 data.last_closep = &last_close;
07be1b83 4818
de8c5301 4819
1de06328 4820 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4821 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4822
786e8c11 4823 CHECK_RESTUDY_GOTO;
07be1b83 4824
33b8afdf 4825 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4826 = r->float_substr = r->float_utf8 = NULL;
653099ff 4827 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4828 && !cl_is_anything(data.start_class))
4829 {
2eccd3b2 4830 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4831
f8fc2ecf 4832 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4833 struct regnode_charclass_class);
4834 StructCopy(data.start_class,
f8fc2ecf 4835 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4836 struct regnode_charclass_class);
f8fc2ecf 4837 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4838 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4839 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4840 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4841 PerlIO_printf(Perl_debug_log,
a0288114 4842 "synthetic stclass \"%s\".\n",
3f7c398e 4843 SvPVX_const(sv));});
653099ff 4844 }
a0d0e21e
LW
4845 }
4846
1de06328
YO
4847 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4848 the "real" pattern. */
cf9788e3
RGS
4849 DEBUG_OPTIMISE_r({
4850 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4851 (IV)minlen, (IV)r->minlen);
cf9788e3 4852 });
de8c5301 4853 r->minlenret = minlen;
1de06328
YO
4854 if (r->minlen < minlen)
4855 r->minlen = minlen;
4856
b81d288d 4857 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4858 r->extflags |= RXf_GPOS_SEEN;
830247a4 4859 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4860 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4861 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4862 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4863 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4864 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4865 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4866 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4867 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4868 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4869 if (RExC_paren_names)
85fbaab2 4870 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4871 else
5daac39c 4872 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4873
7bd1e614 4874#ifdef STUPID_PATTERN_CHECKS
5509d87a 4875 if (RX_PRELEN(rx) == 0)
640f820d 4876 r->extflags |= RXf_NULL;
5509d87a 4877 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4878 /* XXX: this should happen BEFORE we compile */
4879 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4880 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4881 r->extflags |= RXf_WHITE;
5509d87a 4882 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4883 r->extflags |= RXf_START_ONLY;
f1b875a0 4884#else
5509d87a 4885 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4886 /* XXX: this should happen BEFORE we compile */
4887 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4888 else {
4889 regnode *first = ri->program + 1;
39aa8307
JH
4890 U8 fop = OP(first);
4891 U8 nop = OP(NEXTOPER(first));
7bd1e614 4892
640f820d
AB
4893 if (PL_regkind[fop] == NOTHING && nop == END)
4894 r->extflags |= RXf_NULL;
4895 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4896 r->extflags |= RXf_START_ONLY;
4897 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4898 r->extflags |= RXf_WHITE;
4899 }
f1b875a0 4900#endif
1f1031fe
YO
4901#ifdef DEBUGGING
4902 if (RExC_paren_names) {
4903 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4904 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4905 } else
1f1031fe 4906#endif
cde0cee5 4907 ri->name_list_idx = 0;
1f1031fe 4908
40d049e4
YO
4909 if (RExC_recurse_count) {
4910 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4911 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4912 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4913 }
4914 }
f0ab9afb 4915 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4916 /* assume we don't need to swap parens around before we match */
4917
be8e71aa
YO
4918 DEBUG_DUMP_r({
4919 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4920 regdump(r);
4921 });
7122b237
YO
4922#ifdef RE_TRACK_PATTERN_OFFSETS
4923 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4924 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4925 U32 i;
4926 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4927 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4928 for (i = 1; i <= len; i++) {
7122b237 4929 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4930 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4931 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4932 }
4933 PerlIO_printf(Perl_debug_log, "\n");
4934 });
7122b237 4935#endif
288b8c02 4936 return rx;
a687059c
LW
4937}
4938
f9f4320a 4939#undef RE_ENGINE_PTR
3dab1dad 4940
93b32b6d 4941
81714fb9 4942SV*
192b9cd1
AB
4943Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4944 const U32 flags)
4945{
7918f24d
NC
4946 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4947
192b9cd1
AB
4948 PERL_UNUSED_ARG(value);
4949
f1b875a0 4950 if (flags & RXapif_FETCH) {
192b9cd1 4951 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4952 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
f1f66076 4953 Perl_croak(aTHX_ "%s", PL_no_modify);
192b9cd1 4954 return NULL;
f1b875a0 4955 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
4956 return reg_named_buff_exists(rx, key, flags)
4957 ? &PL_sv_yes
4958 : &PL_sv_no;
f1b875a0 4959 } else if (flags & RXapif_REGNAMES) {
192b9cd1 4960 return reg_named_buff_all(rx, flags);
f1b875a0 4961 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
4962 return reg_named_buff_scalar(rx, flags);
4963 } else {
4964 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4965 return NULL;
4966 }
4967}
4968
4969SV*
4970Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4971 const U32 flags)
4972{
7918f24d 4973 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
4974 PERL_UNUSED_ARG(lastkey);
4975
f1b875a0 4976 if (flags & RXapif_FIRSTKEY)
192b9cd1 4977 return reg_named_buff_firstkey(rx, flags);
f1b875a0 4978 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
4979 return reg_named_buff_nextkey(rx, flags);
4980 else {
4981 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4982 return NULL;
4983 }
4984}
4985
4986SV*
288b8c02
NC
4987Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4988 const U32 flags)
81714fb9 4989{
44a2ac75
YO
4990 AV *retarray = NULL;
4991 SV *ret;
288b8c02 4992 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
4993
4994 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4995
f1b875a0 4996 if (flags & RXapif_ALL)
44a2ac75 4997 retarray=newAV();
93b32b6d 4998
5daac39c
NC
4999 if (rx && RXp_PAREN_NAMES(rx)) {
5000 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5001 if (he_str) {
5002 IV i;
5003 SV* sv_dat=HeVAL(he_str);
5004 I32 *nums=(I32*)SvPVX(sv_dat);
5005 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5006 if ((I32)(rx->nparens) >= nums[i]
5007 && rx->offs[nums[i]].start != -1
5008 && rx->offs[nums[i]].end != -1)
93b32b6d 5009 {
49d7dfbc 5010 ret = newSVpvs("");
288b8c02 5011 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5012 if (!retarray)
5013 return ret;
5014 } else {
5015 ret = newSVsv(&PL_sv_undef);
5016 }
ec83ea38 5017 if (retarray)
93b32b6d 5018 av_push(retarray, ret);
81714fb9 5019 }
93b32b6d 5020 if (retarray)
ad64d0ec 5021 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5022 }
5023 }
5024 return NULL;
5025}
5026
5027bool
288b8c02 5028Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5029 const U32 flags)
5030{
288b8c02 5031 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5032
5033 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5034
5daac39c 5035 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5036 if (flags & RXapif_ALL) {
5daac39c 5037 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5038 } else {
288b8c02 5039 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5040 if (sv) {
5041 SvREFCNT_dec(sv);
192b9cd1
AB
5042 return TRUE;
5043 } else {
5044 return FALSE;
5045 }
5046 }
5047 } else {
5048 return FALSE;
5049 }
5050}
5051
5052SV*
288b8c02 5053Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5054{
288b8c02 5055 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5056
5057 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5058
5daac39c
NC
5059 if ( rx && RXp_PAREN_NAMES(rx) ) {
5060 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5061
288b8c02 5062 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5063 } else {
5064 return FALSE;
5065 }
192b9cd1
AB
5066}
5067
5068SV*
288b8c02 5069Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5070{
288b8c02 5071 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5072 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5073
5074 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5075
5daac39c
NC
5076 if (rx && RXp_PAREN_NAMES(rx)) {
5077 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5078 HE *temphe;
5079 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5080 IV i;
5081 IV parno = 0;
5082 SV* sv_dat = HeVAL(temphe);
5083 I32 *nums = (I32*)SvPVX(sv_dat);
5084 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5085 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5086 rx->offs[nums[i]].start != -1 &&
5087 rx->offs[nums[i]].end != -1)
5088 {
5089 parno = nums[i];
5090 break;
5091 }
5092 }
f1b875a0 5093 if (parno || flags & RXapif_ALL) {
a663657d 5094 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5095 }
81714fb9
YO
5096 }
5097 }
44a2ac75
YO
5098 return NULL;
5099}
5100
192b9cd1 5101SV*
288b8c02 5102Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5103{
5104 SV *ret;
5105 AV *av;
5106 I32 length;
288b8c02 5107 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5108
7918f24d
NC
5109 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5110
5daac39c 5111 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5112 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5113 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5114 } else if (flags & RXapif_ONE) {
288b8c02 5115 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5116 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5117 length = av_len(av);
ec83ea38 5118 SvREFCNT_dec(ret);
192b9cd1
AB
5119 return newSViv(length + 1);
5120 } else {
5121 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5122 return NULL;
5123 }
5124 }
5125 return &PL_sv_undef;
5126}
5127
5128SV*
288b8c02 5129Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5130{
288b8c02 5131 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5132 AV *av = newAV();
5133
7918f24d
NC
5134 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5135
5daac39c
NC
5136 if (rx && RXp_PAREN_NAMES(rx)) {
5137 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5138 HE *temphe;
5139 (void)hv_iterinit(hv);
5140 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5141 IV i;
5142 IV parno = 0;
5143 SV* sv_dat = HeVAL(temphe);
5144 I32 *nums = (I32*)SvPVX(sv_dat);
5145 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5146 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5147 rx->offs[nums[i]].start != -1 &&
5148 rx->offs[nums[i]].end != -1)
5149 {
5150 parno = nums[i];
5151 break;
5152 }
5153 }
f1b875a0 5154 if (parno || flags & RXapif_ALL) {
a663657d 5155 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5156 }
5157 }
5158 }
5159
ad64d0ec 5160 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5161}
5162
49d7dfbc 5163void
288b8c02
NC
5164Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5165 SV * const sv)
44a2ac75 5166{
288b8c02 5167 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5168 char *s = NULL;
a9d504c3 5169 I32 i = 0;
44a2ac75 5170 I32 s1, t1;
7918f24d
NC
5171
5172 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5173
cde0cee5
YO
5174 if (!rx->subbeg) {
5175 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5176 return;
cde0cee5
YO
5177 }
5178 else
f1b875a0 5179 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5180 /* $` */
f0ab9afb 5181 i = rx->offs[0].start;
cde0cee5 5182 s = rx->subbeg;
44a2ac75
YO
5183 }
5184 else
f1b875a0 5185 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5186 /* $' */
f0ab9afb
NC
5187 s = rx->subbeg + rx->offs[0].end;
5188 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5189 }
5190 else
5191 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5192 (s1 = rx->offs[paren].start) != -1 &&
5193 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5194 {
5195 /* $& $1 ... */
5196 i = t1 - s1;
5197 s = rx->subbeg + s1;
cde0cee5
YO
5198 } else {
5199 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5200 return;
cde0cee5
YO
5201 }
5202 assert(rx->sublen >= (s - rx->subbeg) + i );
5203 if (i >= 0) {
5204 const int oldtainted = PL_tainted;
5205 TAINT_NOT;
5206 sv_setpvn(sv, s, i);
5207 PL_tainted = oldtainted;
5208 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5209 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5210 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5211 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5212 {
5213 SvUTF8_on(sv);
5214 }
5215 else
5216 SvUTF8_off(sv);
5217 if (PL_tainting) {
07bc277f 5218 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5219 if (SvTYPE(sv) >= SVt_PVMG) {
5220 MAGIC* const mg = SvMAGIC(sv);
5221 MAGIC* mgt;
5222 PL_tainted = 1;
5223 SvMAGIC_set(sv, mg->mg_moremagic);
5224 SvTAINT(sv);
5225 if ((mgt = SvMAGIC(sv))) {
5226 mg->mg_moremagic = mgt;
5227 SvMAGIC_set(sv, mg);
44a2ac75 5228 }
cde0cee5
YO
5229 } else {
5230 PL_tainted = 1;
5231 SvTAINT(sv);
5232 }
5233 } else
5234 SvTAINTED_off(sv);
44a2ac75 5235 }
81714fb9 5236 } else {
44a2ac75 5237 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5238 return;
81714fb9
YO
5239 }
5240}
93b32b6d 5241
2fdbfb4d
AB
5242void
5243Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5244 SV const * const value)
5245{
7918f24d
NC
5246 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5247
2fdbfb4d
AB
5248 PERL_UNUSED_ARG(rx);
5249 PERL_UNUSED_ARG(paren);
5250 PERL_UNUSED_ARG(value);
5251
5252 if (!PL_localizing)
f1f66076 5253 Perl_croak(aTHX_ "%s", PL_no_modify);
2fdbfb4d
AB
5254}
5255
5256I32
288b8c02 5257Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5258 const I32 paren)
5259{
288b8c02 5260 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5261 I32 i;
5262 I32 s1, t1;
5263
7918f24d
NC
5264 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5265
2fdbfb4d
AB
5266 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5267 switch (paren) {
192b9cd1 5268 /* $` / ${^PREMATCH} */
f1b875a0 5269 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5270 if (rx->offs[0].start != -1) {
5271 i = rx->offs[0].start;
5272 if (i > 0) {
5273 s1 = 0;
5274 t1 = i;
5275 goto getlen;
5276 }
5277 }
5278 return 0;
192b9cd1 5279 /* $' / ${^POSTMATCH} */
f1b875a0 5280 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5281 if (rx->offs[0].end != -1) {
5282 i = rx->sublen - rx->offs[0].end;
5283 if (i > 0) {
5284 s1 = rx->offs[0].end;
5285 t1 = rx->sublen;
5286 goto getlen;
5287 }
5288 }
5289 return 0;
192b9cd1
AB
5290 /* $& / ${^MATCH}, $1, $2, ... */
5291 default:
2fdbfb4d
AB
5292 if (paren <= (I32)rx->nparens &&
5293 (s1 = rx->offs[paren].start) != -1 &&
5294 (t1 = rx->offs[paren].end) != -1)
5295 {
5296 i = t1 - s1;
5297 goto getlen;
5298 } else {
5299 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5300 report_uninit((const SV *)sv);
2fdbfb4d
AB
5301 return 0;
5302 }
5303 }
5304 getlen:
07bc277f 5305 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5306 const char * const s = rx->subbeg + s1;
5307 const U8 *ep;
5308 STRLEN el;
5309
5310 i = t1 - s1;
5311 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5312 i = el;
5313 }
5314 return i;
5315}
5316
fe578d7f 5317SV*
49d7dfbc 5318Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5319{
7918f24d 5320 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5321 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5322 if (0)
5323 return NULL;
5324 else
5325 return newSVpvs("Regexp");
fe578d7f 5326}
0a4db386 5327
894be9b7 5328/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5329 * If flags is REG_RSN_RETURN_NULL returns null.
5330 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5331 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5332 * to the parsed name as looked up in the RExC_paren_names hash.
5333 * If there is an error throws a vFAIL().. type exception.
894be9b7 5334 */
0a4db386
YO
5335
5336#define REG_RSN_RETURN_NULL 0
5337#define REG_RSN_RETURN_NAME 1
5338#define REG_RSN_RETURN_DATA 2
5339
894be9b7 5340STATIC SV*
7918f24d
NC
5341S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5342{
894be9b7 5343 char *name_start = RExC_parse;
1f1031fe 5344
7918f24d
NC
5345 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5346
1f1031fe
YO
5347 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5348 /* skip IDFIRST by using do...while */
5349 if (UTF)
5350 do {
5351 RExC_parse += UTF8SKIP(RExC_parse);
5352 } while (isALNUM_utf8((U8*)RExC_parse));
5353 else
5354 do {
5355 RExC_parse++;
5356 } while (isALNUM(*RExC_parse));
894be9b7 5357 }
1f1031fe 5358
0a4db386 5359 if ( flags ) {
59cd0e26
NC
5360 SV* sv_name
5361 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5362 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5363 if ( flags == REG_RSN_RETURN_NAME)
5364 return sv_name;
5365 else if (flags==REG_RSN_RETURN_DATA) {
5366 HE *he_str = NULL;
5367 SV *sv_dat = NULL;
5368 if ( ! sv_name ) /* should not happen*/
5369 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5370 if (RExC_paren_names)
5371 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5372 if ( he_str )
5373 sv_dat = HeVAL(he_str);
5374 if ( ! sv_dat )
5375 vFAIL("Reference to nonexistent named group");
5376 return sv_dat;
5377 }
5378 else {
5379 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5380 }
5381 /* NOT REACHED */
894be9b7 5382 }
0a4db386 5383 return NULL;
894be9b7
YO
5384}
5385
3dab1dad
YO
5386#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5387 int rem=(int)(RExC_end - RExC_parse); \
5388 int cut; \
5389 int num; \
5390 int iscut=0; \
5391 if (rem>10) { \
5392 rem=10; \
5393 iscut=1; \
5394 } \
5395 cut=10-rem; \
5396 if (RExC_lastparse!=RExC_parse) \
5397 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5398 rem, RExC_parse, \
5399 cut + 4, \
5400 iscut ? "..." : "<" \
5401 ); \
5402 else \
5403 PerlIO_printf(Perl_debug_log,"%16s",""); \
5404 \
5405 if (SIZE_ONLY) \
3b57cd43 5406 num = RExC_size + 1; \
3dab1dad
YO
5407 else \
5408 num=REG_NODE_NUM(RExC_emit); \
5409 if (RExC_lastnum!=num) \
0a4db386 5410 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5411 else \
0a4db386 5412 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5413 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5414 (int)((depth*2)), "", \
3dab1dad
YO
5415 (funcname) \
5416 ); \
5417 RExC_lastnum=num; \
5418 RExC_lastparse=RExC_parse; \
5419})
5420
07be1b83
YO
5421
5422
3dab1dad
YO
5423#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5424 DEBUG_PARSE_MSG((funcname)); \
5425 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5426})
6bda09f9
YO
5427#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5428 DEBUG_PARSE_MSG((funcname)); \
5429 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5430})
a687059c
LW
5431/*
5432 - reg - regular expression, i.e. main body or parenthesized thing
5433 *
5434 * Caller must absorb opening parenthesis.
5435 *
5436 * Combining parenthesis handling with the base level of regular expression
5437 * is a trifle forced, but the need to tie the tails of the branches to what
5438 * follows makes it hard to avoid.
5439 */
07be1b83
YO
5440#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5441#ifdef DEBUGGING
5442#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5443#else
5444#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5445#endif
3dab1dad 5446
76e3520e 5447STATIC regnode *
3dab1dad 5448S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5449 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5450{
27da23d5 5451 dVAR;
c277df42
IZ
5452 register regnode *ret; /* Will be the head of the group. */
5453 register regnode *br;
5454 register regnode *lastbr;
cbbf8932 5455 register regnode *ender = NULL;
a0d0e21e 5456 register I32 parno = 0;
cbbf8932 5457 I32 flags;
f7819f85 5458 U32 oregflags = RExC_flags;
6136c704
AL
5459 bool have_branch = 0;
5460 bool is_open = 0;
594d7033
YO
5461 I32 freeze_paren = 0;
5462 I32 after_freeze = 0;
9d1d55b5
JP
5463
5464 /* for (?g), (?gc), and (?o) warnings; warning
5465 about (?c) will warn about (?g) -- japhy */
5466
6136c704
AL
5467#define WASTED_O 0x01
5468#define WASTED_G 0x02
5469#define WASTED_C 0x04
5470#define WASTED_GC (0x02|0x04)
cbbf8932 5471 I32 wastedflags = 0x00;
9d1d55b5 5472
fac92740 5473 char * parse_start = RExC_parse; /* MJD */
a28509cc 5474 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5475
3dab1dad 5476 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5477
5478 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5479 DEBUG_PARSE("reg ");
5480
821b33a5 5481 *flagp = 0; /* Tentatively. */
a0d0e21e 5482
9d1d55b5 5483
a0d0e21e
LW
5484 /* Make an OPEN node, if parenthesized. */
5485 if (paren) {
e2e6a0f1
YO
5486 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5487 char *start_verb = RExC_parse;
5488 STRLEN verb_len = 0;
5489 char *start_arg = NULL;
5490 unsigned char op = 0;
5491 int argok = 1;
5492 int internal_argval = 0; /* internal_argval is only useful if !argok */
5493 while ( *RExC_parse && *RExC_parse != ')' ) {
5494 if ( *RExC_parse == ':' ) {
5495 start_arg = RExC_parse + 1;
5496 break;
5497 }
5498 RExC_parse++;
5499 }
5500 ++start_verb;
5501 verb_len = RExC_parse - start_verb;
5502 if ( start_arg ) {
5503 RExC_parse++;
5504 while ( *RExC_parse && *RExC_parse != ')' )
5505 RExC_parse++;
5506 if ( *RExC_parse != ')' )
5507 vFAIL("Unterminated verb pattern argument");
5508 if ( RExC_parse == start_arg )
5509 start_arg = NULL;
5510 } else {
5511 if ( *RExC_parse != ')' )
5512 vFAIL("Unterminated verb pattern");
5513 }
5d458dd8 5514
e2e6a0f1
YO
5515 switch ( *start_verb ) {
5516 case 'A': /* (*ACCEPT) */
568a785a 5517 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5518 op = ACCEPT;
5519 internal_argval = RExC_nestroot;
5520 }
5521 break;
5522 case 'C': /* (*COMMIT) */
568a785a 5523 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5524 op = COMMIT;
e2e6a0f1
YO
5525 break;
5526 case 'F': /* (*FAIL) */
568a785a 5527 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5528 op = OPFAIL;
5529 argok = 0;
5530 }
5531 break;
5d458dd8
YO
5532 case ':': /* (*:NAME) */
5533 case 'M': /* (*MARK:NAME) */
568a785a 5534 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5535 op = MARKPOINT;
5d458dd8
YO
5536 argok = -1;
5537 }
5538 break;
5539 case 'P': /* (*PRUNE) */
568a785a 5540 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5541 op = PRUNE;
e2e6a0f1 5542 break;
5d458dd8 5543 case 'S': /* (*SKIP) */
568a785a 5544 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5545 op = SKIP;
5546 break;
5547 case 'T': /* (*THEN) */
5548 /* [19:06] <TimToady> :: is then */
568a785a 5549 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5550 op = CUTGROUP;
5551 RExC_seen |= REG_SEEN_CUTGROUP;
5552 }
e2e6a0f1
YO
5553 break;
5554 }
5555 if ( ! op ) {
5556 RExC_parse++;
5557 vFAIL3("Unknown verb pattern '%.*s'",
5558 verb_len, start_verb);
5559 }
5560 if ( argok ) {
5561 if ( start_arg && internal_argval ) {
5562 vFAIL3("Verb pattern '%.*s' may not have an argument",
5563 verb_len, start_verb);
5564 } else if ( argok < 0 && !start_arg ) {
5565 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5566 verb_len, start_verb);
5567 } else {
5568 ret = reganode(pRExC_state, op, internal_argval);
5569 if ( ! internal_argval && ! SIZE_ONLY ) {
5570 if (start_arg) {
5571 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5572 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5573 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5574 ret->flags = 0;
5575 } else {
5576 ret->flags = 1;
5577 }
5578 }
5579 }
5580 if (!internal_argval)
5581 RExC_seen |= REG_SEEN_VERBARG;
5582 } else if ( start_arg ) {
5583 vFAIL3("Verb pattern '%.*s' may not have an argument",
5584 verb_len, start_verb);
5585 } else {
5586 ret = reg_node(pRExC_state, op);
5587 }
5588 nextchar(pRExC_state);
5589 return ret;
5590 } else
fac92740 5591 if (*RExC_parse == '?') { /* (?...) */
6136c704 5592 bool is_logical = 0;
a28509cc 5593 const char * const seqstart = RExC_parse;
ca9dfc88 5594
830247a4
IZ
5595 RExC_parse++;
5596 paren = *RExC_parse++;
c277df42 5597 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5598 switch (paren) {
894be9b7 5599
1f1031fe
YO
5600 case 'P': /* (?P...) variants for those used to PCRE/Python */
5601 paren = *RExC_parse++;
5602 if ( paren == '<') /* (?P<...>) named capture */
5603 goto named_capture;
5604 else if (paren == '>') { /* (?P>name) named recursion */
5605 goto named_recursion;
5606 }
5607 else if (paren == '=') { /* (?P=...) named backref */
5608 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5609 you change this make sure you change that */
5610 char* name_start = RExC_parse;
5611 U32 num = 0;
5612 SV *sv_dat = reg_scan_name(pRExC_state,
5613 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5614 if (RExC_parse == name_start || *RExC_parse != ')')
5615 vFAIL2("Sequence %.3s... not terminated",parse_start);
5616
5617 if (!SIZE_ONLY) {
5618 num = add_data( pRExC_state, 1, "S" );
5619 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5620 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5621 }
5622 RExC_sawback = 1;
5623 ret = reganode(pRExC_state,
5624 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5625 num);
5626 *flagp |= HASWIDTH;
5627
5628 Set_Node_Offset(ret, parse_start+1);
5629 Set_Node_Cur_Length(ret); /* MJD */
5630
5631 nextchar(pRExC_state);
5632 return ret;
5633 }
57b84237
YO
5634 RExC_parse++;
5635 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5636 /*NOTREACHED*/
5637 case '<': /* (?<...) */
b81d288d 5638 if (*RExC_parse == '!')
c277df42 5639 paren = ',';
0a4db386 5640 else if (*RExC_parse != '=')
1f1031fe 5641 named_capture:
0a4db386 5642 { /* (?<...>) */
81714fb9 5643 char *name_start;
894be9b7 5644 SV *svname;
81714fb9
YO
5645 paren= '>';
5646 case '\'': /* (?'...') */
5647 name_start= RExC_parse;
0a4db386
YO
5648 svname = reg_scan_name(pRExC_state,
5649 SIZE_ONLY ? /* reverse test from the others */
5650 REG_RSN_RETURN_NAME :
5651 REG_RSN_RETURN_NULL);
57b84237
YO
5652 if (RExC_parse == name_start) {
5653 RExC_parse++;
5654 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5655 /*NOTREACHED*/
5656 }
81714fb9
YO
5657 if (*RExC_parse != paren)
5658 vFAIL2("Sequence (?%c... not terminated",
5659 paren=='>' ? '<' : paren);
5660 if (SIZE_ONLY) {
e62cc96a
YO
5661 HE *he_str;
5662 SV *sv_dat = NULL;
894be9b7
YO
5663 if (!svname) /* shouldnt happen */
5664 Perl_croak(aTHX_
5665 "panic: reg_scan_name returned NULL");
81714fb9
YO
5666 if (!RExC_paren_names) {
5667 RExC_paren_names= newHV();
ad64d0ec 5668 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5669#ifdef DEBUGGING
5670 RExC_paren_name_list= newAV();
ad64d0ec 5671 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5672#endif
81714fb9
YO
5673 }
5674 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5675 if ( he_str )
81714fb9 5676 sv_dat = HeVAL(he_str);
e62cc96a 5677 if ( ! sv_dat ) {
81714fb9 5678 /* croak baby croak */
e62cc96a
YO
5679 Perl_croak(aTHX_
5680 "panic: paren_name hash element allocation failed");
5681 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5682 /* (?|...) can mean we have dupes so scan to check
5683 its already been stored. Maybe a flag indicating
5684 we are inside such a construct would be useful,
5685 but the arrays are likely to be quite small, so
5686 for now we punt -- dmq */
5687 IV count = SvIV(sv_dat);
5688 I32 *pv = (I32*)SvPVX(sv_dat);
5689 IV i;
5690 for ( i = 0 ; i < count ; i++ ) {
5691 if ( pv[i] == RExC_npar ) {
5692 count = 0;
5693 break;
5694 }
5695 }
5696 if ( count ) {
5697 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5698 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5699 pv[count] = RExC_npar;
3a92e6ae 5700 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5701 }
81714fb9
YO
5702 } else {
5703 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5704 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5705 SvIOK_on(sv_dat);
3ec35e0f 5706 SvIV_set(sv_dat, 1);
e62cc96a 5707 }
1f1031fe
YO
5708#ifdef DEBUGGING
5709 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5710 SvREFCNT_dec(svname);
5711#endif
e62cc96a 5712
81714fb9
YO
5713 /*sv_dump(sv_dat);*/
5714 }
5715 nextchar(pRExC_state);
5716 paren = 1;
5717 goto capturing_parens;
5718 }
5719 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5720 RExC_parse++;
fac92740 5721 case '=': /* (?=...) */
89c6a13e
AB
5722 RExC_seen_zerolen++;
5723 break;
fac92740 5724 case '!': /* (?!...) */
830247a4 5725 RExC_seen_zerolen++;
e2e6a0f1
YO
5726 if (*RExC_parse == ')') {
5727 ret=reg_node(pRExC_state, OPFAIL);
5728 nextchar(pRExC_state);
5729 return ret;
5730 }
594d7033
YO
5731 break;
5732 case '|': /* (?|...) */
5733 /* branch reset, behave like a (?:...) except that
5734 buffers in alternations share the same numbers */
5735 paren = ':';
5736 after_freeze = freeze_paren = RExC_npar;
5737 break;
fac92740
MJD
5738 case ':': /* (?:...) */
5739 case '>': /* (?>...) */
a0d0e21e 5740 break;
fac92740
MJD
5741 case '$': /* (?$...) */
5742 case '@': /* (?@...) */
8615cb43 5743 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5744 break;
fac92740 5745 case '#': /* (?#...) */
830247a4
IZ
5746 while (*RExC_parse && *RExC_parse != ')')
5747 RExC_parse++;
5748 if (*RExC_parse != ')')
c277df42 5749 FAIL("Sequence (?#... not terminated");
830247a4 5750 nextchar(pRExC_state);
a0d0e21e
LW
5751 *flagp = TRYAGAIN;
5752 return NULL;
894be9b7
YO
5753 case '0' : /* (?0) */
5754 case 'R' : /* (?R) */
5755 if (*RExC_parse != ')')
6bda09f9 5756 FAIL("Sequence (?R) not terminated");
1a147d38 5757 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5758 *flagp |= POSTPONED;
7f69552c
YO
5759 nextchar(pRExC_state);
5760 return ret;
5761 /*notreached*/
894be9b7
YO
5762 { /* named and numeric backreferences */
5763 I32 num;
894be9b7
YO
5764 case '&': /* (?&NAME) */
5765 parse_start = RExC_parse - 1;
1f1031fe 5766 named_recursion:
894be9b7 5767 {
0a4db386
YO
5768 SV *sv_dat = reg_scan_name(pRExC_state,
5769 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5770 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5771 }
5772 goto gen_recurse_regop;
5773 /* NOT REACHED */
542fa716
YO
5774 case '+':
5775 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5776 RExC_parse++;
5777 vFAIL("Illegal pattern");
5778 }
5779 goto parse_recursion;
5780 /* NOT REACHED*/
5781 case '-': /* (?-1) */
5782 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5783 RExC_parse--; /* rewind to let it be handled later */
5784 goto parse_flags;
5785 }
5786 /*FALLTHROUGH */
6bda09f9
YO
5787 case '1': case '2': case '3': case '4': /* (?1) */
5788 case '5': case '6': case '7': case '8': case '9':
5789 RExC_parse--;
542fa716 5790 parse_recursion:
894be9b7
YO
5791 num = atoi(RExC_parse);
5792 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5793 if (*RExC_parse == '-')
5794 RExC_parse++;
6bda09f9
YO
5795 while (isDIGIT(*RExC_parse))
5796 RExC_parse++;
5797 if (*RExC_parse!=')')
5798 vFAIL("Expecting close bracket");
894be9b7
YO
5799
5800 gen_recurse_regop:
542fa716
YO
5801 if ( paren == '-' ) {
5802 /*
5803 Diagram of capture buffer numbering.
5804 Top line is the normal capture buffer numbers
5805 Botton line is the negative indexing as from
5806 the X (the (?-2))
5807
5808 + 1 2 3 4 5 X 6 7
5809 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5810 - 5 4 3 2 1 X x x
5811
5812 */
5813 num = RExC_npar + num;
5814 if (num < 1) {
5815 RExC_parse++;
5816 vFAIL("Reference to nonexistent group");
5817 }
5818 } else if ( paren == '+' ) {
5819 num = RExC_npar + num - 1;
5820 }
5821
1a147d38 5822 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5823 if (!SIZE_ONLY) {
5824 if (num > (I32)RExC_rx->nparens) {
5825 RExC_parse++;
5826 vFAIL("Reference to nonexistent group");
5827 }
40d049e4 5828 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5829 RExC_emit++;
226de585 5830 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5831 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5832 } else {
6bda09f9 5833 RExC_size++;
6bda09f9 5834 }
0a4db386 5835 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5836 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5837 Set_Node_Offset(ret, parse_start); /* MJD */
5838
a3b492c3 5839 *flagp |= POSTPONED;
6bda09f9
YO
5840 nextchar(pRExC_state);
5841 return ret;
894be9b7
YO
5842 } /* named and numeric backreferences */
5843 /* NOT REACHED */
5844
fac92740 5845 case '?': /* (??...) */
6136c704 5846 is_logical = 1;
57b84237
YO
5847 if (*RExC_parse != '{') {
5848 RExC_parse++;
5849 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5850 /*NOTREACHED*/
5851 }
a3b492c3 5852 *flagp |= POSTPONED;
830247a4 5853 paren = *RExC_parse++;
0f5d15d6 5854 /* FALL THROUGH */
fac92740 5855 case '{': /* (?{...}) */
c277df42 5856 {
2eccd3b2
NC
5857 I32 count = 1;
5858 U32 n = 0;
c277df42 5859 char c;
830247a4 5860 char *s = RExC_parse;
c277df42 5861
830247a4
IZ
5862 RExC_seen_zerolen++;
5863 RExC_seen |= REG_SEEN_EVAL;
5864 while (count && (c = *RExC_parse)) {
6136c704
AL
5865 if (c == '\\') {
5866 if (RExC_parse[1])
5867 RExC_parse++;
5868 }
b81d288d 5869 else if (c == '{')
c277df42 5870 count++;
b81d288d 5871 else if (c == '}')
c277df42 5872 count--;
830247a4 5873 RExC_parse++;
c277df42 5874 }
6136c704 5875 if (*RExC_parse != ')') {
b81d288d 5876 RExC_parse = s;
b45f050a
JF
5877 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5878 }
c277df42 5879 if (!SIZE_ONLY) {
f3548bdc 5880 PAD *pad;
6136c704
AL
5881 OP_4tree *sop, *rop;
5882 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5883
569233ed
SB
5884 ENTER;
5885 Perl_save_re_context(aTHX);
f3548bdc 5886 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5887 sop->op_private |= OPpREFCOUNTED;
5888 /* re_dup will OpREFCNT_inc */
5889 OpREFCNT_set(sop, 1);
569233ed 5890 LEAVE;
c277df42 5891
830247a4 5892 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5893 RExC_rxi->data->data[n] = (void*)rop;
5894 RExC_rxi->data->data[n+1] = (void*)sop;
5895 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5896 SvREFCNT_dec(sv);
a0ed51b3 5897 }
e24b16f9 5898 else { /* First pass */
830247a4 5899 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5900 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5901 /* No compiled RE interpolated, has runtime
5902 components ===> unsafe. */
5903 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5904 if (PL_tainting && PL_tainted)
cc6b7395 5905 FAIL("Eval-group in insecure regular expression");
54df2634 5906#if PERL_VERSION > 8
923e4eb5 5907 if (IN_PERL_COMPILETIME)
b5c19bd7 5908 PL_cv_has_eval = 1;
54df2634 5909#endif
c277df42 5910 }
b5c19bd7 5911
830247a4 5912 nextchar(pRExC_state);
6136c704 5913 if (is_logical) {
830247a4 5914 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5915 if (!SIZE_ONLY)
5916 ret->flags = 2;
3dab1dad 5917 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5918 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5919 return ret;
5920 }
ccb2c380
MP
5921 ret = reganode(pRExC_state, EVAL, n);
5922 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5923 Set_Node_Offset(ret, parse_start);
5924 return ret;
c277df42 5925 }
fac92740 5926 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5927 {
0a4db386 5928 int is_define= 0;
fac92740 5929 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5930 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5931 || RExC_parse[1] == '<'
830247a4 5932 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5933 I32 flag;
5934
830247a4 5935 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5936 if (!SIZE_ONLY)
5937 ret->flags = 1;
3dab1dad 5938 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5939 goto insert_if;
b81d288d 5940 }
a0ed51b3 5941 }
0a4db386
YO
5942 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5943 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5944 {
5945 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5946 char *name_start= RExC_parse++;
2eccd3b2 5947 U32 num = 0;
0a4db386
YO
5948 SV *sv_dat=reg_scan_name(pRExC_state,
5949 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5950 if (RExC_parse == name_start || *RExC_parse != ch)
5951 vFAIL2("Sequence (?(%c... not terminated",
5952 (ch == '>' ? '<' : ch));
5953 RExC_parse++;
5954 if (!SIZE_ONLY) {
5955 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5956 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5957 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
5958 }
5959 ret = reganode(pRExC_state,NGROUPP,num);
5960 goto insert_if_check_paren;
5961 }
5962 else if (RExC_parse[0] == 'D' &&
5963 RExC_parse[1] == 'E' &&
5964 RExC_parse[2] == 'F' &&
5965 RExC_parse[3] == 'I' &&
5966 RExC_parse[4] == 'N' &&
5967 RExC_parse[5] == 'E')
5968 {
5969 ret = reganode(pRExC_state,DEFINEP,0);
5970 RExC_parse +=6 ;
5971 is_define = 1;
5972 goto insert_if_check_paren;
5973 }
5974 else if (RExC_parse[0] == 'R') {
5975 RExC_parse++;
5976 parno = 0;
5977 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5978 parno = atoi(RExC_parse++);
5979 while (isDIGIT(*RExC_parse))
5980 RExC_parse++;
5981 } else if (RExC_parse[0] == '&') {
5982 SV *sv_dat;
5983 RExC_parse++;
5984 sv_dat = reg_scan_name(pRExC_state,
5985 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5986 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5987 }
1a147d38 5988 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5989 goto insert_if_check_paren;
5990 }
830247a4 5991 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5992 /* (?(1)...) */
6136c704 5993 char c;
830247a4 5994 parno = atoi(RExC_parse++);
c277df42 5995
830247a4
IZ
5996 while (isDIGIT(*RExC_parse))
5997 RExC_parse++;
fac92740 5998 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5999
0a4db386 6000 insert_if_check_paren:
830247a4 6001 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6002 vFAIL("Switch condition not recognized");
c277df42 6003 insert_if:
3dab1dad
YO
6004 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6005 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6006 if (br == NULL)
830247a4 6007 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6008 else
3dab1dad 6009 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6010 c = *nextchar(pRExC_state);
d1b80229
IZ
6011 if (flags&HASWIDTH)
6012 *flagp |= HASWIDTH;
c277df42 6013 if (c == '|') {
0a4db386
YO
6014 if (is_define)
6015 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6016 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6017 regbranch(pRExC_state, &flags, 1,depth+1);
6018 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6019 if (flags&HASWIDTH)
6020 *flagp |= HASWIDTH;
830247a4 6021 c = *nextchar(pRExC_state);
a0ed51b3
LW
6022 }
6023 else
c277df42
IZ
6024 lastbr = NULL;
6025 if (c != ')')
8615cb43 6026 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6027 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6028 REGTAIL(pRExC_state, br, ender);
c277df42 6029 if (lastbr) {
3dab1dad
YO
6030 REGTAIL(pRExC_state, lastbr, ender);
6031 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6032 }
6033 else
3dab1dad 6034 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6035 RExC_size++; /* XXX WHY do we need this?!!
6036 For large programs it seems to be required
6037 but I can't figure out why. -- dmq*/
c277df42 6038 return ret;
a0ed51b3
LW
6039 }
6040 else {
830247a4 6041 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6042 }
6043 }
1b1626e4 6044 case 0:
830247a4 6045 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6046 vFAIL("Sequence (? incomplete");
1b1626e4 6047 break;
a0d0e21e 6048 default:
cde0cee5
YO
6049 --RExC_parse;
6050 parse_flags: /* (?i) */
6051 {
6052 U32 posflags = 0, negflags = 0;
6053 U32 *flagsp = &posflags;
6054
6055 while (*RExC_parse) {
6056 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6057 /* (?g), (?gc) and (?o) are useless here
6058 and must be globally applied -- japhy */
cde0cee5
YO
6059 switch (*RExC_parse) {
6060 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
6061 case ONCE_PAT_MOD: /* 'o' */
6062 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6063 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6064 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6065 if (! (wastedflags & wflagbit) ) {
6066 wastedflags |= wflagbit;
6067 vWARN5(
6068 RExC_parse + 1,
6069 "Useless (%s%c) - %suse /%c modifier",
6070 flagsp == &negflags ? "?-" : "?",
6071 *RExC_parse,
6072 flagsp == &negflags ? "don't " : "",
6073 *RExC_parse
6074 );
6075 }
6076 }
cde0cee5
YO
6077 break;
6078
f7819f85 6079 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6080 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6081 if (! (wastedflags & WASTED_C) ) {
6082 wastedflags |= WASTED_GC;
9d1d55b5
JP
6083 vWARN3(
6084 RExC_parse + 1,
6085 "Useless (%sc) - %suse /gc modifier",
6086 flagsp == &negflags ? "?-" : "?",
6087 flagsp == &negflags ? "don't " : ""
6088 );
6089 }
6090 }
cde0cee5 6091 break;
f7819f85 6092 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6093 if (flagsp == &negflags) {
668c081a
NC
6094 if (SIZE_ONLY)
6095 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6096 } else {
6097 *flagsp |= RXf_PMf_KEEPCOPY;
6098 }
6099 break;
6100 case '-':
57b84237
YO
6101 if (flagsp == &negflags) {
6102 RExC_parse++;
6103 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6104 /*NOTREACHED*/
6105 }
cde0cee5
YO
6106 flagsp = &negflags;
6107 wastedflags = 0; /* reset so (?g-c) warns twice */
6108 break;
6109 case ':':
6110 paren = ':';
6111 /*FALLTHROUGH*/
6112 case ')':
6113 RExC_flags |= posflags;
6114 RExC_flags &= ~negflags;
f7819f85
A
6115 if (paren != ':') {
6116 oregflags |= posflags;
6117 oregflags &= ~negflags;
6118 }
cde0cee5
YO
6119 nextchar(pRExC_state);
6120 if (paren != ':') {
6121 *flagp = TRYAGAIN;
6122 return NULL;
6123 } else {
6124 ret = NULL;
6125 goto parse_rest;
6126 }
6127 /*NOTREACHED*/
6128 default:
cde0cee5
YO
6129 RExC_parse++;
6130 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6131 /*NOTREACHED*/
6132 }
830247a4 6133 ++RExC_parse;
48c036b1 6134 }
cde0cee5 6135 }} /* one for the default block, one for the switch */
a0d0e21e 6136 }
fac92740 6137 else { /* (...) */
81714fb9 6138 capturing_parens:
830247a4
IZ
6139 parno = RExC_npar;
6140 RExC_npar++;
e2e6a0f1 6141
830247a4 6142 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6143 if (!SIZE_ONLY ){
6144 if (!RExC_nestroot)
6145 RExC_nestroot = parno;
c009da3d
YO
6146 if (RExC_seen & REG_SEEN_RECURSE
6147 && !RExC_open_parens[parno-1])
6148 {
e2e6a0f1 6149 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6150 "Setting open paren #%"IVdf" to %d\n",
6151 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6152 RExC_open_parens[parno-1]= ret;
6153 }
6bda09f9 6154 }
fac92740
MJD
6155 Set_Node_Length(ret, 1); /* MJD */
6156 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6157 is_open = 1;
a0d0e21e 6158 }
a0ed51b3 6159 }
fac92740 6160 else /* ! paren */
a0d0e21e 6161 ret = NULL;
cde0cee5
YO
6162
6163 parse_rest:
a0d0e21e 6164 /* Pick up the branches, linking them together. */
fac92740 6165 parse_start = RExC_parse; /* MJD */
3dab1dad 6166 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6167
6168 if (freeze_paren) {
6169 if (RExC_npar > after_freeze)
6170 after_freeze = RExC_npar;
6171 RExC_npar = freeze_paren;
6172 }
6173
fac92740 6174 /* branch_len = (paren != 0); */
2af232bd 6175
a0d0e21e
LW
6176 if (br == NULL)
6177 return(NULL);
830247a4
IZ
6178 if (*RExC_parse == '|') {
6179 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6180 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6181 }
fac92740 6182 else { /* MJD */
6bda09f9 6183 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6184 Set_Node_Length(br, paren != 0);
6185 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6186 }
c277df42
IZ
6187 have_branch = 1;
6188 if (SIZE_ONLY)
830247a4 6189 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6190 }
6191 else if (paren == ':') {
c277df42
IZ
6192 *flagp |= flags&SIMPLE;
6193 }
6136c704 6194 if (is_open) { /* Starts with OPEN. */
3dab1dad 6195 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6196 }
6197 else if (paren != '?') /* Not Conditional */
a0d0e21e 6198 ret = br;
8ae10a67 6199 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6200 lastbr = br;
830247a4
IZ
6201 while (*RExC_parse == '|') {
6202 if (!SIZE_ONLY && RExC_extralen) {
6203 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6204 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6205 }
6206 if (SIZE_ONLY)
830247a4
IZ
6207 RExC_extralen += 2; /* Account for LONGJMP. */
6208 nextchar(pRExC_state);
594d7033
YO
6209 if (freeze_paren) {
6210 if (RExC_npar > after_freeze)
6211 after_freeze = RExC_npar;
6212 RExC_npar = freeze_paren;
6213 }
3dab1dad 6214 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6215
a687059c 6216 if (br == NULL)
a0d0e21e 6217 return(NULL);
3dab1dad 6218 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6219 lastbr = br;
8ae10a67 6220 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6221 }
6222
c277df42
IZ
6223 if (have_branch || paren != ':') {
6224 /* Make a closing node, and hook it on the end. */
6225 switch (paren) {
6226 case ':':
830247a4 6227 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6228 break;
6229 case 1:
830247a4 6230 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6231 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6232 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6233 "Setting close paren #%"IVdf" to %d\n",
6234 (IV)parno, REG_NODE_NUM(ender)));
6235 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6236 if (RExC_nestroot == parno)
6237 RExC_nestroot = 0;
40d049e4 6238 }
fac92740
MJD
6239 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6240 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6241 break;
6242 case '<':
c277df42
IZ
6243 case ',':
6244 case '=':
6245 case '!':
c277df42 6246 *flagp &= ~HASWIDTH;
821b33a5
IZ
6247 /* FALL THROUGH */
6248 case '>':
830247a4 6249 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6250 break;
6251 case 0:
830247a4 6252 ender = reg_node(pRExC_state, END);
40d049e4
YO
6253 if (!SIZE_ONLY) {
6254 assert(!RExC_opend); /* there can only be one! */
6255 RExC_opend = ender;
6256 }
c277df42
IZ
6257 break;
6258 }
eaf3ca90 6259 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6260
9674d46a 6261 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6262 if (depth==1)
6263 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6264
c277df42 6265 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6266 for (br = ret; br; br = regnext(br)) {
6267 const U8 op = PL_regkind[OP(br)];
6268 if (op == BRANCH) {
07be1b83 6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6270 }
6271 else if (op == BRANCHJ) {
07be1b83 6272 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6273 }
c277df42
IZ
6274 }
6275 }
a0d0e21e 6276 }
c277df42
IZ
6277
6278 {
e1ec3a88
AL
6279 const char *p;
6280 static const char parens[] = "=!<,>";
c277df42
IZ
6281
6282 if (paren && (p = strchr(parens, paren))) {
eb160463 6283 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6284 int flag = (p - parens) > 1;
6285
6286 if (paren == '>')
6287 node = SUSPEND, flag = 0;
6bda09f9 6288 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6289 Set_Node_Cur_Length(ret);
6290 Set_Node_Offset(ret, parse_start + 1);
c277df42 6291 ret->flags = flag;
07be1b83 6292 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6293 }
a0d0e21e
LW
6294 }
6295
6296 /* Check for proper termination. */
ce3e6498 6297 if (paren) {
e2509266 6298 RExC_flags = oregflags;
830247a4
IZ
6299 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6300 RExC_parse = oregcomp_parse;
380a0633 6301 vFAIL("Unmatched (");
ce3e6498 6302 }
a0ed51b3 6303 }
830247a4
IZ
6304 else if (!paren && RExC_parse < RExC_end) {
6305 if (*RExC_parse == ')') {
6306 RExC_parse++;
380a0633 6307 vFAIL("Unmatched )");
a0ed51b3
LW
6308 }
6309 else
b45f050a 6310 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6311 /* NOTREACHED */
6312 }
594d7033
YO
6313 if (after_freeze)
6314 RExC_npar = after_freeze;
a0d0e21e 6315 return(ret);
a687059c
LW
6316}
6317
6318/*
6319 - regbranch - one alternative of an | operator
6320 *
6321 * Implements the concatenation operator.
6322 */
76e3520e 6323STATIC regnode *
3dab1dad 6324S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6325{
97aff369 6326 dVAR;
c277df42
IZ
6327 register regnode *ret;
6328 register regnode *chain = NULL;
6329 register regnode *latest;
6330 I32 flags = 0, c = 0;
3dab1dad 6331 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6332
6333 PERL_ARGS_ASSERT_REGBRANCH;
6334
3dab1dad 6335 DEBUG_PARSE("brnc");
02daf0ab 6336
b81d288d 6337 if (first)
c277df42
IZ
6338 ret = NULL;
6339 else {
b81d288d 6340 if (!SIZE_ONLY && RExC_extralen)
830247a4 6341 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6342 else {
830247a4 6343 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6344 Set_Node_Length(ret, 1);
6345 }
c277df42
IZ
6346 }
6347
b81d288d 6348 if (!first && SIZE_ONLY)
830247a4 6349 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6350
c277df42 6351 *flagp = WORST; /* Tentatively. */
a0d0e21e 6352
830247a4
IZ
6353 RExC_parse--;
6354 nextchar(pRExC_state);
6355 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6356 flags &= ~TRYAGAIN;
3dab1dad 6357 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6358 if (latest == NULL) {
6359 if (flags & TRYAGAIN)
6360 continue;
6361 return(NULL);
a0ed51b3
LW
6362 }
6363 else if (ret == NULL)
c277df42 6364 ret = latest;
8ae10a67 6365 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6366 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6367 *flagp |= flags&SPSTART;
6368 else {
830247a4 6369 RExC_naughty++;
3dab1dad 6370 REGTAIL(pRExC_state, chain, latest);
a687059c 6371 }
a0d0e21e 6372 chain = latest;
c277df42
IZ
6373 c++;
6374 }
6375 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6376 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6377 if (ret == NULL)
6378 ret = chain;
6379 }
6380 if (c == 1) {
6381 *flagp |= flags&SIMPLE;
a0d0e21e 6382 }
a687059c 6383
d4c19fe8 6384 return ret;
a687059c
LW
6385}
6386
6387/*
6388 - regpiece - something followed by possible [*+?]
6389 *
6390 * Note that the branching code sequences used for ? and the general cases
6391 * of * and + are somewhat optimized: they use the same NOTHING node as
6392 * both the endmarker for their branch list and the body of the last branch.
6393 * It might seem that this node could be dispensed with entirely, but the
6394 * endmarker role is not redundant.
6395 */
76e3520e 6396STATIC regnode *
3dab1dad 6397S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6398{
97aff369 6399 dVAR;
c277df42 6400 register regnode *ret;
a0d0e21e
LW
6401 register char op;
6402 register char *next;
6403 I32 flags;
1df70142 6404 const char * const origparse = RExC_parse;
a0d0e21e 6405 I32 min;
c277df42 6406 I32 max = REG_INFTY;
fac92740 6407 char *parse_start;
10edeb5d 6408 const char *maxpos = NULL;
3dab1dad 6409 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6410
6411 PERL_ARGS_ASSERT_REGPIECE;
6412
3dab1dad 6413 DEBUG_PARSE("piec");
a0d0e21e 6414
3dab1dad 6415 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6416 if (ret == NULL) {
6417 if (flags & TRYAGAIN)
6418 *flagp |= TRYAGAIN;
6419 return(NULL);
6420 }
6421
830247a4 6422 op = *RExC_parse;
a0d0e21e 6423
830247a4 6424 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6425 maxpos = NULL;
fac92740 6426 parse_start = RExC_parse; /* MJD */
830247a4 6427 next = RExC_parse + 1;
a0d0e21e
LW
6428 while (isDIGIT(*next) || *next == ',') {
6429 if (*next == ',') {
6430 if (maxpos)
6431 break;
6432 else
6433 maxpos = next;
a687059c 6434 }
a0d0e21e
LW
6435 next++;
6436 }
6437 if (*next == '}') { /* got one */
6438 if (!maxpos)
6439 maxpos = next;
830247a4
IZ
6440 RExC_parse++;
6441 min = atoi(RExC_parse);
a0d0e21e
LW
6442 if (*maxpos == ',')
6443 maxpos++;
6444 else
830247a4 6445 maxpos = RExC_parse;
a0d0e21e
LW
6446 max = atoi(maxpos);
6447 if (!max && *maxpos != '0')
c277df42
IZ
6448 max = REG_INFTY; /* meaning "infinity" */
6449 else if (max >= REG_INFTY)
8615cb43 6450 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6451 RExC_parse = next;
6452 nextchar(pRExC_state);
a0d0e21e
LW
6453
6454 do_curly:
6455 if ((flags&SIMPLE)) {
830247a4 6456 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6457 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6458 Set_Node_Offset(ret, parse_start+1); /* MJD */
6459 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6460 }
6461 else {
3dab1dad 6462 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6463
6464 w->flags = 0;
3dab1dad 6465 REGTAIL(pRExC_state, ret, w);
830247a4 6466 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6467 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6468 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6469 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6470 }
6bda09f9 6471 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6472 /* MJD hk */
6473 Set_Node_Offset(ret, parse_start+1);
2af232bd 6474 Set_Node_Length(ret,
fac92740 6475 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6476
830247a4 6477 if (!SIZE_ONLY && RExC_extralen)
c277df42 6478 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6479 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6480 if (SIZE_ONLY)
830247a4
IZ
6481 RExC_whilem_seen++, RExC_extralen += 3;
6482 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6483 }
c277df42 6484 ret->flags = 0;
a0d0e21e
LW
6485
6486 if (min > 0)
821b33a5
IZ
6487 *flagp = WORST;
6488 if (max > 0)
6489 *flagp |= HASWIDTH;
8fa23287 6490 if (max < min)
8615cb43 6491 vFAIL("Can't do {n,m} with n > m");
c277df42 6492 if (!SIZE_ONLY) {
eb160463
GS
6493 ARG1_SET(ret, (U16)min);
6494 ARG2_SET(ret, (U16)max);
a687059c 6495 }
a687059c 6496
a0d0e21e 6497 goto nest_check;
a687059c 6498 }
a0d0e21e 6499 }
a687059c 6500
a0d0e21e
LW
6501 if (!ISMULT1(op)) {
6502 *flagp = flags;
a687059c 6503 return(ret);
a0d0e21e 6504 }
bb20fd44 6505
c277df42 6506#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6507
6508 /* if this is reinstated, don't forget to put this back into perldiag:
6509
6510 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6511
6512 (F) The part of the regexp subject to either the * or + quantifier
6513 could match an empty string. The {#} shows in the regular
6514 expression about where the problem was discovered.
6515
6516 */
6517
bb20fd44 6518 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6519 vFAIL("Regexp *+ operand could be empty");
b81d288d 6520#endif
bb20fd44 6521
fac92740 6522 parse_start = RExC_parse;
830247a4 6523 nextchar(pRExC_state);
a0d0e21e 6524
821b33a5 6525 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6526
6527 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6528 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6529 ret->flags = 0;
830247a4 6530 RExC_naughty += 4;
a0d0e21e
LW
6531 }
6532 else if (op == '*') {
6533 min = 0;
6534 goto do_curly;
a0ed51b3
LW
6535 }
6536 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6537 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6538 ret->flags = 0;
830247a4 6539 RExC_naughty += 3;
a0d0e21e
LW
6540 }
6541 else if (op == '+') {
6542 min = 1;
6543 goto do_curly;
a0ed51b3
LW
6544 }
6545 else if (op == '?') {
a0d0e21e
LW
6546 min = 0; max = 1;
6547 goto do_curly;
6548 }
6549 nest_check:
668c081a
NC
6550 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6551 ckWARN3reg(RExC_parse,
6552 "%.*s matches null string many times",
6553 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6554 origparse);
a0d0e21e
LW
6555 }
6556
b9b4dddf 6557 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6558 nextchar(pRExC_state);
6bda09f9 6559 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6560 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6561 }
b9b4dddf
YO
6562#ifndef REG_ALLOW_MINMOD_SUSPEND
6563 else
6564#endif
6565 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6566 regnode *ender;
6567 nextchar(pRExC_state);
6568 ender = reg_node(pRExC_state, SUCCEED);
6569 REGTAIL(pRExC_state, ret, ender);
6570 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6571 ret->flags = 0;
6572 ender = reg_node(pRExC_state, TAIL);
6573 REGTAIL(pRExC_state, ret, ender);
6574 /*ret= ender;*/
6575 }
6576
6577 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6578 RExC_parse++;
b45f050a
JF
6579 vFAIL("Nested quantifiers");
6580 }
a0d0e21e
LW
6581
6582 return(ret);
a687059c
LW
6583}
6584
fc8cd66c
YO
6585
6586/* reg_namedseq(pRExC_state,UVp)
6587
6588 This is expected to be called by a parser routine that has
afefe6bf 6589 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6590 expected to point at the first char following the N at the time
6591 of the call.
6592
6593 If valuep is non-null then it is assumed that we are parsing inside
6594 of a charclass definition and the first codepoint in the resolved
6595 string is returned via *valuep and the routine will return NULL.
6596 In this mode if a multichar string is returned from the charnames
6597 handler a warning will be issued, and only the first char in the
6598 sequence will be examined. If the string returned is zero length
6599 then the value of *valuep is undefined and NON-NULL will
6600 be returned to indicate failure. (This will NOT be a valid pointer
6601 to a regnode.)
6602
afefe6bf 6603 If valuep is null then it is assumed that we are parsing normal text
fc8cd66c
YO
6604 and inserts a new EXACT node into the program containing the resolved
6605 string and returns a pointer to the new node. If the string is
6606 zerolength a NOTHING node is emitted.
afefe6bf 6607
fc8cd66c
YO
6608 On success RExC_parse is set to the char following the endbrace.
6609 Parsing failures will generate a fatal errorvia vFAIL(...)
6610
6611 NOTE: We cache all results from the charnames handler locally in
6612 the RExC_charnames hash (created on first use) to prevent a charnames
6613 handler from playing silly-buggers and returning a short string and
6614 then a long string for a given pattern. Since the regexp program
6615 size is calculated during an initial parse this would result
6616 in a buffer overrun so we cache to prevent the charname result from
6617 changing during the course of the parse.
6618
6619 */
6620STATIC regnode *
afefe6bf 6621S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c
YO
6622{
6623 char * name; /* start of the content of the name */
6624 char * endbrace; /* endbrace following the name */
6625 SV *sv_str = NULL;
6626 SV *sv_name = NULL;
6627 STRLEN len; /* this has various purposes throughout the code */
6628 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6629 regnode *ret = NULL;
7918f24d
NC
6630
6631 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6632
afefe6bf
RGS
6633 if (*RExC_parse != '{' ||
6634 (*RExC_parse == '{' && RExC_parse[1]
6635 && strchr("0123456789", RExC_parse[1])))
6636 {
6637 GET_RE_DEBUG_FLAGS_DECL;
6638 if (valuep)
6639 /* no bare \N in a charclass */
6640 vFAIL("Missing braces on \\N{}");
6641 GET_RE_DEBUG_FLAGS;
6642 nextchar(pRExC_state);
6643 ret = reg_node(pRExC_state, REG_ANY);
6644 *flagp |= HASWIDTH|SIMPLE;
6645 RExC_naughty++;
6646 RExC_parse--;
6647 Set_Node_Length(ret, 1); /* MJD */
6648 return ret;
fc8cd66c
YO
6649 }
6650 name = RExC_parse+1;
6651 endbrace = strchr(RExC_parse, '}');
6652 if ( ! endbrace ) {
6653 RExC_parse++;
6654 vFAIL("Missing right brace on \\N{}");
6655 }
6656 RExC_parse = endbrace + 1;
6657
6658
6659 /* RExC_parse points at the beginning brace,
6660 endbrace points at the last */
6661 if ( name[0]=='U' && name[1]=='+' ) {
38a44b82 6662 /* its a "Unicode hex" notation {U+89AB} */
fc8cd66c
YO
6663 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6664 | PERL_SCAN_DISALLOW_PREFIX
6665 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6666 UV cp;
196f1508 6667 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6668 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6669 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6670 cp = 0xFFFD;
6671 }
fc8cd66c 6672 if ( valuep ) {
a4893424 6673 if (cp > 0xff) RExC_utf8 = 1;
fc8cd66c
YO
6674 *valuep = cp;
6675 return NULL;
6676 }
a4893424
KW
6677
6678 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6679 * is going to be in utf8 and the representation changes under utf8. */
6680 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6681 U8 string[UTF8_MAXBYTES+1];
6682 U8 *tmps;
6683 RExC_utf8 = 1;
6684 tmps = uvuni_to_utf8(string, cp);
6685 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6686 } else { /* Otherwise, no need for utf8, can skip that step */
6687 char string;
6688 string = (char)cp;
6689 sv_str= newSVpvn(&string, 1);
6690 }
fc8cd66c
YO
6691 } else {
6692 /* fetch the charnames handler for this scope */
6693 HV * const table = GvHV(PL_hintgv);
6694 SV **cvp= table ?
6695 hv_fetchs(table, "charnames", FALSE) :
6696 NULL;
6697 SV *cv= cvp ? *cvp : NULL;
6698 HE *he_str;
6699 int count;
6700 /* create an SV with the name as argument */
6701 sv_name = newSVpvn(name, endbrace - name);
6702
6703 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
deda7dd1 6704 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
fc8cd66c 6705 "(possibly a missing \"use charnames ...\")",
deda7dd1 6706 SVfARG(sv_name));
fc8cd66c
YO
6707 }
6708 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
deda7dd1
NC
6709 vFAIL2("Constant(\\N{%" SVf "}): "
6710 "$^H{charnames} is not defined", SVfARG(sv_name));
fc8cd66c
YO
6711 }
6712
6713
6714
6715 if (!RExC_charnames) {
6716 /* make sure our cache is allocated */
6717 RExC_charnames = newHV();
ad64d0ec 6718 sv_2mortal(MUTABLE_SV(RExC_charnames));
fc8cd66c
YO
6719 }
6720 /* see if we have looked this one up before */
6721 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6722 if ( he_str ) {
6723 sv_str = HeVAL(he_str);
6724 cached = 1;
6725 } else {
6726 dSP ;
6727
6728 ENTER ;
6729 SAVETMPS ;
6730 PUSHMARK(SP) ;
6731
6732 XPUSHs(sv_name);
6733
6734 PUTBACK ;
6735
6736 count= call_sv(cv, G_SCALAR);
2628941b 6737 SPAGAIN ;
fc8cd66c
YO
6738
6739 if (count == 1) { /* XXXX is this right? dmq */
6740 sv_str = POPs;
6741 SvREFCNT_inc_simple_void(sv_str);
6742 }
6743
fc8cd66c
YO
6744 PUTBACK ;
6745 FREETMPS ;
6746 LEAVE ;
6747
6748 if ( !sv_str || !SvOK(sv_str) ) {
deda7dd1
NC
6749 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6750 "did not return a defined value", SVfARG(sv_name));
fc8cd66c
YO
6751 }
6752 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6753 cached = 1;
6754 }
6755 }
6756 if (valuep) {
6757 char *p = SvPV(sv_str, len);
6758 if (len) {
6759 STRLEN numlen = 1;
6760 if ( SvUTF8(sv_str) ) {
196f1508 6761 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6762 if (*valuep > 0x7F)
6763 RExC_utf8 = 1;
6764 /* XXXX
6765 We have to turn on utf8 for high bit chars otherwise
6766 we get failures with
6767
6768 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6769 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6770
6771 This is different from what \x{} would do with the same
6772 codepoint, where the condition is > 0xFF.
6773 - dmq
6774 */
6775
6776
6777 } else {
6778 *valuep = (UV)*p;
6779 /* warn if we havent used the whole string? */
6780 }
668c081a
NC
6781 if (numlen<len && SIZE_ONLY) {
6782 ckWARN2reg(RExC_parse,
deda7dd1
NC
6783 "Ignoring excess chars from \\N{%" SVf "} in character class",
6784 SVfARG(sv_name)
fc8cd66c
YO
6785 );
6786 }
668c081a
NC
6787 } else if (SIZE_ONLY) {
6788 ckWARN2reg(RExC_parse,
deda7dd1
NC
6789 "Ignoring zero length \\N{%" SVf "} in character class",
6790 SVfARG(sv_name)
fc8cd66c
YO
6791 );
6792 }
ef8d46e8 6793 SvREFCNT_dec(sv_name);
fc8cd66c
YO
6794 if (!cached)
6795 SvREFCNT_dec(sv_str);
6796 return len ? NULL : (regnode *)&len;
6797 } else if(SvCUR(sv_str)) {
6798
6799 char *s;
6800 char *p, *pend;
6801 STRLEN charlen = 1;
d008bc60 6802#ifdef DEBUGGING
fc8cd66c 6803 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6804#endif
fc8cd66c
YO
6805 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6806
6807 ret = reg_node(pRExC_state,
6808 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6809 s= STRING(ret);
6810
6811 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6812 sv_utf8_upgrade(sv_str);
6813 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6814 RExC_utf8= 1;
6815 }
6816
6817 p = SvPV(sv_str, len);
6818 pend = p + len;
6819 /* len is the length written, charlen is the size the char read */
6820 for ( len = 0; p < pend; p += charlen ) {
6821 if (UTF) {
196f1508 6822 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6823 if (FOLD) {
6824 STRLEN foldlen,numlen;
6825 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6826 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6827 /* Emit all the Unicode characters. */
6828
6829 for (foldbuf = tmpbuf;
6830 foldlen;
6831 foldlen -= numlen)
6832 {
6833 uvc = utf8_to_uvchr(foldbuf, &numlen);
6834 if (numlen > 0) {
6835 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6836 s += unilen;
6837 len += unilen;
6838 /* In EBCDIC the numlen
6839 * and unilen can differ. */
6840 foldbuf += numlen;
6841 if (numlen >= foldlen)
6842 break;
6843 }
6844 else
6845 break; /* "Can't happen." */
6846 }
6847 } else {
6848 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6849 if (unilen > 0) {
6850 s += unilen;
6851 len += unilen;
6852 }
6853 }
6854 } else {
6855 len++;
6856 REGC(*p, s++);
6857 }
6858 }
6859 if (SIZE_ONLY) {
6860 RExC_size += STR_SZ(len);
6861 } else {
6862 STR_LEN(ret) = len;
6863 RExC_emit += STR_SZ(len);
6864 }
6865 Set_Node_Cur_Length(ret); /* MJD */
6866 RExC_parse--;
6867 nextchar(pRExC_state);
a4893424 6868 } else { /* zero length */
fc8cd66c
YO
6869 ret = reg_node(pRExC_state,NOTHING);
6870 }
ef8d46e8
VP
6871 SvREFCNT_dec(sv_name);
6872 if (!cached)
fc8cd66c 6873 SvREFCNT_dec(sv_str);
fc8cd66c
YO
6874 return ret;
6875
6876}
6877
6878
9e08bc66
TS
6879/*
6880 * reg_recode
6881 *
6882 * It returns the code point in utf8 for the value in *encp.
6883 * value: a code value in the source encoding
6884 * encp: a pointer to an Encode object
6885 *
6886 * If the result from Encode is not a single character,
6887 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6888 */
6889STATIC UV
6890S_reg_recode(pTHX_ const char value, SV **encp)
6891{
6892 STRLEN numlen = 1;
59cd0e26 6893 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6894 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6895 const STRLEN newlen = SvCUR(sv);
6896 UV uv = UNICODE_REPLACEMENT;
6897
7918f24d
NC
6898 PERL_ARGS_ASSERT_REG_RECODE;
6899
9e08bc66
TS
6900 if (newlen)
6901 uv = SvUTF8(sv)
6902 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6903 : *(U8*)s;
6904
6905 if (!newlen || numlen != newlen) {
6906 uv = UNICODE_REPLACEMENT;
c86f7df5 6907 *encp = NULL;
9e08bc66
TS
6908 }
6909 return uv;
6910}
6911
fc8cd66c 6912
a687059c
LW
6913/*
6914 - regatom - the lowest level
ee9b8eae
YO
6915
6916 Try to identify anything special at the start of the pattern. If there
6917 is, then handle it as required. This may involve generating a single regop,
6918 such as for an assertion; or it may involve recursing, such as to
6919 handle a () structure.
6920
6921 If the string doesn't start with something special then we gobble up
6922 as much literal text as we can.
6923
6924 Once we have been able to handle whatever type of thing started the
6925 sequence, we return.
6926
6927 Note: we have to be careful with escapes, as they can be both literal
6928 and special, and in the case of \10 and friends can either, depending
6929 on context. Specifically there are two seperate switches for handling
6930 escape sequences, with the one for handling literal escapes requiring
6931 a dummy entry for all of the special escapes that are actually handled
6932 by the other.
6933*/
6934
76e3520e 6935STATIC regnode *
3dab1dad 6936S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6937{
97aff369 6938 dVAR;
cbbf8932 6939 register regnode *ret = NULL;
a0d0e21e 6940 I32 flags;
45948336 6941 char *parse_start = RExC_parse;
3dab1dad
YO
6942 GET_RE_DEBUG_FLAGS_DECL;
6943 DEBUG_PARSE("atom");
a0d0e21e
LW
6944 *flagp = WORST; /* Tentatively. */
6945
7918f24d 6946 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 6947
a0d0e21e 6948tryagain:
f9a79580 6949 switch ((U8)*RExC_parse) {
a0d0e21e 6950 case '^':
830247a4
IZ
6951 RExC_seen_zerolen++;
6952 nextchar(pRExC_state);
bbe252da 6953 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6954 ret = reg_node(pRExC_state, MBOL);
bbe252da 6955 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6956 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6957 else
830247a4 6958 ret = reg_node(pRExC_state, BOL);
fac92740 6959 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6960 break;
6961 case '$':
830247a4 6962 nextchar(pRExC_state);
b81d288d 6963 if (*RExC_parse)
830247a4 6964 RExC_seen_zerolen++;
bbe252da 6965 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6966 ret = reg_node(pRExC_state, MEOL);
bbe252da 6967 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6968 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6969 else
830247a4 6970 ret = reg_node(pRExC_state, EOL);
fac92740 6971 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6972 break;
6973 case '.':
830247a4 6974 nextchar(pRExC_state);
bbe252da 6975 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6976 ret = reg_node(pRExC_state, SANY);
6977 else
6978 ret = reg_node(pRExC_state, REG_ANY);
6979 *flagp |= HASWIDTH|SIMPLE;
830247a4 6980 RExC_naughty++;
fac92740 6981 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6982 break;
6983 case '[':
b45f050a 6984 {
3dab1dad
YO
6985 char * const oregcomp_parse = ++RExC_parse;
6986 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6987 if (*RExC_parse != ']') {
6988 RExC_parse = oregcomp_parse;
b45f050a
JF
6989 vFAIL("Unmatched [");
6990 }
830247a4 6991 nextchar(pRExC_state);
a0d0e21e 6992 *flagp |= HASWIDTH|SIMPLE;
fac92740 6993 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6994 break;
b45f050a 6995 }
a0d0e21e 6996 case '(':
830247a4 6997 nextchar(pRExC_state);
3dab1dad 6998 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6999 if (ret == NULL) {
bf93d4cc 7000 if (flags & TRYAGAIN) {
830247a4 7001 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7002 /* Make parent create an empty node if needed. */
7003 *flagp |= TRYAGAIN;
7004 return(NULL);
7005 }
a0d0e21e 7006 goto tryagain;
bf93d4cc 7007 }
a0d0e21e
LW
7008 return(NULL);
7009 }
a3b492c3 7010 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7011 break;
7012 case '|':
7013 case ')':
7014 if (flags & TRYAGAIN) {
7015 *flagp |= TRYAGAIN;
7016 return NULL;
7017 }
b45f050a 7018 vFAIL("Internal urp");
a0d0e21e
LW
7019 /* Supposed to be caught earlier. */
7020 break;
85afd4ae 7021 case '{':
830247a4
IZ
7022 if (!regcurly(RExC_parse)) {
7023 RExC_parse++;
85afd4ae
CS
7024 goto defchar;
7025 }
7026 /* FALL THROUGH */
a0d0e21e
LW
7027 case '?':
7028 case '+':
7029 case '*':
830247a4 7030 RExC_parse++;
b45f050a 7031 vFAIL("Quantifier follows nothing");
a0d0e21e 7032 break;
f9a79580
RGS
7033 case 0xDF:
7034 case 0xC3:
7035 case 0xCE:
a0a388a1 7036 do_foldchar:
56d400ed 7037 if (!LOC && FOLD) {
e64b1bd1 7038 U32 len,cp;
7cf3a6a3 7039 len=0; /* silence a spurious compiler warning */
56d400ed 7040 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7041 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7042 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7043 ret = reganode(pRExC_state, FOLDCHAR, cp);
7044 Set_Node_Length(ret, 1); /* MJD */
7045 nextchar(pRExC_state); /* kill whitespace under /x */
7046 return ret;
7047 }
7048 }
7049 goto outer_default;
a0d0e21e 7050 case '\\':
ee9b8eae
YO
7051 /* Special Escapes
7052
7053 This switch handles escape sequences that resolve to some kind
7054 of special regop and not to literal text. Escape sequnces that
7055 resolve to literal text are handled below in the switch marked
7056 "Literal Escapes".
7057
7058 Every entry in this switch *must* have a corresponding entry
7059 in the literal escape switch. However, the opposite is not
7060 required, as the default for this switch is to jump to the
7061 literal text handling code.
7062 */
a0a388a1
YO
7063 switch ((U8)*++RExC_parse) {
7064 case 0xDF:
7065 case 0xC3:
7066 case 0xCE:
7067 goto do_foldchar;
ee9b8eae 7068 /* Special Escapes */
a0d0e21e 7069 case 'A':
830247a4
IZ
7070 RExC_seen_zerolen++;
7071 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7072 *flagp |= SIMPLE;
ee9b8eae 7073 goto finish_meta_pat;
a0d0e21e 7074 case 'G':
830247a4
IZ
7075 ret = reg_node(pRExC_state, GPOS);
7076 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7077 *flagp |= SIMPLE;
ee9b8eae
YO
7078 goto finish_meta_pat;
7079 case 'K':
7080 RExC_seen_zerolen++;
7081 ret = reg_node(pRExC_state, KEEPS);
7082 *flagp |= SIMPLE;
37923168
RGS
7083 /* XXX:dmq : disabling in-place substitution seems to
7084 * be necessary here to avoid cases of memory corruption, as
7085 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7086 */
7087 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7088 goto finish_meta_pat;
a0d0e21e 7089 case 'Z':
830247a4 7090 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7091 *flagp |= SIMPLE;
a1917ab9 7092 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7093 goto finish_meta_pat;
b85d18e9 7094 case 'z':
830247a4 7095 ret = reg_node(pRExC_state, EOS);
b85d18e9 7096 *flagp |= SIMPLE;
830247a4 7097 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7098 goto finish_meta_pat;
4a2d328f 7099 case 'C':
f33976b4
DB
7100 ret = reg_node(pRExC_state, CANY);
7101 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7102 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7103 goto finish_meta_pat;
a0ed51b3 7104 case 'X':
830247a4 7105 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7106 *flagp |= HASWIDTH;
ee9b8eae 7107 goto finish_meta_pat;
a0d0e21e 7108 case 'w':
eb160463 7109 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7110 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7111 goto finish_meta_pat;
a0d0e21e 7112 case 'W':
eb160463 7113 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7114 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7115 goto finish_meta_pat;
a0d0e21e 7116 case 'b':
830247a4
IZ
7117 RExC_seen_zerolen++;
7118 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7119 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7120 *flagp |= SIMPLE;
ee9b8eae 7121 goto finish_meta_pat;
a0d0e21e 7122 case 'B':
830247a4
IZ
7123 RExC_seen_zerolen++;
7124 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7125 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7126 *flagp |= SIMPLE;
ee9b8eae 7127 goto finish_meta_pat;
a0d0e21e 7128 case 's':
eb160463 7129 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7130 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7131 goto finish_meta_pat;
a0d0e21e 7132 case 'S':
eb160463 7133 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7134 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7135 goto finish_meta_pat;
a0d0e21e 7136 case 'd':
ffc61ed2 7137 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7138 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7139 goto finish_meta_pat;
a0d0e21e 7140 case 'D':
ffc61ed2 7141 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7142 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7143 goto finish_meta_pat;
e1d1eefb
YO
7144 case 'R':
7145 ret = reg_node(pRExC_state, LNBREAK);
7146 *flagp |= HASWIDTH|SIMPLE;
7147 goto finish_meta_pat;
7148 case 'h':
7149 ret = reg_node(pRExC_state, HORIZWS);
7150 *flagp |= HASWIDTH|SIMPLE;
7151 goto finish_meta_pat;
7152 case 'H':
7153 ret = reg_node(pRExC_state, NHORIZWS);
7154 *flagp |= HASWIDTH|SIMPLE;
7155 goto finish_meta_pat;
ee9b8eae 7156 case 'v':
e1d1eefb
YO
7157 ret = reg_node(pRExC_state, VERTWS);
7158 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7159 goto finish_meta_pat;
7160 case 'V':
e1d1eefb
YO
7161 ret = reg_node(pRExC_state, NVERTWS);
7162 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7163 finish_meta_pat:
830247a4 7164 nextchar(pRExC_state);
fac92740 7165 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7166 break;
a14b48bc
LW
7167 case 'p':
7168 case 'P':
3568d838 7169 {
3dab1dad 7170 char* const oldregxend = RExC_end;
d008bc60 7171#ifdef DEBUGGING
ccb2c380 7172 char* parse_start = RExC_parse - 2;
d008bc60 7173#endif
a14b48bc 7174
830247a4 7175 if (RExC_parse[1] == '{') {
3568d838 7176 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7177 RExC_end = strchr(RExC_parse, '}');
7178 if (!RExC_end) {
3dab1dad 7179 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7180 RExC_parse += 2;
7181 RExC_end = oldregxend;
0da60cf5 7182 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7183 }
830247a4 7184 RExC_end++;
a14b48bc 7185 }
af6f566e 7186 else {
830247a4 7187 RExC_end = RExC_parse + 2;
af6f566e
HS
7188 if (RExC_end > oldregxend)
7189 RExC_end = oldregxend;
7190 }
830247a4 7191 RExC_parse--;
a14b48bc 7192
3dab1dad 7193 ret = regclass(pRExC_state,depth+1);
a14b48bc 7194
830247a4
IZ
7195 RExC_end = oldregxend;
7196 RExC_parse--;
ccb2c380
MP
7197
7198 Set_Node_Offset(ret, parse_start + 2);
7199 Set_Node_Cur_Length(ret);
830247a4 7200 nextchar(pRExC_state);
a14b48bc
LW
7201 *flagp |= HASWIDTH|SIMPLE;
7202 }
7203 break;
fc8cd66c 7204 case 'N':
afefe6bf 7205 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7206 multicharacter. join_exact() will join them up later on.
7207 Also this makes sure that things like /\N{BLAH}+/ and
7208 \N{BLAH} being multi char Just Happen. dmq*/
7209 ++RExC_parse;
afefe6bf 7210 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7211 break;
0a4db386 7212 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7213 parse_named_seq:
81714fb9
YO
7214 {
7215 char ch= RExC_parse[1];
1f1031fe
YO
7216 if (ch != '<' && ch != '\'' && ch != '{') {
7217 RExC_parse++;
7218 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7219 } else {
1f1031fe
YO
7220 /* this pretty much dupes the code for (?P=...) in reg(), if
7221 you change this make sure you change that */
81714fb9 7222 char* name_start = (RExC_parse += 2);
2eccd3b2 7223 U32 num = 0;
0a4db386
YO
7224 SV *sv_dat = reg_scan_name(pRExC_state,
7225 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7226 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7227 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7228 vFAIL2("Sequence %.3s... not terminated",parse_start);
7229
7230 if (!SIZE_ONLY) {
7231 num = add_data( pRExC_state, 1, "S" );
7232 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7233 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7234 }
7235
81714fb9
YO
7236 RExC_sawback = 1;
7237 ret = reganode(pRExC_state,
7238 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7239 num);
7240 *flagp |= HASWIDTH;
1f1031fe 7241
81714fb9
YO
7242 /* override incorrect value set in reganode MJD */
7243 Set_Node_Offset(ret, parse_start+1);
7244 Set_Node_Cur_Length(ret); /* MJD */
7245 nextchar(pRExC_state);
1f1031fe 7246
81714fb9
YO
7247 }
7248 break;
1f1031fe 7249 }
2bf803e2 7250 case 'g':
a0d0e21e
LW
7251 case '1': case '2': case '3': case '4':
7252 case '5': case '6': case '7': case '8': case '9':
7253 {
c74340f9 7254 I32 num;
2bf803e2
YO
7255 bool isg = *RExC_parse == 'g';
7256 bool isrel = 0;
7257 bool hasbrace = 0;
7258 if (isg) {
c74340f9 7259 RExC_parse++;
2bf803e2
YO
7260 if (*RExC_parse == '{') {
7261 RExC_parse++;
7262 hasbrace = 1;
7263 }
7264 if (*RExC_parse == '-') {
7265 RExC_parse++;
7266 isrel = 1;
7267 }
1f1031fe
YO
7268 if (hasbrace && !isDIGIT(*RExC_parse)) {
7269 if (isrel) RExC_parse--;
7270 RExC_parse -= 2;
7271 goto parse_named_seq;
7272 } }
c74340f9 7273 num = atoi(RExC_parse);
b72d83b2
RGS
7274 if (isg && num == 0)
7275 vFAIL("Reference to invalid group 0");
c74340f9 7276 if (isrel) {
5624f11d 7277 num = RExC_npar - num;
c74340f9
YO
7278 if (num < 1)
7279 vFAIL("Reference to nonexistent or unclosed group");
7280 }
2bf803e2 7281 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7282 goto defchar;
7283 else {
3dab1dad 7284 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7285 while (isDIGIT(*RExC_parse))
7286 RExC_parse++;
1f1031fe
YO
7287 if (parse_start == RExC_parse - 1)
7288 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7289 if (hasbrace) {
7290 if (*RExC_parse != '}')
7291 vFAIL("Unterminated \\g{...} pattern");
7292 RExC_parse++;
7293 }
c74340f9
YO
7294 if (!SIZE_ONLY) {
7295 if (num > (I32)RExC_rx->nparens)
7296 vFAIL("Reference to nonexistent group");
c74340f9 7297 }
830247a4 7298 RExC_sawback = 1;
eb160463
GS
7299 ret = reganode(pRExC_state,
7300 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7301 num);
a0d0e21e 7302 *flagp |= HASWIDTH;
2af232bd 7303
fac92740 7304 /* override incorrect value set in reganode MJD */
2af232bd 7305 Set_Node_Offset(ret, parse_start+1);
fac92740 7306 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7307 RExC_parse--;
7308 nextchar(pRExC_state);
a0d0e21e
LW
7309 }
7310 }
7311 break;
7312 case '\0':
830247a4 7313 if (RExC_parse >= RExC_end)
b45f050a 7314 FAIL("Trailing \\");
a0d0e21e
LW
7315 /* FALL THROUGH */
7316 default:
a0288114 7317 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7318 back into the quick-grab loop below */
45948336 7319 parse_start--;
a0d0e21e
LW
7320 goto defchar;
7321 }
7322 break;
4633a7c4
LW
7323
7324 case '#':
bbe252da 7325 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7326 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7327 goto tryagain;
7328 }
7329 /* FALL THROUGH */
7330
f9a79580
RGS
7331 default:
7332 outer_default:{
ba210ebe 7333 register STRLEN len;
58ae7d3f 7334 register UV ender;
a0d0e21e 7335 register char *p;
3dab1dad 7336 char *s;
80aecb99 7337 STRLEN foldlen;
89ebb4a3 7338 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7339
7340 parse_start = RExC_parse - 1;
a0d0e21e 7341
830247a4 7342 RExC_parse++;
a0d0e21e
LW
7343
7344 defchar:
58ae7d3f 7345 ender = 0;
eb160463
GS
7346 ret = reg_node(pRExC_state,
7347 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7348 s = STRING(ret);
830247a4
IZ
7349 for (len = 0, p = RExC_parse - 1;
7350 len < 127 && p < RExC_end;
a0d0e21e
LW
7351 len++)
7352 {
3dab1dad 7353 char * const oldp = p;
5b5a24f7 7354
bbe252da 7355 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7356 p = regwhite( pRExC_state, p );
f9a79580
RGS
7357 switch ((U8)*p) {
7358 case 0xDF:
7359 case 0xC3:
7360 case 0xCE:
56d400ed 7361 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7362 goto normal_default;
a0d0e21e
LW
7363 case '^':
7364 case '$':
7365 case '.':
7366 case '[':
7367 case '(':
7368 case ')':
7369 case '|':
7370 goto loopdone;
7371 case '\\':
ee9b8eae
YO
7372 /* Literal Escapes Switch
7373
7374 This switch is meant to handle escape sequences that
7375 resolve to a literal character.
7376
7377 Every escape sequence that represents something
7378 else, like an assertion or a char class, is handled
7379 in the switch marked 'Special Escapes' above in this
7380 routine, but also has an entry here as anything that
7381 isn't explicitly mentioned here will be treated as
7382 an unescaped equivalent literal.
7383 */
7384
a0a388a1 7385 switch ((U8)*++p) {
ee9b8eae 7386 /* These are all the special escapes. */
a0a388a1
YO
7387 case 0xDF:
7388 case 0xC3:
7389 case 0xCE:
7390 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7391 goto normal_default;
ee9b8eae
YO
7392 case 'A': /* Start assertion */
7393 case 'b': case 'B': /* Word-boundary assertion*/
7394 case 'C': /* Single char !DANGEROUS! */
7395 case 'd': case 'D': /* digit class */
7396 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7397 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7398 case 'k': case 'K': /* named backref, keep marker */
7399 case 'N': /* named char sequence */
38a44b82 7400 case 'p': case 'P': /* Unicode property */
e1d1eefb 7401 case 'R': /* LNBREAK */
ee9b8eae 7402 case 's': case 'S': /* space class */
e1d1eefb 7403 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7404 case 'w': case 'W': /* word class */
7405 case 'X': /* eXtended Unicode "combining character sequence" */
7406 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7407 --p;
7408 goto loopdone;
ee9b8eae
YO
7409
7410 /* Anything after here is an escape that resolves to a
7411 literal. (Except digits, which may or may not)
7412 */
a0d0e21e
LW
7413 case 'n':
7414 ender = '\n';
7415 p++;
a687059c 7416 break;
a0d0e21e
LW
7417 case 'r':
7418 ender = '\r';
7419 p++;
a687059c 7420 break;
a0d0e21e
LW
7421 case 't':
7422 ender = '\t';
7423 p++;
a687059c 7424 break;
a0d0e21e
LW
7425 case 'f':
7426 ender = '\f';
7427 p++;
a687059c 7428 break;
a0d0e21e 7429 case 'e':
c7f1f016 7430 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7431 p++;
a687059c 7432 break;
a0d0e21e 7433 case 'a':
c7f1f016 7434 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7435 p++;
a687059c 7436 break;
a0d0e21e 7437 case 'x':
a0ed51b3 7438 if (*++p == '{') {
1df70142 7439 char* const e = strchr(p, '}');
b81d288d 7440
b45f050a 7441 if (!e) {
830247a4 7442 RExC_parse = p + 1;
b45f050a
JF
7443 vFAIL("Missing right brace on \\x{}");
7444 }
de5f0749 7445 else {
a4c04bdc
NC
7446 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7447 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7448 STRLEN numlen = e - p - 1;
53305cf1 7449 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7450 if (ender > 0xff)
7451 RExC_utf8 = 1;
a0ed51b3
LW
7452 p = e + 1;
7453 }
a0ed51b3
LW
7454 }
7455 else {
a4c04bdc 7456 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7457 STRLEN numlen = 2;
53305cf1 7458 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7459 p += numlen;
7460 }
9e08bc66
TS
7461 if (PL_encoding && ender < 0x100)
7462 goto recode_encoding;
a687059c 7463 break;
a0d0e21e
LW
7464 case 'c':
7465 p++;
bbce6d69 7466 ender = UCHARAT(p++);
7467 ender = toCTRL(ender);
a687059c 7468 break;
a0d0e21e
LW
7469 case '0': case '1': case '2': case '3':case '4':
7470 case '5': case '6': case '7': case '8':case '9':
7471 if (*p == '0' ||
830247a4 7472 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7473 I32 flags = 0;
1df70142 7474 STRLEN numlen = 3;
53305cf1 7475 ender = grok_oct(p, &numlen, &flags, NULL);
609122bd
KW
7476
7477 /* An octal above 0xff is interpreted differently
7478 * depending on if the re is in utf8 or not. If it
7479 * is in utf8, the value will be itself, otherwise
7480 * it is interpreted as modulo 0x100. It has been
7481 * decided to discourage the use of octal above the
7482 * single-byte range. For now, warn only when
7483 * it ends up modulo */
7484 if (SIZE_ONLY && ender >= 0x100
668c081a
NC
7485 && ! UTF && ! PL_encoding) {
7486 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
609122bd 7487 }
a0d0e21e
LW
7488 p += numlen;
7489 }
7490 else {
7491 --p;
7492 goto loopdone;
a687059c 7493 }
9e08bc66
TS
7494 if (PL_encoding && ender < 0x100)
7495 goto recode_encoding;
7496 break;
7497 recode_encoding:
7498 {
7499 SV* enc = PL_encoding;
7500 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7501 if (!enc && SIZE_ONLY)
7502 ckWARNreg(p, "Invalid escape in the specified encoding");
9e08bc66
TS
7503 RExC_utf8 = 1;
7504 }
a687059c 7505 break;
a0d0e21e 7506 case '\0':
830247a4 7507 if (p >= RExC_end)
b45f050a 7508 FAIL("Trailing \\");
a687059c 7509 /* FALL THROUGH */
a0d0e21e 7510 default:
668c081a
NC
7511 if (!SIZE_ONLY&& isALPHA(*p))
7512 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7513 goto normal_default;
a0d0e21e
LW
7514 }
7515 break;
a687059c 7516 default:
a0ed51b3 7517 normal_default:
fd400ab9 7518 if (UTF8_IS_START(*p) && UTF) {
1df70142 7519 STRLEN numlen;
5e12f4fb 7520 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7521 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7522 p += numlen;
7523 }
7524 else
7525 ender = *p++;
a0d0e21e 7526 break;
a687059c 7527 }
bcdf7404
YO
7528 if ( RExC_flags & RXf_PMf_EXTENDED)
7529 p = regwhite( pRExC_state, p );
60a8b682
JH
7530 if (UTF && FOLD) {
7531 /* Prime the casefolded buffer. */
ac7e0132 7532 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7533 }
bcdf7404 7534 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7535 if (len)
7536 p = oldp;
16ea2a2e 7537 else if (UTF) {
80aecb99 7538 if (FOLD) {
60a8b682 7539 /* Emit all the Unicode characters. */
1df70142 7540 STRLEN numlen;
80aecb99
JH
7541 for (foldbuf = tmpbuf;
7542 foldlen;
7543 foldlen -= numlen) {
7544 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7545 if (numlen > 0) {
71207a34 7546 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7547 s += unilen;
7548 len += unilen;
7549 /* In EBCDIC the numlen
7550 * and unilen can differ. */
9dc45d57 7551 foldbuf += numlen;
47654450
JH
7552 if (numlen >= foldlen)
7553 break;
9dc45d57
JH
7554 }
7555 else
7556 break; /* "Can't happen." */
80aecb99
JH
7557 }
7558 }
7559 else {
71207a34 7560 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7561 if (unilen > 0) {
0ebc6274
JH
7562 s += unilen;
7563 len += unilen;
9dc45d57 7564 }
80aecb99 7565 }
a0ed51b3 7566 }
a0d0e21e
LW
7567 else {
7568 len++;
eb160463 7569 REGC((char)ender, s++);
a0d0e21e
LW
7570 }
7571 break;
a687059c 7572 }
16ea2a2e 7573 if (UTF) {
80aecb99 7574 if (FOLD) {
60a8b682 7575 /* Emit all the Unicode characters. */
1df70142 7576 STRLEN numlen;
80aecb99
JH
7577 for (foldbuf = tmpbuf;
7578 foldlen;
7579 foldlen -= numlen) {
7580 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7581 if (numlen > 0) {
71207a34 7582 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7583 len += unilen;
7584 s += unilen;
7585 /* In EBCDIC the numlen
7586 * and unilen can differ. */
9dc45d57 7587 foldbuf += numlen;
47654450
JH
7588 if (numlen >= foldlen)
7589 break;
9dc45d57
JH
7590 }
7591 else
7592 break;
80aecb99
JH
7593 }
7594 }
7595 else {
71207a34 7596 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7597 if (unilen > 0) {
0ebc6274
JH
7598 s += unilen;
7599 len += unilen;
9dc45d57 7600 }
80aecb99
JH
7601 }
7602 len--;
a0ed51b3
LW
7603 }
7604 else
eb160463 7605 REGC((char)ender, s++);
a0d0e21e
LW
7606 }
7607 loopdone:
830247a4 7608 RExC_parse = p - 1;
fac92740 7609 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7610 nextchar(pRExC_state);
793db0cb
JH
7611 {
7612 /* len is STRLEN which is unsigned, need to copy to signed */
7613 IV iv = len;
7614 if (iv < 0)
7615 vFAIL("Internal disaster");
7616 }
a0d0e21e
LW
7617 if (len > 0)
7618 *flagp |= HASWIDTH;
090f7165 7619 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7620 *flagp |= SIMPLE;
3dab1dad 7621
cd439c50 7622 if (SIZE_ONLY)
830247a4 7623 RExC_size += STR_SZ(len);
3dab1dad
YO
7624 else {
7625 STR_LEN(ret) = len;
830247a4 7626 RExC_emit += STR_SZ(len);
07be1b83 7627 }
3dab1dad 7628 }
a0d0e21e
LW
7629 break;
7630 }
a687059c 7631
a0d0e21e 7632 return(ret);
a687059c
LW
7633}
7634
873ef191 7635STATIC char *
bcdf7404 7636S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7637{
bcdf7404 7638 const char *e = RExC_end;
7918f24d
NC
7639
7640 PERL_ARGS_ASSERT_REGWHITE;
7641
5b5a24f7
CS
7642 while (p < e) {
7643 if (isSPACE(*p))
7644 ++p;
7645 else if (*p == '#') {
bcdf7404 7646 bool ended = 0;
5b5a24f7 7647 do {
bcdf7404
YO
7648 if (*p++ == '\n') {
7649 ended = 1;
7650 break;
7651 }
7652 } while (p < e);
7653 if (!ended)
7654 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7655 }
7656 else
7657 break;
7658 }
7659 return p;
7660}
7661
b8c5462f
JH
7662/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7663 Character classes ([:foo:]) can also be negated ([:^foo:]).
7664 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7665 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7666 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7667
7668#define POSIXCC_DONE(c) ((c) == ':')
7669#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7670#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7671
b8c5462f 7672STATIC I32
830247a4 7673S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7674{
97aff369 7675 dVAR;
936ed897 7676 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7677
7918f24d
NC
7678 PERL_ARGS_ASSERT_REGPPOSIXCC;
7679
830247a4 7680 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7681 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7682 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7683 const char c = UCHARAT(RExC_parse);
097eb12c 7684 char* const s = RExC_parse++;
b81d288d 7685
9a86a77b 7686 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7687 RExC_parse++;
7688 if (RExC_parse == RExC_end)
620e46c5 7689 /* Grandfather lone [:, [=, [. */
830247a4 7690 RExC_parse = s;
620e46c5 7691 else {
3dab1dad 7692 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7693 assert(*t == c);
7694
9a86a77b 7695 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7696 const char *posixcc = s + 1;
830247a4 7697 RExC_parse++; /* skip over the ending ] */
3dab1dad 7698
b8c5462f 7699 if (*s == ':') {
1df70142
AL
7700 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7701 const I32 skip = t - posixcc;
80916619
NC
7702
7703 /* Initially switch on the length of the name. */
7704 switch (skip) {
7705 case 4:
3dab1dad
YO
7706 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7707 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7708 break;
80916619
NC
7709 case 5:
7710 /* Names all of length 5. */
7711 /* alnum alpha ascii blank cntrl digit graph lower
7712 print punct space upper */
7713 /* Offset 4 gives the best switch position. */
7714 switch (posixcc[4]) {
7715 case 'a':
3dab1dad
YO
7716 if (memEQ(posixcc, "alph", 4)) /* alpha */
7717 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7718 break;
7719 case 'e':
3dab1dad
YO
7720 if (memEQ(posixcc, "spac", 4)) /* space */
7721 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7722 break;
7723 case 'h':
3dab1dad
YO
7724 if (memEQ(posixcc, "grap", 4)) /* graph */
7725 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7726 break;
7727 case 'i':
3dab1dad
YO
7728 if (memEQ(posixcc, "asci", 4)) /* ascii */
7729 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7730 break;
7731 case 'k':
3dab1dad
YO
7732 if (memEQ(posixcc, "blan", 4)) /* blank */
7733 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7734 break;
7735 case 'l':
3dab1dad
YO
7736 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7737 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7738 break;
7739 case 'm':
3dab1dad
YO
7740 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7741 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7742 break;
7743 case 'r':
3dab1dad
YO
7744 if (memEQ(posixcc, "lowe", 4)) /* lower */
7745 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7746 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7747 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7748 break;
7749 case 't':
3dab1dad
YO
7750 if (memEQ(posixcc, "digi", 4)) /* digit */
7751 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7752 else if (memEQ(posixcc, "prin", 4)) /* print */
7753 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7754 else if (memEQ(posixcc, "punc", 4)) /* punct */
7755 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7756 break;
b8c5462f
JH
7757 }
7758 break;
80916619 7759 case 6:
3dab1dad
YO
7760 if (memEQ(posixcc, "xdigit", 6))
7761 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7762 break;
7763 }
80916619
NC
7764
7765 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7766 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7767 t - s - 1, s + 1);
80916619
NC
7768 assert (posixcc[skip] == ':');
7769 assert (posixcc[skip+1] == ']');
b45f050a 7770 } else if (!SIZE_ONLY) {
b8c5462f 7771 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7772
830247a4 7773 /* adjust RExC_parse so the warning shows after
b45f050a 7774 the class closes */
9a86a77b 7775 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7776 RExC_parse++;
b45f050a
JF
7777 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7778 }
b8c5462f
JH
7779 } else {
7780 /* Maternal grandfather:
7781 * "[:" ending in ":" but not in ":]" */
830247a4 7782 RExC_parse = s;
767d463e 7783 }
620e46c5
JH
7784 }
7785 }
7786
b8c5462f
JH
7787 return namedclass;
7788}
7789
7790STATIC void
830247a4 7791S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7792{
97aff369 7793 dVAR;
7918f24d
NC
7794
7795 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7796
3dab1dad 7797 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7798 const char *s = RExC_parse;
7799 const char c = *s++;
b8c5462f 7800
3dab1dad 7801 while (isALNUM(*s))
b8c5462f
JH
7802 s++;
7803 if (*s && c == *s && s[1] == ']') {
668c081a
NC
7804 ckWARN3reg(s+2,
7805 "POSIX syntax [%c %c] belongs inside character classes",
7806 c, c);
b45f050a
JF
7807
7808 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7809 if (POSIXCC_NOTYET(c)) {
830247a4 7810 /* adjust RExC_parse so the error shows after
b45f050a 7811 the class closes */
9a86a77b 7812 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7813 NOOP;
b45f050a
JF
7814 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7815 }
b8c5462f
JH
7816 }
7817 }
620e46c5
JH
7818}
7819
7f6f358c 7820
89836f1f
YO
7821#define _C_C_T_(NAME,TEST,WORD) \
7822ANYOF_##NAME: \
7823 if (LOC) \
7824 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7825 else { \
7826 for (value = 0; value < 256; value++) \
7827 if (TEST) \
7828 ANYOF_BITMAP_SET(ret, value); \
7829 } \
7830 yesno = '+'; \
7831 what = WORD; \
7832 break; \
7833case ANYOF_N##NAME: \
7834 if (LOC) \
7835 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7836 else { \
7837 for (value = 0; value < 256; value++) \
7838 if (!TEST) \
7839 ANYOF_BITMAP_SET(ret, value); \
7840 } \
7841 yesno = '!'; \
7842 what = WORD; \
7843 break
7844
e1d1eefb
YO
7845#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7846ANYOF_##NAME: \
7847 for (value = 0; value < 256; value++) \
7848 if (TEST) \
7849 ANYOF_BITMAP_SET(ret, value); \
7850 yesno = '+'; \
7851 what = WORD; \
7852 break; \
7853case ANYOF_N##NAME: \
7854 for (value = 0; value < 256; value++) \
7855 if (!TEST) \
7856 ANYOF_BITMAP_SET(ret, value); \
7857 yesno = '!'; \
7858 what = WORD; \
7859 break
89836f1f 7860
da7fcca4
YO
7861/*
7862 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7863 so that it is possible to override the option here without having to
7864 rebuild the entire core. as we are required to do if we change regcomp.h
7865 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7866*/
7867#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7868#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7869#endif
7870
7871#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7872#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7873#else
7874#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7875#endif
7876
7f6f358c
YO
7877/*
7878 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7879 matches the pattern or if the pattern matches a single char only and
7880 that char is < 256 and we are case insensitive then we produce an
7881 EXACT node instead.
7f6f358c 7882*/
89836f1f 7883
76e3520e 7884STATIC regnode *
3dab1dad 7885S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7886{
97aff369 7887 dVAR;
9a86a77b 7888 register UV nextvalue;
3568d838 7889 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7890 register IV range = 0;
e1d1eefb 7891 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7892 register regnode *ret;
ba210ebe 7893 STRLEN numlen;
ffc61ed2 7894 IV namedclass;
cbbf8932 7895 char *rangebegin = NULL;
936ed897 7896 bool need_class = 0;
c445ea15 7897 SV *listsv = NULL;
ffc61ed2 7898 UV n;
9e55ce06 7899 bool optimize_invert = TRUE;
cbbf8932 7900 AV* unicode_alternate = NULL;
1b2d223b
JH
7901#ifdef EBCDIC
7902 UV literal_endpoint = 0;
7903#endif
7f6f358c 7904 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7905
3dab1dad 7906 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7907 case we need to change the emitted regop to an EXACT. */
07be1b83 7908 const char * orig_parse = RExC_parse;
72f13be8 7909 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7910
7911 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
7912#ifndef DEBUGGING
7913 PERL_UNUSED_ARG(depth);
7914#endif
72f13be8 7915
3dab1dad 7916 DEBUG_PARSE("clas");
7f6f358c
YO
7917
7918 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7919 ret = reganode(pRExC_state, ANYOF, 0);
7920
7921 if (!SIZE_ONLY)
7922 ANYOF_FLAGS(ret) = 0;
7923
9a86a77b 7924 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7925 RExC_naughty++;
7926 RExC_parse++;
7927 if (!SIZE_ONLY)
7928 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7929 }
a0d0e21e 7930
73060fc4 7931 if (SIZE_ONLY) {
830247a4 7932 RExC_size += ANYOF_SKIP;
73060fc4
JH
7933 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7934 }
936ed897 7935 else {
830247a4 7936 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7937 if (FOLD)
7938 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7939 if (LOC)
7940 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7941 ANYOF_BITMAP_ZERO(ret);
396482e1 7942 listsv = newSVpvs("# comment\n");
a0d0e21e 7943 }
b8c5462f 7944
9a86a77b
JH
7945 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7946
b938889d 7947 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7948 checkposixcc(pRExC_state);
b8c5462f 7949
f064b6ad
HS
7950 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7951 if (UCHARAT(RExC_parse) == ']')
7952 goto charclassloop;
ffc61ed2 7953
fc8cd66c 7954parseit:
9a86a77b 7955 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7956
7957 charclassloop:
7958
7959 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7960
73b437c8 7961 if (!range)
830247a4 7962 rangebegin = RExC_parse;
ffc61ed2 7963 if (UTF) {
5e12f4fb 7964 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7965 RExC_end - RExC_parse,
9f7f3913 7966 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7967 RExC_parse += numlen;
7968 }
7969 else
7970 value = UCHARAT(RExC_parse++);
7f6f358c 7971
9a86a77b
JH
7972 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7973 if (value == '[' && POSIXCC(nextvalue))
830247a4 7974 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7975 else if (value == '\\') {
ffc61ed2 7976 if (UTF) {
5e12f4fb 7977 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7978 RExC_end - RExC_parse,
9f7f3913 7979 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7980 RExC_parse += numlen;
7981 }
7982 else
7983 value = UCHARAT(RExC_parse++);
470c3474 7984 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7985 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7986 * be a problem later if we want switch on Unicode.
7987 * A similar issue a little bit later when switching on
7988 * namedclass. --jhi */
ffc61ed2 7989 switch ((I32)value) {
b8c5462f
JH
7990 case 'w': namedclass = ANYOF_ALNUM; break;
7991 case 'W': namedclass = ANYOF_NALNUM; break;
7992 case 's': namedclass = ANYOF_SPACE; break;
7993 case 'S': namedclass = ANYOF_NSPACE; break;
7994 case 'd': namedclass = ANYOF_DIGIT; break;
7995 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
7996 case 'v': namedclass = ANYOF_VERTWS; break;
7997 case 'V': namedclass = ANYOF_NVERTWS; break;
7998 case 'h': namedclass = ANYOF_HORIZWS; break;
7999 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8000 case 'N': /* Handle \N{NAME} in class */
8001 {
8002 /* We only pay attention to the first char of
8003 multichar strings being returned. I kinda wonder
8004 if this makes sense as it does change the behaviour
8005 from earlier versions, OTOH that behaviour was broken
8006 as well. */
8007 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8008 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8009 goto parseit;
8010 }
8011 value= v;
8012 }
8013 break;
ffc61ed2
JH
8014 case 'p':
8015 case 'P':
3dab1dad
YO
8016 {
8017 char *e;
af6f566e 8018 if (RExC_parse >= RExC_end)
2a4859cd 8019 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8020 if (*RExC_parse == '{') {
1df70142 8021 const U8 c = (U8)value;
ffc61ed2
JH
8022 e = strchr(RExC_parse++, '}');
8023 if (!e)
0da60cf5 8024 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8025 while (isSPACE(UCHARAT(RExC_parse)))
8026 RExC_parse++;
8027 if (e == RExC_parse)
0da60cf5 8028 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8029 n = e - RExC_parse;
ab13f0c7
JH
8030 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8031 n--;
ffc61ed2
JH
8032 }
8033 else {
8034 e = RExC_parse;
8035 n = 1;
8036 }
8037 if (!SIZE_ONLY) {
ab13f0c7
JH
8038 if (UCHARAT(RExC_parse) == '^') {
8039 RExC_parse++;
8040 n--;
8041 value = value == 'p' ? 'P' : 'p'; /* toggle */
8042 while (isSPACE(UCHARAT(RExC_parse))) {
8043 RExC_parse++;
8044 n--;
8045 }
8046 }
097eb12c
AL
8047 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8048 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8049 }
8050 RExC_parse = e + 1;
8051 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8052 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8053 }
f81125e2 8054 break;
b8c5462f
JH
8055 case 'n': value = '\n'; break;
8056 case 'r': value = '\r'; break;
8057 case 't': value = '\t'; break;
8058 case 'f': value = '\f'; break;
8059 case 'b': value = '\b'; break;
c7f1f016
NIS
8060 case 'e': value = ASCII_TO_NATIVE('\033');break;
8061 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 8062 case 'x':
ffc61ed2 8063 if (*RExC_parse == '{') {
a4c04bdc
NC
8064 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8065 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8066 char * const e = strchr(RExC_parse++, '}');
b81d288d 8067 if (!e)
ffc61ed2 8068 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8069
8070 numlen = e - RExC_parse;
8071 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8072 RExC_parse = e + 1;
8073 }
8074 else {
a4c04bdc 8075 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8076 numlen = 2;
8077 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8078 RExC_parse += numlen;
8079 }
9e08bc66
TS
8080 if (PL_encoding && value < 0x100)
8081 goto recode_encoding;
b8c5462f
JH
8082 break;
8083 case 'c':
830247a4 8084 value = UCHARAT(RExC_parse++);
b8c5462f
JH
8085 value = toCTRL(value);
8086 break;
8087 case '0': case '1': case '2': case '3': case '4':
8088 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
8089 {
8090 I32 flags = 0;
8091 numlen = 3;
8092 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8093 RExC_parse += numlen;
8094 if (PL_encoding && value < 0x100)
8095 goto recode_encoding;
8096 break;
8097 }
8098 recode_encoding:
8099 {
8100 SV* enc = PL_encoding;
8101 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8102 if (!enc && SIZE_ONLY)
8103 ckWARNreg(RExC_parse,
8104 "Invalid escape in the specified encoding");
9e08bc66
TS
8105 break;
8106 }
1028017a 8107 default:
668c081a
NC
8108 if (!SIZE_ONLY && isALPHA(value))
8109 ckWARN2reg(RExC_parse,
8110 "Unrecognized escape \\%c in character class passed through",
8111 (int)value);
1028017a 8112 break;
b8c5462f 8113 }
ffc61ed2 8114 } /* end of \blah */
1b2d223b
JH
8115#ifdef EBCDIC
8116 else
8117 literal_endpoint++;
8118#endif
ffc61ed2
JH
8119
8120 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8121
8122 if (!SIZE_ONLY && !need_class)
936ed897 8123 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8124
936ed897 8125 need_class = 1;
ffc61ed2
JH
8126
8127 /* a bad range like a-\d, a-[:digit:] ? */
8128 if (range) {
73b437c8 8129 if (!SIZE_ONLY) {
668c081a
NC
8130 const int w =
8131 RExC_parse >= rangebegin ?
8132 RExC_parse - rangebegin : 0;
8133 ckWARN4reg(RExC_parse,
b45f050a 8134 "False [] range \"%*.*s\"",
097eb12c 8135 w, w, rangebegin);
668c081a 8136
3568d838
JH
8137 if (prevvalue < 256) {
8138 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8139 ANYOF_BITMAP_SET(ret, '-');
8140 }
8141 else {
8142 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8143 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8144 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8145 }
b8c5462f 8146 }
ffc61ed2
JH
8147
8148 range = 0; /* this was not a true range */
73b437c8 8149 }
ffc61ed2 8150
89836f1f
YO
8151
8152
73b437c8 8153 if (!SIZE_ONLY) {
c49a72a9
NC
8154 const char *what = NULL;
8155 char yesno = 0;
8156
3568d838
JH
8157 if (namedclass > OOB_NAMEDCLASS)
8158 optimize_invert = FALSE;
e2962f66
JH
8159 /* Possible truncation here but in some 64-bit environments
8160 * the compiler gets heartburn about switch on 64-bit values.
8161 * A similar issue a little earlier when switching on value.
98f323fa 8162 * --jhi */
e2962f66 8163 switch ((I32)namedclass) {
da7fcca4
YO
8164
8165 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8166 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8167 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8168 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8169 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8170 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8171 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8172 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8173 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8174 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8175#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8176 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8177 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8178#else
8179 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8180 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8181#endif
89836f1f 8182 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8183 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8184 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8185 case ANYOF_ASCII:
8186 if (LOC)
936ed897 8187 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8188 else {
c7f1f016 8189#ifndef EBCDIC
1ba5c669
JH
8190 for (value = 0; value < 128; value++)
8191 ANYOF_BITMAP_SET(ret, value);
8192#else /* EBCDIC */
ffbc6a93 8193 for (value = 0; value < 256; value++) {
3a3c4447
JH
8194 if (isASCII(value))
8195 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8196 }
1ba5c669 8197#endif /* EBCDIC */
73b437c8 8198 }
c49a72a9
NC
8199 yesno = '+';
8200 what = "ASCII";
73b437c8
JH
8201 break;
8202 case ANYOF_NASCII:
8203 if (LOC)
936ed897 8204 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8205 else {
c7f1f016 8206#ifndef EBCDIC
1ba5c669
JH
8207 for (value = 128; value < 256; value++)
8208 ANYOF_BITMAP_SET(ret, value);
8209#else /* EBCDIC */
ffbc6a93 8210 for (value = 0; value < 256; value++) {
3a3c4447
JH
8211 if (!isASCII(value))
8212 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8213 }
1ba5c669 8214#endif /* EBCDIC */
73b437c8 8215 }
c49a72a9
NC
8216 yesno = '!';
8217 what = "ASCII";
89836f1f 8218 break;
ffc61ed2
JH
8219 case ANYOF_DIGIT:
8220 if (LOC)
8221 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8222 else {
8223 /* consecutive digits assumed */
8224 for (value = '0'; value <= '9'; value++)
8225 ANYOF_BITMAP_SET(ret, value);
8226 }
c49a72a9 8227 yesno = '+';
da7fcca4 8228 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8229 break;
8230 case ANYOF_NDIGIT:
8231 if (LOC)
8232 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8233 else {
8234 /* consecutive digits assumed */
8235 for (value = 0; value < '0'; value++)
8236 ANYOF_BITMAP_SET(ret, value);
8237 for (value = '9' + 1; value < 256; value++)
8238 ANYOF_BITMAP_SET(ret, value);
8239 }
c49a72a9 8240 yesno = '!';
da7fcca4 8241 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8242 break;
f81125e2
JP
8243 case ANYOF_MAX:
8244 /* this is to handle \p and \P */
8245 break;
73b437c8 8246 default:
b45f050a 8247 vFAIL("Invalid [::] class");
73b437c8 8248 break;
b8c5462f 8249 }
c49a72a9
NC
8250 if (what) {
8251 /* Strings such as "+utf8::isWord\n" */
8252 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8253 }
b8c5462f 8254 if (LOC)
936ed897 8255 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8256 continue;
a0d0e21e 8257 }
ffc61ed2
JH
8258 } /* end of namedclass \blah */
8259
a0d0e21e 8260 if (range) {
eb160463 8261 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8262 const int w = RExC_parse - rangebegin;
8263 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8264 range = 0; /* not a valid range */
73b437c8 8265 }
a0d0e21e
LW
8266 }
8267 else {
3568d838 8268 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8269 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8270 RExC_parse[1] != ']') {
8271 RExC_parse++;
ffc61ed2
JH
8272
8273 /* a bad range like \w-, [:word:]- ? */
8274 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8275 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8276 const int w =
afd78fd5
JH
8277 RExC_parse >= rangebegin ?
8278 RExC_parse - rangebegin : 0;
830247a4 8279 vWARN4(RExC_parse,
b45f050a 8280 "False [] range \"%*.*s\"",
097eb12c 8281 w, w, rangebegin);
afd78fd5 8282 }
73b437c8 8283 if (!SIZE_ONLY)
936ed897 8284 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8285 } else
ffc61ed2
JH
8286 range = 1; /* yeah, it's a range! */
8287 continue; /* but do it the next time */
a0d0e21e 8288 }
a687059c 8289 }
ffc61ed2 8290
93733859 8291 /* now is the next time */
07be1b83 8292 /*stored += (value - prevvalue + 1);*/
ae5c130c 8293 if (!SIZE_ONLY) {
3568d838 8294 if (prevvalue < 256) {
1df70142 8295 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8296 IV i;
3568d838 8297#ifdef EBCDIC
1b2d223b
JH
8298 /* In EBCDIC [\x89-\x91] should include
8299 * the \x8e but [i-j] should not. */
8300 if (literal_endpoint == 2 &&
8301 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8302 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8303 {
3568d838
JH
8304 if (isLOWER(prevvalue)) {
8305 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8306 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8307 stored++;
ffc61ed2 8308 ANYOF_BITMAP_SET(ret, i);
2670d666 8309 }
ffc61ed2 8310 } else {
3568d838 8311 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8312 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8313 stored++;
ffc61ed2 8314 ANYOF_BITMAP_SET(ret, i);
2670d666 8315 }
ffc61ed2 8316 }
8ada0baa 8317 }
ffc61ed2 8318 else
8ada0baa 8319#endif
07be1b83
YO
8320 for (i = prevvalue; i <= ceilvalue; i++) {
8321 if (!ANYOF_BITMAP_TEST(ret,i)) {
8322 stored++;
8323 ANYOF_BITMAP_SET(ret, i);
8324 }
8325 }
3568d838 8326 }
a5961de5 8327 if (value > 255 || UTF) {
1df70142
AL
8328 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8329 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8330 stored+=2; /* can't optimize this class */
ffc61ed2 8331 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8332 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8333 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8334 prevnatvalue, natvalue);
8335 }
8336 else if (prevnatvalue == natvalue) {
8337 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8338 if (FOLD) {
89ebb4a3 8339 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8340 STRLEN foldlen;
1df70142 8341 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8342
e294cc5d
JH
8343#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8344 if (RExC_precomp[0] == ':' &&
8345 RExC_precomp[1] == '[' &&
8346 (f == 0xDF || f == 0x92)) {
8347 f = NATIVE_TO_UNI(f);
8348 }
8349#endif
c840d2a2
JH
8350 /* If folding and foldable and a single
8351 * character, insert also the folded version
8352 * to the charclass. */
9e55ce06 8353 if (f != value) {
e294cc5d
JH
8354#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8355 if ((RExC_precomp[0] == ':' &&
8356 RExC_precomp[1] == '[' &&
8357 (f == 0xA2 &&
8358 (value == 0xFB05 || value == 0xFB06))) ?
8359 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8360 foldlen == (STRLEN)UNISKIP(f) )
8361#else
eb160463 8362 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8363#endif
9e55ce06
JH
8364 Perl_sv_catpvf(aTHX_ listsv,
8365 "%04"UVxf"\n", f);
8366 else {
8367 /* Any multicharacter foldings
8368 * require the following transform:
8369 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8370 * where E folds into "pq" and F folds
8371 * into "rst", all other characters
8372 * fold to single characters. We save
8373 * away these multicharacter foldings,
8374 * to be later saved as part of the
8375 * additional "s" data. */
8376 SV *sv;
8377
8378 if (!unicode_alternate)
8379 unicode_alternate = newAV();
740cce10
NC
8380 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8381 TRUE);
9e55ce06
JH
8382 av_push(unicode_alternate, sv);
8383 }
8384 }
254ba52a 8385
60a8b682
JH
8386 /* If folding and the value is one of the Greek
8387 * sigmas insert a few more sigmas to make the
8388 * folding rules of the sigmas to work right.
8389 * Note that not all the possible combinations
8390 * are handled here: some of them are handled
9e55ce06
JH
8391 * by the standard folding rules, and some of
8392 * them (literal or EXACTF cases) are handled
8393 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8394 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8395 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8396 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8397 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8398 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8399 }
8400 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8401 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8402 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8403 }
8404 }
ffc61ed2 8405 }
1b2d223b
JH
8406#ifdef EBCDIC
8407 literal_endpoint = 0;
8408#endif
8ada0baa 8409 }
ffc61ed2
JH
8410
8411 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8412 }
ffc61ed2 8413
936ed897 8414 if (need_class) {
4f66b38d 8415 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8416 if (SIZE_ONLY)
830247a4 8417 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8418 else
830247a4 8419 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8420 }
ffc61ed2 8421
7f6f358c
YO
8422
8423 if (SIZE_ONLY)
8424 return ret;
8425 /****** !SIZE_ONLY AFTER HERE *********/
8426
3b57cd43 8427 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8428 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8429 ) {
8430 /* optimize single char class to an EXACT node
8431 but *only* when its not a UTF/high char */
07be1b83
YO
8432 const char * cur_parse= RExC_parse;
8433 RExC_emit = (regnode *)orig_emit;
8434 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8435 ret = reg_node(pRExC_state,
8436 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8437 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8438 *STRING(ret)= (char)value;
8439 STR_LEN(ret)= 1;
8440 RExC_emit += STR_SZ(1);
ef8d46e8 8441 SvREFCNT_dec(listsv);
7f6f358c
YO
8442 return ret;
8443 }
ae5c130c 8444 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8445 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8446 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8447 ) {
a0ed51b3 8448 for (value = 0; value < 256; ++value) {
936ed897 8449 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8450 UV fold = PL_fold[value];
ffc61ed2
JH
8451
8452 if (fold != value)
8453 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8454 }
8455 }
936ed897 8456 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8457 }
ffc61ed2 8458
ae5c130c 8459 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8460 if (optimize_invert &&
ffc61ed2
JH
8461 /* If the only flag is inversion. */
8462 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8463 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8464 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8465 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8466 }
7f6f358c 8467 {
097eb12c 8468 AV * const av = newAV();
ffc61ed2 8469 SV *rv;
9e55ce06 8470 /* The 0th element stores the character class description
6a0407ee 8471 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8472 * to initialize the appropriate swash (which gets stored in
8473 * the 1st element), and also useful for dumping the regnode.
8474 * The 2nd element stores the multicharacter foldings,
6a0407ee 8475 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8476 av_store(av, 0, listsv);
8477 av_store(av, 1, NULL);
ad64d0ec
NC
8478 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8479 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8480 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8481 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8482 ARG_SET(ret, n);
a0ed51b3 8483 }
a0ed51b3
LW
8484 return ret;
8485}
89836f1f
YO
8486#undef _C_C_T_
8487
a0ed51b3 8488
bcdf7404
YO
8489/* reg_skipcomment()
8490
8491 Absorbs an /x style # comments from the input stream.
8492 Returns true if there is more text remaining in the stream.
8493 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8494 terminates the pattern without including a newline.
8495
8496 Note its the callers responsibility to ensure that we are
8497 actually in /x mode
8498
8499*/
8500
8501STATIC bool
8502S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8503{
8504 bool ended = 0;
7918f24d
NC
8505
8506 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8507
bcdf7404
YO
8508 while (RExC_parse < RExC_end)
8509 if (*RExC_parse++ == '\n') {
8510 ended = 1;
8511 break;
8512 }
8513 if (!ended) {
8514 /* we ran off the end of the pattern without ending
8515 the comment, so we have to add an \n when wrapping */
8516 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8517 return 0;
8518 } else
8519 return 1;
8520}
8521
8522/* nextchar()
8523
8524 Advance that parse position, and optionall absorbs
8525 "whitespace" from the inputstream.
8526
8527 Without /x "whitespace" means (?#...) style comments only,
8528 with /x this means (?#...) and # comments and whitespace proper.
8529
8530 Returns the RExC_parse point from BEFORE the scan occurs.
8531
8532 This is the /x friendly way of saying RExC_parse++.
8533*/
8534
76e3520e 8535STATIC char*
830247a4 8536S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8537{
097eb12c 8538 char* const retval = RExC_parse++;
a0d0e21e 8539
7918f24d
NC
8540 PERL_ARGS_ASSERT_NEXTCHAR;
8541
4633a7c4 8542 for (;;) {
830247a4
IZ
8543 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8544 RExC_parse[2] == '#') {
e994fd66
AE
8545 while (*RExC_parse != ')') {
8546 if (RExC_parse == RExC_end)
8547 FAIL("Sequence (?#... not terminated");
830247a4 8548 RExC_parse++;
e994fd66 8549 }
830247a4 8550 RExC_parse++;
4633a7c4
LW
8551 continue;
8552 }
bbe252da 8553 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8554 if (isSPACE(*RExC_parse)) {
8555 RExC_parse++;
748a9306
LW
8556 continue;
8557 }
830247a4 8558 else if (*RExC_parse == '#') {
bcdf7404
YO
8559 if ( reg_skipcomment( pRExC_state ) )
8560 continue;
748a9306 8561 }
748a9306 8562 }
4633a7c4 8563 return retval;
a0d0e21e 8564 }
a687059c
LW
8565}
8566
8567/*
c277df42 8568- reg_node - emit a node
a0d0e21e 8569*/
76e3520e 8570STATIC regnode * /* Location. */
830247a4 8571S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8572{
97aff369 8573 dVAR;
c277df42 8574 register regnode *ptr;
504618e9 8575 regnode * const ret = RExC_emit;
07be1b83 8576 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8577
7918f24d
NC
8578 PERL_ARGS_ASSERT_REG_NODE;
8579
c277df42 8580 if (SIZE_ONLY) {
830247a4
IZ
8581 SIZE_ALIGN(RExC_size);
8582 RExC_size += 1;
a0d0e21e
LW
8583 return(ret);
8584 }
3b57cd43
YO
8585 if (RExC_emit >= RExC_emit_bound)
8586 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8587
c277df42 8588 NODE_ALIGN_FILL(ret);
a0d0e21e 8589 ptr = ret;
c277df42 8590 FILL_ADVANCE_NODE(ptr, op);
7122b237 8591#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8592 if (RExC_offsets) { /* MJD */
07be1b83 8593 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8594 "reg_node", __LINE__,
13d6edb4 8595 PL_reg_name[op],
07be1b83
YO
8596 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8597 ? "Overwriting end of array!\n" : "OK",
8598 (UV)(RExC_emit - RExC_emit_start),
8599 (UV)(RExC_parse - RExC_start),
8600 (UV)RExC_offsets[0]));
ccb2c380 8601 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8602 }
7122b237 8603#endif
830247a4 8604 RExC_emit = ptr;
a0d0e21e 8605 return(ret);
a687059c
LW
8606}
8607
8608/*
a0d0e21e
LW
8609- reganode - emit a node with an argument
8610*/
76e3520e 8611STATIC regnode * /* Location. */
830247a4 8612S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8613{
97aff369 8614 dVAR;
c277df42 8615 register regnode *ptr;
504618e9 8616 regnode * const ret = RExC_emit;
07be1b83 8617 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8618
7918f24d
NC
8619 PERL_ARGS_ASSERT_REGANODE;
8620
c277df42 8621 if (SIZE_ONLY) {
830247a4
IZ
8622 SIZE_ALIGN(RExC_size);
8623 RExC_size += 2;
6bda09f9
YO
8624 /*
8625 We can't do this:
8626
8627 assert(2==regarglen[op]+1);
8628
8629 Anything larger than this has to allocate the extra amount.
8630 If we changed this to be:
8631
8632 RExC_size += (1 + regarglen[op]);
8633
8634 then it wouldn't matter. Its not clear what side effect
8635 might come from that so its not done so far.
8636 -- dmq
8637 */
a0d0e21e
LW
8638 return(ret);
8639 }
3b57cd43
YO
8640 if (RExC_emit >= RExC_emit_bound)
8641 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8642
c277df42 8643 NODE_ALIGN_FILL(ret);
a0d0e21e 8644 ptr = ret;
c277df42 8645 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8646#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8647 if (RExC_offsets) { /* MJD */
07be1b83 8648 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8649 "reganode",
ccb2c380 8650 __LINE__,
13d6edb4 8651 PL_reg_name[op],
07be1b83 8652 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8653 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8654 (UV)(RExC_emit - RExC_emit_start),
8655 (UV)(RExC_parse - RExC_start),
8656 (UV)RExC_offsets[0]));
ccb2c380 8657 Set_Cur_Node_Offset;
fac92740 8658 }
7122b237 8659#endif
830247a4 8660 RExC_emit = ptr;
a0d0e21e 8661 return(ret);
fe14fcc3
LW
8662}
8663
8664/*
cd439c50 8665- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8666*/
71207a34
AL
8667STATIC STRLEN
8668S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8669{
97aff369 8670 dVAR;
7918f24d
NC
8671
8672 PERL_ARGS_ASSERT_REGUNI;
8673
71207a34 8674 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8675}
8676
8677/*
a0d0e21e
LW
8678- reginsert - insert an operator in front of already-emitted operand
8679*
8680* Means relocating the operand.
8681*/
76e3520e 8682STATIC void
6bda09f9 8683S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8684{
97aff369 8685 dVAR;
c277df42
IZ
8686 register regnode *src;
8687 register regnode *dst;
8688 register regnode *place;
504618e9 8689 const int offset = regarglen[(U8)op];
6bda09f9 8690 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8691 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8692
8693 PERL_ARGS_ASSERT_REGINSERT;
def51078 8694 PERL_UNUSED_ARG(depth);
22c35a8c 8695/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8696 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8697 if (SIZE_ONLY) {
6bda09f9 8698 RExC_size += size;
a0d0e21e
LW
8699 return;
8700 }
a687059c 8701
830247a4 8702 src = RExC_emit;
6bda09f9 8703 RExC_emit += size;
830247a4 8704 dst = RExC_emit;
40d049e4 8705 if (RExC_open_parens) {
6bda09f9 8706 int paren;
3b57cd43 8707 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8708 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8709 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8710 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8711 RExC_open_parens[paren] += size;
8712 } else {
3b57cd43 8713 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8714 }
8715 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8716 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8717 RExC_close_parens[paren] += size;
8718 } else {
3b57cd43 8719 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8720 }
8721 }
6bda09f9 8722 }
40d049e4 8723
fac92740 8724 while (src > opnd) {
c277df42 8725 StructCopy(--src, --dst, regnode);
7122b237 8726#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8727 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8728 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8729 "reg_insert",
ccb2c380 8730 __LINE__,
13d6edb4 8731 PL_reg_name[op],
07be1b83
YO
8732 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8733 ? "Overwriting end of array!\n" : "OK",
8734 (UV)(src - RExC_emit_start),
8735 (UV)(dst - RExC_emit_start),
8736 (UV)RExC_offsets[0]));
ccb2c380
MP
8737 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8738 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8739 }
7122b237 8740#endif
fac92740
MJD
8741 }
8742
a0d0e21e
LW
8743
8744 place = opnd; /* Op node, where operand used to be. */
7122b237 8745#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8746 if (RExC_offsets) { /* MJD */
07be1b83 8747 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8748 "reginsert",
ccb2c380 8749 __LINE__,
13d6edb4 8750 PL_reg_name[op],
07be1b83 8751 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8752 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8753 (UV)(place - RExC_emit_start),
8754 (UV)(RExC_parse - RExC_start),
786e8c11 8755 (UV)RExC_offsets[0]));
ccb2c380 8756 Set_Node_Offset(place, RExC_parse);
45948336 8757 Set_Node_Length(place, 1);
fac92740 8758 }
7122b237 8759#endif
c277df42
IZ
8760 src = NEXTOPER(place);
8761 FILL_ADVANCE_NODE(place, op);
8762 Zero(src, offset, regnode);
a687059c
LW
8763}
8764
8765/*
c277df42 8766- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8767- SEE ALSO: regtail_study
a0d0e21e 8768*/
097eb12c 8769/* TODO: All three parms should be const */
76e3520e 8770STATIC void
3dab1dad 8771S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8772{
97aff369 8773 dVAR;
c277df42 8774 register regnode *scan;
72f13be8 8775 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8776
8777 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8778#ifndef DEBUGGING
8779 PERL_UNUSED_ARG(depth);
8780#endif
a0d0e21e 8781
c277df42 8782 if (SIZE_ONLY)
a0d0e21e
LW
8783 return;
8784
8785 /* Find last node. */
8786 scan = p;
8787 for (;;) {
504618e9 8788 regnode * const temp = regnext(scan);
3dab1dad
YO
8789 DEBUG_PARSE_r({
8790 SV * const mysv=sv_newmortal();
8791 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8792 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8793 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8794 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8795 (temp == NULL ? "->" : ""),
13d6edb4 8796 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8797 );
3dab1dad
YO
8798 });
8799 if (temp == NULL)
8800 break;
8801 scan = temp;
8802 }
8803
8804 if (reg_off_by_arg[OP(scan)]) {
8805 ARG_SET(scan, val - scan);
8806 }
8807 else {
8808 NEXT_OFF(scan) = val - scan;
8809 }
8810}
8811
07be1b83 8812#ifdef DEBUGGING
3dab1dad
YO
8813/*
8814- regtail_study - set the next-pointer at the end of a node chain of p to val.
8815- Look for optimizable sequences at the same time.
8816- currently only looks for EXACT chains.
07be1b83
YO
8817
8818This is expermental code. The idea is to use this routine to perform
8819in place optimizations on branches and groups as they are constructed,
8820with the long term intention of removing optimization from study_chunk so
8821that it is purely analytical.
8822
8823Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8824to control which is which.
8825
3dab1dad
YO
8826*/
8827/* TODO: All four parms should be const */
07be1b83 8828
3dab1dad
YO
8829STATIC U8
8830S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8831{
8832 dVAR;
8833 register regnode *scan;
07be1b83
YO
8834 U8 exact = PSEUDO;
8835#ifdef EXPERIMENTAL_INPLACESCAN
8836 I32 min = 0;
8837#endif
3dab1dad
YO
8838 GET_RE_DEBUG_FLAGS_DECL;
8839
7918f24d
NC
8840 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8841
07be1b83 8842
3dab1dad
YO
8843 if (SIZE_ONLY)
8844 return exact;
8845
8846 /* Find last node. */
8847
8848 scan = p;
8849 for (;;) {
8850 regnode * const temp = regnext(scan);
07be1b83
YO
8851#ifdef EXPERIMENTAL_INPLACESCAN
8852 if (PL_regkind[OP(scan)] == EXACT)
8853 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8854 return EXACT;
8855#endif
3dab1dad
YO
8856 if ( exact ) {
8857 switch (OP(scan)) {
8858 case EXACT:
8859 case EXACTF:
8860 case EXACTFL:
8861 if( exact == PSEUDO )
8862 exact= OP(scan);
07be1b83
YO
8863 else if ( exact != OP(scan) )
8864 exact= 0;
3dab1dad
YO
8865 case NOTHING:
8866 break;
8867 default:
8868 exact= 0;
8869 }
8870 }
8871 DEBUG_PARSE_r({
8872 SV * const mysv=sv_newmortal();
8873 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8874 regprop(RExC_rx, mysv, scan);
eaf3ca90 8875 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8876 SvPV_nolen_const(mysv),
eaf3ca90 8877 REG_NODE_NUM(scan),
13d6edb4 8878 PL_reg_name[exact]);
3dab1dad 8879 });
a0d0e21e
LW
8880 if (temp == NULL)
8881 break;
8882 scan = temp;
8883 }
07be1b83
YO
8884 DEBUG_PARSE_r({
8885 SV * const mysv_val=sv_newmortal();
8886 DEBUG_PARSE_MSG("");
8887 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8888 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8889 SvPV_nolen_const(mysv_val),
8890 (IV)REG_NODE_NUM(val),
8891 (IV)(val - scan)
07be1b83
YO
8892 );
8893 });
c277df42
IZ
8894 if (reg_off_by_arg[OP(scan)]) {
8895 ARG_SET(scan, val - scan);
a0ed51b3
LW
8896 }
8897 else {
c277df42
IZ
8898 NEXT_OFF(scan) = val - scan;
8899 }
3dab1dad
YO
8900
8901 return exact;
a687059c 8902}
07be1b83 8903#endif
a687059c
LW
8904
8905/*
a687059c
LW
8906 - regcurly - a little FSA that accepts {\d+,?\d*}
8907 */
79072805 8908STATIC I32
5f66b61c 8909S_regcurly(register const char *s)
a687059c 8910{
7918f24d
NC
8911 PERL_ARGS_ASSERT_REGCURLY;
8912
a687059c
LW
8913 if (*s++ != '{')
8914 return FALSE;
f0fcb552 8915 if (!isDIGIT(*s))
a687059c 8916 return FALSE;
f0fcb552 8917 while (isDIGIT(*s))
a687059c
LW
8918 s++;
8919 if (*s == ',')
8920 s++;
f0fcb552 8921 while (isDIGIT(*s))
a687059c
LW
8922 s++;
8923 if (*s != '}')
8924 return FALSE;
8925 return TRUE;
8926}
8927
a687059c
LW
8928
8929/*
fd181c75 8930 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8931 */
f7819f85 8932#ifdef DEBUGGING
c33269f7 8933static void
7918f24d
NC
8934S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8935{
f7819f85
A
8936 int bit;
8937 int set=0;
7918f24d 8938
f7819f85
A
8939 for (bit=0; bit<32; bit++) {
8940 if (flags & (1<<bit)) {
8941 if (!set++ && lead)
8942 PerlIO_printf(Perl_debug_log, "%s",lead);
8943 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8944 }
8945 }
8946 if (lead) {
8947 if (set)
8948 PerlIO_printf(Perl_debug_log, "\n");
8949 else
8950 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8951 }
8952}
8953#endif
8954
a687059c 8955void
097eb12c 8956Perl_regdump(pTHX_ const regexp *r)
a687059c 8957{
35ff7856 8958#ifdef DEBUGGING
97aff369 8959 dVAR;
c445ea15 8960 SV * const sv = sv_newmortal();
ab3bbdeb 8961 SV *dsv= sv_newmortal();
f8fc2ecf 8962 RXi_GET_DECL(r,ri);
f7819f85 8963 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8964
7918f24d
NC
8965 PERL_ARGS_ASSERT_REGDUMP;
8966
f8fc2ecf 8967 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8968
8969 /* Header fields of interest. */
ab3bbdeb
YO
8970 if (r->anchored_substr) {
8971 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8972 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8973 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8974 "anchored %s%s at %"IVdf" ",
8975 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8976 (IV)r->anchored_offset);
ab3bbdeb
YO
8977 } else if (r->anchored_utf8) {
8978 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8979 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8980 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8981 "anchored utf8 %s%s at %"IVdf" ",
8982 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8983 (IV)r->anchored_offset);
ab3bbdeb
YO
8984 }
8985 if (r->float_substr) {
8986 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8987 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8988 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8989 "floating %s%s at %"IVdf"..%"UVuf" ",
8990 s, RE_SV_TAIL(r->float_substr),
7b0972df 8991 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8992 } else if (r->float_utf8) {
8993 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8994 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8995 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8996 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8997 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8998 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8999 }
33b8afdf 9000 if (r->check_substr || r->check_utf8)
b81d288d 9001 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9002 (const char *)
9003 (r->check_substr == r->float_substr
9004 && r->check_utf8 == r->float_utf8
9005 ? "(checking floating" : "(checking anchored"));
bbe252da 9006 if (r->extflags & RXf_NOSCAN)
c277df42 9007 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9008 if (r->extflags & RXf_CHECK_ALL)
c277df42 9009 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9010 if (r->check_substr || r->check_utf8)
c277df42
IZ
9011 PerlIO_printf(Perl_debug_log, ") ");
9012
f8fc2ecf
YO
9013 if (ri->regstclass) {
9014 regprop(r, sv, ri->regstclass);
1de06328 9015 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9016 }
bbe252da 9017 if (r->extflags & RXf_ANCH) {
774d564b 9018 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9019 if (r->extflags & RXf_ANCH_BOL)
774d564b 9020 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9021 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9022 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9023 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9024 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9025 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9026 PerlIO_printf(Perl_debug_log, "(GPOS)");
9027 PerlIO_putc(Perl_debug_log, ' ');
9028 }
bbe252da 9029 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9030 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9031 if (r->intflags & PREGf_SKIP)
760ac839 9032 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9033 if (r->intflags & PREGf_IMPLICIT)
760ac839 9034 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9035 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9036 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9037 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9038 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9039 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9040#else
7918f24d 9041 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9042 PERL_UNUSED_CONTEXT;
65e66c80 9043 PERL_UNUSED_ARG(r);
17c3b450 9044#endif /* DEBUGGING */
a687059c
LW
9045}
9046
9047/*
a0d0e21e
LW
9048- regprop - printable representation of opcode
9049*/
3339dfd8
YO
9050#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9051STMT_START { \
9052 if (do_sep) { \
9053 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9054 if (flags & ANYOF_INVERT) \
9055 /*make sure the invert info is in each */ \
9056 sv_catpvs(sv, "^"); \
9057 do_sep = 0; \
9058 } \
9059} STMT_END
9060
46fc3d4c 9061void
32fc9b6a 9062Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9063{
35ff7856 9064#ifdef DEBUGGING
97aff369 9065 dVAR;
9b155405 9066 register int k;
f8fc2ecf 9067 RXi_GET_DECL(prog,progi);
1de06328 9068 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9069
7918f24d 9070 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9071
76f68e9b 9072 sv_setpvs(sv, "");
8aa23a47 9073
03363afd 9074 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9075 /* It would be nice to FAIL() here, but this may be called from
9076 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9077 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9078 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9079
3dab1dad 9080 k = PL_regkind[OP(o)];
9b155405 9081
2a782b5b 9082 if (k == EXACT) {
f92a2122 9083 sv_catpvs(sv, " ");
ab3bbdeb
YO
9084 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9085 * is a crude hack but it may be the best for now since
9086 * we have no flag "this EXACTish node was UTF-8"
9087 * --jhi */
f92a2122
NC
9088 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9089 PERL_PV_ESCAPE_UNI_DETECT |
9090 PERL_PV_PRETTY_ELLIPSES |
9091 PERL_PV_PRETTY_LTGT |
9092 PERL_PV_PRETTY_NOCLEAR
9093 );
bb263b4e 9094 } else if (k == TRIE) {
3dab1dad 9095 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9096 * progi->data isn't available here */
1de06328 9097 const char op = OP(o);
647f639f 9098 const U32 n = ARG(o);
1de06328 9099 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9100 (reg_ac_data *)progi->data->data[n] :
1de06328 9101 NULL;
3251b653
NC
9102 const reg_trie_data * const trie
9103 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9104
13d6edb4 9105 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9106 DEBUG_TRIE_COMPILE_r(
9107 Perl_sv_catpvf(aTHX_ sv,
9108 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9109 (UV)trie->startstate,
1e2e3d02 9110 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9111 (UV)trie->wordcount,
9112 (UV)trie->minlen,
9113 (UV)trie->maxlen,
9114 (UV)TRIE_CHARCOUNT(trie),
9115 (UV)trie->uniquecharcount
9116 )
9117 );
9118 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9119 int i;
9120 int rangestart = -1;
f46cb337 9121 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9122 sv_catpvs(sv, "[");
1de06328
YO
9123 for (i = 0; i <= 256; i++) {
9124 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9125 if (rangestart == -1)
9126 rangestart = i;
9127 } else if (rangestart != -1) {
9128 if (i <= rangestart + 3)
9129 for (; rangestart < i; rangestart++)
9130 put_byte(sv, rangestart);
9131 else {
9132 put_byte(sv, rangestart);
9133 sv_catpvs(sv, "-");
9134 put_byte(sv, i - 1);
9135 }
9136 rangestart = -1;
9137 }
9138 }
f3a2811a 9139 sv_catpvs(sv, "]");
1de06328
YO
9140 }
9141
a3621e74 9142 } else if (k == CURLY) {
cb434fcc 9143 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9144 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9145 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9146 }
2c2d71f5
JH
9147 else if (k == WHILEM && o->flags) /* Ordinal/of */
9148 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9149 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9150 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9151 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9152 if ( k != REF || OP(o) < NREF) {
502c6561 9153 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9154 SV **name= av_fetch(list, ARG(o), 0 );
9155 if (name)
9156 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9157 }
9158 else {
502c6561 9159 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9160 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9161 I32 *nums=(I32*)SvPVX(sv_dat);
9162 SV **name= av_fetch(list, nums[0], 0 );
9163 I32 n;
9164 if (name) {
9165 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9166 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9167 (n ? "," : ""), (IV)nums[n]);
9168 }
9169 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9170 }
1f1031fe 9171 }
ee9b8eae 9172 }
1f1031fe 9173 } else if (k == GOSUB)
6bda09f9 9174 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9175 else if (k == VERB) {
9176 if (!o->flags)
9177 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9178 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9179 } else if (k == LOGICAL)
04ebc1ab 9180 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9181 else if (k == FOLDCHAR)
df44d732 9182 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9183 else if (k == ANYOF) {
9184 int i, rangestart = -1;
2d03de9c 9185 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9186 int do_sep = 0;
0bd48802
AL
9187
9188 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9189 static const char * const anyofs[] = {
653099ff
GS
9190 "\\w",
9191 "\\W",
9192 "\\s",
9193 "\\S",
9194 "\\d",
9195 "\\D",
9196 "[:alnum:]",
9197 "[:^alnum:]",
9198 "[:alpha:]",
9199 "[:^alpha:]",
9200 "[:ascii:]",
9201 "[:^ascii:]",
24d786f4
YO
9202 "[:cntrl:]",
9203 "[:^cntrl:]",
653099ff
GS
9204 "[:graph:]",
9205 "[:^graph:]",
9206 "[:lower:]",
9207 "[:^lower:]",
9208 "[:print:]",
9209 "[:^print:]",
9210 "[:punct:]",
9211 "[:^punct:]",
9212 "[:upper:]",
aaa51d5e 9213 "[:^upper:]",
653099ff 9214 "[:xdigit:]",
aaa51d5e
JF
9215 "[:^xdigit:]",
9216 "[:space:]",
9217 "[:^space:]",
9218 "[:blank:]",
9219 "[:^blank:]"
653099ff
GS
9220 };
9221
19860706 9222 if (flags & ANYOF_LOCALE)
396482e1 9223 sv_catpvs(sv, "{loc}");
19860706 9224 if (flags & ANYOF_FOLD)
396482e1 9225 sv_catpvs(sv, "{i}");
653099ff 9226 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9227 if (flags & ANYOF_INVERT)
396482e1 9228 sv_catpvs(sv, "^");
3339dfd8
YO
9229
9230 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9231 for (i = 0; i <= 256; i++) {
9232 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9233 if (rangestart == -1)
9234 rangestart = i;
9235 } else if (rangestart != -1) {
9236 if (i <= rangestart + 3)
9237 for (; rangestart < i; rangestart++)
653099ff 9238 put_byte(sv, rangestart);
ffc61ed2
JH
9239 else {
9240 put_byte(sv, rangestart);
396482e1 9241 sv_catpvs(sv, "-");
ffc61ed2 9242 put_byte(sv, i - 1);
653099ff 9243 }
24d786f4 9244 do_sep = 1;
ffc61ed2 9245 rangestart = -1;
653099ff 9246 }
847a199f 9247 }
3339dfd8
YO
9248
9249 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9250 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9251 if (o->flags & ANYOF_CLASS)
bb7a0f54 9252 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9253 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9254 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9255 do_sep = 1;
9256 }
9257
3339dfd8
YO
9258 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9259
9260 /* output information about the unicode matching */
ffc61ed2 9261 if (flags & ANYOF_UNICODE)
396482e1 9262 sv_catpvs(sv, "{unicode}");
1aa99e6b 9263 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9264 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9265
9266 {
9267 SV *lv;
32fc9b6a 9268 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9269
ffc61ed2
JH
9270 if (lv) {
9271 if (sw) {
89ebb4a3 9272 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9273
ffc61ed2 9274 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9275 uvchr_to_utf8(s, i);
ffc61ed2 9276
3568d838 9277 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9278 if (rangestart == -1)
9279 rangestart = i;
9280 } else if (rangestart != -1) {
ffc61ed2
JH
9281 if (i <= rangestart + 3)
9282 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9283 const U8 * const e = uvchr_to_utf8(s,rangestart);
9284 U8 *p;
9285 for(p = s; p < e; p++)
ffc61ed2
JH
9286 put_byte(sv, *p);
9287 }
9288 else {
2d03de9c
AL
9289 const U8 *e = uvchr_to_utf8(s,rangestart);
9290 U8 *p;
9291 for (p = s; p < e; p++)
ffc61ed2 9292 put_byte(sv, *p);
396482e1 9293 sv_catpvs(sv, "-");
2d03de9c
AL
9294 e = uvchr_to_utf8(s, i-1);
9295 for (p = s; p < e; p++)
1df70142 9296 put_byte(sv, *p);
ffc61ed2
JH
9297 }
9298 rangestart = -1;
9299 }
19860706 9300 }
ffc61ed2 9301
396482e1 9302 sv_catpvs(sv, "..."); /* et cetera */
19860706 9303 }
fde631ed 9304
ffc61ed2 9305 {
2e0de35c 9306 char *s = savesvpv(lv);
c445ea15 9307 char * const origs = s;
b81d288d 9308
3dab1dad
YO
9309 while (*s && *s != '\n')
9310 s++;
b81d288d 9311
ffc61ed2 9312 if (*s == '\n') {
2d03de9c 9313 const char * const t = ++s;
ffc61ed2
JH
9314
9315 while (*s) {
9316 if (*s == '\n')
9317 *s = ' ';
9318 s++;
9319 }
9320 if (s[-1] == ' ')
9321 s[-1] = 0;
9322
9323 sv_catpv(sv, t);
fde631ed 9324 }
b81d288d 9325
ffc61ed2 9326 Safefree(origs);
fde631ed
JH
9327 }
9328 }
653099ff 9329 }
ffc61ed2 9330
653099ff
GS
9331 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9332 }
9b155405 9333 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9334 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9335#else
96a5add6 9336 PERL_UNUSED_CONTEXT;
65e66c80
SP
9337 PERL_UNUSED_ARG(sv);
9338 PERL_UNUSED_ARG(o);
f9049ba1 9339 PERL_UNUSED_ARG(prog);
17c3b450 9340#endif /* DEBUGGING */
35ff7856 9341}
a687059c 9342
cad2e5aa 9343SV *
288b8c02 9344Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9345{ /* Assume that RE_INTUIT is set */
97aff369 9346 dVAR;
288b8c02 9347 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9348 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9349
9350 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9351 PERL_UNUSED_CONTEXT;
9352
a3621e74 9353 DEBUG_COMPILE_r(
cfd0369c 9354 {
2d03de9c 9355 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9356 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9357
9358 if (!PL_colorset) reginitcolors();
9359 PerlIO_printf(Perl_debug_log,
a0288114 9360 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9361 PL_colors[4],
9362 prog->check_substr ? "" : "utf8 ",
9363 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9364 s,
9365 PL_colors[1],
9366 (strlen(s) > 60 ? "..." : ""));
9367 } );
9368
33b8afdf 9369 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9370}
9371
84da74a7 9372/*
f8149455 9373 pregfree()
84da74a7 9374
f8149455
YO
9375 handles refcounting and freeing the perl core regexp structure. When
9376 it is necessary to actually free the structure the first thing it
9377 does is call the 'free' method of the regexp_engine associated to to
9378 the regexp, allowing the handling of the void *pprivate; member
9379 first. (This routine is not overridable by extensions, which is why
9380 the extensions free is called first.)
9381
9382 See regdupe and regdupe_internal if you change anything here.
84da74a7 9383*/
f8149455 9384#ifndef PERL_IN_XSUB_RE
2b69d0c2 9385void
84679df5 9386Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9387{
288b8c02
NC
9388 SvREFCNT_dec(r);
9389}
9390
9391void
9392Perl_pregfree2(pTHX_ REGEXP *rx)
9393{
27da23d5 9394 dVAR;
288b8c02 9395 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9396 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9397
7918f24d
NC
9398 PERL_ARGS_ASSERT_PREGFREE2;
9399
28d8d7f4
YO
9400 if (r->mother_re) {
9401 ReREFCNT_dec(r->mother_re);
9402 } else {
288b8c02 9403 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9404 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9405 }
9406 if (r->substrs) {
ef8d46e8
VP
9407 SvREFCNT_dec(r->anchored_substr);
9408 SvREFCNT_dec(r->anchored_utf8);
9409 SvREFCNT_dec(r->float_substr);
9410 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9411 Safefree(r->substrs);
9412 }
288b8c02 9413 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9414#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9415 SvREFCNT_dec(r->saved_copy);
ed252734 9416#endif
f0ab9afb 9417 Safefree(r->offs);
f8149455 9418}
28d8d7f4
YO
9419
9420/* reg_temp_copy()
9421
9422 This is a hacky workaround to the structural issue of match results
9423 being stored in the regexp structure which is in turn stored in
9424 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9425 could be PL_curpm in multiple contexts, and could require multiple
9426 result sets being associated with the pattern simultaneously, such
9427 as when doing a recursive match with (??{$qr})
9428
9429 The solution is to make a lightweight copy of the regexp structure
9430 when a qr// is returned from the code executed by (??{$qr}) this
9431 lightweight copy doesnt actually own any of its data except for
9432 the starp/end and the actual regexp structure itself.
9433
9434*/
9435
9436
84679df5 9437REGEXP *
f0826785 9438Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9439{
f0826785 9440 struct regexp *ret;
288b8c02 9441 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9442 register const I32 npar = r->nparens+1;
7918f24d
NC
9443
9444 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9445
f0826785
BM
9446 if (!ret_x)
9447 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9448 ret = (struct regexp *)SvANY(ret_x);
9449
288b8c02 9450 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9451 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9452 by pointing directly at the buffer, but flagging that the allocated
9453 space in the copy is zero. As we've just done a struct copy, it's now
9454 a case of zero-ing that, rather than copying the current length. */
9455 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9456 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9457 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9458 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9459 SvLEN_set(ret_x, 0);
f0ab9afb
NC
9460 Newx(ret->offs, npar, regexp_paren_pair);
9461 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9462 if (r->substrs) {
28d8d7f4 9463 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9464 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9465
9466 SvREFCNT_inc_void(ret->anchored_substr);
9467 SvREFCNT_inc_void(ret->anchored_utf8);
9468 SvREFCNT_inc_void(ret->float_substr);
9469 SvREFCNT_inc_void(ret->float_utf8);
9470
9471 /* check_substr and check_utf8, if non-NULL, point to either their
9472 anchored or float namesakes, and don't hold a second reference. */
486913e4 9473 }
288b8c02 9474 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9475#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9476 ret->saved_copy = NULL;
28d8d7f4 9477#endif
288b8c02 9478 ret->mother_re = rx;
28d8d7f4 9479
288b8c02 9480 return ret_x;
28d8d7f4 9481}
f8149455
YO
9482#endif
9483
9484/* regfree_internal()
9485
9486 Free the private data in a regexp. This is overloadable by
9487 extensions. Perl takes care of the regexp structure in pregfree(),
9488 this covers the *pprivate pointer which technically perldoesnt
9489 know about, however of course we have to handle the
9490 regexp_internal structure when no extension is in use.
9491
9492 Note this is called before freeing anything in the regexp
9493 structure.
9494 */
9495
9496void
288b8c02 9497Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9498{
9499 dVAR;
288b8c02 9500 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9501 RXi_GET_DECL(r,ri);
9502 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9503
9504 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9505
f8149455
YO
9506 DEBUG_COMPILE_r({
9507 if (!PL_colorset)
9508 reginitcolors();
9509 {
9510 SV *dsv= sv_newmortal();
3c8556c3 9511 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9512 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9513 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9514 PL_colors[4],PL_colors[5],s);
9515 }
9516 });
7122b237
YO
9517#ifdef RE_TRACK_PATTERN_OFFSETS
9518 if (ri->u.offsets)
9519 Safefree(ri->u.offsets); /* 20010421 MJD */
9520#endif
f8fc2ecf
YO
9521 if (ri->data) {
9522 int n = ri->data->count;
f3548bdc
DM
9523 PAD* new_comppad = NULL;
9524 PAD* old_comppad;
4026c95a 9525 PADOFFSET refcnt;
dfad63ad 9526
c277df42 9527 while (--n >= 0) {
261faec3 9528 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9529 switch (ri->data->what[n]) {
c277df42 9530 case 's':
81714fb9 9531 case 'S':
55eed653 9532 case 'u':
ad64d0ec 9533 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9534 break;
653099ff 9535 case 'f':
f8fc2ecf 9536 Safefree(ri->data->data[n]);
653099ff 9537 break;
dfad63ad 9538 case 'p':
502c6561 9539 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9540 break;
c277df42 9541 case 'o':
dfad63ad 9542 if (new_comppad == NULL)
cea2e8a9 9543 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9544 PAD_SAVE_LOCAL(old_comppad,
9545 /* Watch out for global destruction's random ordering. */
c445ea15 9546 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9547 );
b34c0dd4 9548 OP_REFCNT_LOCK;
f8fc2ecf 9549 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9550 OP_REFCNT_UNLOCK;
9551 if (!refcnt)
f8fc2ecf 9552 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9553
f3548bdc 9554 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9555 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9556 new_comppad = NULL;
c277df42
IZ
9557 break;
9558 case 'n':
9e55ce06 9559 break;
07be1b83 9560 case 'T':
be8e71aa
YO
9561 { /* Aho Corasick add-on structure for a trie node.
9562 Used in stclass optimization only */
07be1b83 9563 U32 refcount;
f8fc2ecf 9564 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9565 OP_REFCNT_LOCK;
9566 refcount = --aho->refcount;
9567 OP_REFCNT_UNLOCK;
9568 if ( !refcount ) {
446bd890
NC
9569 PerlMemShared_free(aho->states);
9570 PerlMemShared_free(aho->fail);
446bd890
NC
9571 /* do this last!!!! */
9572 PerlMemShared_free(ri->data->data[n]);
9573 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9574 }
9575 }
9576 break;
a3621e74 9577 case 't':
07be1b83 9578 {
be8e71aa 9579 /* trie structure. */
07be1b83 9580 U32 refcount;
f8fc2ecf 9581 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9582 OP_REFCNT_LOCK;
9583 refcount = --trie->refcount;
9584 OP_REFCNT_UNLOCK;
9585 if ( !refcount ) {
446bd890 9586 PerlMemShared_free(trie->charmap);
446bd890
NC
9587 PerlMemShared_free(trie->states);
9588 PerlMemShared_free(trie->trans);
07be1b83 9589 if (trie->bitmap)
446bd890 9590 PerlMemShared_free(trie->bitmap);
07be1b83 9591 if (trie->wordlen)
446bd890 9592 PerlMemShared_free(trie->wordlen);
786e8c11 9593 if (trie->jump)
446bd890 9594 PerlMemShared_free(trie->jump);
786e8c11 9595 if (trie->nextword)
446bd890 9596 PerlMemShared_free(trie->nextword);
446bd890
NC
9597 /* do this last!!!! */
9598 PerlMemShared_free(ri->data->data[n]);
a3621e74 9599 }
07be1b83
YO
9600 }
9601 break;
c277df42 9602 default:
f8fc2ecf 9603 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9604 }
9605 }
f8fc2ecf
YO
9606 Safefree(ri->data->what);
9607 Safefree(ri->data);
a0d0e21e 9608 }
28d8d7f4 9609
f8fc2ecf 9610 Safefree(ri);
a687059c 9611}
c277df42 9612
84da74a7 9613#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
502c6561 9614#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
85fbaab2 9615#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
84da74a7
YO
9616#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9617
9618/*
32cd70f6 9619 re_dup - duplicate a regexp.
84da74a7 9620
8233f606
DM
9621 This routine is expected to clone a given regexp structure. It is only
9622 compiled under USE_ITHREADS.
32cd70f6 9623
f8149455
YO
9624 After all of the core data stored in struct regexp is duplicated
9625 the regexp_engine.dupe method is used to copy any private data
9626 stored in the *pprivate pointer. This allows extensions to handle
9627 any duplication it needs to do.
9628
9629 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9630*/
a3c0e9ca 9631#if defined(USE_ITHREADS)
f8149455 9632#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9633void
9634Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9635{
84da74a7 9636 dVAR;
a86a1ca7 9637 I32 npar;
288b8c02
NC
9638 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9639 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9640
7918f24d
NC
9641 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9642
84da74a7 9643 npar = r->nparens+1;
f0ab9afb
NC
9644 Newx(ret->offs, npar, regexp_paren_pair);
9645 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9646 if(ret->swap) {
28d8d7f4 9647 /* no need to copy these */
f0ab9afb 9648 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9649 }
84da74a7 9650
6057429f 9651 if (ret->substrs) {
32cd70f6
NC
9652 /* Do it this way to avoid reading from *r after the StructCopy().
9653 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9654 cache, it doesn't matter. */
66b1de87
NC
9655 const bool anchored = r->check_substr
9656 ? r->check_substr == r->anchored_substr
9657 : r->check_utf8 == r->anchored_utf8;
785a26d5 9658 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9659 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9660
32cd70f6
NC
9661 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9662 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9663 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9664 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9665
32cd70f6
NC
9666 /* check_substr and check_utf8, if non-NULL, point to either their
9667 anchored or float namesakes, and don't hold a second reference. */
9668
9669 if (ret->check_substr) {
9670 if (anchored) {
9671 assert(r->check_utf8 == r->anchored_utf8);
9672 ret->check_substr = ret->anchored_substr;
9673 ret->check_utf8 = ret->anchored_utf8;
9674 } else {
9675 assert(r->check_substr == r->float_substr);
9676 assert(r->check_utf8 == r->float_utf8);
9677 ret->check_substr = ret->float_substr;
9678 ret->check_utf8 = ret->float_utf8;
9679 }
66b1de87
NC
9680 } else if (ret->check_utf8) {
9681 if (anchored) {
9682 ret->check_utf8 = ret->anchored_utf8;
9683 } else {
9684 ret->check_utf8 = ret->float_utf8;
9685 }
32cd70f6 9686 }
6057429f 9687 }
f8149455 9688
5daac39c 9689 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9690
6057429f 9691 if (ret->pprivate)
288b8c02 9692 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9693
288b8c02 9694 if (RX_MATCH_COPIED(dstr))
6057429f 9695 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9696 else
9697 ret->subbeg = NULL;
9698#ifdef PERL_OLD_COPY_ON_WRITE
9699 ret->saved_copy = NULL;
9700#endif
6057429f 9701
c2123ae3
NC
9702 if (ret->mother_re) {
9703 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9704 /* Our storage points directly to our mother regexp, but that's
9705 1: a buffer in a different thread
9706 2: something we no longer hold a reference on
9707 so we need to copy it locally. */
9708 /* Note we need to sue SvCUR() on our mother_re, because it, in
9709 turn, may well be pointing to its own mother_re. */
9710 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9711 SvCUR(ret->mother_re)+1));
9712 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9713 }
9714 ret->mother_re = NULL;
9715 }
6057429f 9716 ret->gofs = 0;
f8149455
YO
9717}
9718#endif /* PERL_IN_XSUB_RE */
9719
9720/*
9721 regdupe_internal()
9722
9723 This is the internal complement to regdupe() which is used to copy
9724 the structure pointed to by the *pprivate pointer in the regexp.
9725 This is the core version of the extension overridable cloning hook.
9726 The regexp structure being duplicated will be copied by perl prior
9727 to this and will be provided as the regexp *r argument, however
9728 with the /old/ structures pprivate pointer value. Thus this routine
9729 may override any copying normally done by perl.
9730
9731 It returns a pointer to the new regexp_internal structure.
9732*/
9733
9734void *
288b8c02 9735Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9736{
9737 dVAR;
288b8c02 9738 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9739 regexp_internal *reti;
9740 int len, npar;
9741 RXi_GET_DECL(r,ri);
7918f24d
NC
9742
9743 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9744
9745 npar = r->nparens+1;
7122b237 9746 len = ProgLen(ri);
f8149455 9747
45cf4570 9748 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9749 Copy(ri->program, reti->program, len+1, regnode);
9750
f8149455 9751
f8fc2ecf 9752 reti->regstclass = NULL;
bcdf7404 9753
f8fc2ecf 9754 if (ri->data) {
84da74a7 9755 struct reg_data *d;
f8fc2ecf 9756 const int count = ri->data->count;
84da74a7
YO
9757 int i;
9758
9759 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9760 char, struct reg_data);
9761 Newx(d->what, count, U8);
9762
9763 d->count = count;
9764 for (i = 0; i < count; i++) {
f8fc2ecf 9765 d->what[i] = ri->data->what[i];
84da74a7 9766 switch (d->what[i]) {
55eed653 9767 /* legal options are one of: sSfpontTu
84da74a7
YO
9768 see also regcomp.h and pregfree() */
9769 case 's':
81714fb9 9770 case 'S':
0536c0a7 9771 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9772 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9773 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9774 break;
84da74a7
YO
9775 case 'f':
9776 /* This is cheating. */
9777 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9778 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9779 struct regnode_charclass_class);
f8fc2ecf 9780 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9781 break;
9782 case 'o':
bbe252da
YO
9783 /* Compiled op trees are readonly and in shared memory,
9784 and can thus be shared without duplication. */
84da74a7 9785 OP_REFCNT_LOCK;
f8fc2ecf 9786 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9787 OP_REFCNT_UNLOCK;
9788 break;
23eab42c
NC
9789 case 'T':
9790 /* Trie stclasses are readonly and can thus be shared
9791 * without duplication. We free the stclass in pregfree
9792 * when the corresponding reg_ac_data struct is freed.
9793 */
9794 reti->regstclass= ri->regstclass;
9795 /* Fall through */
84da74a7 9796 case 't':
84da74a7 9797 OP_REFCNT_LOCK;
0536c0a7 9798 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9799 OP_REFCNT_UNLOCK;
0536c0a7
NC
9800 /* Fall through */
9801 case 'n':
9802 d->data[i] = ri->data->data[i];
84da74a7 9803 break;
84da74a7 9804 default:
f8fc2ecf 9805 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9806 }
9807 }
9808
f8fc2ecf 9809 reti->data = d;
84da74a7
YO
9810 }
9811 else
f8fc2ecf 9812 reti->data = NULL;
84da74a7 9813
cde0cee5
YO
9814 reti->name_list_idx = ri->name_list_idx;
9815
7122b237
YO
9816#ifdef RE_TRACK_PATTERN_OFFSETS
9817 if (ri->u.offsets) {
9818 Newx(reti->u.offsets, 2*len+1, U32);
9819 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9820 }
9821#else
9822 SetProgLen(reti,len);
9823#endif
9824
f8149455 9825 return (void*)reti;
84da74a7 9826}
f8149455
YO
9827
9828#endif /* USE_ITHREADS */
84da74a7 9829
f8149455 9830#ifndef PERL_IN_XSUB_RE
bcdf7404 9831
c277df42
IZ
9832/*
9833 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9834 */
9835regnode *
864dbfa3 9836Perl_regnext(pTHX_ register regnode *p)
c277df42 9837{
97aff369 9838 dVAR;
c277df42
IZ
9839 register I32 offset;
9840
f8fc2ecf 9841 if (!p)
c277df42
IZ
9842 return(NULL);
9843
9844 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9845 if (offset == 0)
9846 return(NULL);
9847
c277df42 9848 return(p+offset);
c277df42 9849}
76234dfb 9850#endif
c277df42 9851
01f988be 9852STATIC void
cea2e8a9 9853S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9854{
9855 va_list args;
9856 STRLEN l1 = strlen(pat1);
9857 STRLEN l2 = strlen(pat2);
9858 char buf[512];
06bf62c7 9859 SV *msv;
73d840c0 9860 const char *message;
c277df42 9861
7918f24d
NC
9862 PERL_ARGS_ASSERT_RE_CROAK2;
9863
c277df42
IZ
9864 if (l1 > 510)
9865 l1 = 510;
9866 if (l1 + l2 > 510)
9867 l2 = 510 - l1;
9868 Copy(pat1, buf, l1 , char);
9869 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9870 buf[l1 + l2] = '\n';
9871 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9872#ifdef I_STDARG
9873 /* ANSI variant takes additional second argument */
c277df42 9874 va_start(args, pat2);
8736538c
AS
9875#else
9876 va_start(args);
9877#endif
5a844595 9878 msv = vmess(buf, &args);
c277df42 9879 va_end(args);
cfd0369c 9880 message = SvPV_const(msv,l1);
c277df42
IZ
9881 if (l1 > 512)
9882 l1 = 512;
9883 Copy(message, buf, l1 , char);
197cf9b9 9884 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9885 Perl_croak(aTHX_ "%s", buf);
c277df42 9886}
a0ed51b3
LW
9887
9888/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9889
76234dfb 9890#ifndef PERL_IN_XSUB_RE
a0ed51b3 9891void
864dbfa3 9892Perl_save_re_context(pTHX)
b81d288d 9893{
97aff369 9894 dVAR;
1ade1aa1
NC
9895
9896 struct re_save_state *state;
9897
9898 SAVEVPTR(PL_curcop);
9899 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9900
9901 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9902 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9903 SSPUSHINT(SAVEt_RE_STATE);
9904
46ab3289 9905 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9906
a0ed51b3 9907 PL_reg_start_tmp = 0;
a0ed51b3 9908 PL_reg_start_tmpl = 0;
c445ea15 9909 PL_reg_oldsaved = NULL;
a5db57d6 9910 PL_reg_oldsavedlen = 0;
a5db57d6 9911 PL_reg_maxiter = 0;
a5db57d6 9912 PL_reg_leftiter = 0;
c445ea15 9913 PL_reg_poscache = NULL;
a5db57d6 9914 PL_reg_poscache_size = 0;
1ade1aa1
NC
9915#ifdef PERL_OLD_COPY_ON_WRITE
9916 PL_nrs = NULL;
9917#endif
ada6e8a9 9918
c445ea15
AL
9919 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9920 if (PL_curpm) {
9921 const REGEXP * const rx = PM_GETRE(PL_curpm);
9922 if (rx) {
1df70142 9923 U32 i;
07bc277f 9924 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9925 char digits[TYPE_CHARS(long)];
d9fad198 9926 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9927 GV *const *const gvp
9928 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9929
b37c2d43
AL
9930 if (gvp) {
9931 GV * const gv = *gvp;
9932 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9933 save_scalar(gv);
49f27e4b 9934 }
ada6e8a9
AMS
9935 }
9936 }
9937 }
a0ed51b3 9938}
76234dfb 9939#endif
51371543 9940
51371543 9941static void
acfe0abc 9942clear_re(pTHX_ void *r)
51371543 9943{
97aff369 9944 dVAR;
84679df5 9945 ReREFCNT_dec((REGEXP *)r);
51371543 9946}
ffbc6a93 9947
a28509cc
AL
9948#ifdef DEBUGGING
9949
9950STATIC void
9951S_put_byte(pTHX_ SV *sv, int c)
9952{
7918f24d
NC
9953 PERL_ARGS_ASSERT_PUT_BYTE;
9954
7fddd944
NC
9955 /* Our definition of isPRINT() ignores locales, so only bytes that are
9956 not part of UTF-8 are considered printable. I assume that the same
9957 holds for UTF-EBCDIC.
9958 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9959 which Wikipedia says:
9960
9961 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9962 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9963 identical, to the ASCII delete (DEL) or rubout control character.
9964 ) So the old condition can be simplified to !isPRINT(c) */
9965 if (!isPRINT(c))
a28509cc 9966 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9967 else {
88c9ea1e 9968 const char string = c;
5e7aa789
NC
9969 if (c == '-' || c == ']' || c == '\\' || c == '^')
9970 sv_catpvs(sv, "\\");
9971 sv_catpvn(sv, &string, 1);
9972 }
a28509cc
AL
9973}
9974
786e8c11 9975
3dab1dad
YO
9976#define CLEAR_OPTSTART \
9977 if (optstart) STMT_START { \
70685ca0 9978 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9979 optstart=NULL; \
9980 } STMT_END
9981
786e8c11 9982#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9983
b5a2f8d8
NC
9984STATIC const regnode *
9985S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9986 const regnode *last, const regnode *plast,
9987 SV* sv, I32 indent, U32 depth)
a28509cc 9988{
97aff369 9989 dVAR;
786e8c11 9990 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9991 register const regnode *next;
3dab1dad 9992 const regnode *optstart= NULL;
1f1031fe 9993
f8fc2ecf 9994 RXi_GET_DECL(r,ri);
3dab1dad 9995 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9996
9997 PERL_ARGS_ASSERT_DUMPUNTIL;
9998
786e8c11
YO
9999#ifdef DEBUG_DUMPUNTIL
10000 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10001 last ? last-start : 0,plast ? plast-start : 0);
10002#endif
10003
10004 if (plast && plast < last)
10005 last= plast;
10006
10007 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10008 /* While that wasn't END last time... */
a28509cc
AL
10009 NODE_ALIGN(node);
10010 op = OP(node);
de734bd5 10011 if (op == CLOSE || op == WHILEM)
786e8c11 10012 indent--;
b5a2f8d8 10013 next = regnext((regnode *)node);
1f1031fe 10014
a28509cc 10015 /* Where, what. */
8e11feef 10016 if (OP(node) == OPTIMIZED) {
e68ec53f 10017 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10018 optstart = node;
3dab1dad 10019 else
8e11feef 10020 goto after_print;
3dab1dad
YO
10021 } else
10022 CLEAR_OPTSTART;
1f1031fe 10023
32fc9b6a 10024 regprop(r, sv, node);
a28509cc 10025 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10026 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10027
10028 if (OP(node) != OPTIMIZED) {
10029 if (next == NULL) /* Next ptr. */
10030 PerlIO_printf(Perl_debug_log, " (0)");
10031 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10032 PerlIO_printf(Perl_debug_log, " (FAIL)");
10033 else
10034 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10035 (void)PerlIO_putc(Perl_debug_log, '\n');
10036 }
10037
a28509cc
AL
10038 after_print:
10039 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10040 assert(next);
10041 {
10042 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10043 ? regnext((regnode *)next)
10044 : next);
be8e71aa
YO
10045 if (last && nnode > last)
10046 nnode = last;
786e8c11 10047 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10048 }
a28509cc
AL
10049 }
10050 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10051 assert(next);
786e8c11 10052 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10053 }
10054 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10055 const regnode *this_trie = node;
1de06328 10056 const char op = OP(node);
647f639f 10057 const U32 n = ARG(node);
1de06328 10058 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10059 (reg_ac_data *)ri->data->data[n] :
1de06328 10060 NULL;
3251b653
NC
10061 const reg_trie_data * const trie =
10062 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10063#ifdef DEBUGGING
502c6561 10064 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10065#endif
786e8c11 10066 const regnode *nextbranch= NULL;
a28509cc 10067 I32 word_idx;
76f68e9b 10068 sv_setpvs(sv, "");
786e8c11 10069 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10070 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10071
10072 PerlIO_printf(Perl_debug_log, "%*s%s ",
10073 (int)(2*(indent+3)), "",
10074 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10075 PL_colors[0], PL_colors[1],
10076 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10077 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10078 PERL_PV_PRETTY_LTGT
786e8c11
YO
10079 )
10080 : "???"
10081 );
10082 if (trie->jump) {
40d049e4 10083 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10084 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10085 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10086 if (dist) {
10087 if (!nextbranch)
24b23f37 10088 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10089 DUMPUNTIL(this_trie + dist, nextbranch);
10090 }
786e8c11
YO
10091 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10092 nextbranch= regnext((regnode *)nextbranch);
10093 } else {
10094 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10095 }
786e8c11
YO
10096 }
10097 if (last && next > last)
10098 node= last;
10099 else
10100 node= next;
a28509cc 10101 }
786e8c11
YO
10102 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10103 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10104 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10105 }
10106 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10107 assert(next);
786e8c11 10108 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10109 }
10110 else if ( op == PLUS || op == STAR) {
786e8c11 10111 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10112 }
10113 else if (op == ANYOF) {
10114 /* arglen 1 + class block */
10115 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10116 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10117 node = NEXTOPER(node);
10118 }
10119 else if (PL_regkind[(U8)op] == EXACT) {
10120 /* Literal string, where present. */
10121 node += NODE_SZ_STR(node) - 1;
10122 node = NEXTOPER(node);
10123 }
10124 else {
10125 node = NEXTOPER(node);
10126 node += regarglen[(U8)op];
10127 }
10128 if (op == CURLYX || op == OPEN)
786e8c11 10129 indent++;
a28509cc 10130 }
3dab1dad 10131 CLEAR_OPTSTART;
786e8c11 10132#ifdef DEBUG_DUMPUNTIL
70685ca0 10133 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10134#endif
1de06328 10135 return node;
a28509cc
AL
10136}
10137
10138#endif /* DEBUGGING */
10139
241d1a3b
NC
10140/*
10141 * Local variables:
10142 * c-indentation-style: bsd
10143 * c-basic-offset: 4
10144 * indent-tabs-mode: t
10145 * End:
10146 *
37442d52
RGS
10147 * ex: set ts=8 sts=4 sw=4 noet:
10148 */