This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix a bug in mad - regexps can be 8 bit, not just ASCII or UTF-8.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
288b8c02 105 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
106 regexp *rx; /* perl core regexp structure */
107 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 108 char *start; /* Start of input for compile */
830247a4
IZ
109 char *end; /* End of input for compile */
110 char *parse; /* Input-scan pointer. */
111 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 112 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 113 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 114 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
115 I32 naughty; /* How bad is this pattern? */
116 I32 sawback; /* Did we see \1, ...? */
117 U32 seen;
118 I32 size; /* Code size. */
c74340f9
YO
119 I32 npar; /* Capture buffer count, (OPEN). */
120 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 121 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
122 I32 extralen;
123 I32 seen_zerolen;
124 I32 seen_evals;
40d049e4
YO
125 regnode **open_parens; /* pointers to open parens */
126 regnode **close_parens; /* pointers to close parens */
127 regnode *opend; /* END node in program */
02daf0ab
YO
128 I32 utf8; /* whether the pattern is utf8 or not */
129 I32 orig_utf8; /* whether the pattern was originally in utf8 */
130 /* XXX use this for future optimisation of case
131 * where pattern must be upgraded to utf8. */
6bda09f9 132 HV *charnames; /* cache of named sequences */
81714fb9 133 HV *paren_names; /* Paren names */
1f1031fe 134
40d049e4
YO
135 regnode **recurse; /* Recurse regops */
136 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
137#if ADD_TO_REGEXEC
138 char *starttry; /* -Dr: where regtry was called. */
139#define RExC_starttry (pRExC_state->starttry)
140#endif
3dab1dad 141#ifdef DEBUGGING
be8e71aa 142 const char *lastparse;
3dab1dad 143 I32 lastnum;
1f1031fe 144 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
145#define RExC_lastparse (pRExC_state->lastparse)
146#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 147#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 148#endif
830247a4
IZ
149} RExC_state_t;
150
e2509266 151#define RExC_flags (pRExC_state->flags)
830247a4 152#define RExC_precomp (pRExC_state->precomp)
288b8c02 153#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 154#define RExC_rx (pRExC_state->rx)
f8fc2ecf 155#define RExC_rxi (pRExC_state->rxi)
fac92740 156#define RExC_start (pRExC_state->start)
830247a4
IZ
157#define RExC_end (pRExC_state->end)
158#define RExC_parse (pRExC_state->parse)
159#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
160#ifdef RE_TRACK_PATTERN_OFFSETS
161#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
162#endif
830247a4 163#define RExC_emit (pRExC_state->emit)
fac92740 164#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 165#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
166#define RExC_naughty (pRExC_state->naughty)
167#define RExC_sawback (pRExC_state->sawback)
168#define RExC_seen (pRExC_state->seen)
169#define RExC_size (pRExC_state->size)
170#define RExC_npar (pRExC_state->npar)
e2e6a0f1 171#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
172#define RExC_extralen (pRExC_state->extralen)
173#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
174#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 175#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 176#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 177#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
178#define RExC_open_parens (pRExC_state->open_parens)
179#define RExC_close_parens (pRExC_state->close_parens)
180#define RExC_opend (pRExC_state->opend)
81714fb9 181#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
182#define RExC_recurse (pRExC_state->recurse)
183#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 184
cde0cee5 185
a687059c
LW
186#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
187#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
188 ((*s) == '{' && regcurly(s)))
a687059c 189
35c8bce7
LW
190#ifdef SPSTART
191#undef SPSTART /* dratted cpp namespace... */
192#endif
a687059c
LW
193/*
194 * Flags to be passed up and down.
195 */
a687059c 196#define WORST 0 /* Worst case. */
a3b492c3
YO
197#define HASWIDTH 0x01 /* Known to match non-null strings. */
198#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
199#define SPSTART 0x04 /* Starts with * or +. */
200#define TRYAGAIN 0x08 /* Weeded out a declaration. */
201#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 202
3dab1dad
YO
203#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
204
07be1b83
YO
205/* whether trie related optimizations are enabled */
206#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
207#define TRIE_STUDY_OPT
786e8c11 208#define FULL_TRIE_STUDY
07be1b83
YO
209#define TRIE_STCLASS
210#endif
1de06328
YO
211
212
40d049e4
YO
213
214#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
215#define PBITVAL(paren) (1 << ((paren) & 7))
216#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
217#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
218#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219
220
1de06328
YO
221/* About scan_data_t.
222
223 During optimisation we recurse through the regexp program performing
224 various inplace (keyhole style) optimisations. In addition study_chunk
225 and scan_commit populate this data structure with information about
226 what strings MUST appear in the pattern. We look for the longest
227 string that must appear for at a fixed location, and we look for the
228 longest string that may appear at a floating location. So for instance
229 in the pattern:
230
231 /FOO[xX]A.*B[xX]BAR/
232
233 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
234 strings (because they follow a .* construct). study_chunk will identify
235 both FOO and BAR as being the longest fixed and floating strings respectively.
236
237 The strings can be composites, for instance
238
239 /(f)(o)(o)/
240
241 will result in a composite fixed substring 'foo'.
242
243 For each string some basic information is maintained:
244
245 - offset or min_offset
246 This is the position the string must appear at, or not before.
247 It also implicitly (when combined with minlenp) tells us how many
248 character must match before the string we are searching.
249 Likewise when combined with minlenp and the length of the string
250 tells us how many characters must appear after the string we have
251 found.
252
253 - max_offset
254 Only used for floating strings. This is the rightmost point that
255 the string can appear at. Ifset to I32 max it indicates that the
256 string can occur infinitely far to the right.
257
258 - minlenp
259 A pointer to the minimum length of the pattern that the string
260 was found inside. This is important as in the case of positive
261 lookahead or positive lookbehind we can have multiple patterns
262 involved. Consider
263
264 /(?=FOO).*F/
265
266 The minimum length of the pattern overall is 3, the minimum length
267 of the lookahead part is 3, but the minimum length of the part that
268 will actually match is 1. So 'FOO's minimum length is 3, but the
269 minimum length for the F is 1. This is important as the minimum length
270 is used to determine offsets in front of and behind the string being
271 looked for. Since strings can be composites this is the length of the
272 pattern at the time it was commited with a scan_commit. Note that
273 the length is calculated by study_chunk, so that the minimum lengths
274 are not known until the full pattern has been compiled, thus the
275 pointer to the value.
276
277 - lookbehind
278
279 In the case of lookbehind the string being searched for can be
280 offset past the start point of the final matching string.
281 If this value was just blithely removed from the min_offset it would
282 invalidate some of the calculations for how many chars must match
283 before or after (as they are derived from min_offset and minlen and
284 the length of the string being searched for).
285 When the final pattern is compiled and the data is moved from the
286 scan_data_t structure into the regexp structure the information
287 about lookbehind is factored in, with the information that would
288 have been lost precalculated in the end_shift field for the
289 associated string.
290
291 The fields pos_min and pos_delta are used to store the minimum offset
292 and the delta to the maximum offset at the current point in the pattern.
293
294*/
2c2d71f5
JH
295
296typedef struct scan_data_t {
1de06328
YO
297 /*I32 len_min; unused */
298 /*I32 len_delta; unused */
2c2d71f5
JH
299 I32 pos_min;
300 I32 pos_delta;
301 SV *last_found;
1de06328 302 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
303 I32 last_start_min;
304 I32 last_start_max;
1de06328
YO
305 SV **longest; /* Either &l_fixed, or &l_float. */
306 SV *longest_fixed; /* longest fixed string found in pattern */
307 I32 offset_fixed; /* offset where it starts */
308 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
309 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
310 SV *longest_float; /* longest floating string found in pattern */
311 I32 offset_float_min; /* earliest point in string it can appear */
312 I32 offset_float_max; /* latest point in string it can appear */
313 I32 *minlen_float; /* pointer to the minlen relevent to the string */
314 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
315 I32 flags;
316 I32 whilem_c;
cb434fcc 317 I32 *last_closep;
653099ff 318 struct regnode_charclass_class *start_class;
2c2d71f5
JH
319} scan_data_t;
320
a687059c 321/*
e50aee73 322 * Forward declarations for pregcomp()'s friends.
a687059c 323 */
a0d0e21e 324
27da23d5 325static const scan_data_t zero_scan_data =
1de06328 326 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
327
328#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
329#define SF_BEFORE_SEOL 0x0001
330#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
331#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
332#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333
09b7f37c
CB
334#ifdef NO_UNARY_PLUS
335# define SF_FIX_SHIFT_EOL (0+2)
336# define SF_FL_SHIFT_EOL (0+4)
337#else
338# define SF_FIX_SHIFT_EOL (+2)
339# define SF_FL_SHIFT_EOL (+4)
340#endif
c277df42
IZ
341
342#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
343#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
344
345#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
346#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
347#define SF_IS_INF 0x0040
348#define SF_HAS_PAR 0x0080
349#define SF_IN_PAR 0x0100
350#define SF_HAS_EVAL 0x0200
351#define SCF_DO_SUBSTR 0x0400
653099ff
GS
352#define SCF_DO_STCLASS_AND 0x0800
353#define SCF_DO_STCLASS_OR 0x1000
354#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 355#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 356
786e8c11 357#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 358#define SCF_SEEN_ACCEPT 0x8000
07be1b83 359
eb160463 360#define UTF (RExC_utf8 != 0)
bbe252da
YO
361#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
362#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 363
ffc61ed2 364#define OOB_UNICODE 12345678
93733859 365#define OOB_NAMEDCLASS -1
b8c5462f 366
a0ed51b3
LW
367#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
368#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369
8615cb43 370
b45f050a
JF
371/* length of regex to show in messages that don't mark a position within */
372#define RegexLengthToShowInErrorMessages 127
373
374/*
375 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
376 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
377 * op/pragma/warn/regcomp.
378 */
7253e4e3
RK
379#define MARKER1 "<-- HERE" /* marker as it appears in the description */
380#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 381
7253e4e3 382#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
383
384/*
385 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
386 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 * "...".
388 */
58e23c8d 389#define _FAIL(code) STMT_START { \
bfed75c6 390 const char *ellipses = ""; \
ccb2c380
MP
391 IV len = RExC_end - RExC_precomp; \
392 \
393 if (!SIZE_ONLY) \
288b8c02 394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
395 if (len > RegexLengthToShowInErrorMessages) { \
396 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
397 len = RegexLengthToShowInErrorMessages - 10; \
398 ellipses = "..."; \
399 } \
58e23c8d 400 code; \
ccb2c380 401} STMT_END
8615cb43 402
58e23c8d
YO
403#define FAIL(msg) _FAIL( \
404 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
405 msg, (int)len, RExC_precomp, ellipses))
406
407#define FAIL2(msg,arg) _FAIL( \
408 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
409 arg, (int)len, RExC_precomp, ellipses))
410
b45f050a 411/*
b45f050a
JF
412 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
413 */
ccb2c380 414#define Simple_vFAIL(m) STMT_START { \
a28509cc 415 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
416 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
417 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
422 */
ccb2c380
MP
423#define vFAIL(m) STMT_START { \
424 if (!SIZE_ONLY) \
288b8c02 425 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
426 Simple_vFAIL(m); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Like Simple_vFAIL(), but accepts two arguments.
431 */
ccb2c380 432#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 433 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
434 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436} STMT_END
b45f050a
JF
437
438/*
439 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
440 */
ccb2c380
MP
441#define vFAIL2(m,a1) STMT_START { \
442 if (!SIZE_ONLY) \
288b8c02 443 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
444 Simple_vFAIL2(m, a1); \
445} STMT_END
b45f050a
JF
446
447
448/*
449 * Like Simple_vFAIL(), but accepts three arguments.
450 */
ccb2c380 451#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 452 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
453 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
454 (int)offset, RExC_precomp, RExC_precomp + offset); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
459 */
ccb2c380
MP
460#define vFAIL3(m,a1,a2) STMT_START { \
461 if (!SIZE_ONLY) \
288b8c02 462 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
463 Simple_vFAIL3(m, a1, a2); \
464} STMT_END
b45f050a
JF
465
466/*
467 * Like Simple_vFAIL(), but accepts four arguments.
468 */
ccb2c380 469#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 470 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
471 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
472 (int)offset, RExC_precomp, RExC_precomp + offset); \
473} STMT_END
b45f050a 474
ccb2c380 475#define vWARN(loc,m) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
478 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARNdep(loc,m) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
484 "%s" REPORT_LOCATION, \
485 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
487
488
489#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
491 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
501#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 502 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
503 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
507#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
9d1d55b5 512
8615cb43 513
cd439c50 514/* Allow for side effects in s */
ccb2c380
MP
515#define REGC(c,s) STMT_START { \
516 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517} STMT_END
cd439c50 518
fac92740
MJD
519/* Macros for recording node offsets. 20001227 mjd@plover.com
520 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
521 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
522 * Element 0 holds the number n.
07be1b83 523 * Position is 1 indexed.
fac92740 524 */
7122b237
YO
525#ifndef RE_TRACK_PATTERN_OFFSETS
526#define Set_Node_Offset_To_R(node,byte)
527#define Set_Node_Offset(node,byte)
528#define Set_Cur_Node_Offset
529#define Set_Node_Length_To_R(node,len)
530#define Set_Node_Length(node,len)
531#define Set_Node_Cur_Length(node)
532#define Node_Offset(n)
533#define Node_Length(n)
534#define Set_Node_Offset_Length(node,offset,len)
535#define ProgLen(ri) ri->u.proglen
536#define SetProgLen(ri,x) ri->u.proglen = x
537#else
538#define ProgLen(ri) ri->u.offsets[0]
539#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
540#define Set_Node_Offset_To_R(node,byte) STMT_START { \
541 if (! SIZE_ONLY) { \
542 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 543 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 544 if((node) < 0) { \
551405c4 545 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
546 } else { \
547 RExC_offsets[2*(node)-1] = (byte); \
548 } \
549 } \
550} STMT_END
551
552#define Set_Node_Offset(node,byte) \
553 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
554#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
555
556#define Set_Node_Length_To_R(node,len) STMT_START { \
557 if (! SIZE_ONLY) { \
558 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 559 __LINE__, (int)(node), (int)(len))); \
ccb2c380 560 if((node) < 0) { \
551405c4 561 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
562 } else { \
563 RExC_offsets[2*(node)] = (len); \
564 } \
565 } \
566} STMT_END
567
568#define Set_Node_Length(node,len) \
569 Set_Node_Length_To_R((node)-RExC_emit_start, len)
570#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
571#define Set_Node_Cur_Length(node) \
572 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
573
574/* Get offsets and lengths */
575#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
576#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
577
07be1b83
YO
578#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
579 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
580 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
581} STMT_END
7122b237 582#endif
07be1b83
YO
583
584#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
585#define EXPERIMENTAL_INPLACESCAN
7122b237 586#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 587
304ee84b
YO
588#define DEBUG_STUDYDATA(str,data,depth) \
589DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 590 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
591 "%*s" str "Pos:%"IVdf"/%"IVdf \
592 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
593 (int)(depth)*2, "", \
594 (IV)((data)->pos_min), \
595 (IV)((data)->pos_delta), \
304ee84b 596 (UV)((data)->flags), \
1de06328 597 (IV)((data)->whilem_c), \
304ee84b
YO
598 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
599 is_inf ? "INF " : "" \
1de06328
YO
600 ); \
601 if ((data)->last_found) \
602 PerlIO_printf(Perl_debug_log, \
603 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
604 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
605 SvPVX_const((data)->last_found), \
606 (IV)((data)->last_end), \
607 (IV)((data)->last_start_min), \
608 (IV)((data)->last_start_max), \
609 ((data)->longest && \
610 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
611 SvPVX_const((data)->longest_fixed), \
612 (IV)((data)->offset_fixed), \
613 ((data)->longest && \
614 (data)->longest==&((data)->longest_float)) ? "*" : "", \
615 SvPVX_const((data)->longest_float), \
616 (IV)((data)->offset_float_min), \
617 (IV)((data)->offset_float_max) \
618 ); \
619 PerlIO_printf(Perl_debug_log,"\n"); \
620});
621
acfe0abc 622static void clear_re(pTHX_ void *r);
4327152a 623
653099ff 624/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 625 Update the longest found anchored substring and the longest found
653099ff
GS
626 floating substrings if needed. */
627
4327152a 628STATIC void
304ee84b 629S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 630{
e1ec3a88
AL
631 const STRLEN l = CHR_SVLEN(data->last_found);
632 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 633 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 634
c277df42 635 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 636 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
637 if (*data->longest == data->longest_fixed) {
638 data->offset_fixed = l ? data->last_start_min : data->pos_min;
639 if (data->flags & SF_BEFORE_EOL)
b81d288d 640 data->flags
c277df42
IZ
641 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
642 else
643 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
644 data->minlen_fixed=minlenp;
645 data->lookbehind_fixed=0;
a0ed51b3 646 }
304ee84b 647 else { /* *data->longest == data->longest_float */
c277df42 648 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
649 data->offset_float_max = (l
650 ? data->last_start_max
c277df42 651 : data->pos_min + data->pos_delta);
304ee84b 652 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 653 data->offset_float_max = I32_MAX;
c277df42 654 if (data->flags & SF_BEFORE_EOL)
b81d288d 655 data->flags
c277df42
IZ
656 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
657 else
658 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
659 data->minlen_float=minlenp;
660 data->lookbehind_float=0;
c277df42
IZ
661 }
662 }
663 SvCUR_set(data->last_found, 0);
0eda9292 664 {
a28509cc 665 SV * const sv = data->last_found;
097eb12c
AL
666 if (SvUTF8(sv) && SvMAGICAL(sv)) {
667 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
668 if (mg)
669 mg->mg_len = 0;
670 }
0eda9292 671 }
c277df42
IZ
672 data->last_end = -1;
673 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 674 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
675}
676
653099ff
GS
677/* Can match anything (initialization) */
678STATIC void
097eb12c 679S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 680{
653099ff 681 ANYOF_CLASS_ZERO(cl);
f8bef550 682 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 683 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
684 if (LOC)
685 cl->flags |= ANYOF_LOCALE;
686}
687
688/* Can match anything (initialization) */
689STATIC int
5f66b61c 690S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
691{
692 int value;
693
aaa51d5e 694 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
695 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
696 return 1;
1aa99e6b
IH
697 if (!(cl->flags & ANYOF_UNICODE_ALL))
698 return 0;
10edeb5d 699 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 700 return 0;
653099ff
GS
701 return 1;
702}
703
704/* Can match anything (initialization) */
705STATIC void
097eb12c 706S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 707{
8ecf7187 708 Zero(cl, 1, struct regnode_charclass_class);
653099ff 709 cl->type = ANYOF;
830247a4 710 cl_anything(pRExC_state, cl);
653099ff
GS
711}
712
713STATIC void
097eb12c 714S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 715{
8ecf7187 716 Zero(cl, 1, struct regnode_charclass_class);
653099ff 717 cl->type = ANYOF;
830247a4 718 cl_anything(pRExC_state, cl);
653099ff
GS
719 if (LOC)
720 cl->flags |= ANYOF_LOCALE;
721}
722
723/* 'And' a given class with another one. Can create false positives */
724/* We assume that cl is not inverted */
725STATIC void
5f66b61c 726S_cl_and(struct regnode_charclass_class *cl,
a28509cc 727 const struct regnode_charclass_class *and_with)
653099ff 728{
40d049e4
YO
729
730 assert(and_with->type == ANYOF);
653099ff
GS
731 if (!(and_with->flags & ANYOF_CLASS)
732 && !(cl->flags & ANYOF_CLASS)
733 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
734 && !(and_with->flags & ANYOF_FOLD)
735 && !(cl->flags & ANYOF_FOLD)) {
736 int i;
737
738 if (and_with->flags & ANYOF_INVERT)
739 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
740 cl->bitmap[i] &= ~and_with->bitmap[i];
741 else
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] &= and_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
745 if (!(and_with->flags & ANYOF_EOS))
746 cl->flags &= ~ANYOF_EOS;
1aa99e6b 747
14ebb1a2
JH
748 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
749 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
750 cl->flags &= ~ANYOF_UNICODE_ALL;
751 cl->flags |= ANYOF_UNICODE;
752 ARG_SET(cl, ARG(and_with));
753 }
14ebb1a2
JH
754 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
755 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 756 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
757 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
758 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 759 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
760}
761
762/* 'OR' a given class with another one. Can create false positives */
763/* We assume that cl is not inverted */
764STATIC void
097eb12c 765S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 766{
653099ff
GS
767 if (or_with->flags & ANYOF_INVERT) {
768 /* We do not use
769 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
770 * <= (B1 | !B2) | (CL1 | !CL2)
771 * which is wasteful if CL2 is small, but we ignore CL2:
772 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
773 * XXXX Can we handle case-fold? Unclear:
774 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
775 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
776 */
777 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
778 && !(or_with->flags & ANYOF_FOLD)
779 && !(cl->flags & ANYOF_FOLD) ) {
780 int i;
781
782 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
783 cl->bitmap[i] |= ~or_with->bitmap[i];
784 } /* XXXX: logic is complicated otherwise */
785 else {
830247a4 786 cl_anything(pRExC_state, cl);
653099ff
GS
787 }
788 } else {
789 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
790 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 791 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
792 || (cl->flags & ANYOF_FOLD)) ) {
793 int i;
794
795 /* OR char bitmap and class bitmap separately */
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] |= or_with->bitmap[i];
798 if (or_with->flags & ANYOF_CLASS) {
799 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
800 cl->classflags[i] |= or_with->classflags[i];
801 cl->flags |= ANYOF_CLASS;
802 }
803 }
804 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 805 cl_anything(pRExC_state, cl);
653099ff
GS
806 }
807 }
808 if (or_with->flags & ANYOF_EOS)
809 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
810
811 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
812 ARG(cl) != ARG(or_with)) {
813 cl->flags |= ANYOF_UNICODE_ALL;
814 cl->flags &= ~ANYOF_UNICODE;
815 }
816 if (or_with->flags & ANYOF_UNICODE_ALL) {
817 cl->flags |= ANYOF_UNICODE_ALL;
818 cl->flags &= ~ANYOF_UNICODE;
819 }
653099ff
GS
820}
821
a3621e74
YO
822#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
823#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
824#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
825#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
826
3dab1dad
YO
827
828#ifdef DEBUGGING
07be1b83 829/*
2b8b4781
NC
830 dump_trie(trie,widecharmap,revcharmap)
831 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
832 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
833
834 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
835 The _interim_ variants are used for debugging the interim
836 tables that are used to generate the final compressed
837 representation which is what dump_trie expects.
838
3dab1dad
YO
839 Part of the reason for their existance is to provide a form
840 of documentation as to how the different representations function.
07be1b83
YO
841
842*/
3dab1dad
YO
843
844/*
3dab1dad
YO
845 Dumps the final compressed table form of the trie to Perl_debug_log.
846 Used for debugging make_trie().
847*/
848
849STATIC void
2b8b4781
NC
850S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
851 AV *revcharmap, U32 depth)
3dab1dad
YO
852{
853 U32 state;
ab3bbdeb 854 SV *sv=sv_newmortal();
55eed653 855 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
856 GET_RE_DEBUG_FLAGS_DECL;
857
ab3bbdeb 858
3dab1dad
YO
859 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
860 (int)depth * 2 + 2,"",
861 "Match","Base","Ofs" );
862
863 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 864 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 865 if ( tmp ) {
ab3bbdeb
YO
866 PerlIO_printf( Perl_debug_log, "%*s",
867 colwidth,
ddc5bc0f 868 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
869 PL_colors[0], PL_colors[1],
870 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
871 PERL_PV_ESCAPE_FIRSTCHAR
872 )
873 );
3dab1dad
YO
874 }
875 }
876 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
877 (int)depth * 2 + 2,"");
878
879 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 880 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
881 PerlIO_printf( Perl_debug_log, "\n");
882
1e2e3d02 883 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 884 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
885
886 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
887
888 if ( trie->states[ state ].wordnum ) {
889 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
890 } else {
891 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 }
893
894 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
895
896 if ( base ) {
897 U32 ofs = 0;
898
899 while( ( base + ofs < trie->uniquecharcount ) ||
900 ( base + ofs - trie->uniquecharcount < trie->lasttrans
901 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 ofs++;
903
904 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
905
906 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
907 if ( ( base + ofs >= trie->uniquecharcount ) &&
908 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
909 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
910 {
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*"UVXf,
912 colwidth,
3dab1dad
YO
913 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
914 } else {
ab3bbdeb 915 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
916 }
917 }
918
919 PerlIO_printf( Perl_debug_log, "]");
920
921 }
922 PerlIO_printf( Perl_debug_log, "\n" );
923 }
924}
925/*
3dab1dad
YO
926 Dumps a fully constructed but uncompressed trie in list form.
927 List tries normally only are used for construction when the number of
928 possible chars (trie->uniquecharcount) is very high.
929 Used for debugging make_trie().
930*/
931STATIC void
55eed653 932S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
933 HV *widecharmap, AV *revcharmap, U32 next_alloc,
934 U32 depth)
3dab1dad
YO
935{
936 U32 state;
ab3bbdeb 937 SV *sv=sv_newmortal();
55eed653 938 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
939 GET_RE_DEBUG_FLAGS_DECL;
940 /* print out the table precompression. */
ab3bbdeb
YO
941 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
942 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
943 "------:-----+-----------------\n" );
3dab1dad
YO
944
945 for( state=1 ; state < next_alloc ; state ++ ) {
946 U16 charid;
947
ab3bbdeb 948 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
949 (int)depth * 2 + 2,"", (UV)state );
950 if ( ! trie->states[ state ].wordnum ) {
951 PerlIO_printf( Perl_debug_log, "%5s| ","");
952 } else {
953 PerlIO_printf( Perl_debug_log, "W%4x| ",
954 trie->states[ state ].wordnum
955 );
956 }
957 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 958 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
959 if ( tmp ) {
960 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
961 colwidth,
ddc5bc0f 962 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
963 PL_colors[0], PL_colors[1],
964 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
965 PERL_PV_ESCAPE_FIRSTCHAR
966 ) ,
1e2e3d02
YO
967 TRIE_LIST_ITEM(state,charid).forid,
968 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 );
970 if (!(charid % 10))
664e119d
RGS
971 PerlIO_printf(Perl_debug_log, "\n%*s| ",
972 (int)((depth * 2) + 14), "");
1e2e3d02 973 }
ab3bbdeb
YO
974 }
975 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
976 }
977}
978
979/*
3dab1dad
YO
980 Dumps a fully constructed but uncompressed trie in table form.
981 This is the normal DFA style state transition table, with a few
982 twists to facilitate compression later.
983 Used for debugging make_trie().
984*/
985STATIC void
55eed653 986S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
987 HV *widecharmap, AV *revcharmap, U32 next_alloc,
988 U32 depth)
3dab1dad
YO
989{
990 U32 state;
991 U16 charid;
ab3bbdeb 992 SV *sv=sv_newmortal();
55eed653 993 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
994 GET_RE_DEBUG_FLAGS_DECL;
995
996 /*
997 print out the table precompression so that we can do a visual check
998 that they are identical.
999 */
1000
1001 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1002
1003 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1004 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1005 if ( tmp ) {
ab3bbdeb
YO
1006 PerlIO_printf( Perl_debug_log, "%*s",
1007 colwidth,
ddc5bc0f 1008 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1009 PL_colors[0], PL_colors[1],
1010 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1011 PERL_PV_ESCAPE_FIRSTCHAR
1012 )
1013 );
3dab1dad
YO
1014 }
1015 }
1016
1017 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1018
1019 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1020 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1021 }
1022
1023 PerlIO_printf( Perl_debug_log, "\n" );
1024
1025 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1026
1027 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1028 (int)depth * 2 + 2,"",
1029 (UV)TRIE_NODENUM( state ) );
1030
1031 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1032 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1033 if (v)
1034 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1035 else
1036 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1037 }
1038 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1040 } else {
1041 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1042 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1043 }
1044 }
07be1b83 1045}
3dab1dad
YO
1046
1047#endif
1048
786e8c11
YO
1049/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1050 startbranch: the first branch in the whole branch sequence
1051 first : start branch of sequence of branch-exact nodes.
1052 May be the same as startbranch
1053 last : Thing following the last branch.
1054 May be the same as tail.
1055 tail : item following the branch sequence
1056 count : words in the sequence
1057 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1058 depth : indent depth
3dab1dad 1059
786e8c11 1060Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1061
786e8c11
YO
1062A trie is an N'ary tree where the branches are determined by digital
1063decomposition of the key. IE, at the root node you look up the 1st character and
1064follow that branch repeat until you find the end of the branches. Nodes can be
1065marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1066
786e8c11 1067 /he|she|his|hers/
72f13be8 1068
786e8c11
YO
1069would convert into the following structure. Numbers represent states, letters
1070following numbers represent valid transitions on the letter from that state, if
1071the number is in square brackets it represents an accepting state, otherwise it
1072will be in parenthesis.
07be1b83 1073
786e8c11
YO
1074 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1075 | |
1076 | (2)
1077 | |
1078 (1) +-i->(6)-+-s->[7]
1079 |
1080 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1081
786e8c11
YO
1082 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1083
1084This shows that when matching against the string 'hers' we will begin at state 1
1085read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1086then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1087is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1088single traverse. We store a mapping from accepting to state to which word was
1089matched, and then when we have multiple possibilities we try to complete the
1090rest of the regex in the order in which they occured in the alternation.
1091
1092The only prior NFA like behaviour that would be changed by the TRIE support is
1093the silent ignoring of duplicate alternations which are of the form:
1094
1095 / (DUPE|DUPE) X? (?{ ... }) Y /x
1096
1097Thus EVAL blocks follwing a trie may be called a different number of times with
1098and without the optimisation. With the optimisations dupes will be silently
1099ignored. This inconsistant behaviour of EVAL type nodes is well established as
1100the following demonstrates:
1101
1102 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1103
1104which prints out 'word' three times, but
1105
1106 'words'=~/(word|word|word)(?{ print $1 })S/
1107
1108which doesnt print it out at all. This is due to other optimisations kicking in.
1109
1110Example of what happens on a structural level:
1111
1112The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1113
1114 1: CURLYM[1] {1,32767}(18)
1115 5: BRANCH(8)
1116 6: EXACT <ac>(16)
1117 8: BRANCH(11)
1118 9: EXACT <ad>(16)
1119 11: BRANCH(14)
1120 12: EXACT <ab>(16)
1121 16: SUCCEED(0)
1122 17: NOTHING(18)
1123 18: END(0)
1124
1125This would be optimizable with startbranch=5, first=5, last=16, tail=16
1126and should turn into:
1127
1128 1: CURLYM[1] {1,32767}(18)
1129 5: TRIE(16)
1130 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1131 <ac>
1132 <ad>
1133 <ab>
1134 16: SUCCEED(0)
1135 17: NOTHING(18)
1136 18: END(0)
1137
1138Cases where tail != last would be like /(?foo|bar)baz/:
1139
1140 1: BRANCH(4)
1141 2: EXACT <foo>(8)
1142 4: BRANCH(7)
1143 5: EXACT <bar>(8)
1144 7: TAIL(8)
1145 8: EXACT <baz>(10)
1146 10: END(0)
1147
1148which would be optimizable with startbranch=1, first=1, last=7, tail=8
1149and would end up looking like:
1150
1151 1: TRIE(8)
1152 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1153 <foo>
1154 <bar>
1155 7: TAIL(8)
1156 8: EXACT <baz>(10)
1157 10: END(0)
1158
1159 d = uvuni_to_utf8_flags(d, uv, 0);
1160
1161is the recommended Unicode-aware way of saying
1162
1163 *(d++) = uv;
1164*/
1165
1e2e3d02 1166#define TRIE_STORE_REVCHAR \
786e8c11 1167 STMT_START { \
73031816
NC
1168 if (UTF) { \
1169 SV *zlopp = newSV(2); \
88c9ea1e
CB
1170 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1171 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1172 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 SvPOK_on(zlopp); \
1174 SvUTF8_on(zlopp); \
1175 av_push(revcharmap, zlopp); \
1176 } else { \
6bdeddd2 1177 char ooooff = (char)uvc; \
73031816
NC
1178 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1179 } \
1180 } STMT_END
786e8c11
YO
1181
1182#define TRIE_READ_CHAR STMT_START { \
1183 wordlen++; \
1184 if ( UTF ) { \
1185 if ( folder ) { \
1186 if ( foldlen > 0 ) { \
1187 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1188 foldlen -= len; \
1189 scan += len; \
1190 len = 0; \
1191 } else { \
1192 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1193 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1194 foldlen -= UNISKIP( uvc ); \
1195 scan = foldbuf + UNISKIP( uvc ); \
1196 } \
1197 } else { \
1198 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1199 } \
1200 } else { \
1201 uvc = (U32)*uc; \
1202 len = 1; \
1203 } \
1204} STMT_END
1205
1206
1207
1208#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1209 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1210 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1211 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1212 } \
1213 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1214 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1215 TRIE_LIST_CUR( state )++; \
1216} STMT_END
07be1b83 1217
786e8c11
YO
1218#define TRIE_LIST_NEW(state) STMT_START { \
1219 Newxz( trie->states[ state ].trans.list, \
1220 4, reg_trie_trans_le ); \
1221 TRIE_LIST_CUR( state ) = 1; \
1222 TRIE_LIST_LEN( state ) = 4; \
1223} STMT_END
07be1b83 1224
786e8c11
YO
1225#define TRIE_HANDLE_WORD(state) STMT_START { \
1226 U16 dupe= trie->states[ state ].wordnum; \
1227 regnode * const noper_next = regnext( noper ); \
1228 \
1229 if (trie->wordlen) \
1230 trie->wordlen[ curword ] = wordlen; \
1231 DEBUG_r({ \
1232 /* store the word for dumping */ \
1233 SV* tmp; \
1234 if (OP(noper) != NOTHING) \
740cce10 1235 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1236 else \
740cce10 1237 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1238 av_push( trie_words, tmp ); \
786e8c11
YO
1239 }); \
1240 \
1241 curword++; \
1242 \
1243 if ( noper_next < tail ) { \
1244 if (!trie->jump) \
c944940b 1245 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1246 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1247 if (!jumper) \
1248 jumper = noper_next; \
1249 if (!nextbranch) \
1250 nextbranch= regnext(cur); \
1251 } \
1252 \
1253 if ( dupe ) { \
1254 /* So it's a dupe. This means we need to maintain a */\
1255 /* linked-list from the first to the next. */\
1256 /* we only allocate the nextword buffer when there */\
1257 /* a dupe, so first time we have to do the allocation */\
1258 if (!trie->nextword) \
c944940b 1259 trie->nextword = (U16 *) \
446bd890 1260 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1261 while ( trie->nextword[dupe] ) \
1262 dupe= trie->nextword[dupe]; \
1263 trie->nextword[dupe]= curword; \
1264 } else { \
1265 /* we haven't inserted this word yet. */ \
1266 trie->states[ state ].wordnum = curword; \
1267 } \
1268} STMT_END
07be1b83 1269
3dab1dad 1270
786e8c11
YO
1271#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1272 ( ( base + charid >= ucharcount \
1273 && base + charid < ubound \
1274 && state == trie->trans[ base - ucharcount + charid ].check \
1275 && trie->trans[ base - ucharcount + charid ].next ) \
1276 ? trie->trans[ base - ucharcount + charid ].next \
1277 : ( state==1 ? special : 0 ) \
1278 )
3dab1dad 1279
786e8c11
YO
1280#define MADE_TRIE 1
1281#define MADE_JUMP_TRIE 2
1282#define MADE_EXACT_TRIE 4
3dab1dad 1283
a3621e74 1284STATIC I32
786e8c11 1285S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1286{
27da23d5 1287 dVAR;
a3621e74
YO
1288 /* first pass, loop through and scan words */
1289 reg_trie_data *trie;
55eed653 1290 HV *widecharmap = NULL;
2b8b4781 1291 AV *revcharmap = newAV();
a3621e74 1292 regnode *cur;
9f7f3913 1293 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1294 STRLEN len = 0;
1295 UV uvc = 0;
1296 U16 curword = 0;
1297 U32 next_alloc = 0;
786e8c11
YO
1298 regnode *jumper = NULL;
1299 regnode *nextbranch = NULL;
7f69552c 1300 regnode *convert = NULL;
a3621e74 1301 /* we just use folder as a flag in utf8 */
e1ec3a88 1302 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1303 ? PL_fold
1304 : ( flags == EXACTFL
1305 ? PL_fold_locale
1306 : NULL
1307 )
1308 );
1309
2b8b4781
NC
1310#ifdef DEBUGGING
1311 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1312 AV *trie_words = NULL;
1313 /* along with revcharmap, this only used during construction but both are
1314 * useful during debugging so we store them in the struct when debugging.
8e11feef 1315 */
2b8b4781
NC
1316#else
1317 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1318 STRLEN trie_charcount=0;
3dab1dad 1319#endif
2b8b4781 1320 SV *re_trie_maxbuff;
a3621e74 1321 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1322#ifndef DEBUGGING
1323 PERL_UNUSED_ARG(depth);
1324#endif
a3621e74 1325
c944940b 1326 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1327 trie->refcount = 1;
3dab1dad 1328 trie->startstate = 1;
786e8c11 1329 trie->wordcount = word_count;
f8fc2ecf 1330 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1331 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1332 if (!(UTF && folder))
c944940b 1333 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1334 DEBUG_r({
2b8b4781 1335 trie_words = newAV();
a3621e74 1336 });
a3621e74 1337
0111c4fd 1338 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1339 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1340 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1341 }
3dab1dad
YO
1342 DEBUG_OPTIMISE_r({
1343 PerlIO_printf( Perl_debug_log,
786e8c11 1344 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1345 (int)depth * 2 + 2, "",
1346 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1347 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1348 (int)depth);
3dab1dad 1349 });
7f69552c
YO
1350
1351 /* Find the node we are going to overwrite */
1352 if ( first == startbranch && OP( last ) != BRANCH ) {
1353 /* whole branch chain */
1354 convert = first;
1355 } else {
1356 /* branch sub-chain */
1357 convert = NEXTOPER( first );
1358 }
1359
a3621e74
YO
1360 /* -- First loop and Setup --
1361
1362 We first traverse the branches and scan each word to determine if it
1363 contains widechars, and how many unique chars there are, this is
1364 important as we have to build a table with at least as many columns as we
1365 have unique chars.
1366
1367 We use an array of integers to represent the character codes 0..255
38a44b82 1368 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1369 native representation of the character value as the key and IV's for the
1370 coded index.
1371
1372 *TODO* If we keep track of how many times each character is used we can
1373 remap the columns so that the table compression later on is more
1374 efficient in terms of memory by ensuring most common value is in the
1375 middle and the least common are on the outside. IMO this would be better
1376 than a most to least common mapping as theres a decent chance the most
1377 common letter will share a node with the least common, meaning the node
1378 will not be compressable. With a middle is most common approach the worst
1379 case is when we have the least common nodes twice.
1380
1381 */
1382
a3621e74 1383 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1384 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1385 const U8 *uc = (U8*)STRING( noper );
a28509cc 1386 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1387 STRLEN foldlen = 0;
1388 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1389 const U8 *scan = (U8*)NULL;
07be1b83 1390 U32 wordlen = 0; /* required init */
02daf0ab
YO
1391 STRLEN chars = 0;
1392 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1393
3dab1dad
YO
1394 if (OP(noper) == NOTHING) {
1395 trie->minlen= 0;
1396 continue;
1397 }
02daf0ab
YO
1398 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1399 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1400 regardless of encoding */
1401
a3621e74 1402 for ( ; uc < e ; uc += len ) {
3dab1dad 1403 TRIE_CHARCOUNT(trie)++;
a3621e74 1404 TRIE_READ_CHAR;
3dab1dad 1405 chars++;
a3621e74
YO
1406 if ( uvc < 256 ) {
1407 if ( !trie->charmap[ uvc ] ) {
1408 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 if ( folder )
1410 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1411 TRIE_STORE_REVCHAR;
a3621e74 1412 }
02daf0ab
YO
1413 if ( set_bit ) {
1414 /* store the codepoint in the bitmap, and if its ascii
1415 also store its folded equivelent. */
1416 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1417
1418 /* store the folded codepoint */
1419 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1420
1421 if ( !UTF ) {
1422 /* store first byte of utf8 representation of
1423 codepoints in the 127 < uvc < 256 range */
1424 if (127 < uvc && uvc < 192) {
1425 TRIE_BITMAP_SET(trie,194);
1426 } else if (191 < uvc ) {
1427 TRIE_BITMAP_SET(trie,195);
1428 /* && uvc < 256 -- we know uvc is < 256 already */
1429 }
1430 }
02daf0ab
YO
1431 set_bit = 0; /* We've done our bit :-) */
1432 }
a3621e74
YO
1433 } else {
1434 SV** svpp;
55eed653
NC
1435 if ( !widecharmap )
1436 widecharmap = newHV();
a3621e74 1437
55eed653 1438 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1439
1440 if ( !svpp )
e4584336 1441 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1442
1443 if ( !SvTRUE( *svpp ) ) {
1444 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1445 TRIE_STORE_REVCHAR;
a3621e74
YO
1446 }
1447 }
1448 }
3dab1dad
YO
1449 if( cur == first ) {
1450 trie->minlen=chars;
1451 trie->maxlen=chars;
1452 } else if (chars < trie->minlen) {
1453 trie->minlen=chars;
1454 } else if (chars > trie->maxlen) {
1455 trie->maxlen=chars;
1456 }
1457
a3621e74
YO
1458 } /* end first pass */
1459 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1460 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1461 (int)depth * 2 + 2,"",
55eed653 1462 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1463 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1464 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1465 );
c944940b 1466 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1467
1468 /*
1469 We now know what we are dealing with in terms of unique chars and
1470 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1471 representation using a flat table will take. If it's over a reasonable
1472 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1473 conservative but potentially much slower representation using an array
1474 of lists.
1475
1476 At the end we convert both representations into the same compressed
1477 form that will be used in regexec.c for matching with. The latter
1478 is a form that cannot be used to construct with but has memory
1479 properties similar to the list form and access properties similar
1480 to the table form making it both suitable for fast searches and
1481 small enough that its feasable to store for the duration of a program.
1482
1483 See the comment in the code where the compressed table is produced
1484 inplace from the flat tabe representation for an explanation of how
1485 the compression works.
1486
1487 */
1488
1489
3dab1dad 1490 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1491 /*
1492 Second Pass -- Array Of Lists Representation
1493
1494 Each state will be represented by a list of charid:state records
1495 (reg_trie_trans_le) the first such element holds the CUR and LEN
1496 points of the allocated array. (See defines above).
1497
1498 We build the initial structure using the lists, and then convert
1499 it into the compressed table form which allows faster lookups
1500 (but cant be modified once converted).
a3621e74
YO
1501 */
1502
a3621e74
YO
1503 STRLEN transcount = 1;
1504
1e2e3d02
YO
1505 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1506 "%*sCompiling trie using list compiler\n",
1507 (int)depth * 2 + 2, ""));
446bd890 1508
c944940b
JH
1509 trie->states = (reg_trie_state *)
1510 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1511 sizeof(reg_trie_state) );
a3621e74
YO
1512 TRIE_LIST_NEW(1);
1513 next_alloc = 2;
1514
1515 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516
c445ea15
AL
1517 regnode * const noper = NEXTOPER( cur );
1518 U8 *uc = (U8*)STRING( noper );
1519 const U8 * const e = uc + STR_LEN( noper );
1520 U32 state = 1; /* required init */
1521 U16 charid = 0; /* sanity init */
1522 U8 *scan = (U8*)NULL; /* sanity init */
1523 STRLEN foldlen = 0; /* required init */
07be1b83 1524 U32 wordlen = 0; /* required init */
c445ea15
AL
1525 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526
3dab1dad 1527 if (OP(noper) != NOTHING) {
786e8c11 1528 for ( ; uc < e ; uc += len ) {
c445ea15 1529
786e8c11 1530 TRIE_READ_CHAR;
c445ea15 1531
786e8c11
YO
1532 if ( uvc < 256 ) {
1533 charid = trie->charmap[ uvc ];
c445ea15 1534 } else {
55eed653 1535 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1536 if ( !svpp ) {
1537 charid = 0;
1538 } else {
1539 charid=(U16)SvIV( *svpp );
1540 }
c445ea15 1541 }
786e8c11
YO
1542 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1543 if ( charid ) {
a3621e74 1544
786e8c11
YO
1545 U16 check;
1546 U32 newstate = 0;
a3621e74 1547
786e8c11
YO
1548 charid--;
1549 if ( !trie->states[ state ].trans.list ) {
1550 TRIE_LIST_NEW( state );
c445ea15 1551 }
786e8c11
YO
1552 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1553 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1554 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1555 break;
1556 }
1557 }
1558 if ( ! newstate ) {
1559 newstate = next_alloc++;
1560 TRIE_LIST_PUSH( state, charid, newstate );
1561 transcount++;
1562 }
1563 state = newstate;
1564 } else {
1565 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1566 }
a28509cc 1567 }
c445ea15 1568 }
3dab1dad 1569 TRIE_HANDLE_WORD(state);
a3621e74
YO
1570
1571 } /* end second pass */
1572
1e2e3d02
YO
1573 /* next alloc is the NEXT state to be allocated */
1574 trie->statecount = next_alloc;
c944940b
JH
1575 trie->states = (reg_trie_state *)
1576 PerlMemShared_realloc( trie->states,
1577 next_alloc
1578 * sizeof(reg_trie_state) );
a3621e74 1579
3dab1dad 1580 /* and now dump it out before we compress it */
2b8b4781
NC
1581 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1582 revcharmap, next_alloc,
1583 depth+1)
1e2e3d02 1584 );
a3621e74 1585
c944940b
JH
1586 trie->trans = (reg_trie_trans *)
1587 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1588 {
1589 U32 state;
a3621e74
YO
1590 U32 tp = 0;
1591 U32 zp = 0;
1592
1593
1594 for( state=1 ; state < next_alloc ; state ++ ) {
1595 U32 base=0;
1596
1597 /*
1598 DEBUG_TRIE_COMPILE_MORE_r(
1599 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1600 );
1601 */
1602
1603 if (trie->states[state].trans.list) {
1604 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1605 U16 maxid=minid;
a28509cc 1606 U16 idx;
a3621e74
YO
1607
1608 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1609 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1610 if ( forid < minid ) {
1611 minid=forid;
1612 } else if ( forid > maxid ) {
1613 maxid=forid;
1614 }
a3621e74
YO
1615 }
1616 if ( transcount < tp + maxid - minid + 1) {
1617 transcount *= 2;
c944940b
JH
1618 trie->trans = (reg_trie_trans *)
1619 PerlMemShared_realloc( trie->trans,
446bd890
NC
1620 transcount
1621 * sizeof(reg_trie_trans) );
a3621e74
YO
1622 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 }
1624 base = trie->uniquecharcount + tp - minid;
1625 if ( maxid == minid ) {
1626 U32 set = 0;
1627 for ( ; zp < tp ; zp++ ) {
1628 if ( ! trie->trans[ zp ].next ) {
1629 base = trie->uniquecharcount + zp - minid;
1630 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1631 trie->trans[ zp ].check = state;
1632 set = 1;
1633 break;
1634 }
1635 }
1636 if ( !set ) {
1637 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1638 trie->trans[ tp ].check = state;
1639 tp++;
1640 zp = tp;
1641 }
1642 } else {
1643 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1644 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1645 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1646 trie->trans[ tid ].check = state;
1647 }
1648 tp += ( maxid - minid + 1 );
1649 }
1650 Safefree(trie->states[ state ].trans.list);
1651 }
1652 /*
1653 DEBUG_TRIE_COMPILE_MORE_r(
1654 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1655 );
1656 */
1657 trie->states[ state ].trans.base=base;
1658 }
cc601c31 1659 trie->lasttrans = tp + 1;
a3621e74
YO
1660 }
1661 } else {
1662 /*
1663 Second Pass -- Flat Table Representation.
1664
1665 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1666 We know that we will need Charcount+1 trans at most to store the data
1667 (one row per char at worst case) So we preallocate both structures
1668 assuming worst case.
1669
1670 We then construct the trie using only the .next slots of the entry
1671 structs.
1672
1673 We use the .check field of the first entry of the node temporarily to
1674 make compression both faster and easier by keeping track of how many non
1675 zero fields are in the node.
1676
1677 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1678 transition.
1679
1680 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1681 number representing the first entry of the node, and state as a
1682 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1683 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1684 are 2 entrys per node. eg:
1685
1686 A B A B
1687 1. 2 4 1. 3 7
1688 2. 0 3 3. 0 5
1689 3. 0 0 5. 0 0
1690 4. 0 0 7. 0 0
1691
1692 The table is internally in the right hand, idx form. However as we also
1693 have to deal with the states array which is indexed by nodenum we have to
1694 use TRIE_NODENUM() to convert.
1695
1696 */
1e2e3d02
YO
1697 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1698 "%*sCompiling trie using table compiler\n",
1699 (int)depth * 2 + 2, ""));
3dab1dad 1700
c944940b
JH
1701 trie->trans = (reg_trie_trans *)
1702 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1703 * trie->uniquecharcount + 1,
1704 sizeof(reg_trie_trans) );
1705 trie->states = (reg_trie_state *)
1706 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1707 sizeof(reg_trie_state) );
a3621e74
YO
1708 next_alloc = trie->uniquecharcount + 1;
1709
3dab1dad 1710
a3621e74
YO
1711 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712
c445ea15 1713 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1714 const U8 *uc = (U8*)STRING( noper );
1715 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1716
1717 U32 state = 1; /* required init */
1718
1719 U16 charid = 0; /* sanity init */
1720 U32 accept_state = 0; /* sanity init */
1721 U8 *scan = (U8*)NULL; /* sanity init */
1722
1723 STRLEN foldlen = 0; /* required init */
07be1b83 1724 U32 wordlen = 0; /* required init */
a3621e74
YO
1725 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726
3dab1dad 1727 if ( OP(noper) != NOTHING ) {
786e8c11 1728 for ( ; uc < e ; uc += len ) {
a3621e74 1729
786e8c11 1730 TRIE_READ_CHAR;
a3621e74 1731
786e8c11
YO
1732 if ( uvc < 256 ) {
1733 charid = trie->charmap[ uvc ];
1734 } else {
55eed653 1735 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1736 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1737 }
786e8c11
YO
1738 if ( charid ) {
1739 charid--;
1740 if ( !trie->trans[ state + charid ].next ) {
1741 trie->trans[ state + charid ].next = next_alloc;
1742 trie->trans[ state ].check++;
1743 next_alloc += trie->uniquecharcount;
1744 }
1745 state = trie->trans[ state + charid ].next;
1746 } else {
1747 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 }
1749 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1750 }
a3621e74 1751 }
3dab1dad
YO
1752 accept_state = TRIE_NODENUM( state );
1753 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1754
1755 } /* end second pass */
1756
3dab1dad 1757 /* and now dump it out before we compress it */
2b8b4781
NC
1758 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 revcharmap,
1760 next_alloc, depth+1));
a3621e74 1761
a3621e74
YO
1762 {
1763 /*
1764 * Inplace compress the table.*
1765
1766 For sparse data sets the table constructed by the trie algorithm will
1767 be mostly 0/FAIL transitions or to put it another way mostly empty.
1768 (Note that leaf nodes will not contain any transitions.)
1769
1770 This algorithm compresses the tables by eliminating most such
1771 transitions, at the cost of a modest bit of extra work during lookup:
1772
1773 - Each states[] entry contains a .base field which indicates the
1774 index in the state[] array wheres its transition data is stored.
1775
1776 - If .base is 0 there are no valid transitions from that node.
1777
1778 - If .base is nonzero then charid is added to it to find an entry in
1779 the trans array.
1780
1781 -If trans[states[state].base+charid].check!=state then the
1782 transition is taken to be a 0/Fail transition. Thus if there are fail
1783 transitions at the front of the node then the .base offset will point
1784 somewhere inside the previous nodes data (or maybe even into a node
1785 even earlier), but the .check field determines if the transition is
1786 valid.
1787
786e8c11 1788 XXX - wrong maybe?
a3621e74
YO
1789 The following process inplace converts the table to the compressed
1790 table: We first do not compress the root node 1,and mark its all its
1791 .check pointers as 1 and set its .base pointer as 1 as well. This
1792 allows to do a DFA construction from the compressed table later, and
1793 ensures that any .base pointers we calculate later are greater than
1794 0.
1795
1796 - We set 'pos' to indicate the first entry of the second node.
1797
1798 - We then iterate over the columns of the node, finding the first and
1799 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1800 and set the .check pointers accordingly, and advance pos
1801 appropriately and repreat for the next node. Note that when we copy
1802 the next pointers we have to convert them from the original
1803 NODEIDX form to NODENUM form as the former is not valid post
1804 compression.
1805
1806 - If a node has no transitions used we mark its base as 0 and do not
1807 advance the pos pointer.
1808
1809 - If a node only has one transition we use a second pointer into the
1810 structure to fill in allocated fail transitions from other states.
1811 This pointer is independent of the main pointer and scans forward
1812 looking for null transitions that are allocated to a state. When it
1813 finds one it writes the single transition into the "hole". If the
786e8c11 1814 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1815
1816 - Once compressed we can Renew/realloc the structures to release the
1817 excess space.
1818
1819 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1820 specifically Fig 3.47 and the associated pseudocode.
1821
1822 demq
1823 */
a3b680e6 1824 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1825 U32 state, charid;
a3621e74 1826 U32 pos = 0, zp=0;
1e2e3d02 1827 trie->statecount = laststate;
a3621e74
YO
1828
1829 for ( state = 1 ; state < laststate ; state++ ) {
1830 U8 flag = 0;
a28509cc
AL
1831 const U32 stateidx = TRIE_NODEIDX( state );
1832 const U32 o_used = trie->trans[ stateidx ].check;
1833 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1834 trie->trans[ stateidx ].check = 0;
1835
1836 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1837 if ( flag || trie->trans[ stateidx + charid ].next ) {
1838 if ( trie->trans[ stateidx + charid ].next ) {
1839 if (o_used == 1) {
1840 for ( ; zp < pos ; zp++ ) {
1841 if ( ! trie->trans[ zp ].next ) {
1842 break;
1843 }
1844 }
1845 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1846 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1847 trie->trans[ zp ].check = state;
1848 if ( ++zp > pos ) pos = zp;
1849 break;
1850 }
1851 used--;
1852 }
1853 if ( !flag ) {
1854 flag = 1;
1855 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 }
1857 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1858 trie->trans[ pos ].check = state;
1859 pos++;
1860 }
1861 }
1862 }
cc601c31 1863 trie->lasttrans = pos + 1;
c944940b
JH
1864 trie->states = (reg_trie_state *)
1865 PerlMemShared_realloc( trie->states, laststate
1866 * sizeof(reg_trie_state) );
a3621e74 1867 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1868 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1869 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1870 (int)depth * 2 + 2,"",
1871 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1872 (IV)next_alloc,
1873 (IV)pos,
a3621e74
YO
1874 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1875 );
1876
1877 } /* end table compress */
1878 }
1e2e3d02
YO
1879 DEBUG_TRIE_COMPILE_MORE_r(
1880 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1881 (int)depth * 2 + 2, "",
1882 (UV)trie->statecount,
1883 (UV)trie->lasttrans)
1884 );
cc601c31 1885 /* resize the trans array to remove unused space */
c944940b
JH
1886 trie->trans = (reg_trie_trans *)
1887 PerlMemShared_realloc( trie->trans, trie->lasttrans
1888 * sizeof(reg_trie_trans) );
a3621e74 1889
3dab1dad 1890 /* and now dump out the compressed format */
2b8b4781 1891 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1892
3dab1dad 1893 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1894 U8 nodetype =(U8)(flags & 0xFF);
1895 char *str=NULL;
786e8c11 1896
07be1b83 1897#ifdef DEBUGGING
e62cc96a 1898 regnode *optimize = NULL;
7122b237
YO
1899#ifdef RE_TRACK_PATTERN_OFFSETS
1900
b57a0404
JH
1901 U32 mjd_offset = 0;
1902 U32 mjd_nodelen = 0;
7122b237
YO
1903#endif /* RE_TRACK_PATTERN_OFFSETS */
1904#endif /* DEBUGGING */
a3621e74 1905 /*
3dab1dad
YO
1906 This means we convert either the first branch or the first Exact,
1907 depending on whether the thing following (in 'last') is a branch
1908 or not and whther first is the startbranch (ie is it a sub part of
1909 the alternation or is it the whole thing.)
1910 Assuming its a sub part we conver the EXACT otherwise we convert
1911 the whole branch sequence, including the first.
a3621e74 1912 */
3dab1dad 1913 /* Find the node we are going to overwrite */
7f69552c 1914 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1915 /* branch sub-chain */
3dab1dad 1916 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1917#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1918 DEBUG_r({
1919 mjd_offset= Node_Offset((convert));
1920 mjd_nodelen= Node_Length((convert));
1921 });
7122b237 1922#endif
7f69552c 1923 /* whole branch chain */
7122b237
YO
1924 }
1925#ifdef RE_TRACK_PATTERN_OFFSETS
1926 else {
7f69552c
YO
1927 DEBUG_r({
1928 const regnode *nop = NEXTOPER( convert );
1929 mjd_offset= Node_Offset((nop));
1930 mjd_nodelen= Node_Length((nop));
1931 });
07be1b83
YO
1932 }
1933 DEBUG_OPTIMISE_r(
1934 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1935 (int)depth * 2 + 2, "",
786e8c11 1936 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1937 );
7122b237 1938#endif
3dab1dad
YO
1939 /* But first we check to see if there is a common prefix we can
1940 split out as an EXACT and put in front of the TRIE node. */
1941 trie->startstate= 1;
55eed653 1942 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1943 U32 state;
1e2e3d02 1944 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1945 U32 ofs = 0;
8e11feef
RGS
1946 I32 idx = -1;
1947 U32 count = 0;
1948 const U32 base = trie->states[ state ].trans.base;
a3621e74 1949
3dab1dad 1950 if ( trie->states[state].wordnum )
8e11feef 1951 count = 1;
a3621e74 1952
8e11feef 1953 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1954 if ( ( base + ofs >= trie->uniquecharcount ) &&
1955 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1956 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 {
3dab1dad 1958 if ( ++count > 1 ) {
2b8b4781 1959 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1960 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1961 if ( state == 1 ) break;
3dab1dad
YO
1962 if ( count == 2 ) {
1963 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 DEBUG_OPTIMISE_r(
8e11feef
RGS
1965 PerlIO_printf(Perl_debug_log,
1966 "%*sNew Start State=%"UVuf" Class: [",
1967 (int)depth * 2 + 2, "",
786e8c11 1968 (UV)state));
be8e71aa 1969 if (idx >= 0) {
2b8b4781 1970 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1971 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1972
3dab1dad 1973 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1974 if ( folder )
1975 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1976 DEBUG_OPTIMISE_r(
07be1b83 1977 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1978 );
8e11feef
RGS
1979 }
1980 }
1981 TRIE_BITMAP_SET(trie,*ch);
1982 if ( folder )
1983 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1984 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1985 }
1986 idx = ofs;
1987 }
3dab1dad
YO
1988 }
1989 if ( count == 1 ) {
2b8b4781 1990 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
1991 STRLEN len;
1992 char *ch = SvPV( *tmp, len );
de734bd5
A
1993 DEBUG_OPTIMISE_r({
1994 SV *sv=sv_newmortal();
8e11feef
RGS
1995 PerlIO_printf( Perl_debug_log,
1996 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1997 (int)depth * 2 + 2, "",
de734bd5
A
1998 (UV)state, (UV)idx,
1999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2000 PL_colors[0], PL_colors[1],
2001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2002 PERL_PV_ESCAPE_FIRSTCHAR
2003 )
2004 );
2005 });
3dab1dad
YO
2006 if ( state==1 ) {
2007 OP( convert ) = nodetype;
2008 str=STRING(convert);
2009 STR_LEN(convert)=0;
2010 }
c490c714
YO
2011 STR_LEN(convert) += len;
2012 while (len--)
de734bd5 2013 *str++ = *ch++;
8e11feef 2014 } else {
f9049ba1 2015#ifdef DEBUGGING
8e11feef
RGS
2016 if (state>1)
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2018#endif
8e11feef
RGS
2019 break;
2020 }
2021 }
3dab1dad 2022 if (str) {
8e11feef 2023 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2024 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2025 trie->startstate = state;
07be1b83
YO
2026 trie->minlen -= (state - 1);
2027 trie->maxlen -= (state - 1);
33809eae
JH
2028#ifdef DEBUGGING
2029 /* At least the UNICOS C compiler choked on this
2030 * being argument to DEBUG_r(), so let's just have
2031 * it right here. */
2032 if (
2033#ifdef PERL_EXT_RE_BUILD
2034 1
2035#else
2036 DEBUG_r_TEST
2037#endif
2038 ) {
2039 regnode *fix = convert;
2040 U32 word = trie->wordcount;
2041 mjd_nodelen++;
2042 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2043 while( ++fix < n ) {
2044 Set_Node_Offset_Length(fix, 0, 0);
2045 }
2046 while (word--) {
2047 SV ** const tmp = av_fetch( trie_words, word, 0 );
2048 if (tmp) {
2049 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2050 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2051 else
2052 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2053 }
2054 }
2055 }
2056#endif
8e11feef
RGS
2057 if (trie->maxlen) {
2058 convert = n;
2059 } else {
3dab1dad 2060 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2061 DEBUG_r(optimize= n);
3dab1dad
YO
2062 }
2063 }
2064 }
a5ca303d
YO
2065 if (!jumper)
2066 jumper = last;
3dab1dad 2067 if ( trie->maxlen ) {
8e11feef
RGS
2068 NEXT_OFF( convert ) = (U16)(tail - convert);
2069 ARG_SET( convert, data_slot );
786e8c11
YO
2070 /* Store the offset to the first unabsorbed branch in
2071 jump[0], which is otherwise unused by the jump logic.
2072 We use this when dumping a trie and during optimisation. */
2073 if (trie->jump)
7f69552c 2074 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2075
786e8c11
YO
2076 /* XXXX */
2077 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2078 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2079 {
2080 OP( convert ) = TRIEC;
2081 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2082 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2083 trie->bitmap= NULL;
2084 } else
2085 OP( convert ) = TRIE;
a3621e74 2086
3dab1dad
YO
2087 /* store the type in the flags */
2088 convert->flags = nodetype;
a5ca303d
YO
2089 DEBUG_r({
2090 optimize = convert
2091 + NODE_STEP_REGNODE
2092 + regarglen[ OP( convert ) ];
2093 });
2094 /* XXX We really should free up the resource in trie now,
2095 as we won't use them - (which resources?) dmq */
3dab1dad 2096 }
a3621e74 2097 /* needed for dumping*/
e62cc96a 2098 DEBUG_r(if (optimize) {
07be1b83 2099 regnode *opt = convert;
bcdf7404 2100
e62cc96a 2101 while ( ++opt < optimize) {
07be1b83
YO
2102 Set_Node_Offset_Length(opt,0,0);
2103 }
786e8c11
YO
2104 /*
2105 Try to clean up some of the debris left after the
2106 optimisation.
a3621e74 2107 */
786e8c11 2108 while( optimize < jumper ) {
07be1b83 2109 mjd_nodelen += Node_Length((optimize));
a3621e74 2110 OP( optimize ) = OPTIMIZED;
07be1b83 2111 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2112 optimize++;
2113 }
07be1b83 2114 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2115 });
2116 } /* end node insert */
55eed653 2117 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2118#ifdef DEBUGGING
2119 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2120 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2121#else
2122 SvREFCNT_dec(revcharmap);
07be1b83 2123#endif
786e8c11
YO
2124 return trie->jump
2125 ? MADE_JUMP_TRIE
2126 : trie->startstate>1
2127 ? MADE_EXACT_TRIE
2128 : MADE_TRIE;
2129}
2130
2131STATIC void
2132S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2133{
2134/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2135
2136 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2137 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2138 ISBN 0-201-10088-6
2139
2140 We find the fail state for each state in the trie, this state is the longest proper
2141 suffix of the current states 'word' that is also a proper prefix of another word in our
2142 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2143 the DFA not to have to restart after its tried and failed a word at a given point, it
2144 simply continues as though it had been matching the other word in the first place.
2145 Consider
2146 'abcdgu'=~/abcdefg|cdgu/
2147 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2148 fail, which would bring use to the state representing 'd' in the second word where we would
2149 try 'g' and succeed, prodceding to match 'cdgu'.
2150 */
2151 /* add a fail transition */
3251b653
NC
2152 const U32 trie_offset = ARG(source);
2153 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2154 U32 *q;
2155 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2156 const U32 numstates = trie->statecount;
786e8c11
YO
2157 const U32 ubound = trie->lasttrans + ucharcount;
2158 U32 q_read = 0;
2159 U32 q_write = 0;
2160 U32 charid;
2161 U32 base = trie->states[ 1 ].trans.base;
2162 U32 *fail;
2163 reg_ac_data *aho;
2164 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2165 GET_RE_DEBUG_FLAGS_DECL;
2166#ifndef DEBUGGING
2167 PERL_UNUSED_ARG(depth);
2168#endif
2169
2170
2171 ARG_SET( stclass, data_slot );
c944940b 2172 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2173 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2174 aho->trie=trie_offset;
446bd890
NC
2175 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2176 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2177 Newxz( q, numstates, U32);
c944940b 2178 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2179 aho->refcount = 1;
2180 fail = aho->fail;
2181 /* initialize fail[0..1] to be 1 so that we always have
2182 a valid final fail state */
2183 fail[ 0 ] = fail[ 1 ] = 1;
2184
2185 for ( charid = 0; charid < ucharcount ; charid++ ) {
2186 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2187 if ( newstate ) {
2188 q[ q_write ] = newstate;
2189 /* set to point at the root */
2190 fail[ q[ q_write++ ] ]=1;
2191 }
2192 }
2193 while ( q_read < q_write) {
2194 const U32 cur = q[ q_read++ % numstates ];
2195 base = trie->states[ cur ].trans.base;
2196
2197 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2198 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2199 if (ch_state) {
2200 U32 fail_state = cur;
2201 U32 fail_base;
2202 do {
2203 fail_state = fail[ fail_state ];
2204 fail_base = aho->states[ fail_state ].trans.base;
2205 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2206
2207 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2208 fail[ ch_state ] = fail_state;
2209 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2210 {
2211 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2212 }
2213 q[ q_write++ % numstates] = ch_state;
2214 }
2215 }
2216 }
2217 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2218 when we fail in state 1, this allows us to use the
2219 charclass scan to find a valid start char. This is based on the principle
2220 that theres a good chance the string being searched contains lots of stuff
2221 that cant be a start char.
2222 */
2223 fail[ 0 ] = fail[ 1 ] = 0;
2224 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2225 PerlIO_printf(Perl_debug_log,
2226 "%*sStclass Failtable (%"UVuf" states): 0",
2227 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2228 );
786e8c11
YO
2229 for( q_read=1; q_read<numstates; q_read++ ) {
2230 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2231 }
2232 PerlIO_printf(Perl_debug_log, "\n");
2233 });
2234 Safefree(q);
2235 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2236}
2237
786e8c11 2238
a3621e74 2239/*
5d1c421c
JH
2240 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2241 * These need to be revisited when a newer toolchain becomes available.
2242 */
2243#if defined(__sparc64__) && defined(__GNUC__)
2244# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2245# undef SPARC64_GCC_WORKAROUND
2246# define SPARC64_GCC_WORKAROUND 1
2247# endif
2248#endif
2249
07be1b83 2250#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2251 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2252 SV * const mysv=sv_newmortal(); \
2253 regnode *Next = regnext(scan); \
2254 regprop(RExC_rx, mysv, scan); \
7f69552c 2255 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2256 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2257 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2258 }});
07be1b83 2259
1de06328
YO
2260
2261
2262
2263
07be1b83
YO
2264#define JOIN_EXACT(scan,min,flags) \
2265 if (PL_regkind[OP(scan)] == EXACT) \
2266 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2267
be8e71aa 2268STATIC U32
07be1b83
YO
2269S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2270 /* Merge several consecutive EXACTish nodes into one. */
2271 regnode *n = regnext(scan);
2272 U32 stringok = 1;
2273 regnode *next = scan + NODE_SZ_STR(scan);
2274 U32 merged = 0;
2275 U32 stopnow = 0;
2276#ifdef DEBUGGING
2277 regnode *stop = scan;
72f13be8 2278 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2279#else
d47053eb
RGS
2280 PERL_UNUSED_ARG(depth);
2281#endif
2282#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2283 PERL_UNUSED_ARG(flags);
2284 PERL_UNUSED_ARG(val);
07be1b83 2285#endif
07be1b83
YO
2286 DEBUG_PEEP("join",scan,depth);
2287
2288 /* Skip NOTHING, merge EXACT*. */
2289 while (n &&
2290 ( PL_regkind[OP(n)] == NOTHING ||
2291 (stringok && (OP(n) == OP(scan))))
2292 && NEXT_OFF(n)
2293 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2294
2295 if (OP(n) == TAIL || n > next)
2296 stringok = 0;
2297 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2298 DEBUG_PEEP("skip:",n,depth);
2299 NEXT_OFF(scan) += NEXT_OFF(n);
2300 next = n + NODE_STEP_REGNODE;
2301#ifdef DEBUGGING
2302 if (stringok)
2303 stop = n;
2304#endif
2305 n = regnext(n);
2306 }
2307 else if (stringok) {
786e8c11 2308 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2309 regnode * const nnext = regnext(n);
2310
2311 DEBUG_PEEP("merg",n,depth);
2312
2313 merged++;
2314 if (oldl + STR_LEN(n) > U8_MAX)
2315 break;
2316 NEXT_OFF(scan) += NEXT_OFF(n);
2317 STR_LEN(scan) += STR_LEN(n);
2318 next = n + NODE_SZ_STR(n);
2319 /* Now we can overwrite *n : */
2320 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2321#ifdef DEBUGGING
2322 stop = next - 1;
2323#endif
2324 n = nnext;
2325 if (stopnow) break;
2326 }
2327
d47053eb
RGS
2328#ifdef EXPERIMENTAL_INPLACESCAN
2329 if (flags && !NEXT_OFF(n)) {
2330 DEBUG_PEEP("atch", val, depth);
2331 if (reg_off_by_arg[OP(n)]) {
2332 ARG_SET(n, val - n);
2333 }
2334 else {
2335 NEXT_OFF(n) = val - n;
2336 }
2337 stopnow = 1;
2338 }
07be1b83
YO
2339#endif
2340 }
2341
2342 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2343 /*
2344 Two problematic code points in Unicode casefolding of EXACT nodes:
2345
2346 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2347 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2348
2349 which casefold to
2350
2351 Unicode UTF-8
2352
2353 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2354 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2355
2356 This means that in case-insensitive matching (or "loose matching",
2357 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2358 length of the above casefolded versions) can match a target string
2359 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2360 This would rather mess up the minimum length computation.
2361
2362 What we'll do is to look for the tail four bytes, and then peek
2363 at the preceding two bytes to see whether we need to decrease
2364 the minimum length by four (six minus two).
2365
2366 Thanks to the design of UTF-8, there cannot be false matches:
2367 A sequence of valid UTF-8 bytes cannot be a subsequence of
2368 another valid sequence of UTF-8 bytes.
2369
2370 */
2371 char * const s0 = STRING(scan), *s, *t;
2372 char * const s1 = s0 + STR_LEN(scan) - 1;
2373 char * const s2 = s1 - 4;
e294cc5d
JH
2374#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2375 const char t0[] = "\xaf\x49\xaf\x42";
2376#else
07be1b83 2377 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2378#endif
07be1b83
YO
2379 const char * const t1 = t0 + 3;
2380
2381 for (s = s0 + 2;
2382 s < s2 && (t = ninstr(s, s1, t0, t1));
2383 s = t + 4) {
e294cc5d
JH
2384#ifdef EBCDIC
2385 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2386 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2387#else
07be1b83
YO
2388 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2389 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2390#endif
07be1b83
YO
2391 *min -= 4;
2392 }
2393 }
2394
2395#ifdef DEBUGGING
2396 /* Allow dumping */
2397 n = scan + NODE_SZ_STR(scan);
2398 while (n <= stop) {
2399 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2400 OP(n) = OPTIMIZED;
2401 NEXT_OFF(n) = 0;
2402 }
2403 n++;
2404 }
2405#endif
2406 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2407 return stopnow;
2408}
2409
653099ff
GS
2410/* REx optimizer. Converts nodes into quickier variants "in place".
2411 Finds fixed substrings. */
2412
a0288114 2413/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2414 to the position after last scanned or to NULL. */
2415
40d049e4
YO
2416#define INIT_AND_WITHP \
2417 assert(!and_withp); \
2418 Newx(and_withp,1,struct regnode_charclass_class); \
2419 SAVEFREEPV(and_withp)
07be1b83 2420
b515a41d
YO
2421/* this is a chain of data about sub patterns we are processing that
2422 need to be handled seperately/specially in study_chunk. Its so
2423 we can simulate recursion without losing state. */
2424struct scan_frame;
2425typedef struct scan_frame {
2426 regnode *last; /* last node to process in this frame */
2427 regnode *next; /* next node to process when last is reached */
2428 struct scan_frame *prev; /*previous frame*/
2429 I32 stop; /* what stopparen do we use */
2430} scan_frame;
2431
304ee84b
YO
2432
2433#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2434
e1d1eefb
YO
2435#define CASE_SYNST_FNC(nAmE) \
2436case nAmE: \
2437 if (flags & SCF_DO_STCLASS_AND) { \
2438 for (value = 0; value < 256; value++) \
2439 if (!is_ ## nAmE ## _cp(value)) \
2440 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2441 } \
2442 else { \
2443 for (value = 0; value < 256; value++) \
2444 if (is_ ## nAmE ## _cp(value)) \
2445 ANYOF_BITMAP_SET(data->start_class, value); \
2446 } \
2447 break; \
2448case N ## nAmE: \
2449 if (flags & SCF_DO_STCLASS_AND) { \
2450 for (value = 0; value < 256; value++) \
2451 if (is_ ## nAmE ## _cp(value)) \
2452 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2453 } \
2454 else { \
2455 for (value = 0; value < 256; value++) \
2456 if (!is_ ## nAmE ## _cp(value)) \
2457 ANYOF_BITMAP_SET(data->start_class, value); \
2458 } \
2459 break
2460
2461
2462
76e3520e 2463STATIC I32
40d049e4 2464S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2465 I32 *minlenp, I32 *deltap,
40d049e4
YO
2466 regnode *last,
2467 scan_data_t *data,
2468 I32 stopparen,
2469 U8* recursed,
2470 struct regnode_charclass_class *and_withp,
2471 U32 flags, U32 depth)
c277df42
IZ
2472 /* scanp: Start here (read-write). */
2473 /* deltap: Write maxlen-minlen here. */
2474 /* last: Stop before this one. */
40d049e4
YO
2475 /* data: string data about the pattern */
2476 /* stopparen: treat close N as END */
2477 /* recursed: which subroutines have we recursed into */
2478 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2479{
97aff369 2480 dVAR;
c277df42
IZ
2481 I32 min = 0, pars = 0, code;
2482 regnode *scan = *scanp, *next;
2483 I32 delta = 0;
2484 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2485 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2486 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2487 scan_data_t data_fake;
a3621e74 2488 SV *re_trie_maxbuff = NULL;
786e8c11 2489 regnode *first_non_open = scan;
e2e6a0f1 2490 I32 stopmin = I32_MAX;
8aa23a47
YO
2491 scan_frame *frame = NULL;
2492
a3621e74 2493 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2494
13a24bad 2495#ifdef DEBUGGING
40d049e4 2496 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2497#endif
40d049e4 2498
786e8c11 2499 if ( depth == 0 ) {
40d049e4 2500 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2501 first_non_open=regnext(first_non_open);
2502 }
2503
b81d288d 2504
8aa23a47
YO
2505 fake_study_recurse:
2506 while ( scan && OP(scan) != END && scan < last ){
2507 /* Peephole optimizer: */
304ee84b 2508 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2509 DEBUG_PEEP("Peep",scan,depth);
2510 JOIN_EXACT(scan,&min,0);
2511
2512 /* Follow the next-chain of the current node and optimize
2513 away all the NOTHINGs from it. */
2514 if (OP(scan) != CURLYX) {
2515 const int max = (reg_off_by_arg[OP(scan)]
2516 ? I32_MAX
2517 /* I32 may be smaller than U16 on CRAYs! */
2518 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2519 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2520 int noff;
2521 regnode *n = scan;
2522
2523 /* Skip NOTHING and LONGJMP. */
2524 while ((n = regnext(n))
2525 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2526 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2527 && off + noff < max)
2528 off += noff;
2529 if (reg_off_by_arg[OP(scan)])
2530 ARG(scan) = off;
2531 else
2532 NEXT_OFF(scan) = off;
2533 }
a3621e74 2534
c277df42 2535
8aa23a47
YO
2536
2537 /* The principal pseudo-switch. Cannot be a switch, since we
2538 look into several different things. */
2539 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2540 || OP(scan) == IFTHEN) {
2541 next = regnext(scan);
2542 code = OP(scan);
2543 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2544
2545 if (OP(next) == code || code == IFTHEN) {
2546 /* NOTE - There is similar code to this block below for handling
2547 TRIE nodes on a re-study. If you change stuff here check there
2548 too. */
2549 I32 max1 = 0, min1 = I32_MAX, num = 0;
2550 struct regnode_charclass_class accum;
2551 regnode * const startbranch=scan;
2552
2553 if (flags & SCF_DO_SUBSTR)
304ee84b 2554 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2555 if (flags & SCF_DO_STCLASS)
2556 cl_init_zero(pRExC_state, &accum);
2557
2558 while (OP(scan) == code) {
2559 I32 deltanext, minnext, f = 0, fake;
2560 struct regnode_charclass_class this_class;
2561
2562 num++;
2563 data_fake.flags = 0;
2564 if (data) {
2565 data_fake.whilem_c = data->whilem_c;
2566 data_fake.last_closep = data->last_closep;
2567 }
2568 else
2569 data_fake.last_closep = &fake;
58e23c8d
YO
2570
2571 data_fake.pos_delta = delta;
8aa23a47
YO
2572 next = regnext(scan);
2573 scan = NEXTOPER(scan);
2574 if (code != BRANCH)
c277df42 2575 scan = NEXTOPER(scan);
8aa23a47
YO
2576 if (flags & SCF_DO_STCLASS) {
2577 cl_init(pRExC_state, &this_class);
2578 data_fake.start_class = &this_class;
2579 f = SCF_DO_STCLASS_AND;
58e23c8d 2580 }
8aa23a47
YO
2581 if (flags & SCF_WHILEM_VISITED_POS)
2582 f |= SCF_WHILEM_VISITED_POS;
2583
2584 /* we suppose the run is continuous, last=next...*/
2585 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2586 next, &data_fake,
2587 stopparen, recursed, NULL, f,depth+1);
2588 if (min1 > minnext)
2589 min1 = minnext;
2590 if (max1 < minnext + deltanext)
2591 max1 = minnext + deltanext;
2592 if (deltanext == I32_MAX)
2593 is_inf = is_inf_internal = 1;
2594 scan = next;
2595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2596 pars++;
2597 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2598 if ( stopmin > minnext)
2599 stopmin = min + min1;
2600 flags &= ~SCF_DO_SUBSTR;
2601 if (data)
2602 data->flags |= SCF_SEEN_ACCEPT;
2603 }
2604 if (data) {
2605 if (data_fake.flags & SF_HAS_EVAL)
2606 data->flags |= SF_HAS_EVAL;
2607 data->whilem_c = data_fake.whilem_c;
3dab1dad 2608 }
8aa23a47
YO
2609 if (flags & SCF_DO_STCLASS)
2610 cl_or(pRExC_state, &accum, &this_class);
2611 }
2612 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2613 min1 = 0;
2614 if (flags & SCF_DO_SUBSTR) {
2615 data->pos_min += min1;
2616 data->pos_delta += max1 - min1;
2617 if (max1 != min1 || is_inf)
2618 data->longest = &(data->longest_float);
2619 }
2620 min += min1;
2621 delta += max1 - min1;
2622 if (flags & SCF_DO_STCLASS_OR) {
2623 cl_or(pRExC_state, data->start_class, &accum);
2624 if (min1) {
2625 cl_and(data->start_class, and_withp);
2626 flags &= ~SCF_DO_STCLASS;
653099ff 2627 }
8aa23a47
YO
2628 }
2629 else if (flags & SCF_DO_STCLASS_AND) {
2630 if (min1) {
2631 cl_and(data->start_class, &accum);
2632 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2633 }
8aa23a47
YO
2634 else {
2635 /* Switch to OR mode: cache the old value of
2636 * data->start_class */
2637 INIT_AND_WITHP;
2638 StructCopy(data->start_class, and_withp,
2639 struct regnode_charclass_class);
2640 flags &= ~SCF_DO_STCLASS_AND;
2641 StructCopy(&accum, data->start_class,
2642 struct regnode_charclass_class);
2643 flags |= SCF_DO_STCLASS_OR;
2644 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2645 }
8aa23a47 2646 }
a3621e74 2647
8aa23a47
YO
2648 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2649 /* demq.
a3621e74 2650
8aa23a47
YO
2651 Assuming this was/is a branch we are dealing with: 'scan' now
2652 points at the item that follows the branch sequence, whatever
2653 it is. We now start at the beginning of the sequence and look
2654 for subsequences of
a3621e74 2655
8aa23a47
YO
2656 BRANCH->EXACT=>x1
2657 BRANCH->EXACT=>x2
2658 tail
a3621e74 2659
8aa23a47 2660 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2661
8aa23a47
YO
2662 If we can find such a subseqence we need to turn the first
2663 element into a trie and then add the subsequent branch exact
2664 strings to the trie.
a3621e74 2665
8aa23a47 2666 We have two cases
a3621e74 2667
8aa23a47 2668 1. patterns where the whole set of branch can be converted.
a3621e74 2669
8aa23a47 2670 2. patterns where only a subset can be converted.
a3621e74 2671
8aa23a47
YO
2672 In case 1 we can replace the whole set with a single regop
2673 for the trie. In case 2 we need to keep the start and end
2674 branchs so
a3621e74 2675
8aa23a47
YO
2676 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2677 becomes BRANCH TRIE; BRANCH X;
786e8c11 2678
8aa23a47
YO
2679 There is an additional case, that being where there is a
2680 common prefix, which gets split out into an EXACT like node
2681 preceding the TRIE node.
a3621e74 2682
8aa23a47
YO
2683 If x(1..n)==tail then we can do a simple trie, if not we make
2684 a "jump" trie, such that when we match the appropriate word
2685 we "jump" to the appopriate tail node. Essentailly we turn
2686 a nested if into a case structure of sorts.
b515a41d 2687
8aa23a47
YO
2688 */
2689
2690 int made=0;
2691 if (!re_trie_maxbuff) {
2692 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2693 if (!SvIOK(re_trie_maxbuff))
2694 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2695 }
2696 if ( SvIV(re_trie_maxbuff)>=0 ) {
2697 regnode *cur;
2698 regnode *first = (regnode *)NULL;
2699 regnode *last = (regnode *)NULL;
2700 regnode *tail = scan;
2701 U8 optype = 0;
2702 U32 count=0;
a3621e74
YO
2703
2704#ifdef DEBUGGING
8aa23a47 2705 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2706#endif
8aa23a47
YO
2707 /* var tail is used because there may be a TAIL
2708 regop in the way. Ie, the exacts will point to the
2709 thing following the TAIL, but the last branch will
2710 point at the TAIL. So we advance tail. If we
2711 have nested (?:) we may have to move through several
2712 tails.
2713 */
2714
2715 while ( OP( tail ) == TAIL ) {
2716 /* this is the TAIL generated by (?:) */
2717 tail = regnext( tail );
2718 }
a3621e74 2719
8aa23a47
YO
2720
2721 DEBUG_OPTIMISE_r({
2722 regprop(RExC_rx, mysv, tail );
2723 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2724 (int)depth * 2 + 2, "",
2725 "Looking for TRIE'able sequences. Tail node is: ",
2726 SvPV_nolen_const( mysv )
2727 );
2728 });
2729
2730 /*
2731
2732 step through the branches, cur represents each
2733 branch, noper is the first thing to be matched
2734 as part of that branch and noper_next is the
2735 regnext() of that node. if noper is an EXACT
2736 and noper_next is the same as scan (our current
2737 position in the regex) then the EXACT branch is
2738 a possible optimization target. Once we have
2739 two or more consequetive such branches we can
2740 create a trie of the EXACT's contents and stich
2741 it in place. If the sequence represents all of
2742 the branches we eliminate the whole thing and
2743 replace it with a single TRIE. If it is a
2744 subsequence then we need to stitch it in. This
2745 means the first branch has to remain, and needs
2746 to be repointed at the item on the branch chain
2747 following the last branch optimized. This could
2748 be either a BRANCH, in which case the
2749 subsequence is internal, or it could be the
2750 item following the branch sequence in which
2751 case the subsequence is at the end.
2752
2753 */
2754
2755 /* dont use tail as the end marker for this traverse */
2756 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2757 regnode * const noper = NEXTOPER( cur );
b515a41d 2758#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2759 regnode * const noper_next = regnext( noper );
b515a41d
YO
2760#endif
2761
8aa23a47
YO
2762 DEBUG_OPTIMISE_r({
2763 regprop(RExC_rx, mysv, cur);
2764 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2765 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2766
2767 regprop(RExC_rx, mysv, noper);
2768 PerlIO_printf( Perl_debug_log, " -> %s",
2769 SvPV_nolen_const(mysv));
2770
2771 if ( noper_next ) {
2772 regprop(RExC_rx, mysv, noper_next );
2773 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2774 SvPV_nolen_const(mysv));
2775 }
2776 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2777 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2778 });
2779 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2780 : PL_regkind[ OP( noper ) ] == EXACT )
2781 || OP(noper) == NOTHING )
786e8c11 2782#ifdef NOJUMPTRIE
8aa23a47 2783 && noper_next == tail
786e8c11 2784#endif
8aa23a47
YO
2785 && count < U16_MAX)
2786 {
2787 count++;
2788 if ( !first || optype == NOTHING ) {
2789 if (!first) first = cur;
2790 optype = OP( noper );
2791 } else {
2792 last = cur;
2793 }
2794 } else {
a0a388a1
YO
2795/*
2796 Currently we assume that the trie can handle unicode and ascii
2797 matches fold cased matches. If this proves true then the following
2798 define will prevent tries in this situation.
2799
2800 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2801*/
2802#define TRIE_TYPE_IS_SAFE 1
2803 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2804 make_trie( pRExC_state,
2805 startbranch, first, cur, tail, count,
2806 optype, depth+1 );
2807 }
2808 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2809#ifdef NOJUMPTRIE
8aa23a47 2810 && noper_next == tail
786e8c11 2811#endif
8aa23a47
YO
2812 ){
2813 count = 1;
2814 first = cur;
2815 optype = OP( noper );
2816 } else {
2817 count = 0;
2818 first = NULL;
2819 optype = 0;
2820 }
2821 last = NULL;
2822 }
2823 }
2824 DEBUG_OPTIMISE_r({
2825 regprop(RExC_rx, mysv, cur);
2826 PerlIO_printf( Perl_debug_log,
2827 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2828 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2829
2830 });
a0a388a1
YO
2831
2832 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2833 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2834#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2835 if ( ((made == MADE_EXACT_TRIE &&
2836 startbranch == first)
2837 || ( first_non_open == first )) &&
2838 depth==0 ) {
2839 flags |= SCF_TRIE_RESTUDY;
2840 if ( startbranch == first
2841 && scan == tail )
2842 {
2843 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2844 }
2845 }
3dab1dad 2846#endif
8aa23a47
YO
2847 }
2848 }
2849
2850 } /* do trie */
2851
653099ff 2852 }
8aa23a47
YO
2853 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2854 scan = NEXTOPER(NEXTOPER(scan));
2855 } else /* single branch is optimized. */
2856 scan = NEXTOPER(scan);
2857 continue;
2858 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2859 scan_frame *newframe = NULL;
2860 I32 paren;
2861 regnode *start;
2862 regnode *end;
2863
2864 if (OP(scan) != SUSPEND) {
2865 /* set the pointer */
2866 if (OP(scan) == GOSUB) {
2867 paren = ARG(scan);
2868 RExC_recurse[ARG2L(scan)] = scan;
2869 start = RExC_open_parens[paren-1];
2870 end = RExC_close_parens[paren-1];
2871 } else {
2872 paren = 0;
f8fc2ecf 2873 start = RExC_rxi->program + 1;
8aa23a47
YO
2874 end = RExC_opend;
2875 }
2876 if (!recursed) {
2877 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2878 SAVEFREEPV(recursed);
2879 }
2880 if (!PAREN_TEST(recursed,paren+1)) {
2881 PAREN_SET(recursed,paren+1);
2882 Newx(newframe,1,scan_frame);
2883 } else {
2884 if (flags & SCF_DO_SUBSTR) {
304ee84b 2885 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2886 data->longest = &(data->longest_float);
2887 }
2888 is_inf = is_inf_internal = 1;
2889 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2890 cl_anything(pRExC_state, data->start_class);
2891 flags &= ~SCF_DO_STCLASS;
2892 }
2893 } else {
2894 Newx(newframe,1,scan_frame);
2895 paren = stopparen;
2896 start = scan+2;
2897 end = regnext(scan);
2898 }
2899 if (newframe) {
2900 assert(start);
2901 assert(end);
2902 SAVEFREEPV(newframe);
2903 newframe->next = regnext(scan);
2904 newframe->last = last;
2905 newframe->stop = stopparen;
2906 newframe->prev = frame;
2907
2908 frame = newframe;
2909 scan = start;
2910 stopparen = paren;
2911 last = end;
2912
2913 continue;
2914 }
2915 }
2916 else if (OP(scan) == EXACT) {
2917 I32 l = STR_LEN(scan);
2918 UV uc;
2919 if (UTF) {
2920 const U8 * const s = (U8*)STRING(scan);
2921 l = utf8_length(s, s + l);
2922 uc = utf8_to_uvchr(s, NULL);
2923 } else {
2924 uc = *((U8*)STRING(scan));
2925 }
2926 min += l;
2927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2928 /* The code below prefers earlier match for fixed
2929 offset, later match for variable offset. */
2930 if (data->last_end == -1) { /* Update the start info. */
2931 data->last_start_min = data->pos_min;
2932 data->last_start_max = is_inf
2933 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2934 }
8aa23a47
YO
2935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2936 if (UTF)
2937 SvUTF8_on(data->last_found);
2938 {
2939 SV * const sv = data->last_found;
2940 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2941 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2942 if (mg && mg->mg_len >= 0)
2943 mg->mg_len += utf8_length((U8*)STRING(scan),
2944 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2945 }
8aa23a47
YO
2946 data->last_end = data->pos_min + l;
2947 data->pos_min += l; /* As in the first entry. */
2948 data->flags &= ~SF_BEFORE_EOL;
2949 }
2950 if (flags & SCF_DO_STCLASS_AND) {
2951 /* Check whether it is compatible with what we know already! */
2952 int compat = 1;
2953
2954 if (uc >= 0x100 ||
2955 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2956 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2957 && (!(data->start_class->flags & ANYOF_FOLD)
2958 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2959 )
2960 compat = 0;
2961 ANYOF_CLASS_ZERO(data->start_class);
2962 ANYOF_BITMAP_ZERO(data->start_class);
2963 if (compat)
2964 ANYOF_BITMAP_SET(data->start_class, uc);
2965 data->start_class->flags &= ~ANYOF_EOS;
2966 if (uc < 0x100)
2967 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2968 }
2969 else if (flags & SCF_DO_STCLASS_OR) {
2970 /* false positive possible if the class is case-folded */
2971 if (uc < 0x100)
2972 ANYOF_BITMAP_SET(data->start_class, uc);
2973 else
2974 data->start_class->flags |= ANYOF_UNICODE_ALL;
2975 data->start_class->flags &= ~ANYOF_EOS;
2976 cl_and(data->start_class, and_withp);
2977 }
2978 flags &= ~SCF_DO_STCLASS;
2979 }
2980 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2981 I32 l = STR_LEN(scan);
2982 UV uc = *((U8*)STRING(scan));
2983
2984 /* Search for fixed substrings supports EXACT only. */
2985 if (flags & SCF_DO_SUBSTR) {
2986 assert(data);
304ee84b 2987 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2988 }
2989 if (UTF) {
2990 const U8 * const s = (U8 *)STRING(scan);
2991 l = utf8_length(s, s + l);
2992 uc = utf8_to_uvchr(s, NULL);
2993 }
2994 min += l;
2995 if (flags & SCF_DO_SUBSTR)
2996 data->pos_min += l;
2997 if (flags & SCF_DO_STCLASS_AND) {
2998 /* Check whether it is compatible with what we know already! */
2999 int compat = 1;
3000
3001 if (uc >= 0x100 ||
3002 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3003 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3004 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 compat = 0;
3006 ANYOF_CLASS_ZERO(data->start_class);
3007 ANYOF_BITMAP_ZERO(data->start_class);
3008 if (compat) {
3009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3010 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3011 data->start_class->flags |= ANYOF_FOLD;
3012 if (OP(scan) == EXACTFL)
3013 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3014 }
8aa23a47
YO
3015 }
3016 else if (flags & SCF_DO_STCLASS_OR) {
3017 if (data->start_class->flags & ANYOF_FOLD) {
3018 /* false positive possible if the class is case-folded.
3019 Assume that the locale settings are the same... */
1aa99e6b
IH
3020 if (uc < 0x100)
3021 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3022 data->start_class->flags &= ~ANYOF_EOS;
3023 }
8aa23a47 3024 cl_and(data->start_class, and_withp);
653099ff 3025 }
8aa23a47
YO
3026 flags &= ~SCF_DO_STCLASS;
3027 }
3028 else if (strchr((const char*)PL_varies,OP(scan))) {
3029 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3030 I32 f = flags, pos_before = 0;
3031 regnode * const oscan = scan;
3032 struct regnode_charclass_class this_class;
3033 struct regnode_charclass_class *oclass = NULL;
3034 I32 next_is_eval = 0;
3035
3036 switch (PL_regkind[OP(scan)]) {
3037 case WHILEM: /* End of (?:...)* . */
3038 scan = NEXTOPER(scan);
3039 goto finish;
3040 case PLUS:
3041 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3042 next = NEXTOPER(scan);
3043 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3044 mincount = 1;
3045 maxcount = REG_INFTY;
3046 next = regnext(scan);
3047 scan = NEXTOPER(scan);
3048 goto do_curly;
3049 }
3050 }
3051 if (flags & SCF_DO_SUBSTR)
3052 data->pos_min++;
3053 min++;
3054 /* Fall through. */
3055 case STAR:
3056 if (flags & SCF_DO_STCLASS) {
3057 mincount = 0;
3058 maxcount = REG_INFTY;
3059 next = regnext(scan);
3060 scan = NEXTOPER(scan);
3061 goto do_curly;
3062 }
3063 is_inf = is_inf_internal = 1;
3064 scan = regnext(scan);
c277df42 3065 if (flags & SCF_DO_SUBSTR) {
304ee84b 3066 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3067 data->longest = &(data->longest_float);
c277df42 3068 }
8aa23a47
YO
3069 goto optimize_curly_tail;
3070 case CURLY:
3071 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3072 && (scan->flags == stopparen))
3073 {
3074 mincount = 1;
3075 maxcount = 1;
3076 } else {
3077 mincount = ARG1(scan);
3078 maxcount = ARG2(scan);
653099ff 3079 }
8aa23a47
YO
3080 next = regnext(scan);
3081 if (OP(scan) == CURLYX) {
3082 I32 lp = (data ? *(data->last_closep) : 0);
3083 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3084 }
8aa23a47
YO
3085 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3086 next_is_eval = (OP(scan) == EVAL);
3087 do_curly:
3088 if (flags & SCF_DO_SUBSTR) {
304ee84b 3089 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3090 pos_before = data->pos_min;
b45f050a 3091 }
8aa23a47
YO
3092 if (data) {
3093 fl = data->flags;
3094 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3095 if (is_inf)
3096 data->flags |= SF_IS_INF;
3097 }
3098 if (flags & SCF_DO_STCLASS) {
3099 cl_init(pRExC_state, &this_class);
3100 oclass = data->start_class;
3101 data->start_class = &this_class;
3102 f |= SCF_DO_STCLASS_AND;
3103 f &= ~SCF_DO_STCLASS_OR;
3104 }
3105 /* These are the cases when once a subexpression
3106 fails at a particular position, it cannot succeed
3107 even after backtracking at the enclosing scope.
3108
3109 XXXX what if minimal match and we are at the
3110 initial run of {n,m}? */
3111 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3112 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3113
8aa23a47
YO
3114 /* This will finish on WHILEM, setting scan, or on NULL: */
3115 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3116 last, data, stopparen, recursed, NULL,
3117 (mincount == 0
3118 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3119
8aa23a47
YO
3120 if (flags & SCF_DO_STCLASS)
3121 data->start_class = oclass;
3122 if (mincount == 0 || minnext == 0) {
3123 if (flags & SCF_DO_STCLASS_OR) {
3124 cl_or(pRExC_state, data->start_class, &this_class);
3125 }
3126 else if (flags & SCF_DO_STCLASS_AND) {
3127 /* Switch to OR mode: cache the old value of
3128 * data->start_class */
3129 INIT_AND_WITHP;
3130 StructCopy(data->start_class, and_withp,
3131 struct regnode_charclass_class);
3132 flags &= ~SCF_DO_STCLASS_AND;
3133 StructCopy(&this_class, data->start_class,
3134 struct regnode_charclass_class);
3135 flags |= SCF_DO_STCLASS_OR;
3136 data->start_class->flags |= ANYOF_EOS;
3137 }
3138 } else { /* Non-zero len */
3139 if (flags & SCF_DO_STCLASS_OR) {
3140 cl_or(pRExC_state, data->start_class, &this_class);
3141 cl_and(data->start_class, and_withp);
3142 }
3143 else if (flags & SCF_DO_STCLASS_AND)
3144 cl_and(data->start_class, &this_class);
3145 flags &= ~SCF_DO_STCLASS;
3146 }
3147 if (!scan) /* It was not CURLYX, but CURLY. */
3148 scan = next;
3149 if ( /* ? quantifier ok, except for (?{ ... }) */
3150 (next_is_eval || !(mincount == 0 && maxcount == 1))
3151 && (minnext == 0) && (deltanext == 0)
3152 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3153 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3154 && ckWARN(WARN_REGEXP))
3155 {
3156 vWARN(RExC_parse,
3157 "Quantifier unexpected on zero-length expression");
3158 }
3159
3160 min += minnext * mincount;
3161 is_inf_internal |= ((maxcount == REG_INFTY
3162 && (minnext + deltanext) > 0)
3163 || deltanext == I32_MAX);
3164 is_inf |= is_inf_internal;
3165 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3166
3167 /* Try powerful optimization CURLYX => CURLYN. */
3168 if ( OP(oscan) == CURLYX && data
3169 && data->flags & SF_IN_PAR
3170 && !(data->flags & SF_HAS_EVAL)
3171 && !deltanext && minnext == 1 ) {
3172 /* Try to optimize to CURLYN. */
3173 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3174 regnode * const nxt1 = nxt;
497b47a8 3175#ifdef DEBUGGING
8aa23a47 3176 regnode *nxt2;
497b47a8 3177#endif
c277df42 3178
8aa23a47
YO
3179 /* Skip open. */
3180 nxt = regnext(nxt);
3181 if (!strchr((const char*)PL_simple,OP(nxt))
3182 && !(PL_regkind[OP(nxt)] == EXACT
3183 && STR_LEN(nxt) == 1))
3184 goto nogo;
497b47a8 3185#ifdef DEBUGGING
8aa23a47 3186 nxt2 = nxt;
497b47a8 3187#endif
8aa23a47
YO
3188 nxt = regnext(nxt);
3189 if (OP(nxt) != CLOSE)
3190 goto nogo;
3191 if (RExC_open_parens) {
3192 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3193 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3194 }
3195 /* Now we know that nxt2 is the only contents: */
3196 oscan->flags = (U8)ARG(nxt);
3197 OP(oscan) = CURLYN;
3198 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3199
c277df42 3200#ifdef DEBUGGING
8aa23a47
YO
3201 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3202 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3203 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3204 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3205 OP(nxt + 1) = OPTIMIZED; /* was count. */
3206 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3207#endif
8aa23a47
YO
3208 }
3209 nogo:
3210
3211 /* Try optimization CURLYX => CURLYM. */
3212 if ( OP(oscan) == CURLYX && data
3213 && !(data->flags & SF_HAS_PAR)
3214 && !(data->flags & SF_HAS_EVAL)
3215 && !deltanext /* atom is fixed width */
3216 && minnext != 0 /* CURLYM can't handle zero width */
3217 ) {
3218 /* XXXX How to optimize if data == 0? */
3219 /* Optimize to a simpler form. */
3220 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3221 regnode *nxt2;
3222
3223 OP(oscan) = CURLYM;
3224 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3225 && (OP(nxt2) != WHILEM))
3226 nxt = nxt2;
3227 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3228 /* Need to optimize away parenths. */
3229 if (data->flags & SF_IN_PAR) {
3230 /* Set the parenth number. */
3231 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3232
3233 if (OP(nxt) != CLOSE)
3234 FAIL("Panic opt close");
3235 oscan->flags = (U8)ARG(nxt);
3236 if (RExC_open_parens) {
3237 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3238 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3239 }
8aa23a47
YO
3240 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3241 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3242
c277df42 3243#ifdef DEBUGGING
8aa23a47
YO
3244 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3245 OP(nxt + 1) = OPTIMIZED; /* was count. */
3246 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3247 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3248#endif
c277df42 3249#if 0
8aa23a47
YO
3250 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3251 regnode *nnxt = regnext(nxt1);
3252
3253 if (nnxt == nxt) {
3254 if (reg_off_by_arg[OP(nxt1)])
3255 ARG_SET(nxt1, nxt2 - nxt1);
3256 else if (nxt2 - nxt1 < U16_MAX)
3257 NEXT_OFF(nxt1) = nxt2 - nxt1;
3258 else
3259 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3260 }
8aa23a47 3261 nxt1 = nnxt;
c277df42 3262 }
5d1c421c 3263#endif
8aa23a47
YO
3264 /* Optimize again: */
3265 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3266 NULL, stopparen, recursed, NULL, 0,depth+1);
3267 }
3268 else
3269 oscan->flags = 0;
3270 }
3271 else if ((OP(oscan) == CURLYX)
3272 && (flags & SCF_WHILEM_VISITED_POS)
3273 /* See the comment on a similar expression above.
3274 However, this time it not a subexpression
3275 we care about, but the expression itself. */
3276 && (maxcount == REG_INFTY)
3277 && data && ++data->whilem_c < 16) {
3278 /* This stays as CURLYX, we can put the count/of pair. */
3279 /* Find WHILEM (as in regexec.c) */
3280 regnode *nxt = oscan + NEXT_OFF(oscan);
3281
3282 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3283 nxt += ARG(nxt);
3284 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3285 | (RExC_whilem_seen << 4)); /* On WHILEM */
3286 }
3287 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3288 pars++;
3289 if (flags & SCF_DO_SUBSTR) {
3290 SV *last_str = NULL;
3291 int counted = mincount != 0;
a0ed51b3 3292
8aa23a47
YO
3293 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3294#if defined(SPARC64_GCC_WORKAROUND)
3295 I32 b = 0;
3296 STRLEN l = 0;
3297 const char *s = NULL;
3298 I32 old = 0;
b515a41d 3299
8aa23a47
YO
3300 if (pos_before >= data->last_start_min)
3301 b = pos_before;
3302 else
3303 b = data->last_start_min;
b515a41d 3304
8aa23a47
YO
3305 l = 0;
3306 s = SvPV_const(data->last_found, l);
3307 old = b - data->last_start_min;
3308
3309#else
3310 I32 b = pos_before >= data->last_start_min
3311 ? pos_before : data->last_start_min;
3312 STRLEN l;
3313 const char * const s = SvPV_const(data->last_found, l);
3314 I32 old = b - data->last_start_min;
3315#endif
3316
3317 if (UTF)
3318 old = utf8_hop((U8*)s, old) - (U8*)s;
3319
3320 l -= old;
3321 /* Get the added string: */
740cce10 3322 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3323 if (deltanext == 0 && pos_before == b) {
3324 /* What was added is a constant string */
3325 if (mincount > 1) {
3326 SvGROW(last_str, (mincount * l) + 1);
3327 repeatcpy(SvPVX(last_str) + l,
3328 SvPVX_const(last_str), l, mincount - 1);
3329 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3330 /* Add additional parts. */
3331 SvCUR_set(data->last_found,
3332 SvCUR(data->last_found) - l);
3333 sv_catsv(data->last_found, last_str);
3334 {
3335 SV * sv = data->last_found;
3336 MAGIC *mg =
3337 SvUTF8(sv) && SvMAGICAL(sv) ?
3338 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3339 if (mg && mg->mg_len >= 0)
bd94e887 3340 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3341 }
8aa23a47 3342 data->last_end += l * (mincount - 1);
b515a41d 3343 }
8aa23a47
YO
3344 } else {
3345 /* start offset must point into the last copy */
3346 data->last_start_min += minnext * (mincount - 1);
3347 data->last_start_max += is_inf ? I32_MAX
3348 : (maxcount - 1) * (minnext + data->pos_delta);
3349 }
c277df42 3350 }
8aa23a47
YO
3351 /* It is counted once already... */
3352 data->pos_min += minnext * (mincount - counted);
3353 data->pos_delta += - counted * deltanext +
3354 (minnext + deltanext) * maxcount - minnext * mincount;
3355 if (mincount != maxcount) {
3356 /* Cannot extend fixed substrings found inside
3357 the group. */
304ee84b 3358 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3359 if (mincount && last_str) {
3360 SV * const sv = data->last_found;
3361 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3362 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3363
3364 if (mg)
3365 mg->mg_len = -1;
3366 sv_setsv(sv, last_str);
3367 data->last_end = data->pos_min;
3368 data->last_start_min =
3369 data->pos_min - CHR_SVLEN(last_str);
3370 data->last_start_max = is_inf
3371 ? I32_MAX
3372 : data->pos_min + data->pos_delta
3373 - CHR_SVLEN(last_str);
3374 }
3375 data->longest = &(data->longest_float);
3376 }
3377 SvREFCNT_dec(last_str);
c277df42 3378 }
8aa23a47
YO
3379 if (data && (fl & SF_HAS_EVAL))
3380 data->flags |= SF_HAS_EVAL;
3381 optimize_curly_tail:
3382 if (OP(oscan) != CURLYX) {
3383 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3384 && NEXT_OFF(next))
3385 NEXT_OFF(oscan) += NEXT_OFF(next);
3386 }
3387 continue;
3388 default: /* REF and CLUMP only? */
3389 if (flags & SCF_DO_SUBSTR) {
304ee84b 3390 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3391 data->longest = &(data->longest_float);
3392 }
3393 is_inf = is_inf_internal = 1;
3394 if (flags & SCF_DO_STCLASS_OR)
3395 cl_anything(pRExC_state, data->start_class);
3396 flags &= ~SCF_DO_STCLASS;
3397 break;
c277df42 3398 }
8aa23a47 3399 }
e1d1eefb
YO
3400 else if (OP(scan) == LNBREAK) {
3401 if (flags & SCF_DO_STCLASS) {
3402 int value = 0;
3403 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3404 if (flags & SCF_DO_STCLASS_AND) {
3405 for (value = 0; value < 256; value++)
e64b1bd1 3406 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3407 ANYOF_BITMAP_CLEAR(data->start_class, value);
3408 }
3409 else {
3410 for (value = 0; value < 256; value++)
e64b1bd1 3411 if (is_VERTWS_cp(value))
e1d1eefb
YO
3412 ANYOF_BITMAP_SET(data->start_class, value);
3413 }
3414 if (flags & SCF_DO_STCLASS_OR)
3415 cl_and(data->start_class, and_withp);
3416 flags &= ~SCF_DO_STCLASS;
3417 }
3418 min += 1;
f9a79580 3419 delta += 1;
e1d1eefb
YO
3420 if (flags & SCF_DO_SUBSTR) {
3421 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3422 data->pos_min += 1;
f9a79580 3423 data->pos_delta += 1;
e1d1eefb
YO
3424 data->longest = &(data->longest_float);
3425 }
3426
3427 }
f9a79580
RGS
3428 else if (OP(scan) == FOLDCHAR) {
3429 int d = ARG(scan)==0xDF ? 1 : 2;
3430 flags &= ~SCF_DO_STCLASS;
3431 min += 1;
3432 delta += d;
3433 if (flags & SCF_DO_SUBSTR) {
3434 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3435 data->pos_min += 1;
3436 data->pos_delta += d;
3437 data->longest = &(data->longest_float);
3438 }
3439 }
8aa23a47
YO
3440 else if (strchr((const char*)PL_simple,OP(scan))) {
3441 int value = 0;
653099ff 3442
8aa23a47 3443 if (flags & SCF_DO_SUBSTR) {
304ee84b 3444 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3445 data->pos_min++;
3446 }
3447 min++;
3448 if (flags & SCF_DO_STCLASS) {
3449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3450
8aa23a47
YO
3451 /* Some of the logic below assumes that switching
3452 locale on will only add false positives. */
3453 switch (PL_regkind[OP(scan)]) {
3454 case SANY:
3455 default:
3456 do_default:
3457 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3458 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3459 cl_anything(pRExC_state, data->start_class);
3460 break;
3461 case REG_ANY:
3462 if (OP(scan) == SANY)
3463 goto do_default;
3464 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3465 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3466 || (data->start_class->flags & ANYOF_CLASS));
3467 cl_anything(pRExC_state, data->start_class);
653099ff 3468 }
8aa23a47
YO
3469 if (flags & SCF_DO_STCLASS_AND || !value)
3470 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3471 break;
3472 case ANYOF:
3473 if (flags & SCF_DO_STCLASS_AND)
3474 cl_and(data->start_class,
3475 (struct regnode_charclass_class*)scan);
653099ff 3476 else
8aa23a47
YO
3477 cl_or(pRExC_state, data->start_class,
3478 (struct regnode_charclass_class*)scan);
3479 break;
3480 case ALNUM:
3481 if (flags & SCF_DO_STCLASS_AND) {
3482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3484 for (value = 0; value < 256; value++)
3485 if (!isALNUM(value))
3486 ANYOF_BITMAP_CLEAR(data->start_class, value);
3487 }
653099ff 3488 }
8aa23a47
YO
3489 else {
3490 if (data->start_class->flags & ANYOF_LOCALE)
3491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3492 else {
3493 for (value = 0; value < 256; value++)
3494 if (isALNUM(value))
3495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3496 }
8aa23a47
YO
3497 }
3498 break;
3499 case ALNUML:
3500 if (flags & SCF_DO_STCLASS_AND) {
3501 if (data->start_class->flags & ANYOF_LOCALE)
3502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3503 }
3504 else {
3505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3506 data->start_class->flags |= ANYOF_LOCALE;
3507 }
3508 break;
3509 case NALNUM:
3510 if (flags & SCF_DO_STCLASS_AND) {
3511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3513 for (value = 0; value < 256; value++)
3514 if (isALNUM(value))
3515 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3516 }
3517 }
8aa23a47
YO
3518 else {
3519 if (data->start_class->flags & ANYOF_LOCALE)
3520 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3521 else {
3522 for (value = 0; value < 256; value++)
3523 if (!isALNUM(value))
3524 ANYOF_BITMAP_SET(data->start_class, value);
3525 }
653099ff 3526 }
8aa23a47
YO
3527 break;
3528 case NALNUML:
3529 if (flags & SCF_DO_STCLASS_AND) {
3530 if (data->start_class->flags & ANYOF_LOCALE)
3531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3532 }
8aa23a47
YO
3533 else {
3534 data->start_class->flags |= ANYOF_LOCALE;
3535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3536 }
3537 break;
3538 case SPACE:
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3542 for (value = 0; value < 256; value++)
3543 if (!isSPACE(value))
3544 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3545 }
3546 }
8aa23a47
YO
3547 else {
3548 if (data->start_class->flags & ANYOF_LOCALE)
3549 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3550 else {
3551 for (value = 0; value < 256; value++)
3552 if (isSPACE(value))
3553 ANYOF_BITMAP_SET(data->start_class, value);
3554 }
653099ff 3555 }
8aa23a47
YO
3556 break;
3557 case SPACEL:
3558 if (flags & SCF_DO_STCLASS_AND) {
3559 if (data->start_class->flags & ANYOF_LOCALE)
3560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3561 }
3562 else {
3563 data->start_class->flags |= ANYOF_LOCALE;
3564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3565 }
3566 break;
3567 case NSPACE:
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3571 for (value = 0; value < 256; value++)
3572 if (isSPACE(value))
3573 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3574 }
8aa23a47
YO
3575 }
3576 else {
3577 if (data->start_class->flags & ANYOF_LOCALE)
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3579 else {
3580 for (value = 0; value < 256; value++)
3581 if (!isSPACE(value))
3582 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3583 }
3584 }
8aa23a47
YO
3585 break;
3586 case NSPACEL:
3587 if (flags & SCF_DO_STCLASS_AND) {
3588 if (data->start_class->flags & ANYOF_LOCALE) {
3589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3590 for (value = 0; value < 256; value++)
3591 if (!isSPACE(value))
3592 ANYOF_BITMAP_CLEAR(data->start_class, value);
3593 }
653099ff 3594 }
8aa23a47
YO
3595 else {
3596 data->start_class->flags |= ANYOF_LOCALE;
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3598 }
3599 break;
3600 case DIGIT:
3601 if (flags & SCF_DO_STCLASS_AND) {
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3603 for (value = 0; value < 256; value++)
3604 if (!isDIGIT(value))
3605 ANYOF_BITMAP_CLEAR(data->start_class, value);
3606 }
3607 else {
3608 if (data->start_class->flags & ANYOF_LOCALE)
3609 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3610 else {
3611 for (value = 0; value < 256; value++)
3612 if (isDIGIT(value))
3613 ANYOF_BITMAP_SET(data->start_class, value);
3614 }
3615 }
3616 break;
3617 case NDIGIT:
3618 if (flags & SCF_DO_STCLASS_AND) {
3619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3620 for (value = 0; value < 256; value++)
3621 if (isDIGIT(value))
3622 ANYOF_BITMAP_CLEAR(data->start_class, value);
3623 }
3624 else {
3625 if (data->start_class->flags & ANYOF_LOCALE)
3626 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3627 else {
3628 for (value = 0; value < 256; value++)
3629 if (!isDIGIT(value))
3630 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3631 }
3632 }
8aa23a47 3633 break;
e1d1eefb
YO
3634 CASE_SYNST_FNC(VERTWS);
3635 CASE_SYNST_FNC(HORIZWS);
3636
8aa23a47
YO
3637 }
3638 if (flags & SCF_DO_STCLASS_OR)
3639 cl_and(data->start_class, and_withp);
3640 flags &= ~SCF_DO_STCLASS;
3641 }
3642 }
3643 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3644 data->flags |= (OP(scan) == MEOL
3645 ? SF_BEFORE_MEOL
3646 : SF_BEFORE_SEOL);
3647 }
3648 else if ( PL_regkind[OP(scan)] == BRANCHJ
3649 /* Lookbehind, or need to calculate parens/evals/stclass: */
3650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3652 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3653 || OP(scan) == UNLESSM )
3654 {
3655 /* Negative Lookahead/lookbehind
3656 In this case we can't do fixed string optimisation.
3657 */
1de06328 3658
8aa23a47
YO
3659 I32 deltanext, minnext, fake = 0;
3660 regnode *nscan;
3661 struct regnode_charclass_class intrnl;
3662 int f = 0;
1de06328 3663
8aa23a47
YO
3664 data_fake.flags = 0;
3665 if (data) {
3666 data_fake.whilem_c = data->whilem_c;
3667 data_fake.last_closep = data->last_closep;
c277df42 3668 }
8aa23a47
YO
3669 else
3670 data_fake.last_closep = &fake;
58e23c8d 3671 data_fake.pos_delta = delta;
8aa23a47
YO
3672 if ( flags & SCF_DO_STCLASS && !scan->flags
3673 && OP(scan) == IFMATCH ) { /* Lookahead */
3674 cl_init(pRExC_state, &intrnl);
3675 data_fake.start_class = &intrnl;
3676 f |= SCF_DO_STCLASS_AND;
3677 }
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3680 next = regnext(scan);
3681 nscan = NEXTOPER(NEXTOPER(scan));
3682 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3683 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3684 if (scan->flags) {
3685 if (deltanext) {
58e23c8d 3686 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3687 }
3688 else if (minnext > (I32)U8_MAX) {
58e23c8d 3689 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3690 }
3691 scan->flags = (U8)minnext;
3692 }
3693 if (data) {
3694 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3695 pars++;
3696 if (data_fake.flags & SF_HAS_EVAL)
3697 data->flags |= SF_HAS_EVAL;
3698 data->whilem_c = data_fake.whilem_c;
3699 }
3700 if (f & SCF_DO_STCLASS_AND) {
3701 const int was = (data->start_class->flags & ANYOF_EOS);
3702
3703 cl_and(data->start_class, &intrnl);
3704 if (was)
3705 data->start_class->flags |= ANYOF_EOS;
3706 }
cb434fcc 3707 }
8aa23a47
YO
3708#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3709 else {
3710 /* Positive Lookahead/lookbehind
3711 In this case we can do fixed string optimisation,
3712 but we must be careful about it. Note in the case of
3713 lookbehind the positions will be offset by the minimum
3714 length of the pattern, something we won't know about
3715 until after the recurse.
3716 */
3717 I32 deltanext, fake = 0;
3718 regnode *nscan;
3719 struct regnode_charclass_class intrnl;
3720 int f = 0;
3721 /* We use SAVEFREEPV so that when the full compile
3722 is finished perl will clean up the allocated
3723 minlens when its all done. This was we don't
3724 have to worry about freeing them when we know
3725 they wont be used, which would be a pain.
3726 */
3727 I32 *minnextp;
3728 Newx( minnextp, 1, I32 );
3729 SAVEFREEPV(minnextp);
3730
3731 if (data) {
3732 StructCopy(data, &data_fake, scan_data_t);
3733 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3734 f |= SCF_DO_SUBSTR;
3735 if (scan->flags)
304ee84b 3736 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3737 data_fake.last_found=newSVsv(data->last_found);
3738 }
3739 }
3740 else
3741 data_fake.last_closep = &fake;
3742 data_fake.flags = 0;
58e23c8d 3743 data_fake.pos_delta = delta;
8aa23a47
YO
3744 if (is_inf)
3745 data_fake.flags |= SF_IS_INF;
3746 if ( flags & SCF_DO_STCLASS && !scan->flags
3747 && OP(scan) == IFMATCH ) { /* Lookahead */
3748 cl_init(pRExC_state, &intrnl);
3749 data_fake.start_class = &intrnl;
3750 f |= SCF_DO_STCLASS_AND;
3751 }
3752 if (flags & SCF_WHILEM_VISITED_POS)
3753 f |= SCF_WHILEM_VISITED_POS;
3754 next = regnext(scan);
3755 nscan = NEXTOPER(NEXTOPER(scan));
3756
3757 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3758 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3759 if (scan->flags) {
3760 if (deltanext) {
58e23c8d 3761 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3762 }
3763 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3764 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3765 }
3766 scan->flags = (U8)*minnextp;
3767 }
3768
3769 *minnextp += min;
3770
3771 if (f & SCF_DO_STCLASS_AND) {
3772 const int was = (data->start_class->flags & ANYOF_EOS);
3773
3774 cl_and(data->start_class, &intrnl);
3775 if (was)
3776 data->start_class->flags |= ANYOF_EOS;
3777 }
3778 if (data) {
3779 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3780 pars++;
3781 if (data_fake.flags & SF_HAS_EVAL)
3782 data->flags |= SF_HAS_EVAL;
3783 data->whilem_c = data_fake.whilem_c;
3784 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3785 if (RExC_rx->minlen<*minnextp)
3786 RExC_rx->minlen=*minnextp;
304ee84b 3787 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3788 SvREFCNT_dec(data_fake.last_found);
3789
3790 if ( data_fake.minlen_fixed != minlenp )
3791 {
3792 data->offset_fixed= data_fake.offset_fixed;
3793 data->minlen_fixed= data_fake.minlen_fixed;
3794 data->lookbehind_fixed+= scan->flags;
3795 }
3796 if ( data_fake.minlen_float != minlenp )
3797 {
3798 data->minlen_float= data_fake.minlen_float;
3799 data->offset_float_min=data_fake.offset_float_min;
3800 data->offset_float_max=data_fake.offset_float_max;
3801 data->lookbehind_float+= scan->flags;
3802 }
3803 }
3804 }
3805
3806
40d049e4 3807 }
8aa23a47
YO
3808#endif
3809 }
3810 else if (OP(scan) == OPEN) {
3811 if (stopparen != (I32)ARG(scan))
3812 pars++;
3813 }
3814 else if (OP(scan) == CLOSE) {
3815 if (stopparen == (I32)ARG(scan)) {
3816 break;
3817 }
3818 if ((I32)ARG(scan) == is_par) {
3819 next = regnext(scan);
b515a41d 3820
8aa23a47
YO
3821 if ( next && (OP(next) != WHILEM) && next < last)
3822 is_par = 0; /* Disable optimization */
40d049e4 3823 }
8aa23a47
YO
3824 if (data)
3825 *(data->last_closep) = ARG(scan);
3826 }
3827 else if (OP(scan) == EVAL) {
c277df42
IZ
3828 if (data)
3829 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3830 }
3831 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3832 if (flags & SCF_DO_SUBSTR) {
304ee84b 3833 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3834 flags &= ~SCF_DO_SUBSTR;
40d049e4 3835 }
8aa23a47
YO
3836 if (data && OP(scan)==ACCEPT) {
3837 data->flags |= SCF_SEEN_ACCEPT;
3838 if (stopmin > min)
3839 stopmin = min;
e2e6a0f1 3840 }
8aa23a47
YO
3841 }
3842 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3843 {
0f5d15d6 3844 if (flags & SCF_DO_SUBSTR) {
304ee84b 3845 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3846 data->longest = &(data->longest_float);
3847 }
3848 is_inf = is_inf_internal = 1;
653099ff 3849 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3850 cl_anything(pRExC_state, data->start_class);
96776eda 3851 flags &= ~SCF_DO_STCLASS;
8aa23a47 3852 }
58e23c8d 3853 else if (OP(scan) == GPOS) {
bbe252da 3854 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3855 !(delta || is_inf || (data && data->pos_delta)))
3856 {
bbe252da
YO
3857 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3858 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3859 if (RExC_rx->gofs < (U32)min)
3860 RExC_rx->gofs = min;
3861 } else {
bbe252da 3862 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3863 RExC_rx->gofs = 0;
3864 }
3865 }
786e8c11 3866#ifdef TRIE_STUDY_OPT
40d049e4 3867#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3868 else if (PL_regkind[OP(scan)] == TRIE) {
3869 /* NOTE - There is similar code to this block above for handling
3870 BRANCH nodes on the initial study. If you change stuff here
3871 check there too. */
3872 regnode *trie_node= scan;
3873 regnode *tail= regnext(scan);
f8fc2ecf 3874 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3875 I32 max1 = 0, min1 = I32_MAX;
3876 struct regnode_charclass_class accum;
3877
3878 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3879 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3880 if (flags & SCF_DO_STCLASS)
3881 cl_init_zero(pRExC_state, &accum);
3882
3883 if (!trie->jump) {
3884 min1= trie->minlen;
3885 max1= trie->maxlen;
3886 } else {
3887 const regnode *nextbranch= NULL;
3888 U32 word;
3889
3890 for ( word=1 ; word <= trie->wordcount ; word++)
3891 {
3892 I32 deltanext=0, minnext=0, f = 0, fake;
3893 struct regnode_charclass_class this_class;
3894
3895 data_fake.flags = 0;
3896 if (data) {
3897 data_fake.whilem_c = data->whilem_c;
3898 data_fake.last_closep = data->last_closep;
3899 }
3900 else
3901 data_fake.last_closep = &fake;
58e23c8d 3902 data_fake.pos_delta = delta;
8aa23a47
YO
3903 if (flags & SCF_DO_STCLASS) {
3904 cl_init(pRExC_state, &this_class);
3905 data_fake.start_class = &this_class;
3906 f = SCF_DO_STCLASS_AND;
3907 }
3908 if (flags & SCF_WHILEM_VISITED_POS)
3909 f |= SCF_WHILEM_VISITED_POS;
3910
3911 if (trie->jump[word]) {
3912 if (!nextbranch)
3913 nextbranch = trie_node + trie->jump[0];
3914 scan= trie_node + trie->jump[word];
3915 /* We go from the jump point to the branch that follows
3916 it. Note this means we need the vestigal unused branches
3917 even though they arent otherwise used.
3918 */
3919 minnext = study_chunk(pRExC_state, &scan, minlenp,
3920 &deltanext, (regnode *)nextbranch, &data_fake,
3921 stopparen, recursed, NULL, f,depth+1);
3922 }
3923 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3924 nextbranch= regnext((regnode*)nextbranch);
3925
3926 if (min1 > (I32)(minnext + trie->minlen))
3927 min1 = minnext + trie->minlen;
3928 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3929 max1 = minnext + deltanext + trie->maxlen;
3930 if (deltanext == I32_MAX)
3931 is_inf = is_inf_internal = 1;
3932
3933 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3934 pars++;
3935 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3936 if ( stopmin > min + min1)
3937 stopmin = min + min1;
3938 flags &= ~SCF_DO_SUBSTR;
3939 if (data)
3940 data->flags |= SCF_SEEN_ACCEPT;
3941 }
3942 if (data) {
3943 if (data_fake.flags & SF_HAS_EVAL)
3944 data->flags |= SF_HAS_EVAL;
3945 data->whilem_c = data_fake.whilem_c;
3946 }
3947 if (flags & SCF_DO_STCLASS)
3948 cl_or(pRExC_state, &accum, &this_class);
3949 }
3950 }
3951 if (flags & SCF_DO_SUBSTR) {
3952 data->pos_min += min1;
3953 data->pos_delta += max1 - min1;
3954 if (max1 != min1 || is_inf)
3955 data->longest = &(data->longest_float);
3956 }
3957 min += min1;
3958 delta += max1 - min1;
3959 if (flags & SCF_DO_STCLASS_OR) {
3960 cl_or(pRExC_state, data->start_class, &accum);
3961 if (min1) {
3962 cl_and(data->start_class, and_withp);
3963 flags &= ~SCF_DO_STCLASS;
3964 }
3965 }
3966 else if (flags & SCF_DO_STCLASS_AND) {
3967 if (min1) {
3968 cl_and(data->start_class, &accum);
3969 flags &= ~SCF_DO_STCLASS;
3970 }
3971 else {
3972 /* Switch to OR mode: cache the old value of
3973 * data->start_class */
3974 INIT_AND_WITHP;
3975 StructCopy(data->start_class, and_withp,
3976 struct regnode_charclass_class);
3977 flags &= ~SCF_DO_STCLASS_AND;
3978 StructCopy(&accum, data->start_class,
3979 struct regnode_charclass_class);
3980 flags |= SCF_DO_STCLASS_OR;
3981 data->start_class->flags |= ANYOF_EOS;
3982 }
3983 }
3984 scan= tail;
3985 continue;
3986 }
786e8c11 3987#else
8aa23a47 3988 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3989 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3990 U8*bang=NULL;
3991
3992 min += trie->minlen;
3993 delta += (trie->maxlen - trie->minlen);
3994 flags &= ~SCF_DO_STCLASS; /* xxx */
3995 if (flags & SCF_DO_SUBSTR) {
304ee84b 3996 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3997 data->pos_min += trie->minlen;
3998 data->pos_delta += (trie->maxlen - trie->minlen);
3999 if (trie->maxlen != trie->minlen)
4000 data->longest = &(data->longest_float);
4001 }
4002 if (trie->jump) /* no more substrings -- for now /grr*/
4003 flags &= ~SCF_DO_SUBSTR;
b515a41d 4004 }
8aa23a47
YO
4005#endif /* old or new */
4006#endif /* TRIE_STUDY_OPT */
e1d1eefb 4007
8aa23a47
YO
4008 /* Else: zero-length, ignore. */
4009 scan = regnext(scan);
4010 }
4011 if (frame) {
4012 last = frame->last;
4013 scan = frame->next;
4014 stopparen = frame->stop;
4015 frame = frame->prev;
4016 goto fake_study_recurse;
c277df42
IZ
4017 }
4018
4019 finish:
8aa23a47 4020 assert(!frame);
304ee84b 4021 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4022
c277df42 4023 *scanp = scan;
aca2d497 4024 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4025 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4026 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4027 if (is_par > (I32)U8_MAX)
c277df42
IZ
4028 is_par = 0;
4029 if (is_par && pars==1 && data) {
4030 data->flags |= SF_IN_PAR;
4031 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4032 }
4033 else if (pars && data) {
c277df42
IZ
4034 data->flags |= SF_HAS_PAR;
4035 data->flags &= ~SF_IN_PAR;
4036 }
653099ff 4037 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4038 cl_and(data->start_class, and_withp);
786e8c11
YO
4039 if (flags & SCF_TRIE_RESTUDY)
4040 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4041
304ee84b 4042 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4043
e2e6a0f1 4044 return min < stopmin ? min : stopmin;
c277df42
IZ
4045}
4046
2eccd3b2
NC
4047STATIC U32
4048S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4049{
4a4e7719
NC
4050 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4051
4052 Renewc(RExC_rxi->data,
4053 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4054 char, struct reg_data);
4055 if(count)
f8fc2ecf 4056 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4057 else
f8fc2ecf 4058 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4059 RExC_rxi->data->count = count + n;
4060 Copy(s, RExC_rxi->data->what + count, n, U8);
4061 return count;
c277df42
IZ
4062}
4063
f8149455 4064/*XXX: todo make this not included in a non debugging perl */
76234dfb 4065#ifndef PERL_IN_XSUB_RE
d88dccdf 4066void
864dbfa3 4067Perl_reginitcolors(pTHX)
d88dccdf 4068{
97aff369 4069 dVAR;
1df70142 4070 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4071 if (s) {
1df70142
AL
4072 char *t = savepv(s);
4073 int i = 0;
4074 PL_colors[0] = t;
d88dccdf 4075 while (++i < 6) {
1df70142
AL
4076 t = strchr(t, '\t');
4077 if (t) {
4078 *t = '\0';
4079 PL_colors[i] = ++t;
d88dccdf
IZ
4080 }
4081 else
1df70142 4082 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4083 }
4084 } else {
1df70142 4085 int i = 0;
b81d288d 4086 while (i < 6)
06b5626a 4087 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4088 }
4089 PL_colorset = 1;
4090}
76234dfb 4091#endif
8615cb43 4092
07be1b83 4093
786e8c11
YO
4094#ifdef TRIE_STUDY_OPT
4095#define CHECK_RESTUDY_GOTO \
4096 if ( \
4097 (data.flags & SCF_TRIE_RESTUDY) \
4098 && ! restudied++ \
4099 ) goto reStudy
4100#else
4101#define CHECK_RESTUDY_GOTO
4102#endif
f9f4320a 4103
a687059c 4104/*
e50aee73 4105 - pregcomp - compile a regular expression into internal code
a687059c
LW
4106 *
4107 * We can't allocate space until we know how big the compiled form will be,
4108 * but we can't compile it (and thus know how big it is) until we've got a
4109 * place to put the code. So we cheat: we compile it twice, once with code
4110 * generation turned off and size counting turned on, and once "for real".
4111 * This also means that we don't allocate space until we are sure that the
4112 * thing really will compile successfully, and we never have to move the
4113 * code and thus invalidate pointers into it. (Note that it has to be in
4114 * one piece because free() must be able to free it all.) [NB: not true in perl]
4115 *
4116 * Beware that the optimization-preparation code in here knows about some
4117 * of the structure of the compiled regexp. [I'll say.]
4118 */
b9b4dddf
YO
4119
4120
4121
f9f4320a 4122#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4123#define RE_ENGINE_PTR &PL_core_reg_engine
4124#else
f9f4320a
YO
4125extern const struct regexp_engine my_reg_engine;
4126#define RE_ENGINE_PTR &my_reg_engine
4127#endif
6d5c990f
RGS
4128
4129#ifndef PERL_IN_XSUB_RE
3ab4a224
AB
4130REGEXP *
4131Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
a687059c 4132{
97aff369 4133 dVAR;
6d5c990f 4134 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
4135 /* Dispatch a request to compile a regexp to correct
4136 regexp engine. */
f9f4320a
YO
4137 if (table) {
4138 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4139 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4140 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4141 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4142 DEBUG_COMPILE_r({
8d8756e7 4143 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4144 SvIV(*ptr));
4145 });
3ab4a224 4146 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4147 }
b9b4dddf 4148 }
3ab4a224 4149 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4150}
6d5c990f 4151#endif
2a5d9b1d 4152
3ab4a224 4153REGEXP *
b9ad30b4 4154Perl_re_compile(pTHX_ const SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4155{
4156 dVAR;
288b8c02
NC
4157 REGEXP *rx;
4158 struct regexp *r;
f8fc2ecf 4159 register regexp_internal *ri;
3ab4a224
AB
4160 STRLEN plen;
4161 char* exp = SvPV((SV*)pattern, plen);
4162 char* xend = exp + plen;
c277df42 4163 regnode *scan;
a0d0e21e 4164 I32 flags;
a0d0e21e
LW
4165 I32 minlen = 0;
4166 I32 sawplus = 0;
4167 I32 sawopen = 0;
2c2d71f5 4168 scan_data_t data;
830247a4 4169 RExC_state_t RExC_state;
be8e71aa 4170 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4171#ifdef TRIE_STUDY_OPT
4172 int restudied= 0;
4173 RExC_state_t copyRExC_state;
4174#endif
2a5d9b1d 4175 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f 4176 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4177
b9ad30b4 4178 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4179
a3621e74 4180 DEBUG_COMPILE_r({
ab3bbdeb
YO
4181 SV *dsv= sv_newmortal();
4182 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4183 dsv, exp, plen, 60);
ab3bbdeb
YO
4184 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4185 PL_colors[4],PL_colors[5],s);
a5961de5 4186 });
02daf0ab
YO
4187
4188redo_first_pass:
4189 RExC_precomp = exp;
c737faaf 4190 RExC_flags = pm_flags;
830247a4 4191 RExC_sawback = 0;
bbce6d69 4192
830247a4
IZ
4193 RExC_seen = 0;
4194 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4195 RExC_seen_evals = 0;
4196 RExC_extralen = 0;
c277df42 4197
bbce6d69 4198 /* First pass: determine size, legality. */
830247a4 4199 RExC_parse = exp;
fac92740 4200 RExC_start = exp;
830247a4
IZ
4201 RExC_end = xend;
4202 RExC_naughty = 0;
4203 RExC_npar = 1;
e2e6a0f1 4204 RExC_nestroot = 0;
830247a4
IZ
4205 RExC_size = 0L;
4206 RExC_emit = &PL_regdummy;
4207 RExC_whilem_seen = 0;
fc8cd66c 4208 RExC_charnames = NULL;
40d049e4
YO
4209 RExC_open_parens = NULL;
4210 RExC_close_parens = NULL;
4211 RExC_opend = NULL;
81714fb9 4212 RExC_paren_names = NULL;
1f1031fe
YO
4213#ifdef DEBUGGING
4214 RExC_paren_name_list = NULL;
4215#endif
40d049e4
YO
4216 RExC_recurse = NULL;
4217 RExC_recurse_count = 0;
81714fb9 4218
85ddcde9
JH
4219#if 0 /* REGC() is (currently) a NOP at the first pass.
4220 * Clever compilers notice this and complain. --jhi */
830247a4 4221 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4222#endif
3dab1dad
YO
4223 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4224 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4225 RExC_precomp = NULL;
a0d0e21e
LW
4226 return(NULL);
4227 }
02daf0ab 4228 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4229 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4230 codepoints outside of the byte range, such as via \x{100}. If we
4231 detect such a sequence we have to convert the entire pattern to utf8
4232 and then recompile, as our sizing calculation will have been based
4233 on 1 byte == 1 character, but we will need to use utf8 to encode
4234 at least some part of the pattern, and therefore must convert the whole
4235 thing.
4236 XXX: somehow figure out how to make this less expensive...
4237 -- dmq */
3ab4a224 4238 STRLEN len = plen;
02daf0ab
YO
4239 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4240 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4241 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4242 xend = exp + len;
4243 RExC_orig_utf8 = RExC_utf8;
4244 SAVEFREEPV(exp);
4245 goto redo_first_pass;
4246 }
07be1b83 4247 DEBUG_PARSE_r({
81714fb9
YO
4248 PerlIO_printf(Perl_debug_log,
4249 "Required size %"IVdf" nodes\n"
4250 "Starting second pass (creation)\n",
4251 (IV)RExC_size);
07be1b83
YO
4252 RExC_lastnum=0;
4253 RExC_lastparse=NULL;
4254 });
c277df42
IZ
4255 /* Small enough for pointer-storage convention?
4256 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4257 if (RExC_size >= 0x10000L && RExC_extralen)
4258 RExC_size += RExC_extralen;
c277df42 4259 else
830247a4
IZ
4260 RExC_extralen = 0;
4261 if (RExC_whilem_seen > 15)
4262 RExC_whilem_seen = 15;
a0d0e21e 4263
f9f4320a
YO
4264 /* Allocate space and zero-initialize. Note, the two step process
4265 of zeroing when in debug mode, thus anything assigned has to
4266 happen after that */
d2f13c59 4267 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4268 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4269 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4270 char, regexp_internal);
4271 if ( r == NULL || ri == NULL )
b45f050a 4272 FAIL("Regexp out of space");
0f79a09d
GS
4273#ifdef DEBUGGING
4274 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4275 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4276#else
f8fc2ecf
YO
4277 /* bulk initialize base fields with 0. */
4278 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4279#endif
58e23c8d
YO
4280
4281 /* non-zero initialization begins here */
f8fc2ecf 4282 RXi_SET( r, ri );
f9f4320a 4283 r->engine= RE_ENGINE_PTR;
c737faaf 4284 r->extflags = pm_flags;
bcdf7404 4285 {
f7819f85 4286 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4287 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4288 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4289 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4290 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4291 const char *fptr = STD_PAT_MODS; /*"msix"*/
4292 char *p;
9d17798d 4293 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4294 + (sizeof(STD_PAT_MODS) - 1)
4295 + (sizeof("(?:)") - 1);
4296
d2f13c59 4297 p = sv_grow((SV *)rx, wraplen + 1);
9d17798d 4298 SvCUR_set(rx, wraplen);
f7c278bf 4299 SvPOK_on(rx);
8f6ae13c 4300 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4301 *p++='('; *p++='?';
f7819f85
A
4302 if (has_p)
4303 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4304 {
4305 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4306 char *colon = r + 1;
4307 char ch;
4308
4309 while((ch = *fptr++)) {
4310 if(reganch & 1)
4311 *p++ = ch;
4312 else
4313 *r-- = ch;
4314 reganch >>= 1;
4315 }
4316 if(has_minus) {
4317 *r = '-';
4318 p = colon;
4319 }
4320 }
4321
28d8d7f4 4322 *p++ = ':';
bb661a58 4323 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4324 assert ((RX_WRAPPED(rx) - p) < 16);
4325 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4326 p += plen;
bcdf7404 4327 if (has_runon)
28d8d7f4
YO
4328 *p++ = '\n';
4329 *p++ = ')';
4330 *p = 0;
bcdf7404
YO
4331 }
4332
bbe252da 4333 r->intflags = 0;
830247a4 4334 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4335
6bda09f9 4336 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4337 Newxz(RExC_open_parens, RExC_npar,regnode *);
4338 SAVEFREEPV(RExC_open_parens);
4339 Newxz(RExC_close_parens,RExC_npar,regnode *);
4340 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4341 }
4342
4343 /* Useful during FAIL. */
7122b237
YO
4344#ifdef RE_TRACK_PATTERN_OFFSETS
4345 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4346 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4347 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4348 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4349 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4350#endif
4351 SetProgLen(ri,RExC_size);
288b8c02 4352 RExC_rx_sv = rx;
830247a4 4353 RExC_rx = r;
f8fc2ecf 4354 RExC_rxi = ri;
bbce6d69 4355
4356 /* Second pass: emit code. */
c737faaf 4357 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4358 RExC_parse = exp;
4359 RExC_end = xend;
4360 RExC_naughty = 0;
4361 RExC_npar = 1;
f8fc2ecf
YO
4362 RExC_emit_start = ri->program;
4363 RExC_emit = ri->program;
3b57cd43
YO
4364 RExC_emit_bound = ri->program + RExC_size + 1;
4365
2cd61cdb 4366 /* Store the count of eval-groups for security checks: */
f8149455 4367 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4368 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4369 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4370 ReREFCNT_dec(rx);
a0d0e21e 4371 return(NULL);
80757612 4372 }
07be1b83
YO
4373 /* XXXX To minimize changes to RE engine we always allocate
4374 3-units-long substrs field. */
4375 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4376 if (RExC_recurse_count) {
4377 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4378 SAVEFREEPV(RExC_recurse);
4379 }
a0d0e21e 4380
07be1b83 4381reStudy:
1de06328 4382 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4383 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4384
07be1b83
YO
4385#ifdef TRIE_STUDY_OPT
4386 if ( restudied ) {
5d458dd8 4387 U32 seen=RExC_seen;
07be1b83 4388 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4389
4390 RExC_state = copyRExC_state;
4391 if (seen & REG_TOP_LEVEL_BRANCHES)
4392 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4393 else
4394 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4395 if (data.last_found) {
07be1b83 4396 SvREFCNT_dec(data.longest_fixed);
07be1b83 4397 SvREFCNT_dec(data.longest_float);
07be1b83 4398 SvREFCNT_dec(data.last_found);
1de06328 4399 }
40d049e4 4400 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4401 } else {
40d049e4 4402 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4403 copyRExC_state = RExC_state;
07be1b83 4404 }
40d049e4
YO
4405#else
4406 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4407#endif
fc8cd66c 4408
a0d0e21e 4409 /* Dig out information for optimizations. */
f7819f85 4410 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4411 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4412
a0ed51b3 4413 if (UTF)
8f6ae13c 4414 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4415 ri->regstclass = NULL;
830247a4 4416 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4417 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4418 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4419
1de06328
YO
4420 /* testing for BRANCH here tells us whether there is "must appear"
4421 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4422 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4423 I32 fake;
c5254dd6 4424 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4425 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4426 int stclass_flag;
07be1b83 4427 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4428 regnode *first= scan;
4429 regnode *first_next= regnext(first);
4430
c277df42 4431 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4432 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4433 /* An OR of *one* alternative - should not happen now. */
5339e136 4434 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4435 /* for now we can't handle lookbehind IFMATCH*/
4436 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4437 (OP(first) == PLUS) ||
4438 (OP(first) == MINMOD) ||
653099ff 4439 /* An {n,m} with n>0 */
5339e136
YO
4440 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4441 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4442 {
786e8c11 4443
a0d0e21e
LW
4444 if (OP(first) == PLUS)
4445 sawplus = 1;
4446 else
3dab1dad 4447 first += regarglen[OP(first)];
07be1b83
YO
4448 if (OP(first) == IFMATCH) {
4449 first = NEXTOPER(first);
4450 first += EXTRA_STEP_2ARGS;
7c167cea 4451 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4452 first = NEXTOPER(first);
5339e136 4453 first_next= regnext(first);
a687059c
LW
4454 }
4455
a0d0e21e
LW
4456 /* Starting-point info. */
4457 again:
786e8c11 4458 DEBUG_PEEP("first:",first,0);
07be1b83 4459 /* Ignore EXACT as we deal with it later. */
3dab1dad 4460 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4461 if (OP(first) == EXACT)
6f207bd3 4462 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4463 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4464 ri->regstclass = first;
b3c9acc1 4465 }
07be1b83 4466#ifdef TRIE_STCLASS
786e8c11 4467 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4468 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4469 {
786e8c11 4470 regnode *trie_op;
07be1b83 4471 /* this can happen only on restudy */
786e8c11 4472 if ( OP(first) == TRIE ) {
c944940b 4473 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4474 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4475 StructCopy(first,trieop,struct regnode_1);
4476 trie_op=(regnode *)trieop;
4477 } else {
c944940b 4478 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4479 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4480 StructCopy(first,trieop,struct regnode_charclass);
4481 trie_op=(regnode *)trieop;
4482 }
1de06328 4483 OP(trie_op)+=2;
786e8c11 4484 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4485 ri->regstclass = trie_op;
07be1b83
YO
4486 }
4487#endif
bfed75c6 4488 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4489 ri->regstclass = first;
3dab1dad
YO
4490 else if (PL_regkind[OP(first)] == BOUND ||
4491 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4492 ri->regstclass = first;
3dab1dad 4493 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4494 r->extflags |= (OP(first) == MBOL
4495 ? RXf_ANCH_MBOL
cad2e5aa 4496 : (OP(first) == SBOL
bbe252da
YO
4497 ? RXf_ANCH_SBOL
4498 : RXf_ANCH_BOL));
a0d0e21e 4499 first = NEXTOPER(first);
774d564b 4500 goto again;
4501 }
4502 else if (OP(first) == GPOS) {
bbe252da 4503 r->extflags |= RXf_ANCH_GPOS;
774d564b 4504 first = NEXTOPER(first);
4505 goto again;
a0d0e21e 4506 }
cf2a2b69
YO
4507 else if ((!sawopen || !RExC_sawback) &&
4508 (OP(first) == STAR &&
3dab1dad 4509 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4510 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4511 {
4512 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4513 const int type =
4514 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4515 ? RXf_ANCH_MBOL
4516 : RXf_ANCH_SBOL;
4517 r->extflags |= type;
4518 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4519 first = NEXTOPER(first);
774d564b 4520 goto again;
a0d0e21e 4521 }
b81d288d 4522 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4523 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4524 /* x+ must match at the 1st pos of run of x's */
bbe252da 4525 r->intflags |= PREGf_SKIP;
a0d0e21e 4526
c277df42 4527 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4528#ifdef TRIE_STUDY_OPT
81714fb9 4529 DEBUG_PARSE_r(
be8e71aa
YO
4530 if (!restudied)
4531 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4532 (IV)(first - scan + 1))
4533 );
4534#else
81714fb9 4535 DEBUG_PARSE_r(
be8e71aa
YO
4536 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4537 (IV)(first - scan + 1))
4538 );
4539#endif
4540
4541
a0d0e21e
LW
4542 /*
4543 * If there's something expensive in the r.e., find the
4544 * longest literal string that must appear and make it the
4545 * regmust. Resolve ties in favor of later strings, since
4546 * the regstart check works with the beginning of the r.e.
4547 * and avoiding duplication strengthens checking. Not a
4548 * strong reason, but sufficient in the absence of others.
4549 * [Now we resolve ties in favor of the earlier string if
c277df42 4550 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4551 * earlier string may buy us something the later one won't.]
4552 */
de8c5301 4553
396482e1
GA
4554 data.longest_fixed = newSVpvs("");
4555 data.longest_float = newSVpvs("");
4556 data.last_found = newSVpvs("");
c277df42
IZ
4557 data.longest = &(data.longest_fixed);
4558 first = scan;
f8fc2ecf 4559 if (!ri->regstclass) {
830247a4 4560 cl_init(pRExC_state, &ch_class);
653099ff
GS
4561 data.start_class = &ch_class;
4562 stclass_flag = SCF_DO_STCLASS_AND;
4563 } else /* XXXX Check for BOUND? */
4564 stclass_flag = 0;
cb434fcc 4565 data.last_closep = &last_close;
de8c5301 4566
1de06328 4567 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4568 &data, -1, NULL, NULL,
4569 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4570
07be1b83 4571
786e8c11
YO
4572 CHECK_RESTUDY_GOTO;
4573
4574
830247a4 4575 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4576 && data.last_start_min == 0 && data.last_end > 0
830247a4 4577 && !RExC_seen_zerolen
2bf803e2 4578 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4579 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4580 r->extflags |= RXf_CHECK_ALL;
304ee84b 4581 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4582 SvREFCNT_dec(data.last_found);
4583
1de06328
YO
4584 /* Note that code very similar to this but for anchored string
4585 follows immediately below, changes may need to be made to both.
4586 Be careful.
4587 */
a0ed51b3 4588 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4589 if (longest_float_length
c277df42
IZ
4590 || (data.flags & SF_FL_BEFORE_EOL
4591 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4592 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4593 {
1182767e 4594 I32 t,ml;
cf93c79d 4595
1de06328 4596 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4597 && data.offset_fixed == data.offset_float_min
4598 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4599 goto remove_float; /* As in (a)+. */
4600
1de06328
YO
4601 /* copy the information about the longest float from the reg_scan_data
4602 over to the program. */
33b8afdf
JH
4603 if (SvUTF8(data.longest_float)) {
4604 r->float_utf8 = data.longest_float;
c445ea15 4605 r->float_substr = NULL;
33b8afdf
JH
4606 } else {
4607 r->float_substr = data.longest_float;
c445ea15 4608 r->float_utf8 = NULL;
33b8afdf 4609 }
1de06328
YO
4610 /* float_end_shift is how many chars that must be matched that
4611 follow this item. We calculate it ahead of time as once the
4612 lookbehind offset is added in we lose the ability to correctly
4613 calculate it.*/
4614 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4615 : (I32)longest_float_length;
1de06328
YO
4616 r->float_end_shift = ml - data.offset_float_min
4617 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4618 + data.lookbehind_float;
4619 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4620 r->float_max_offset = data.offset_float_max;
1182767e 4621 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4622 r->float_max_offset -= data.lookbehind_float;
4623
cf93c79d
IZ
4624 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4625 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4626 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4627 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4628 }
4629 else {
aca2d497 4630 remove_float:
c445ea15 4631 r->float_substr = r->float_utf8 = NULL;
c277df42 4632 SvREFCNT_dec(data.longest_float);
c5254dd6 4633 longest_float_length = 0;
a0d0e21e 4634 }
c277df42 4635
1de06328
YO
4636 /* Note that code very similar to this but for floating string
4637 is immediately above, changes may need to be made to both.
4638 Be careful.
4639 */
a0ed51b3 4640 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4641 if (longest_fixed_length
c277df42
IZ
4642 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4643 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4644 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4645 {
1182767e 4646 I32 t,ml;
cf93c79d 4647
1de06328
YO
4648 /* copy the information about the longest fixed
4649 from the reg_scan_data over to the program. */
33b8afdf
JH
4650 if (SvUTF8(data.longest_fixed)) {
4651 r->anchored_utf8 = data.longest_fixed;
c445ea15 4652 r->anchored_substr = NULL;
33b8afdf
JH
4653 } else {
4654 r->anchored_substr = data.longest_fixed;
c445ea15 4655 r->anchored_utf8 = NULL;
33b8afdf 4656 }
1de06328
YO
4657 /* fixed_end_shift is how many chars that must be matched that
4658 follow this item. We calculate it ahead of time as once the
4659 lookbehind offset is added in we lose the ability to correctly
4660 calculate it.*/
4661 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4662 : (I32)longest_fixed_length;
1de06328
YO
4663 r->anchored_end_shift = ml - data.offset_fixed
4664 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4665 + data.lookbehind_fixed;
4666 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4667
cf93c79d
IZ
4668 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4669 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4670 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4671 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4672 }
4673 else {
c445ea15 4674 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4675 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4676 longest_fixed_length = 0;
a0d0e21e 4677 }
f8fc2ecf
YO
4678 if (ri->regstclass
4679 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4680 ri->regstclass = NULL;
33b8afdf
JH
4681 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4682 && stclass_flag
653099ff 4683 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4684 && !cl_is_anything(data.start_class))
4685 {
2eccd3b2 4686 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4687
f8fc2ecf 4688 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4689 struct regnode_charclass_class);
4690 StructCopy(data.start_class,
f8fc2ecf 4691 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4692 struct regnode_charclass_class);
f8fc2ecf 4693 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4694 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4695 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4696 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4697 PerlIO_printf(Perl_debug_log,
a0288114 4698 "synthetic stclass \"%s\".\n",
3f7c398e 4699 SvPVX_const(sv));});
653099ff 4700 }
c277df42
IZ
4701
4702 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4703 if (longest_fixed_length > longest_float_length) {
1de06328 4704 r->check_end_shift = r->anchored_end_shift;
c277df42 4705 r->check_substr = r->anchored_substr;
33b8afdf 4706 r->check_utf8 = r->anchored_utf8;
c277df42 4707 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4708 if (r->extflags & RXf_ANCH_SINGLE)
4709 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4710 }
4711 else {
1de06328 4712 r->check_end_shift = r->float_end_shift;
c277df42 4713 r->check_substr = r->float_substr;
33b8afdf 4714 r->check_utf8 = r->float_utf8;
1de06328
YO
4715 r->check_offset_min = r->float_min_offset;
4716 r->check_offset_max = r->float_max_offset;
a0d0e21e 4717 }
30382c73
IZ
4718 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4719 This should be changed ASAP! */
bbe252da
YO
4720 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4721 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4722 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4723 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4724 }
1de06328
YO
4725 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4726 if ( (STRLEN)minlen < longest_float_length )
4727 minlen= longest_float_length;
4728 if ( (STRLEN)minlen < longest_fixed_length )
4729 minlen= longest_fixed_length;
4730 */
a0ed51b3
LW
4731 }
4732 else {
c277df42
IZ
4733 /* Several toplevels. Best we can is to set minlen. */
4734 I32 fake;
653099ff 4735 struct regnode_charclass_class ch_class;
cb434fcc 4736 I32 last_close = 0;
c277df42 4737
5d458dd8 4738 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4739
f8fc2ecf 4740 scan = ri->program + 1;
830247a4 4741 cl_init(pRExC_state, &ch_class);
653099ff 4742 data.start_class = &ch_class;
cb434fcc 4743 data.last_closep = &last_close;
07be1b83 4744
de8c5301 4745
1de06328 4746 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4747 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4748
786e8c11 4749 CHECK_RESTUDY_GOTO;
07be1b83 4750
33b8afdf 4751 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4752 = r->float_substr = r->float_utf8 = NULL;
653099ff 4753 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4754 && !cl_is_anything(data.start_class))
4755 {
2eccd3b2 4756 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4757
f8fc2ecf 4758 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4759 struct regnode_charclass_class);
4760 StructCopy(data.start_class,
f8fc2ecf 4761 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4762 struct regnode_charclass_class);
f8fc2ecf 4763 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4764 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4765 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4766 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4767 PerlIO_printf(Perl_debug_log,
a0288114 4768 "synthetic stclass \"%s\".\n",
3f7c398e 4769 SvPVX_const(sv));});
653099ff 4770 }
a0d0e21e
LW
4771 }
4772
1de06328
YO
4773 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4774 the "real" pattern. */
cf9788e3
RGS
4775 DEBUG_OPTIMISE_r({
4776 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4777 (IV)minlen, (IV)r->minlen);
cf9788e3 4778 });
de8c5301 4779 r->minlenret = minlen;
1de06328
YO
4780 if (r->minlen < minlen)
4781 r->minlen = minlen;
4782
b81d288d 4783 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4784 r->extflags |= RXf_GPOS_SEEN;
830247a4 4785 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4786 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4787 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4788 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4789 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4790 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4791 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4792 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4793 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4794 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4795 if (RExC_paren_names)
5daac39c 4796 RXp_PAREN_NAMES(r) = (HV*)SvREFCNT_inc(RExC_paren_names);
81714fb9 4797 else
5daac39c 4798 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4799
7bd1e614 4800#ifdef STUPID_PATTERN_CHECKS
5509d87a 4801 if (RX_PRELEN(rx) == 0)
640f820d 4802 r->extflags |= RXf_NULL;
5509d87a 4803 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4804 /* XXX: this should happen BEFORE we compile */
4805 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4806 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4807 r->extflags |= RXf_WHITE;
5509d87a 4808 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4809 r->extflags |= RXf_START_ONLY;
f1b875a0 4810#else
5509d87a 4811 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4812 /* XXX: this should happen BEFORE we compile */
4813 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4814 else {
4815 regnode *first = ri->program + 1;
39aa8307
JH
4816 U8 fop = OP(first);
4817 U8 nop = OP(NEXTOPER(first));
7bd1e614 4818
640f820d
AB
4819 if (PL_regkind[fop] == NOTHING && nop == END)
4820 r->extflags |= RXf_NULL;
4821 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4822 r->extflags |= RXf_START_ONLY;
4823 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4824 r->extflags |= RXf_WHITE;
4825 }
f1b875a0 4826#endif
1f1031fe
YO
4827#ifdef DEBUGGING
4828 if (RExC_paren_names) {
4829 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4830 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4831 } else
1f1031fe 4832#endif
cde0cee5 4833 ri->name_list_idx = 0;
1f1031fe 4834
40d049e4
YO
4835 if (RExC_recurse_count) {
4836 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4837 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4838 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4839 }
4840 }
f0ab9afb 4841 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4842 /* assume we don't need to swap parens around before we match */
4843
be8e71aa
YO
4844 DEBUG_DUMP_r({
4845 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4846 regdump(r);
4847 });
7122b237
YO
4848#ifdef RE_TRACK_PATTERN_OFFSETS
4849 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4850 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4851 U32 i;
4852 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4853 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4854 for (i = 1; i <= len; i++) {
7122b237 4855 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4856 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4857 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4858 }
4859 PerlIO_printf(Perl_debug_log, "\n");
4860 });
7122b237 4861#endif
288b8c02 4862 return rx;
a687059c
LW
4863}
4864
f9f4320a 4865#undef RE_ENGINE_PTR
3dab1dad 4866
93b32b6d 4867
81714fb9 4868SV*
192b9cd1
AB
4869Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4870 const U32 flags)
4871{
4872 PERL_UNUSED_ARG(value);
4873
f1b875a0 4874 if (flags & RXapif_FETCH) {
192b9cd1 4875 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4876 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
192b9cd1
AB
4877 Perl_croak(aTHX_ PL_no_modify);
4878 return NULL;
f1b875a0 4879 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
4880 return reg_named_buff_exists(rx, key, flags)
4881 ? &PL_sv_yes
4882 : &PL_sv_no;
f1b875a0 4883 } else if (flags & RXapif_REGNAMES) {
192b9cd1 4884 return reg_named_buff_all(rx, flags);
f1b875a0 4885 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
4886 return reg_named_buff_scalar(rx, flags);
4887 } else {
4888 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4889 return NULL;
4890 }
4891}
4892
4893SV*
4894Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4895 const U32 flags)
4896{
4897 PERL_UNUSED_ARG(lastkey);
4898
f1b875a0 4899 if (flags & RXapif_FIRSTKEY)
192b9cd1 4900 return reg_named_buff_firstkey(rx, flags);
f1b875a0 4901 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
4902 return reg_named_buff_nextkey(rx, flags);
4903 else {
4904 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4905 return NULL;
4906 }
4907}
4908
4909SV*
288b8c02
NC
4910Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4911 const U32 flags)
81714fb9 4912{
44a2ac75
YO
4913 AV *retarray = NULL;
4914 SV *ret;
288b8c02 4915 struct regexp *const rx = (struct regexp *)SvANY(r);
f1b875a0 4916 if (flags & RXapif_ALL)
44a2ac75 4917 retarray=newAV();
93b32b6d 4918
5daac39c
NC
4919 if (rx && RXp_PAREN_NAMES(rx)) {
4920 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
4921 if (he_str) {
4922 IV i;
4923 SV* sv_dat=HeVAL(he_str);
4924 I32 *nums=(I32*)SvPVX(sv_dat);
4925 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
4926 if ((I32)(rx->nparens) >= nums[i]
4927 && rx->offs[nums[i]].start != -1
4928 && rx->offs[nums[i]].end != -1)
93b32b6d 4929 {
49d7dfbc 4930 ret = newSVpvs("");
288b8c02 4931 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
4932 if (!retarray)
4933 return ret;
4934 } else {
4935 ret = newSVsv(&PL_sv_undef);
4936 }
4937 if (retarray) {
5a5094bd 4938 SvREFCNT_inc_simple_void(ret);
93b32b6d 4939 av_push(retarray, ret);
81714fb9
YO
4940 }
4941 }
93b32b6d 4942 if (retarray)
192b9cd1
AB
4943 return newRV((SV*)retarray);
4944 }
4945 }
4946 return NULL;
4947}
4948
4949bool
288b8c02 4950Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
4951 const U32 flags)
4952{
288b8c02 4953 struct regexp *const rx = (struct regexp *)SvANY(r);
5daac39c 4954 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 4955 if (flags & RXapif_ALL) {
5daac39c 4956 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 4957 } else {
288b8c02 4958 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
4959 if (sv) {
4960 SvREFCNT_dec(sv);
192b9cd1
AB
4961 return TRUE;
4962 } else {
4963 return FALSE;
4964 }
4965 }
4966 } else {
4967 return FALSE;
4968 }
4969}
4970
4971SV*
288b8c02 4972Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 4973{
288b8c02 4974 struct regexp *const rx = (struct regexp *)SvANY(r);
5daac39c
NC
4975 if ( rx && RXp_PAREN_NAMES(rx) ) {
4976 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 4977
288b8c02 4978 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
4979 } else {
4980 return FALSE;
4981 }
192b9cd1
AB
4982}
4983
4984SV*
288b8c02 4985Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 4986{
288b8c02 4987 struct regexp *const rx = (struct regexp *)SvANY(r);
5daac39c
NC
4988 if (rx && RXp_PAREN_NAMES(rx)) {
4989 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
4990 HE *temphe;
4991 while ( (temphe = hv_iternext_flags(hv,0)) ) {
4992 IV i;
4993 IV parno = 0;
4994 SV* sv_dat = HeVAL(temphe);
4995 I32 *nums = (I32*)SvPVX(sv_dat);
4996 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4997 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4998 rx->offs[nums[i]].start != -1 &&
4999 rx->offs[nums[i]].end != -1)
5000 {
5001 parno = nums[i];
5002 break;
5003 }
5004 }
f1b875a0 5005 if (parno || flags & RXapif_ALL) {
a663657d 5006 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5007 }
81714fb9
YO
5008 }
5009 }
44a2ac75
YO
5010 return NULL;
5011}
5012
192b9cd1 5013SV*
288b8c02 5014Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5015{
5016 SV *ret;
5017 AV *av;
5018 I32 length;
288b8c02 5019 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5020
5daac39c 5021 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5022 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5023 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5024 } else if (flags & RXapif_ONE) {
288b8c02 5025 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
192b9cd1
AB
5026 av = (AV*)SvRV(ret);
5027 length = av_len(av);
5028 return newSViv(length + 1);
5029 } else {
5030 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5031 return NULL;
5032 }
5033 }
5034 return &PL_sv_undef;
5035}
5036
5037SV*
288b8c02 5038Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5039{
288b8c02 5040 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5041 AV *av = newAV();
5042
5daac39c
NC
5043 if (rx && RXp_PAREN_NAMES(rx)) {
5044 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5045 HE *temphe;
5046 (void)hv_iterinit(hv);
5047 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5048 IV i;
5049 IV parno = 0;
5050 SV* sv_dat = HeVAL(temphe);
5051 I32 *nums = (I32*)SvPVX(sv_dat);
5052 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5053 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5054 rx->offs[nums[i]].start != -1 &&
5055 rx->offs[nums[i]].end != -1)
5056 {
5057 parno = nums[i];
5058 break;
5059 }
5060 }
f1b875a0 5061 if (parno || flags & RXapif_ALL) {
a663657d 5062 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5063 }
5064 }
5065 }
5066
5067 return newRV((SV*)av);
5068}
5069
49d7dfbc 5070void
288b8c02
NC
5071Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5072 SV * const sv)
44a2ac75 5073{
288b8c02 5074 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5075 char *s = NULL;
a9d504c3 5076 I32 i = 0;
44a2ac75 5077 I32 s1, t1;
44a2ac75 5078
cde0cee5
YO
5079 if (!rx->subbeg) {
5080 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5081 return;
cde0cee5
YO
5082 }
5083 else
f1b875a0 5084 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5085 /* $` */
f0ab9afb 5086 i = rx->offs[0].start;
cde0cee5 5087 s = rx->subbeg;
44a2ac75
YO
5088 }
5089 else
f1b875a0 5090 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5091 /* $' */
f0ab9afb
NC
5092 s = rx->subbeg + rx->offs[0].end;
5093 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5094 }
5095 else
5096 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5097 (s1 = rx->offs[paren].start) != -1 &&
5098 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5099 {
5100 /* $& $1 ... */
5101 i = t1 - s1;
5102 s = rx->subbeg + s1;
cde0cee5
YO
5103 } else {
5104 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5105 return;
cde0cee5
YO
5106 }
5107 assert(rx->sublen >= (s - rx->subbeg) + i );
5108 if (i >= 0) {
5109 const int oldtainted = PL_tainted;
5110 TAINT_NOT;
5111 sv_setpvn(sv, s, i);
5112 PL_tainted = oldtainted;
5113 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5114 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5115 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5116 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5117 {
5118 SvUTF8_on(sv);
5119 }
5120 else
5121 SvUTF8_off(sv);
5122 if (PL_tainting) {
07bc277f 5123 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5124 if (SvTYPE(sv) >= SVt_PVMG) {
5125 MAGIC* const mg = SvMAGIC(sv);
5126 MAGIC* mgt;
5127 PL_tainted = 1;
5128 SvMAGIC_set(sv, mg->mg_moremagic);
5129 SvTAINT(sv);
5130 if ((mgt = SvMAGIC(sv))) {
5131 mg->mg_moremagic = mgt;
5132 SvMAGIC_set(sv, mg);
44a2ac75 5133 }
cde0cee5
YO
5134 } else {
5135 PL_tainted = 1;
5136 SvTAINT(sv);
5137 }
5138 } else
5139 SvTAINTED_off(sv);
44a2ac75 5140 }
81714fb9 5141 } else {
44a2ac75 5142 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5143 return;
81714fb9
YO
5144 }
5145}
93b32b6d 5146
2fdbfb4d
AB
5147void
5148Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5149 SV const * const value)
5150{
5151 PERL_UNUSED_ARG(rx);
5152 PERL_UNUSED_ARG(paren);
5153 PERL_UNUSED_ARG(value);
5154
5155 if (!PL_localizing)
5156 Perl_croak(aTHX_ PL_no_modify);
5157}
5158
5159I32
288b8c02 5160Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5161 const I32 paren)
5162{
288b8c02 5163 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5164 I32 i;
5165 I32 s1, t1;
5166
5167 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5168 switch (paren) {
192b9cd1 5169 /* $` / ${^PREMATCH} */
f1b875a0 5170 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5171 if (rx->offs[0].start != -1) {
5172 i = rx->offs[0].start;
5173 if (i > 0) {
5174 s1 = 0;
5175 t1 = i;
5176 goto getlen;
5177 }
5178 }
5179 return 0;
192b9cd1 5180 /* $' / ${^POSTMATCH} */
f1b875a0 5181 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5182 if (rx->offs[0].end != -1) {
5183 i = rx->sublen - rx->offs[0].end;
5184 if (i > 0) {
5185 s1 = rx->offs[0].end;
5186 t1 = rx->sublen;
5187 goto getlen;
5188 }
5189 }
5190 return 0;
192b9cd1
AB
5191 /* $& / ${^MATCH}, $1, $2, ... */
5192 default:
2fdbfb4d
AB
5193 if (paren <= (I32)rx->nparens &&
5194 (s1 = rx->offs[paren].start) != -1 &&
5195 (t1 = rx->offs[paren].end) != -1)
5196 {
5197 i = t1 - s1;
5198 goto getlen;
5199 } else {
5200 if (ckWARN(WARN_UNINITIALIZED))
5201 report_uninit((SV*)sv);
5202 return 0;
5203 }
5204 }
5205 getlen:
07bc277f 5206 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5207 const char * const s = rx->subbeg + s1;
5208 const U8 *ep;
5209 STRLEN el;
5210
5211 i = t1 - s1;
5212 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5213 i = el;
5214 }
5215 return i;
5216}
5217
fe578d7f 5218SV*
49d7dfbc 5219Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f
AB
5220{
5221 PERL_UNUSED_ARG(rx);
5c35adbb 5222 return NULL;
fe578d7f 5223}
0a4db386 5224
894be9b7 5225/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5226 * If flags is REG_RSN_RETURN_NULL returns null.
5227 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5228 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5229 * to the parsed name as looked up in the RExC_paren_names hash.
5230 * If there is an error throws a vFAIL().. type exception.
894be9b7 5231 */
0a4db386
YO
5232
5233#define REG_RSN_RETURN_NULL 0
5234#define REG_RSN_RETURN_NAME 1
5235#define REG_RSN_RETURN_DATA 2
5236
894be9b7
YO
5237STATIC SV*
5238S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5239 char *name_start = RExC_parse;
1f1031fe
YO
5240
5241 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5242 /* skip IDFIRST by using do...while */
5243 if (UTF)
5244 do {
5245 RExC_parse += UTF8SKIP(RExC_parse);
5246 } while (isALNUM_utf8((U8*)RExC_parse));
5247 else
5248 do {
5249 RExC_parse++;
5250 } while (isALNUM(*RExC_parse));
894be9b7 5251 }
1f1031fe 5252
0a4db386 5253 if ( flags ) {
59cd0e26
NC
5254 SV* sv_name
5255 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5256 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5257 if ( flags == REG_RSN_RETURN_NAME)
5258 return sv_name;
5259 else if (flags==REG_RSN_RETURN_DATA) {
5260 HE *he_str = NULL;
5261 SV *sv_dat = NULL;
5262 if ( ! sv_name ) /* should not happen*/
5263 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5264 if (RExC_paren_names)
5265 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5266 if ( he_str )
5267 sv_dat = HeVAL(he_str);
5268 if ( ! sv_dat )
5269 vFAIL("Reference to nonexistent named group");
5270 return sv_dat;
5271 }
5272 else {
5273 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5274 }
5275 /* NOT REACHED */
894be9b7 5276 }
0a4db386 5277 return NULL;
894be9b7
YO
5278}
5279
3dab1dad
YO
5280#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5281 int rem=(int)(RExC_end - RExC_parse); \
5282 int cut; \
5283 int num; \
5284 int iscut=0; \
5285 if (rem>10) { \
5286 rem=10; \
5287 iscut=1; \
5288 } \
5289 cut=10-rem; \
5290 if (RExC_lastparse!=RExC_parse) \
5291 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5292 rem, RExC_parse, \
5293 cut + 4, \
5294 iscut ? "..." : "<" \
5295 ); \
5296 else \
5297 PerlIO_printf(Perl_debug_log,"%16s",""); \
5298 \
5299 if (SIZE_ONLY) \
3b57cd43 5300 num = RExC_size + 1; \
3dab1dad
YO
5301 else \
5302 num=REG_NODE_NUM(RExC_emit); \
5303 if (RExC_lastnum!=num) \
0a4db386 5304 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5305 else \
0a4db386 5306 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5307 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5308 (int)((depth*2)), "", \
3dab1dad
YO
5309 (funcname) \
5310 ); \
5311 RExC_lastnum=num; \
5312 RExC_lastparse=RExC_parse; \
5313})
5314
07be1b83
YO
5315
5316
3dab1dad
YO
5317#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5318 DEBUG_PARSE_MSG((funcname)); \
5319 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5320})
6bda09f9
YO
5321#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5322 DEBUG_PARSE_MSG((funcname)); \
5323 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5324})
a687059c
LW
5325/*
5326 - reg - regular expression, i.e. main body or parenthesized thing
5327 *
5328 * Caller must absorb opening parenthesis.
5329 *
5330 * Combining parenthesis handling with the base level of regular expression
5331 * is a trifle forced, but the need to tie the tails of the branches to what
5332 * follows makes it hard to avoid.
5333 */
07be1b83
YO
5334#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5335#ifdef DEBUGGING
5336#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5337#else
5338#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5339#endif
3dab1dad 5340
76e3520e 5341STATIC regnode *
3dab1dad 5342S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5343 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5344{
27da23d5 5345 dVAR;
c277df42
IZ
5346 register regnode *ret; /* Will be the head of the group. */
5347 register regnode *br;
5348 register regnode *lastbr;
cbbf8932 5349 register regnode *ender = NULL;
a0d0e21e 5350 register I32 parno = 0;
cbbf8932 5351 I32 flags;
f7819f85 5352 U32 oregflags = RExC_flags;
6136c704
AL
5353 bool have_branch = 0;
5354 bool is_open = 0;
594d7033
YO
5355 I32 freeze_paren = 0;
5356 I32 after_freeze = 0;
9d1d55b5
JP
5357
5358 /* for (?g), (?gc), and (?o) warnings; warning
5359 about (?c) will warn about (?g) -- japhy */
5360
6136c704
AL
5361#define WASTED_O 0x01
5362#define WASTED_G 0x02
5363#define WASTED_C 0x04
5364#define WASTED_GC (0x02|0x04)
cbbf8932 5365 I32 wastedflags = 0x00;
9d1d55b5 5366
fac92740 5367 char * parse_start = RExC_parse; /* MJD */
a28509cc 5368 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5369
3dab1dad
YO
5370 GET_RE_DEBUG_FLAGS_DECL;
5371 DEBUG_PARSE("reg ");
5372
821b33a5 5373 *flagp = 0; /* Tentatively. */
a0d0e21e 5374
9d1d55b5 5375
a0d0e21e
LW
5376 /* Make an OPEN node, if parenthesized. */
5377 if (paren) {
e2e6a0f1
YO
5378 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5379 char *start_verb = RExC_parse;
5380 STRLEN verb_len = 0;
5381 char *start_arg = NULL;
5382 unsigned char op = 0;
5383 int argok = 1;
5384 int internal_argval = 0; /* internal_argval is only useful if !argok */
5385 while ( *RExC_parse && *RExC_parse != ')' ) {
5386 if ( *RExC_parse == ':' ) {
5387 start_arg = RExC_parse + 1;
5388 break;
5389 }
5390 RExC_parse++;
5391 }
5392 ++start_verb;
5393 verb_len = RExC_parse - start_verb;
5394 if ( start_arg ) {
5395 RExC_parse++;
5396 while ( *RExC_parse && *RExC_parse != ')' )
5397 RExC_parse++;
5398 if ( *RExC_parse != ')' )
5399 vFAIL("Unterminated verb pattern argument");
5400 if ( RExC_parse == start_arg )
5401 start_arg = NULL;
5402 } else {
5403 if ( *RExC_parse != ')' )
5404 vFAIL("Unterminated verb pattern");
5405 }
5d458dd8 5406
e2e6a0f1
YO
5407 switch ( *start_verb ) {
5408 case 'A': /* (*ACCEPT) */
568a785a 5409 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5410 op = ACCEPT;
5411 internal_argval = RExC_nestroot;
5412 }
5413 break;
5414 case 'C': /* (*COMMIT) */
568a785a 5415 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5416 op = COMMIT;
e2e6a0f1
YO
5417 break;
5418 case 'F': /* (*FAIL) */
568a785a 5419 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5420 op = OPFAIL;
5421 argok = 0;
5422 }
5423 break;
5d458dd8
YO
5424 case ':': /* (*:NAME) */
5425 case 'M': /* (*MARK:NAME) */
568a785a 5426 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5427 op = MARKPOINT;
5d458dd8
YO
5428 argok = -1;
5429 }
5430 break;
5431 case 'P': /* (*PRUNE) */
568a785a 5432 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5433 op = PRUNE;
e2e6a0f1 5434 break;
5d458dd8 5435 case 'S': /* (*SKIP) */
568a785a 5436 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5437 op = SKIP;
5438 break;
5439 case 'T': /* (*THEN) */
5440 /* [19:06] <TimToady> :: is then */
568a785a 5441 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5442 op = CUTGROUP;
5443 RExC_seen |= REG_SEEN_CUTGROUP;
5444 }
e2e6a0f1
YO
5445 break;
5446 }
5447 if ( ! op ) {
5448 RExC_parse++;
5449 vFAIL3("Unknown verb pattern '%.*s'",
5450 verb_len, start_verb);
5451 }
5452 if ( argok ) {
5453 if ( start_arg && internal_argval ) {
5454 vFAIL3("Verb pattern '%.*s' may not have an argument",
5455 verb_len, start_verb);
5456 } else if ( argok < 0 && !start_arg ) {
5457 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5458 verb_len, start_verb);
5459 } else {
5460 ret = reganode(pRExC_state, op, internal_argval);
5461 if ( ! internal_argval && ! SIZE_ONLY ) {
5462 if (start_arg) {
5463 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5464 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5465 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5466 ret->flags = 0;
5467 } else {
5468 ret->flags = 1;
5469 }
5470 }
5471 }
5472 if (!internal_argval)
5473 RExC_seen |= REG_SEEN_VERBARG;
5474 } else if ( start_arg ) {
5475 vFAIL3("Verb pattern '%.*s' may not have an argument",
5476 verb_len, start_verb);
5477 } else {
5478 ret = reg_node(pRExC_state, op);
5479 }
5480 nextchar(pRExC_state);
5481 return ret;
5482 } else
fac92740 5483 if (*RExC_parse == '?') { /* (?...) */
6136c704 5484 bool is_logical = 0;
a28509cc 5485 const char * const seqstart = RExC_parse;
ca9dfc88 5486
830247a4
IZ
5487 RExC_parse++;
5488 paren = *RExC_parse++;
c277df42 5489 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5490 switch (paren) {
894be9b7 5491
1f1031fe
YO
5492 case 'P': /* (?P...) variants for those used to PCRE/Python */
5493 paren = *RExC_parse++;
5494 if ( paren == '<') /* (?P<...>) named capture */
5495 goto named_capture;
5496 else if (paren == '>') { /* (?P>name) named recursion */
5497 goto named_recursion;
5498 }
5499 else if (paren == '=') { /* (?P=...) named backref */
5500 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5501 you change this make sure you change that */
5502 char* name_start = RExC_parse;
5503 U32 num = 0;
5504 SV *sv_dat = reg_scan_name(pRExC_state,
5505 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5506 if (RExC_parse == name_start || *RExC_parse != ')')
5507 vFAIL2("Sequence %.3s... not terminated",parse_start);
5508
5509 if (!SIZE_ONLY) {
5510 num = add_data( pRExC_state, 1, "S" );
5511 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5512 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5513 }
5514 RExC_sawback = 1;
5515 ret = reganode(pRExC_state,
5516 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5517 num);
5518 *flagp |= HASWIDTH;
5519
5520 Set_Node_Offset(ret, parse_start+1);
5521 Set_Node_Cur_Length(ret); /* MJD */
5522
5523 nextchar(pRExC_state);
5524 return ret;
5525 }
57b84237
YO
5526 RExC_parse++;
5527 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5528 /*NOTREACHED*/
5529 case '<': /* (?<...) */
b81d288d 5530 if (*RExC_parse == '!')
c277df42 5531 paren = ',';
0a4db386 5532 else if (*RExC_parse != '=')
1f1031fe 5533 named_capture:
0a4db386 5534 { /* (?<...>) */
81714fb9 5535 char *name_start;
894be9b7 5536 SV *svname;
81714fb9
YO
5537 paren= '>';
5538 case '\'': /* (?'...') */
5539 name_start= RExC_parse;
0a4db386
YO
5540 svname = reg_scan_name(pRExC_state,
5541 SIZE_ONLY ? /* reverse test from the others */
5542 REG_RSN_RETURN_NAME :
5543 REG_RSN_RETURN_NULL);
57b84237
YO
5544 if (RExC_parse == name_start) {
5545 RExC_parse++;
5546 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5547 /*NOTREACHED*/
5548 }
81714fb9
YO
5549 if (*RExC_parse != paren)
5550 vFAIL2("Sequence (?%c... not terminated",
5551 paren=='>' ? '<' : paren);
5552 if (SIZE_ONLY) {
e62cc96a
YO
5553 HE *he_str;
5554 SV *sv_dat = NULL;
894be9b7
YO
5555 if (!svname) /* shouldnt happen */
5556 Perl_croak(aTHX_
5557 "panic: reg_scan_name returned NULL");
81714fb9
YO
5558 if (!RExC_paren_names) {
5559 RExC_paren_names= newHV();
5560 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5561#ifdef DEBUGGING
5562 RExC_paren_name_list= newAV();
5563 sv_2mortal((SV*)RExC_paren_name_list);
5564#endif
81714fb9
YO
5565 }
5566 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5567 if ( he_str )
81714fb9 5568 sv_dat = HeVAL(he_str);
e62cc96a 5569 if ( ! sv_dat ) {
81714fb9 5570 /* croak baby croak */
e62cc96a
YO
5571 Perl_croak(aTHX_
5572 "panic: paren_name hash element allocation failed");
5573 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5574 /* (?|...) can mean we have dupes so scan to check
5575 its already been stored. Maybe a flag indicating
5576 we are inside such a construct would be useful,
5577 but the arrays are likely to be quite small, so
5578 for now we punt -- dmq */
5579 IV count = SvIV(sv_dat);
5580 I32 *pv = (I32*)SvPVX(sv_dat);
5581 IV i;
5582 for ( i = 0 ; i < count ; i++ ) {
5583 if ( pv[i] == RExC_npar ) {
5584 count = 0;
5585 break;
5586 }
5587 }
5588 if ( count ) {
5589 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5590 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5591 pv[count] = RExC_npar;
5592 SvIVX(sv_dat)++;
5593 }
81714fb9
YO
5594 } else {
5595 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5596 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5597 SvIOK_on(sv_dat);
5598 SvIVX(sv_dat)= 1;
e62cc96a 5599 }
1f1031fe
YO
5600#ifdef DEBUGGING
5601 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5602 SvREFCNT_dec(svname);
5603#endif
e62cc96a 5604
81714fb9
YO
5605 /*sv_dump(sv_dat);*/
5606 }
5607 nextchar(pRExC_state);
5608 paren = 1;
5609 goto capturing_parens;
5610 }
5611 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5612 RExC_parse++;
fac92740
MJD
5613 case '=': /* (?=...) */
5614 case '!': /* (?!...) */
830247a4 5615 RExC_seen_zerolen++;
e2e6a0f1
YO
5616 if (*RExC_parse == ')') {
5617 ret=reg_node(pRExC_state, OPFAIL);
5618 nextchar(pRExC_state);
5619 return ret;
5620 }
594d7033
YO
5621 break;
5622 case '|': /* (?|...) */
5623 /* branch reset, behave like a (?:...) except that
5624 buffers in alternations share the same numbers */
5625 paren = ':';
5626 after_freeze = freeze_paren = RExC_npar;
5627 break;
fac92740
MJD
5628 case ':': /* (?:...) */
5629 case '>': /* (?>...) */
a0d0e21e 5630 break;
fac92740
MJD
5631 case '$': /* (?$...) */
5632 case '@': /* (?@...) */
8615cb43 5633 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5634 break;
fac92740 5635 case '#': /* (?#...) */
830247a4
IZ
5636 while (*RExC_parse && *RExC_parse != ')')
5637 RExC_parse++;
5638 if (*RExC_parse != ')')
c277df42 5639 FAIL("Sequence (?#... not terminated");
830247a4 5640 nextchar(pRExC_state);
a0d0e21e
LW
5641 *flagp = TRYAGAIN;
5642 return NULL;
894be9b7
YO
5643 case '0' : /* (?0) */
5644 case 'R' : /* (?R) */
5645 if (*RExC_parse != ')')
6bda09f9 5646 FAIL("Sequence (?R) not terminated");
1a147d38 5647 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5648 *flagp |= POSTPONED;
7f69552c
YO
5649 nextchar(pRExC_state);
5650 return ret;
5651 /*notreached*/
894be9b7
YO
5652 { /* named and numeric backreferences */
5653 I32 num;
894be9b7
YO
5654 case '&': /* (?&NAME) */
5655 parse_start = RExC_parse - 1;
1f1031fe 5656 named_recursion:
894be9b7 5657 {
0a4db386
YO
5658 SV *sv_dat = reg_scan_name(pRExC_state,
5659 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5660 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5661 }
5662 goto gen_recurse_regop;
5663 /* NOT REACHED */
542fa716
YO
5664 case '+':
5665 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5666 RExC_parse++;
5667 vFAIL("Illegal pattern");
5668 }
5669 goto parse_recursion;
5670 /* NOT REACHED*/
5671 case '-': /* (?-1) */
5672 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5673 RExC_parse--; /* rewind to let it be handled later */
5674 goto parse_flags;
5675 }
5676 /*FALLTHROUGH */
6bda09f9
YO
5677 case '1': case '2': case '3': case '4': /* (?1) */
5678 case '5': case '6': case '7': case '8': case '9':
5679 RExC_parse--;
542fa716 5680 parse_recursion:
894be9b7
YO
5681 num = atoi(RExC_parse);
5682 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5683 if (*RExC_parse == '-')
5684 RExC_parse++;
6bda09f9
YO
5685 while (isDIGIT(*RExC_parse))
5686 RExC_parse++;
5687 if (*RExC_parse!=')')
5688 vFAIL("Expecting close bracket");
894be9b7
YO
5689
5690 gen_recurse_regop:
542fa716
YO
5691 if ( paren == '-' ) {
5692 /*
5693 Diagram of capture buffer numbering.
5694 Top line is the normal capture buffer numbers
5695 Botton line is the negative indexing as from
5696 the X (the (?-2))
5697
5698 + 1 2 3 4 5 X 6 7
5699 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5700 - 5 4 3 2 1 X x x
5701
5702 */
5703 num = RExC_npar + num;
5704 if (num < 1) {
5705 RExC_parse++;
5706 vFAIL("Reference to nonexistent group");
5707 }
5708 } else if ( paren == '+' ) {
5709 num = RExC_npar + num - 1;
5710 }
5711
1a147d38 5712 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5713 if (!SIZE_ONLY) {
5714 if (num > (I32)RExC_rx->nparens) {
5715 RExC_parse++;
5716 vFAIL("Reference to nonexistent group");
5717 }
40d049e4 5718 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5719 RExC_emit++;
226de585 5720 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5721 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5722 } else {
6bda09f9 5723 RExC_size++;
6bda09f9 5724 }
0a4db386 5725 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5726 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5727 Set_Node_Offset(ret, parse_start); /* MJD */
5728
a3b492c3 5729 *flagp |= POSTPONED;
6bda09f9
YO
5730 nextchar(pRExC_state);
5731 return ret;
894be9b7
YO
5732 } /* named and numeric backreferences */
5733 /* NOT REACHED */
5734
fac92740 5735 case '?': /* (??...) */
6136c704 5736 is_logical = 1;
57b84237
YO
5737 if (*RExC_parse != '{') {
5738 RExC_parse++;
5739 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5740 /*NOTREACHED*/
5741 }
a3b492c3 5742 *flagp |= POSTPONED;
830247a4 5743 paren = *RExC_parse++;
0f5d15d6 5744 /* FALL THROUGH */
fac92740 5745 case '{': /* (?{...}) */
c277df42 5746 {
2eccd3b2
NC
5747 I32 count = 1;
5748 U32 n = 0;
c277df42 5749 char c;
830247a4 5750 char *s = RExC_parse;
c277df42 5751
830247a4
IZ
5752 RExC_seen_zerolen++;
5753 RExC_seen |= REG_SEEN_EVAL;
5754 while (count && (c = *RExC_parse)) {
6136c704
AL
5755 if (c == '\\') {
5756 if (RExC_parse[1])
5757 RExC_parse++;
5758 }
b81d288d 5759 else if (c == '{')
c277df42 5760 count++;
b81d288d 5761 else if (c == '}')
c277df42 5762 count--;
830247a4 5763 RExC_parse++;
c277df42 5764 }
6136c704 5765 if (*RExC_parse != ')') {
b81d288d 5766 RExC_parse = s;
b45f050a
JF
5767 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5768 }
c277df42 5769 if (!SIZE_ONLY) {
f3548bdc 5770 PAD *pad;
6136c704
AL
5771 OP_4tree *sop, *rop;
5772 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5773
569233ed
SB
5774 ENTER;
5775 Perl_save_re_context(aTHX);
f3548bdc 5776 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5777 sop->op_private |= OPpREFCOUNTED;
5778 /* re_dup will OpREFCNT_inc */
5779 OpREFCNT_set(sop, 1);
569233ed 5780 LEAVE;
c277df42 5781
830247a4 5782 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5783 RExC_rxi->data->data[n] = (void*)rop;
5784 RExC_rxi->data->data[n+1] = (void*)sop;
5785 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5786 SvREFCNT_dec(sv);
a0ed51b3 5787 }
e24b16f9 5788 else { /* First pass */
830247a4 5789 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5790 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5791 /* No compiled RE interpolated, has runtime
5792 components ===> unsafe. */
5793 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5794 if (PL_tainting && PL_tainted)
cc6b7395 5795 FAIL("Eval-group in insecure regular expression");
54df2634 5796#if PERL_VERSION > 8
923e4eb5 5797 if (IN_PERL_COMPILETIME)
b5c19bd7 5798 PL_cv_has_eval = 1;
54df2634 5799#endif
c277df42 5800 }
b5c19bd7 5801
830247a4 5802 nextchar(pRExC_state);
6136c704 5803 if (is_logical) {
830247a4 5804 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5805 if (!SIZE_ONLY)
5806 ret->flags = 2;
3dab1dad 5807 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5808 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5809 return ret;
5810 }
ccb2c380
MP
5811 ret = reganode(pRExC_state, EVAL, n);
5812 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5813 Set_Node_Offset(ret, parse_start);
5814 return ret;
c277df42 5815 }
fac92740 5816 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5817 {
0a4db386 5818 int is_define= 0;
fac92740 5819 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5820 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5821 || RExC_parse[1] == '<'
830247a4 5822 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5823 I32 flag;
5824
830247a4 5825 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5826 if (!SIZE_ONLY)
5827 ret->flags = 1;
3dab1dad 5828 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5829 goto insert_if;
b81d288d 5830 }
a0ed51b3 5831 }
0a4db386
YO
5832 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5833 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5834 {
5835 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5836 char *name_start= RExC_parse++;
2eccd3b2 5837 U32 num = 0;
0a4db386
YO
5838 SV *sv_dat=reg_scan_name(pRExC_state,
5839 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5840 if (RExC_parse == name_start || *RExC_parse != ch)
5841 vFAIL2("Sequence (?(%c... not terminated",
5842 (ch == '>' ? '<' : ch));
5843 RExC_parse++;
5844 if (!SIZE_ONLY) {
5845 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5846 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5847 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
5848 }
5849 ret = reganode(pRExC_state,NGROUPP,num);
5850 goto insert_if_check_paren;
5851 }
5852 else if (RExC_parse[0] == 'D' &&
5853 RExC_parse[1] == 'E' &&
5854 RExC_parse[2] == 'F' &&
5855 RExC_parse[3] == 'I' &&
5856 RExC_parse[4] == 'N' &&
5857 RExC_parse[5] == 'E')
5858 {
5859 ret = reganode(pRExC_state,DEFINEP,0);
5860 RExC_parse +=6 ;
5861 is_define = 1;
5862 goto insert_if_check_paren;
5863 }
5864 else if (RExC_parse[0] == 'R') {
5865 RExC_parse++;
5866 parno = 0;
5867 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5868 parno = atoi(RExC_parse++);
5869 while (isDIGIT(*RExC_parse))
5870 RExC_parse++;
5871 } else if (RExC_parse[0] == '&') {
5872 SV *sv_dat;
5873 RExC_parse++;
5874 sv_dat = reg_scan_name(pRExC_state,
5875 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5876 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5877 }
1a147d38 5878 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5879 goto insert_if_check_paren;
5880 }
830247a4 5881 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5882 /* (?(1)...) */
6136c704 5883 char c;
830247a4 5884 parno = atoi(RExC_parse++);
c277df42 5885
830247a4
IZ
5886 while (isDIGIT(*RExC_parse))
5887 RExC_parse++;
fac92740 5888 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5889
0a4db386 5890 insert_if_check_paren:
830247a4 5891 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5892 vFAIL("Switch condition not recognized");
c277df42 5893 insert_if:
3dab1dad
YO
5894 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5895 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5896 if (br == NULL)
830247a4 5897 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5898 else
3dab1dad 5899 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5900 c = *nextchar(pRExC_state);
d1b80229
IZ
5901 if (flags&HASWIDTH)
5902 *flagp |= HASWIDTH;
c277df42 5903 if (c == '|') {
0a4db386
YO
5904 if (is_define)
5905 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5906 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5907 regbranch(pRExC_state, &flags, 1,depth+1);
5908 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5909 if (flags&HASWIDTH)
5910 *flagp |= HASWIDTH;
830247a4 5911 c = *nextchar(pRExC_state);
a0ed51b3
LW
5912 }
5913 else
c277df42
IZ
5914 lastbr = NULL;
5915 if (c != ')')
8615cb43 5916 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5917 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5918 REGTAIL(pRExC_state, br, ender);
c277df42 5919 if (lastbr) {
3dab1dad
YO
5920 REGTAIL(pRExC_state, lastbr, ender);
5921 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5922 }
5923 else
3dab1dad 5924 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
5925 RExC_size++; /* XXX WHY do we need this?!!
5926 For large programs it seems to be required
5927 but I can't figure out why. -- dmq*/
c277df42 5928 return ret;
a0ed51b3
LW
5929 }
5930 else {
830247a4 5931 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5932 }
5933 }
1b1626e4 5934 case 0:
830247a4 5935 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5936 vFAIL("Sequence (? incomplete");
1b1626e4 5937 break;
a0d0e21e 5938 default:
cde0cee5
YO
5939 --RExC_parse;
5940 parse_flags: /* (?i) */
5941 {
5942 U32 posflags = 0, negflags = 0;
5943 U32 *flagsp = &posflags;
5944
5945 while (*RExC_parse) {
5946 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5947 /* (?g), (?gc) and (?o) are useless here
5948 and must be globally applied -- japhy */
cde0cee5
YO
5949 switch (*RExC_parse) {
5950 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
5951 case ONCE_PAT_MOD: /* 'o' */
5952 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 5953 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5954 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5955 if (! (wastedflags & wflagbit) ) {
5956 wastedflags |= wflagbit;
5957 vWARN5(
5958 RExC_parse + 1,
5959 "Useless (%s%c) - %suse /%c modifier",
5960 flagsp == &negflags ? "?-" : "?",
5961 *RExC_parse,
5962 flagsp == &negflags ? "don't " : "",
5963 *RExC_parse
5964 );
5965 }
5966 }
cde0cee5
YO
5967 break;
5968
f7819f85 5969 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 5970 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5971 if (! (wastedflags & WASTED_C) ) {
5972 wastedflags |= WASTED_GC;
9d1d55b5
JP
5973 vWARN3(
5974 RExC_parse + 1,
5975 "Useless (%sc) - %suse /gc modifier",
5976 flagsp == &negflags ? "?-" : "?",
5977 flagsp == &negflags ? "don't " : ""
5978 );
5979 }
5980 }
cde0cee5 5981 break;
f7819f85 5982 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5
YO
5983 if (flagsp == &negflags) {
5984 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
f7819f85 5985 vWARN(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
5986 } else {
5987 *flagsp |= RXf_PMf_KEEPCOPY;
5988 }
5989 break;
5990 case '-':
57b84237
YO
5991 if (flagsp == &negflags) {
5992 RExC_parse++;
5993 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5994 /*NOTREACHED*/
5995 }
cde0cee5
YO
5996 flagsp = &negflags;
5997 wastedflags = 0; /* reset so (?g-c) warns twice */
5998 break;
5999 case ':':
6000 paren = ':';
6001 /*FALLTHROUGH*/
6002 case ')':
6003 RExC_flags |= posflags;
6004 RExC_flags &= ~negflags;
f7819f85
A
6005 if (paren != ':') {
6006 oregflags |= posflags;
6007 oregflags &= ~negflags;
6008 }
cde0cee5
YO
6009 nextchar(pRExC_state);
6010 if (paren != ':') {
6011 *flagp = TRYAGAIN;
6012 return NULL;
6013 } else {
6014 ret = NULL;
6015 goto parse_rest;
6016 }
6017 /*NOTREACHED*/
6018 default:
cde0cee5
YO
6019 RExC_parse++;
6020 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6021 /*NOTREACHED*/
6022 }
830247a4 6023 ++RExC_parse;
48c036b1 6024 }
cde0cee5 6025 }} /* one for the default block, one for the switch */
a0d0e21e 6026 }
fac92740 6027 else { /* (...) */
81714fb9 6028 capturing_parens:
830247a4
IZ
6029 parno = RExC_npar;
6030 RExC_npar++;
e2e6a0f1 6031
830247a4 6032 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6033 if (!SIZE_ONLY ){
6034 if (!RExC_nestroot)
6035 RExC_nestroot = parno;
c009da3d
YO
6036 if (RExC_seen & REG_SEEN_RECURSE
6037 && !RExC_open_parens[parno-1])
6038 {
e2e6a0f1 6039 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6040 "Setting open paren #%"IVdf" to %d\n",
6041 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6042 RExC_open_parens[parno-1]= ret;
6043 }
6bda09f9 6044 }
fac92740
MJD
6045 Set_Node_Length(ret, 1); /* MJD */
6046 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6047 is_open = 1;
a0d0e21e 6048 }
a0ed51b3 6049 }
fac92740 6050 else /* ! paren */
a0d0e21e 6051 ret = NULL;
cde0cee5
YO
6052
6053 parse_rest:
a0d0e21e 6054 /* Pick up the branches, linking them together. */
fac92740 6055 parse_start = RExC_parse; /* MJD */
3dab1dad 6056 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 6057 /* branch_len = (paren != 0); */
2af232bd 6058
a0d0e21e
LW
6059 if (br == NULL)
6060 return(NULL);
830247a4
IZ
6061 if (*RExC_parse == '|') {
6062 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6063 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6064 }
fac92740 6065 else { /* MJD */
6bda09f9 6066 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6067 Set_Node_Length(br, paren != 0);
6068 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6069 }
c277df42
IZ
6070 have_branch = 1;
6071 if (SIZE_ONLY)
830247a4 6072 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6073 }
6074 else if (paren == ':') {
c277df42
IZ
6075 *flagp |= flags&SIMPLE;
6076 }
6136c704 6077 if (is_open) { /* Starts with OPEN. */
3dab1dad 6078 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6079 }
6080 else if (paren != '?') /* Not Conditional */
a0d0e21e 6081 ret = br;
8ae10a67 6082 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6083 lastbr = br;
830247a4
IZ
6084 while (*RExC_parse == '|') {
6085 if (!SIZE_ONLY && RExC_extralen) {
6086 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6087 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6088 }
6089 if (SIZE_ONLY)
830247a4
IZ
6090 RExC_extralen += 2; /* Account for LONGJMP. */
6091 nextchar(pRExC_state);
594d7033
YO
6092 if (freeze_paren) {
6093 if (RExC_npar > after_freeze)
6094 after_freeze = RExC_npar;
6095 RExC_npar = freeze_paren;
6096 }
3dab1dad 6097 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6098
a687059c 6099 if (br == NULL)
a0d0e21e 6100 return(NULL);
3dab1dad 6101 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6102 lastbr = br;
8ae10a67 6103 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6104 }
6105
c277df42
IZ
6106 if (have_branch || paren != ':') {
6107 /* Make a closing node, and hook it on the end. */
6108 switch (paren) {
6109 case ':':
830247a4 6110 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6111 break;
6112 case 1:
830247a4 6113 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6114 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6115 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6116 "Setting close paren #%"IVdf" to %d\n",
6117 (IV)parno, REG_NODE_NUM(ender)));
6118 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6119 if (RExC_nestroot == parno)
6120 RExC_nestroot = 0;
40d049e4 6121 }
fac92740
MJD
6122 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6123 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6124 break;
6125 case '<':
c277df42
IZ
6126 case ',':
6127 case '=':
6128 case '!':
c277df42 6129 *flagp &= ~HASWIDTH;
821b33a5
IZ
6130 /* FALL THROUGH */
6131 case '>':
830247a4 6132 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6133 break;
6134 case 0:
830247a4 6135 ender = reg_node(pRExC_state, END);
40d049e4
YO
6136 if (!SIZE_ONLY) {
6137 assert(!RExC_opend); /* there can only be one! */
6138 RExC_opend = ender;
6139 }
c277df42
IZ
6140 break;
6141 }
eaf3ca90 6142 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6143
9674d46a 6144 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6145 if (depth==1)
6146 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6147
c277df42 6148 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6149 for (br = ret; br; br = regnext(br)) {
6150 const U8 op = PL_regkind[OP(br)];
6151 if (op == BRANCH) {
07be1b83 6152 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6153 }
6154 else if (op == BRANCHJ) {
07be1b83 6155 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6156 }
c277df42
IZ
6157 }
6158 }
a0d0e21e 6159 }
c277df42
IZ
6160
6161 {
e1ec3a88
AL
6162 const char *p;
6163 static const char parens[] = "=!<,>";
c277df42
IZ
6164
6165 if (paren && (p = strchr(parens, paren))) {
eb160463 6166 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6167 int flag = (p - parens) > 1;
6168
6169 if (paren == '>')
6170 node = SUSPEND, flag = 0;
6bda09f9 6171 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6172 Set_Node_Cur_Length(ret);
6173 Set_Node_Offset(ret, parse_start + 1);
c277df42 6174 ret->flags = flag;
07be1b83 6175 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6176 }
a0d0e21e
LW
6177 }
6178
6179 /* Check for proper termination. */
ce3e6498 6180 if (paren) {
e2509266 6181 RExC_flags = oregflags;
830247a4
IZ
6182 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6183 RExC_parse = oregcomp_parse;
380a0633 6184 vFAIL("Unmatched (");
ce3e6498 6185 }
a0ed51b3 6186 }
830247a4
IZ
6187 else if (!paren && RExC_parse < RExC_end) {
6188 if (*RExC_parse == ')') {
6189 RExC_parse++;
380a0633 6190 vFAIL("Unmatched )");
a0ed51b3
LW
6191 }
6192 else
b45f050a 6193 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6194 /* NOTREACHED */
6195 }
594d7033
YO
6196 if (after_freeze)
6197 RExC_npar = after_freeze;
a0d0e21e 6198 return(ret);
a687059c
LW
6199}
6200
6201/*
6202 - regbranch - one alternative of an | operator
6203 *
6204 * Implements the concatenation operator.
6205 */
76e3520e 6206STATIC regnode *
3dab1dad 6207S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6208{
97aff369 6209 dVAR;
c277df42
IZ
6210 register regnode *ret;
6211 register regnode *chain = NULL;
6212 register regnode *latest;
6213 I32 flags = 0, c = 0;
3dab1dad
YO
6214 GET_RE_DEBUG_FLAGS_DECL;
6215 DEBUG_PARSE("brnc");
02daf0ab 6216
b81d288d 6217 if (first)
c277df42
IZ
6218 ret = NULL;
6219 else {
b81d288d 6220 if (!SIZE_ONLY && RExC_extralen)
830247a4 6221 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6222 else {
830247a4 6223 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6224 Set_Node_Length(ret, 1);
6225 }
c277df42
IZ
6226 }
6227
b81d288d 6228 if (!first && SIZE_ONLY)
830247a4 6229 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6230
c277df42 6231 *flagp = WORST; /* Tentatively. */
a0d0e21e 6232
830247a4
IZ
6233 RExC_parse--;
6234 nextchar(pRExC_state);
6235 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6236 flags &= ~TRYAGAIN;
3dab1dad 6237 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6238 if (latest == NULL) {
6239 if (flags & TRYAGAIN)
6240 continue;
6241 return(NULL);
a0ed51b3
LW
6242 }
6243 else if (ret == NULL)
c277df42 6244 ret = latest;
8ae10a67 6245 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6246 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6247 *flagp |= flags&SPSTART;
6248 else {
830247a4 6249 RExC_naughty++;
3dab1dad 6250 REGTAIL(pRExC_state, chain, latest);
a687059c 6251 }
a0d0e21e 6252 chain = latest;
c277df42
IZ
6253 c++;
6254 }
6255 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6256 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6257 if (ret == NULL)
6258 ret = chain;
6259 }
6260 if (c == 1) {
6261 *flagp |= flags&SIMPLE;
a0d0e21e 6262 }
a687059c 6263
d4c19fe8 6264 return ret;
a687059c
LW
6265}
6266
6267/*
6268 - regpiece - something followed by possible [*+?]
6269 *
6270 * Note that the branching code sequences used for ? and the general cases
6271 * of * and + are somewhat optimized: they use the same NOTHING node as
6272 * both the endmarker for their branch list and the body of the last branch.
6273 * It might seem that this node could be dispensed with entirely, but the
6274 * endmarker role is not redundant.
6275 */
76e3520e 6276STATIC regnode *
3dab1dad 6277S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6278{
97aff369 6279 dVAR;
c277df42 6280 register regnode *ret;
a0d0e21e
LW
6281 register char op;
6282 register char *next;
6283 I32 flags;
1df70142 6284 const char * const origparse = RExC_parse;
a0d0e21e 6285 I32 min;
c277df42 6286 I32 max = REG_INFTY;
fac92740 6287 char *parse_start;
10edeb5d 6288 const char *maxpos = NULL;
3dab1dad
YO
6289 GET_RE_DEBUG_FLAGS_DECL;
6290 DEBUG_PARSE("piec");
a0d0e21e 6291
3dab1dad 6292 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6293 if (ret == NULL) {
6294 if (flags & TRYAGAIN)
6295 *flagp |= TRYAGAIN;
6296 return(NULL);
6297 }
6298
830247a4 6299 op = *RExC_parse;
a0d0e21e 6300
830247a4 6301 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6302 maxpos = NULL;
fac92740 6303 parse_start = RExC_parse; /* MJD */
830247a4 6304 next = RExC_parse + 1;
a0d0e21e
LW
6305 while (isDIGIT(*next) || *next == ',') {
6306 if (*next == ',') {
6307 if (maxpos)
6308 break;
6309 else
6310 maxpos = next;
a687059c 6311 }
a0d0e21e
LW
6312 next++;
6313 }
6314 if (*next == '}') { /* got one */
6315 if (!maxpos)
6316 maxpos = next;
830247a4
IZ
6317 RExC_parse++;
6318 min = atoi(RExC_parse);
a0d0e21e
LW
6319 if (*maxpos == ',')
6320 maxpos++;
6321 else
830247a4 6322 maxpos = RExC_parse;
a0d0e21e
LW
6323 max = atoi(maxpos);
6324 if (!max && *maxpos != '0')
c277df42
IZ
6325 max = REG_INFTY; /* meaning "infinity" */
6326 else if (max >= REG_INFTY)
8615cb43 6327 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6328 RExC_parse = next;
6329 nextchar(pRExC_state);
a0d0e21e
LW
6330
6331 do_curly:
6332 if ((flags&SIMPLE)) {
830247a4 6333 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6334 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6335 Set_Node_Offset(ret, parse_start+1); /* MJD */
6336 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6337 }
6338 else {
3dab1dad 6339 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6340
6341 w->flags = 0;
3dab1dad 6342 REGTAIL(pRExC_state, ret, w);
830247a4 6343 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6344 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6345 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6346 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6347 }
6bda09f9 6348 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6349 /* MJD hk */
6350 Set_Node_Offset(ret, parse_start+1);
2af232bd 6351 Set_Node_Length(ret,
fac92740 6352 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6353
830247a4 6354 if (!SIZE_ONLY && RExC_extralen)
c277df42 6355 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6356 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6357 if (SIZE_ONLY)
830247a4
IZ
6358 RExC_whilem_seen++, RExC_extralen += 3;
6359 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6360 }
c277df42 6361 ret->flags = 0;
a0d0e21e
LW
6362
6363 if (min > 0)
821b33a5
IZ
6364 *flagp = WORST;
6365 if (max > 0)
6366 *flagp |= HASWIDTH;
a0d0e21e 6367 if (max && max < min)
8615cb43 6368 vFAIL("Can't do {n,m} with n > m");
c277df42 6369 if (!SIZE_ONLY) {
eb160463
GS
6370 ARG1_SET(ret, (U16)min);
6371 ARG2_SET(ret, (U16)max);
a687059c 6372 }
a687059c 6373
a0d0e21e 6374 goto nest_check;
a687059c 6375 }
a0d0e21e 6376 }
a687059c 6377
a0d0e21e
LW
6378 if (!ISMULT1(op)) {
6379 *flagp = flags;
a687059c 6380 return(ret);
a0d0e21e 6381 }
bb20fd44 6382
c277df42 6383#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6384
6385 /* if this is reinstated, don't forget to put this back into perldiag:
6386
6387 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6388
6389 (F) The part of the regexp subject to either the * or + quantifier
6390 could match an empty string. The {#} shows in the regular
6391 expression about where the problem was discovered.
6392
6393 */
6394
bb20fd44 6395 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6396 vFAIL("Regexp *+ operand could be empty");
b81d288d 6397#endif
bb20fd44 6398
fac92740 6399 parse_start = RExC_parse;
830247a4 6400 nextchar(pRExC_state);
a0d0e21e 6401
821b33a5 6402 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6403
6404 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6405 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6406 ret->flags = 0;
830247a4 6407 RExC_naughty += 4;
a0d0e21e
LW
6408 }
6409 else if (op == '*') {
6410 min = 0;
6411 goto do_curly;
a0ed51b3
LW
6412 }
6413 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6414 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6415 ret->flags = 0;
830247a4 6416 RExC_naughty += 3;
a0d0e21e
LW
6417 }
6418 else if (op == '+') {
6419 min = 1;
6420 goto do_curly;
a0ed51b3
LW
6421 }
6422 else if (op == '?') {
a0d0e21e
LW
6423 min = 0; max = 1;
6424 goto do_curly;
6425 }
6426 nest_check:
a3b492c3 6427 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 6428 vWARN3(RExC_parse,
b45f050a 6429 "%.*s matches null string many times",
afd78fd5 6430 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 6431 origparse);
a0d0e21e
LW
6432 }
6433
b9b4dddf 6434 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6435 nextchar(pRExC_state);
6bda09f9 6436 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6437 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6438 }
b9b4dddf
YO
6439#ifndef REG_ALLOW_MINMOD_SUSPEND
6440 else
6441#endif
6442 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6443 regnode *ender;
6444 nextchar(pRExC_state);
6445 ender = reg_node(pRExC_state, SUCCEED);
6446 REGTAIL(pRExC_state, ret, ender);
6447 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6448 ret->flags = 0;
6449 ender = reg_node(pRExC_state, TAIL);
6450 REGTAIL(pRExC_state, ret, ender);
6451 /*ret= ender;*/
6452 }
6453
6454 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6455 RExC_parse++;
b45f050a
JF
6456 vFAIL("Nested quantifiers");
6457 }
a0d0e21e
LW
6458
6459 return(ret);
a687059c
LW
6460}
6461
fc8cd66c
YO
6462
6463/* reg_namedseq(pRExC_state,UVp)
6464
6465 This is expected to be called by a parser routine that has
6466 recognized'\N' and needs to handle the rest. RExC_parse is
6467 expected to point at the first char following the N at the time
6468 of the call.
6469
6470 If valuep is non-null then it is assumed that we are parsing inside
6471 of a charclass definition and the first codepoint in the resolved
6472 string is returned via *valuep and the routine will return NULL.
6473 In this mode if a multichar string is returned from the charnames
6474 handler a warning will be issued, and only the first char in the
6475 sequence will be examined. If the string returned is zero length
6476 then the value of *valuep is undefined and NON-NULL will
6477 be returned to indicate failure. (This will NOT be a valid pointer
6478 to a regnode.)
6479
6480 If value is null then it is assumed that we are parsing normal text
6481 and inserts a new EXACT node into the program containing the resolved
6482 string and returns a pointer to the new node. If the string is
6483 zerolength a NOTHING node is emitted.
6484
6485 On success RExC_parse is set to the char following the endbrace.
6486 Parsing failures will generate a fatal errorvia vFAIL(...)
6487
6488 NOTE: We cache all results from the charnames handler locally in
6489 the RExC_charnames hash (created on first use) to prevent a charnames
6490 handler from playing silly-buggers and returning a short string and
6491 then a long string for a given pattern. Since the regexp program
6492 size is calculated during an initial parse this would result
6493 in a buffer overrun so we cache to prevent the charname result from
6494 changing during the course of the parse.
6495
6496 */
6497STATIC regnode *
6498S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6499{
6500 char * name; /* start of the content of the name */
6501 char * endbrace; /* endbrace following the name */
6502 SV *sv_str = NULL;
6503 SV *sv_name = NULL;
6504 STRLEN len; /* this has various purposes throughout the code */
6505 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6506 regnode *ret = NULL;
6507
6508 if (*RExC_parse != '{') {
6509 vFAIL("Missing braces on \\N{}");
6510 }
6511 name = RExC_parse+1;
6512 endbrace = strchr(RExC_parse, '}');
6513 if ( ! endbrace ) {
6514 RExC_parse++;
6515 vFAIL("Missing right brace on \\N{}");
6516 }
6517 RExC_parse = endbrace + 1;
6518
6519
6520 /* RExC_parse points at the beginning brace,
6521 endbrace points at the last */
6522 if ( name[0]=='U' && name[1]=='+' ) {
38a44b82 6523 /* its a "Unicode hex" notation {U+89AB} */
fc8cd66c
YO
6524 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6525 | PERL_SCAN_DISALLOW_PREFIX
6526 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6527 UV cp;
88c9ea1e 6528 char string;
196f1508 6529 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6530 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6531 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6532 cp = 0xFFFD;
6533 }
6534 if (cp > 0xff)
6535 RExC_utf8 = 1;
6536 if ( valuep ) {
6537 *valuep = cp;
6538 return NULL;
6539 }
6bdeddd2 6540 string = (char)cp;
5e7aa789 6541 sv_str= newSVpvn(&string, 1);
fc8cd66c
YO
6542 } else {
6543 /* fetch the charnames handler for this scope */
6544 HV * const table = GvHV(PL_hintgv);
6545 SV **cvp= table ?
6546 hv_fetchs(table, "charnames", FALSE) :
6547 NULL;
6548 SV *cv= cvp ? *cvp : NULL;
6549 HE *he_str;
6550 int count;
6551 /* create an SV with the name as argument */
6552 sv_name = newSVpvn(name, endbrace - name);
6553
6554 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6555 vFAIL2("Constant(\\N{%s}) unknown: "
6556 "(possibly a missing \"use charnames ...\")",
6557 SvPVX(sv_name));
6558 }
6559 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6560 vFAIL2("Constant(\\N{%s}): "
6561 "$^H{charnames} is not defined",SvPVX(sv_name));
6562 }
6563
6564
6565
6566 if (!RExC_charnames) {
6567 /* make sure our cache is allocated */
6568 RExC_charnames = newHV();
6bda09f9 6569 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6570 }
6571 /* see if we have looked this one up before */
6572 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6573 if ( he_str ) {
6574 sv_str = HeVAL(he_str);
6575 cached = 1;
6576 } else {
6577 dSP ;
6578
6579 ENTER ;
6580 SAVETMPS ;
6581 PUSHMARK(SP) ;
6582
6583 XPUSHs(sv_name);
6584
6585 PUTBACK ;
6586
6587 count= call_sv(cv, G_SCALAR);
6588
6589 if (count == 1) { /* XXXX is this right? dmq */
6590 sv_str = POPs;
6591 SvREFCNT_inc_simple_void(sv_str);
6592 }
6593
6594 SPAGAIN ;
6595 PUTBACK ;
6596 FREETMPS ;
6597 LEAVE ;
6598
6599 if ( !sv_str || !SvOK(sv_str) ) {
6600 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6601 "did not return a defined value",SvPVX(sv_name));
6602 }
6603 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6604 cached = 1;
6605 }
6606 }
6607 if (valuep) {
6608 char *p = SvPV(sv_str, len);
6609 if (len) {
6610 STRLEN numlen = 1;
6611 if ( SvUTF8(sv_str) ) {
196f1508 6612 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6613 if (*valuep > 0x7F)
6614 RExC_utf8 = 1;
6615 /* XXXX
6616 We have to turn on utf8 for high bit chars otherwise
6617 we get failures with
6618
6619 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6620 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6621
6622 This is different from what \x{} would do with the same
6623 codepoint, where the condition is > 0xFF.
6624 - dmq
6625 */
6626
6627
6628 } else {
6629 *valuep = (UV)*p;
6630 /* warn if we havent used the whole string? */
6631 }
6632 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6633 vWARN2(RExC_parse,
6634 "Ignoring excess chars from \\N{%s} in character class",
6635 SvPVX(sv_name)
6636 );
6637 }
6638 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6639 vWARN2(RExC_parse,
6640 "Ignoring zero length \\N{%s} in character class",
6641 SvPVX(sv_name)
6642 );
6643 }
6644 if (sv_name)
6645 SvREFCNT_dec(sv_name);
6646 if (!cached)
6647 SvREFCNT_dec(sv_str);
6648 return len ? NULL : (regnode *)&len;
6649 } else if(SvCUR(sv_str)) {
6650
6651 char *s;
6652 char *p, *pend;
6653 STRLEN charlen = 1;
d008bc60 6654#ifdef DEBUGGING
fc8cd66c 6655 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6656#endif
fc8cd66c
YO
6657 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6658
6659 ret = reg_node(pRExC_state,
6660 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6661 s= STRING(ret);
6662
6663 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6664 sv_utf8_upgrade(sv_str);
6665 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6666 RExC_utf8= 1;
6667 }
6668
6669 p = SvPV(sv_str, len);
6670 pend = p + len;
6671 /* len is the length written, charlen is the size the char read */
6672 for ( len = 0; p < pend; p += charlen ) {
6673 if (UTF) {
196f1508 6674 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6675 if (FOLD) {
6676 STRLEN foldlen,numlen;
6677 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6678 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6679 /* Emit all the Unicode characters. */
6680
6681 for (foldbuf = tmpbuf;
6682 foldlen;
6683 foldlen -= numlen)
6684 {
6685 uvc = utf8_to_uvchr(foldbuf, &numlen);
6686 if (numlen > 0) {
6687 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6688 s += unilen;
6689 len += unilen;
6690 /* In EBCDIC the numlen
6691 * and unilen can differ. */
6692 foldbuf += numlen;
6693 if (numlen >= foldlen)
6694 break;
6695 }
6696 else
6697 break; /* "Can't happen." */
6698 }
6699 } else {
6700 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6701 if (unilen > 0) {
6702 s += unilen;
6703 len += unilen;
6704 }
6705 }
6706 } else {
6707 len++;
6708 REGC(*p, s++);
6709 }
6710 }
6711 if (SIZE_ONLY) {
6712 RExC_size += STR_SZ(len);
6713 } else {
6714 STR_LEN(ret) = len;
6715 RExC_emit += STR_SZ(len);
6716 }
6717 Set_Node_Cur_Length(ret); /* MJD */
6718 RExC_parse--;
6719 nextchar(pRExC_state);
6720 } else {
6721 ret = reg_node(pRExC_state,NOTHING);
6722 }
6723 if (!cached) {
6724 SvREFCNT_dec(sv_str);
6725 }
6726 if (sv_name) {
6727 SvREFCNT_dec(sv_name);
6728 }
6729 return ret;
6730
6731}
6732
6733
9e08bc66
TS
6734/*
6735 * reg_recode
6736 *
6737 * It returns the code point in utf8 for the value in *encp.
6738 * value: a code value in the source encoding
6739 * encp: a pointer to an Encode object
6740 *
6741 * If the result from Encode is not a single character,
6742 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6743 */
6744STATIC UV
6745S_reg_recode(pTHX_ const char value, SV **encp)
6746{
6747 STRLEN numlen = 1;
59cd0e26 6748 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6749 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6750 const STRLEN newlen = SvCUR(sv);
6751 UV uv = UNICODE_REPLACEMENT;
6752
6753 if (newlen)
6754 uv = SvUTF8(sv)
6755 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6756 : *(U8*)s;
6757
6758 if (!newlen || numlen != newlen) {
6759 uv = UNICODE_REPLACEMENT;
c86f7df5 6760 *encp = NULL;
9e08bc66
TS
6761 }
6762 return uv;
6763}
6764
fc8cd66c 6765
a687059c
LW
6766/*
6767 - regatom - the lowest level
ee9b8eae
YO
6768
6769 Try to identify anything special at the start of the pattern. If there
6770 is, then handle it as required. This may involve generating a single regop,
6771 such as for an assertion; or it may involve recursing, such as to
6772 handle a () structure.
6773
6774 If the string doesn't start with something special then we gobble up
6775 as much literal text as we can.
6776
6777 Once we have been able to handle whatever type of thing started the
6778 sequence, we return.
6779
6780 Note: we have to be careful with escapes, as they can be both literal
6781 and special, and in the case of \10 and friends can either, depending
6782 on context. Specifically there are two seperate switches for handling
6783 escape sequences, with the one for handling literal escapes requiring
6784 a dummy entry for all of the special escapes that are actually handled
6785 by the other.
6786*/
6787
76e3520e 6788STATIC regnode *
3dab1dad 6789S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6790{
97aff369 6791 dVAR;
cbbf8932 6792 register regnode *ret = NULL;
a0d0e21e 6793 I32 flags;
45948336 6794 char *parse_start = RExC_parse;
3dab1dad
YO
6795 GET_RE_DEBUG_FLAGS_DECL;
6796 DEBUG_PARSE("atom");
a0d0e21e
LW
6797 *flagp = WORST; /* Tentatively. */
6798
ee9b8eae 6799
a0d0e21e 6800tryagain:
f9a79580 6801 switch ((U8)*RExC_parse) {
a0d0e21e 6802 case '^':
830247a4
IZ
6803 RExC_seen_zerolen++;
6804 nextchar(pRExC_state);
bbe252da 6805 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6806 ret = reg_node(pRExC_state, MBOL);
bbe252da 6807 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6808 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6809 else
830247a4 6810 ret = reg_node(pRExC_state, BOL);
fac92740 6811 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6812 break;
6813 case '$':
830247a4 6814 nextchar(pRExC_state);
b81d288d 6815 if (*RExC_parse)
830247a4 6816 RExC_seen_zerolen++;
bbe252da 6817 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6818 ret = reg_node(pRExC_state, MEOL);
bbe252da 6819 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6820 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6821 else
830247a4 6822 ret = reg_node(pRExC_state, EOL);
fac92740 6823 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6824 break;
6825 case '.':
830247a4 6826 nextchar(pRExC_state);
bbe252da 6827 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6828 ret = reg_node(pRExC_state, SANY);
6829 else
6830 ret = reg_node(pRExC_state, REG_ANY);
6831 *flagp |= HASWIDTH|SIMPLE;
830247a4 6832 RExC_naughty++;
fac92740 6833 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6834 break;
6835 case '[':
b45f050a 6836 {
3dab1dad
YO
6837 char * const oregcomp_parse = ++RExC_parse;
6838 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6839 if (*RExC_parse != ']') {
6840 RExC_parse = oregcomp_parse;
b45f050a
JF
6841 vFAIL("Unmatched [");
6842 }
830247a4 6843 nextchar(pRExC_state);
a0d0e21e 6844 *flagp |= HASWIDTH|SIMPLE;
fac92740 6845 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6846 break;
b45f050a 6847 }
a0d0e21e 6848 case '(':
830247a4 6849 nextchar(pRExC_state);
3dab1dad 6850 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6851 if (ret == NULL) {
bf93d4cc 6852 if (flags & TRYAGAIN) {
830247a4 6853 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6854 /* Make parent create an empty node if needed. */
6855 *flagp |= TRYAGAIN;
6856 return(NULL);
6857 }
a0d0e21e 6858 goto tryagain;
bf93d4cc 6859 }
a0d0e21e
LW
6860 return(NULL);
6861 }
a3b492c3 6862 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
6863 break;
6864 case '|':
6865 case ')':
6866 if (flags & TRYAGAIN) {
6867 *flagp |= TRYAGAIN;
6868 return NULL;
6869 }
b45f050a 6870 vFAIL("Internal urp");
a0d0e21e
LW
6871 /* Supposed to be caught earlier. */
6872 break;
85afd4ae 6873 case '{':
830247a4
IZ
6874 if (!regcurly(RExC_parse)) {
6875 RExC_parse++;
85afd4ae
CS
6876 goto defchar;
6877 }
6878 /* FALL THROUGH */
a0d0e21e
LW
6879 case '?':
6880 case '+':
6881 case '*':
830247a4 6882 RExC_parse++;
b45f050a 6883 vFAIL("Quantifier follows nothing");
a0d0e21e 6884 break;
f9a79580
RGS
6885 case 0xDF:
6886 case 0xC3:
6887 case 0xCE:
a0a388a1 6888 do_foldchar:
56d400ed 6889 if (!LOC && FOLD) {
e64b1bd1 6890 U32 len,cp;
7cf3a6a3 6891 len=0; /* silence a spurious compiler warning */
56d400ed 6892 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
6893 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6894 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6895 ret = reganode(pRExC_state, FOLDCHAR, cp);
6896 Set_Node_Length(ret, 1); /* MJD */
6897 nextchar(pRExC_state); /* kill whitespace under /x */
6898 return ret;
6899 }
6900 }
6901 goto outer_default;
a0d0e21e 6902 case '\\':
ee9b8eae
YO
6903 /* Special Escapes
6904
6905 This switch handles escape sequences that resolve to some kind
6906 of special regop and not to literal text. Escape sequnces that
6907 resolve to literal text are handled below in the switch marked
6908 "Literal Escapes".
6909
6910 Every entry in this switch *must* have a corresponding entry
6911 in the literal escape switch. However, the opposite is not
6912 required, as the default for this switch is to jump to the
6913 literal text handling code.
6914 */
a0a388a1
YO
6915 switch ((U8)*++RExC_parse) {
6916 case 0xDF:
6917 case 0xC3:
6918 case 0xCE:
6919 goto do_foldchar;
ee9b8eae 6920 /* Special Escapes */
a0d0e21e 6921 case 'A':
830247a4
IZ
6922 RExC_seen_zerolen++;
6923 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6924 *flagp |= SIMPLE;
ee9b8eae 6925 goto finish_meta_pat;
a0d0e21e 6926 case 'G':
830247a4
IZ
6927 ret = reg_node(pRExC_state, GPOS);
6928 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6929 *flagp |= SIMPLE;
ee9b8eae
YO
6930 goto finish_meta_pat;
6931 case 'K':
6932 RExC_seen_zerolen++;
6933 ret = reg_node(pRExC_state, KEEPS);
6934 *flagp |= SIMPLE;
37923168
RGS
6935 /* XXX:dmq : disabling in-place substitution seems to
6936 * be necessary here to avoid cases of memory corruption, as
6937 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6938 */
6939 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 6940 goto finish_meta_pat;
a0d0e21e 6941 case 'Z':
830247a4 6942 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6943 *flagp |= SIMPLE;
a1917ab9 6944 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6945 goto finish_meta_pat;
b85d18e9 6946 case 'z':
830247a4 6947 ret = reg_node(pRExC_state, EOS);
b85d18e9 6948 *flagp |= SIMPLE;
830247a4 6949 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6950 goto finish_meta_pat;
4a2d328f 6951 case 'C':
f33976b4
DB
6952 ret = reg_node(pRExC_state, CANY);
6953 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6954 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6955 goto finish_meta_pat;
a0ed51b3 6956 case 'X':
830247a4 6957 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6958 *flagp |= HASWIDTH;
ee9b8eae 6959 goto finish_meta_pat;
a0d0e21e 6960 case 'w':
eb160463 6961 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6962 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6963 goto finish_meta_pat;
a0d0e21e 6964 case 'W':
eb160463 6965 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6966 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6967 goto finish_meta_pat;
a0d0e21e 6968 case 'b':
830247a4
IZ
6969 RExC_seen_zerolen++;
6970 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6971 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6972 *flagp |= SIMPLE;
ee9b8eae 6973 goto finish_meta_pat;
a0d0e21e 6974 case 'B':
830247a4
IZ
6975 RExC_seen_zerolen++;
6976 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6977 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6978 *flagp |= SIMPLE;
ee9b8eae 6979 goto finish_meta_pat;
a0d0e21e 6980 case 's':
eb160463 6981 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6982 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6983 goto finish_meta_pat;
a0d0e21e 6984 case 'S':
eb160463 6985 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6986 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6987 goto finish_meta_pat;
a0d0e21e 6988 case 'd':
ffc61ed2 6989 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6990 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6991 goto finish_meta_pat;
a0d0e21e 6992 case 'D':
ffc61ed2 6993 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6994 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6995 goto finish_meta_pat;
e1d1eefb
YO
6996 case 'R':
6997 ret = reg_node(pRExC_state, LNBREAK);
6998 *flagp |= HASWIDTH|SIMPLE;
6999 goto finish_meta_pat;
7000 case 'h':
7001 ret = reg_node(pRExC_state, HORIZWS);
7002 *flagp |= HASWIDTH|SIMPLE;
7003 goto finish_meta_pat;
7004 case 'H':
7005 ret = reg_node(pRExC_state, NHORIZWS);
7006 *flagp |= HASWIDTH|SIMPLE;
7007 goto finish_meta_pat;
ee9b8eae 7008 case 'v':
e1d1eefb
YO
7009 ret = reg_node(pRExC_state, VERTWS);
7010 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7011 goto finish_meta_pat;
7012 case 'V':
e1d1eefb
YO
7013 ret = reg_node(pRExC_state, NVERTWS);
7014 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7015 finish_meta_pat:
830247a4 7016 nextchar(pRExC_state);
fac92740 7017 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7018 break;
a14b48bc
LW
7019 case 'p':
7020 case 'P':
3568d838 7021 {
3dab1dad 7022 char* const oldregxend = RExC_end;
d008bc60 7023#ifdef DEBUGGING
ccb2c380 7024 char* parse_start = RExC_parse - 2;
d008bc60 7025#endif
a14b48bc 7026
830247a4 7027 if (RExC_parse[1] == '{') {
3568d838 7028 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7029 RExC_end = strchr(RExC_parse, '}');
7030 if (!RExC_end) {
3dab1dad 7031 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7032 RExC_parse += 2;
7033 RExC_end = oldregxend;
0da60cf5 7034 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7035 }
830247a4 7036 RExC_end++;
a14b48bc 7037 }
af6f566e 7038 else {
830247a4 7039 RExC_end = RExC_parse + 2;
af6f566e
HS
7040 if (RExC_end > oldregxend)
7041 RExC_end = oldregxend;
7042 }
830247a4 7043 RExC_parse--;
a14b48bc 7044
3dab1dad 7045 ret = regclass(pRExC_state,depth+1);
a14b48bc 7046
830247a4
IZ
7047 RExC_end = oldregxend;
7048 RExC_parse--;
ccb2c380
MP
7049
7050 Set_Node_Offset(ret, parse_start + 2);
7051 Set_Node_Cur_Length(ret);
830247a4 7052 nextchar(pRExC_state);
a14b48bc
LW
7053 *flagp |= HASWIDTH|SIMPLE;
7054 }
7055 break;
fc8cd66c
YO
7056 case 'N':
7057 /* Handle \N{NAME} here and not below because it can be
7058 multicharacter. join_exact() will join them up later on.
7059 Also this makes sure that things like /\N{BLAH}+/ and
7060 \N{BLAH} being multi char Just Happen. dmq*/
7061 ++RExC_parse;
7062 ret= reg_namedseq(pRExC_state, NULL);
7063 break;
0a4db386 7064 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7065 parse_named_seq:
81714fb9
YO
7066 {
7067 char ch= RExC_parse[1];
1f1031fe
YO
7068 if (ch != '<' && ch != '\'' && ch != '{') {
7069 RExC_parse++;
7070 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7071 } else {
1f1031fe
YO
7072 /* this pretty much dupes the code for (?P=...) in reg(), if
7073 you change this make sure you change that */
81714fb9 7074 char* name_start = (RExC_parse += 2);
2eccd3b2 7075 U32 num = 0;
0a4db386
YO
7076 SV *sv_dat = reg_scan_name(pRExC_state,
7077 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7078 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7079 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7080 vFAIL2("Sequence %.3s... not terminated",parse_start);
7081
7082 if (!SIZE_ONLY) {
7083 num = add_data( pRExC_state, 1, "S" );
7084 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7085 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7086 }
7087
81714fb9
YO
7088 RExC_sawback = 1;
7089 ret = reganode(pRExC_state,
7090 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7091 num);
7092 *flagp |= HASWIDTH;
1f1031fe 7093
81714fb9
YO
7094 /* override incorrect value set in reganode MJD */
7095 Set_Node_Offset(ret, parse_start+1);
7096 Set_Node_Cur_Length(ret); /* MJD */
7097 nextchar(pRExC_state);
1f1031fe 7098
81714fb9
YO
7099 }
7100 break;
1f1031fe 7101 }
2bf803e2 7102 case 'g':
a0d0e21e
LW
7103 case '1': case '2': case '3': case '4':
7104 case '5': case '6': case '7': case '8': case '9':
7105 {
c74340f9 7106 I32 num;
2bf803e2
YO
7107 bool isg = *RExC_parse == 'g';
7108 bool isrel = 0;
7109 bool hasbrace = 0;
7110 if (isg) {
c74340f9 7111 RExC_parse++;
2bf803e2
YO
7112 if (*RExC_parse == '{') {
7113 RExC_parse++;
7114 hasbrace = 1;
7115 }
7116 if (*RExC_parse == '-') {
7117 RExC_parse++;
7118 isrel = 1;
7119 }
1f1031fe
YO
7120 if (hasbrace && !isDIGIT(*RExC_parse)) {
7121 if (isrel) RExC_parse--;
7122 RExC_parse -= 2;
7123 goto parse_named_seq;
7124 } }
c74340f9 7125 num = atoi(RExC_parse);
b72d83b2
RGS
7126 if (isg && num == 0)
7127 vFAIL("Reference to invalid group 0");
c74340f9 7128 if (isrel) {
5624f11d 7129 num = RExC_npar - num;
c74340f9
YO
7130 if (num < 1)
7131 vFAIL("Reference to nonexistent or unclosed group");
7132 }
2bf803e2 7133 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7134 goto defchar;
7135 else {
3dab1dad 7136 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7137 while (isDIGIT(*RExC_parse))
7138 RExC_parse++;
1f1031fe
YO
7139 if (parse_start == RExC_parse - 1)
7140 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7141 if (hasbrace) {
7142 if (*RExC_parse != '}')
7143 vFAIL("Unterminated \\g{...} pattern");
7144 RExC_parse++;
7145 }
c74340f9
YO
7146 if (!SIZE_ONLY) {
7147 if (num > (I32)RExC_rx->nparens)
7148 vFAIL("Reference to nonexistent group");
c74340f9 7149 }
830247a4 7150 RExC_sawback = 1;
eb160463
GS
7151 ret = reganode(pRExC_state,
7152 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7153 num);
a0d0e21e 7154 *flagp |= HASWIDTH;
2af232bd 7155
fac92740 7156 /* override incorrect value set in reganode MJD */
2af232bd 7157 Set_Node_Offset(ret, parse_start+1);
fac92740 7158 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7159 RExC_parse--;
7160 nextchar(pRExC_state);
a0d0e21e
LW
7161 }
7162 }
7163 break;
7164 case '\0':
830247a4 7165 if (RExC_parse >= RExC_end)
b45f050a 7166 FAIL("Trailing \\");
a0d0e21e
LW
7167 /* FALL THROUGH */
7168 default:
a0288114 7169 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7170 back into the quick-grab loop below */
45948336 7171 parse_start--;
a0d0e21e
LW
7172 goto defchar;
7173 }
7174 break;
4633a7c4
LW
7175
7176 case '#':
bbe252da 7177 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7178 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7179 goto tryagain;
7180 }
7181 /* FALL THROUGH */
7182
f9a79580
RGS
7183 default:
7184 outer_default:{
ba210ebe 7185 register STRLEN len;
58ae7d3f 7186 register UV ender;
a0d0e21e 7187 register char *p;
3dab1dad 7188 char *s;
80aecb99 7189 STRLEN foldlen;
89ebb4a3 7190 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7191
7192 parse_start = RExC_parse - 1;
a0d0e21e 7193
830247a4 7194 RExC_parse++;
a0d0e21e
LW
7195
7196 defchar:
58ae7d3f 7197 ender = 0;
eb160463
GS
7198 ret = reg_node(pRExC_state,
7199 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7200 s = STRING(ret);
830247a4
IZ
7201 for (len = 0, p = RExC_parse - 1;
7202 len < 127 && p < RExC_end;
a0d0e21e
LW
7203 len++)
7204 {
3dab1dad 7205 char * const oldp = p;
5b5a24f7 7206
bbe252da 7207 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7208 p = regwhite( pRExC_state, p );
f9a79580
RGS
7209 switch ((U8)*p) {
7210 case 0xDF:
7211 case 0xC3:
7212 case 0xCE:
56d400ed 7213 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7214 goto normal_default;
a0d0e21e
LW
7215 case '^':
7216 case '$':
7217 case '.':
7218 case '[':
7219 case '(':
7220 case ')':
7221 case '|':
7222 goto loopdone;
7223 case '\\':
ee9b8eae
YO
7224 /* Literal Escapes Switch
7225
7226 This switch is meant to handle escape sequences that
7227 resolve to a literal character.
7228
7229 Every escape sequence that represents something
7230 else, like an assertion or a char class, is handled
7231 in the switch marked 'Special Escapes' above in this
7232 routine, but also has an entry here as anything that
7233 isn't explicitly mentioned here will be treated as
7234 an unescaped equivalent literal.
7235 */
7236
a0a388a1 7237 switch ((U8)*++p) {
ee9b8eae 7238 /* These are all the special escapes. */
a0a388a1
YO
7239 case 0xDF:
7240 case 0xC3:
7241 case 0xCE:
7242 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7243 goto normal_default;
ee9b8eae
YO
7244 case 'A': /* Start assertion */
7245 case 'b': case 'B': /* Word-boundary assertion*/
7246 case 'C': /* Single char !DANGEROUS! */
7247 case 'd': case 'D': /* digit class */
7248 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7249 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7250 case 'k': case 'K': /* named backref, keep marker */
7251 case 'N': /* named char sequence */
38a44b82 7252 case 'p': case 'P': /* Unicode property */
e1d1eefb 7253 case 'R': /* LNBREAK */
ee9b8eae 7254 case 's': case 'S': /* space class */
e1d1eefb 7255 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7256 case 'w': case 'W': /* word class */
7257 case 'X': /* eXtended Unicode "combining character sequence" */
7258 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7259 --p;
7260 goto loopdone;
ee9b8eae
YO
7261
7262 /* Anything after here is an escape that resolves to a
7263 literal. (Except digits, which may or may not)
7264 */
a0d0e21e
LW
7265 case 'n':
7266 ender = '\n';
7267 p++;
a687059c 7268 break;
a0d0e21e
LW
7269 case 'r':
7270 ender = '\r';
7271 p++;
a687059c 7272 break;
a0d0e21e
LW
7273 case 't':
7274 ender = '\t';
7275 p++;
a687059c 7276 break;
a0d0e21e
LW
7277 case 'f':
7278 ender = '\f';
7279 p++;
a687059c 7280 break;
a0d0e21e 7281 case 'e':
c7f1f016 7282 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7283 p++;
a687059c 7284 break;
a0d0e21e 7285 case 'a':
c7f1f016 7286 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7287 p++;
a687059c 7288 break;
a0d0e21e 7289 case 'x':
a0ed51b3 7290 if (*++p == '{') {
1df70142 7291 char* const e = strchr(p, '}');
b81d288d 7292
b45f050a 7293 if (!e) {
830247a4 7294 RExC_parse = p + 1;
b45f050a
JF
7295 vFAIL("Missing right brace on \\x{}");
7296 }
de5f0749 7297 else {
a4c04bdc
NC
7298 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7299 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7300 STRLEN numlen = e - p - 1;
53305cf1 7301 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7302 if (ender > 0xff)
7303 RExC_utf8 = 1;
a0ed51b3
LW
7304 p = e + 1;
7305 }
a0ed51b3
LW
7306 }
7307 else {
a4c04bdc 7308 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7309 STRLEN numlen = 2;
53305cf1 7310 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7311 p += numlen;
7312 }
9e08bc66
TS
7313 if (PL_encoding && ender < 0x100)
7314 goto recode_encoding;
a687059c 7315 break;
a0d0e21e
LW
7316 case 'c':
7317 p++;
bbce6d69 7318 ender = UCHARAT(p++);
7319 ender = toCTRL(ender);
a687059c 7320 break;
a0d0e21e
LW
7321 case '0': case '1': case '2': case '3':case '4':
7322 case '5': case '6': case '7': case '8':case '9':
7323 if (*p == '0' ||
830247a4 7324 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7325 I32 flags = 0;
1df70142 7326 STRLEN numlen = 3;
53305cf1 7327 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
7328 p += numlen;
7329 }
7330 else {
7331 --p;
7332 goto loopdone;
a687059c 7333 }
9e08bc66
TS
7334 if (PL_encoding && ender < 0x100)
7335 goto recode_encoding;
7336 break;
7337 recode_encoding:
7338 {
7339 SV* enc = PL_encoding;
7340 ender = reg_recode((const char)(U8)ender, &enc);
7341 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7342 vWARN(p, "Invalid escape in the specified encoding");
7343 RExC_utf8 = 1;
7344 }
a687059c 7345 break;
a0d0e21e 7346 case '\0':
830247a4 7347 if (p >= RExC_end)
b45f050a 7348 FAIL("Trailing \\");
a687059c 7349 /* FALL THROUGH */
a0d0e21e 7350 default:
041457d9 7351 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 7352 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7353 goto normal_default;
a0d0e21e
LW
7354 }
7355 break;
a687059c 7356 default:
a0ed51b3 7357 normal_default:
fd400ab9 7358 if (UTF8_IS_START(*p) && UTF) {
1df70142 7359 STRLEN numlen;
5e12f4fb 7360 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7361 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7362 p += numlen;
7363 }
7364 else
7365 ender = *p++;
a0d0e21e 7366 break;
a687059c 7367 }
bcdf7404
YO
7368 if ( RExC_flags & RXf_PMf_EXTENDED)
7369 p = regwhite( pRExC_state, p );
60a8b682
JH
7370 if (UTF && FOLD) {
7371 /* Prime the casefolded buffer. */
ac7e0132 7372 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7373 }
bcdf7404 7374 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7375 if (len)
7376 p = oldp;
16ea2a2e 7377 else if (UTF) {
80aecb99 7378 if (FOLD) {
60a8b682 7379 /* Emit all the Unicode characters. */
1df70142 7380 STRLEN numlen;
80aecb99
JH
7381 for (foldbuf = tmpbuf;
7382 foldlen;
7383 foldlen -= numlen) {
7384 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7385 if (numlen > 0) {
71207a34 7386 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7387 s += unilen;
7388 len += unilen;
7389 /* In EBCDIC the numlen
7390 * and unilen can differ. */
9dc45d57 7391 foldbuf += numlen;
47654450
JH
7392 if (numlen >= foldlen)
7393 break;
9dc45d57
JH
7394 }
7395 else
7396 break; /* "Can't happen." */
80aecb99
JH
7397 }
7398 }
7399 else {
71207a34 7400 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7401 if (unilen > 0) {
0ebc6274
JH
7402 s += unilen;
7403 len += unilen;
9dc45d57 7404 }
80aecb99 7405 }
a0ed51b3 7406 }
a0d0e21e
LW
7407 else {
7408 len++;
eb160463 7409 REGC((char)ender, s++);
a0d0e21e
LW
7410 }
7411 break;
a687059c 7412 }
16ea2a2e 7413 if (UTF) {
80aecb99 7414 if (FOLD) {
60a8b682 7415 /* Emit all the Unicode characters. */
1df70142 7416 STRLEN numlen;
80aecb99
JH
7417 for (foldbuf = tmpbuf;
7418 foldlen;
7419 foldlen -= numlen) {
7420 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7421 if (numlen > 0) {
71207a34 7422 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7423 len += unilen;
7424 s += unilen;
7425 /* In EBCDIC the numlen
7426 * and unilen can differ. */
9dc45d57 7427 foldbuf += numlen;
47654450
JH
7428 if (numlen >= foldlen)
7429 break;
9dc45d57
JH
7430 }
7431 else
7432 break;
80aecb99
JH
7433 }
7434 }
7435 else {
71207a34 7436 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7437 if (unilen > 0) {
0ebc6274
JH
7438 s += unilen;
7439 len += unilen;
9dc45d57 7440 }
80aecb99
JH
7441 }
7442 len--;
a0ed51b3
LW
7443 }
7444 else
eb160463 7445 REGC((char)ender, s++);
a0d0e21e
LW
7446 }
7447 loopdone:
830247a4 7448 RExC_parse = p - 1;
fac92740 7449 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7450 nextchar(pRExC_state);
793db0cb
JH
7451 {
7452 /* len is STRLEN which is unsigned, need to copy to signed */
7453 IV iv = len;
7454 if (iv < 0)
7455 vFAIL("Internal disaster");
7456 }
a0d0e21e
LW
7457 if (len > 0)
7458 *flagp |= HASWIDTH;
090f7165 7459 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7460 *flagp |= SIMPLE;
3dab1dad 7461
cd439c50 7462 if (SIZE_ONLY)
830247a4 7463 RExC_size += STR_SZ(len);
3dab1dad
YO
7464 else {
7465 STR_LEN(ret) = len;
830247a4 7466 RExC_emit += STR_SZ(len);
07be1b83 7467 }
3dab1dad 7468 }
a0d0e21e
LW
7469 break;
7470 }
a687059c 7471
a0d0e21e 7472 return(ret);
a687059c
LW
7473}
7474
873ef191 7475STATIC char *
bcdf7404 7476S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7477{
bcdf7404 7478 const char *e = RExC_end;
5b5a24f7
CS
7479 while (p < e) {
7480 if (isSPACE(*p))
7481 ++p;
7482 else if (*p == '#') {
bcdf7404 7483 bool ended = 0;
5b5a24f7 7484 do {
bcdf7404
YO
7485 if (*p++ == '\n') {
7486 ended = 1;
7487 break;
7488 }
7489 } while (p < e);
7490 if (!ended)
7491 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7492 }
7493 else
7494 break;
7495 }
7496 return p;
7497}
7498
b8c5462f
JH
7499/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7500 Character classes ([:foo:]) can also be negated ([:^foo:]).
7501 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7502 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7503 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7504
7505#define POSIXCC_DONE(c) ((c) == ':')
7506#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7507#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7508
b8c5462f 7509STATIC I32
830247a4 7510S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7511{
97aff369 7512 dVAR;
936ed897 7513 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7514
830247a4 7515 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7516 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7517 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7518 const char c = UCHARAT(RExC_parse);
097eb12c 7519 char* const s = RExC_parse++;
b81d288d 7520
9a86a77b 7521 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7522 RExC_parse++;
7523 if (RExC_parse == RExC_end)
620e46c5 7524 /* Grandfather lone [:, [=, [. */
830247a4 7525 RExC_parse = s;
620e46c5 7526 else {
3dab1dad 7527 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7528 assert(*t == c);
7529
9a86a77b 7530 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7531 const char *posixcc = s + 1;
830247a4 7532 RExC_parse++; /* skip over the ending ] */
3dab1dad 7533
b8c5462f 7534 if (*s == ':') {
1df70142
AL
7535 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7536 const I32 skip = t - posixcc;
80916619
NC
7537
7538 /* Initially switch on the length of the name. */
7539 switch (skip) {
7540 case 4:
3dab1dad
YO
7541 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7542 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7543 break;
80916619
NC
7544 case 5:
7545 /* Names all of length 5. */
7546 /* alnum alpha ascii blank cntrl digit graph lower
7547 print punct space upper */
7548 /* Offset 4 gives the best switch position. */
7549 switch (posixcc[4]) {
7550 case 'a':
3dab1dad
YO
7551 if (memEQ(posixcc, "alph", 4)) /* alpha */
7552 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7553 break;
7554 case 'e':
3dab1dad
YO
7555 if (memEQ(posixcc, "spac", 4)) /* space */
7556 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7557 break;
7558 case 'h':
3dab1dad
YO
7559 if (memEQ(posixcc, "grap", 4)) /* graph */
7560 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7561 break;
7562 case 'i':
3dab1dad
YO
7563 if (memEQ(posixcc, "asci", 4)) /* ascii */
7564 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7565 break;
7566 case 'k':
3dab1dad
YO
7567 if (memEQ(posixcc, "blan", 4)) /* blank */
7568 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7569 break;
7570 case 'l':
3dab1dad
YO
7571 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7572 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7573 break;
7574 case 'm':
3dab1dad
YO
7575 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7576 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7577 break;
7578 case 'r':
3dab1dad
YO
7579 if (memEQ(posixcc, "lowe", 4)) /* lower */
7580 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7581 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7582 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7583 break;
7584 case 't':
3dab1dad
YO
7585 if (memEQ(posixcc, "digi", 4)) /* digit */
7586 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7587 else if (memEQ(posixcc, "prin", 4)) /* print */
7588 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7589 else if (memEQ(posixcc, "punc", 4)) /* punct */
7590 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7591 break;
b8c5462f
JH
7592 }
7593 break;
80916619 7594 case 6:
3dab1dad
YO
7595 if (memEQ(posixcc, "xdigit", 6))
7596 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7597 break;
7598 }
80916619
NC
7599
7600 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7601 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7602 t - s - 1, s + 1);
80916619
NC
7603 assert (posixcc[skip] == ':');
7604 assert (posixcc[skip+1] == ']');
b45f050a 7605 } else if (!SIZE_ONLY) {
b8c5462f 7606 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7607
830247a4 7608 /* adjust RExC_parse so the warning shows after
b45f050a 7609 the class closes */
9a86a77b 7610 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7611 RExC_parse++;
b45f050a
JF
7612 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7613 }
b8c5462f
JH
7614 } else {
7615 /* Maternal grandfather:
7616 * "[:" ending in ":" but not in ":]" */
830247a4 7617 RExC_parse = s;
767d463e 7618 }
620e46c5
JH
7619 }
7620 }
7621
b8c5462f
JH
7622 return namedclass;
7623}
7624
7625STATIC void
830247a4 7626S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7627{
97aff369 7628 dVAR;
3dab1dad 7629 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7630 const char *s = RExC_parse;
7631 const char c = *s++;
b8c5462f 7632
3dab1dad 7633 while (isALNUM(*s))
b8c5462f
JH
7634 s++;
7635 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7636 if (ckWARN(WARN_REGEXP))
7637 vWARN3(s+2,
7638 "POSIX syntax [%c %c] belongs inside character classes",
7639 c, c);
b45f050a
JF
7640
7641 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7642 if (POSIXCC_NOTYET(c)) {
830247a4 7643 /* adjust RExC_parse so the error shows after
b45f050a 7644 the class closes */
9a86a77b 7645 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7646 NOOP;
b45f050a
JF
7647 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7648 }
b8c5462f
JH
7649 }
7650 }
620e46c5
JH
7651}
7652
7f6f358c 7653
89836f1f
YO
7654#define _C_C_T_(NAME,TEST,WORD) \
7655ANYOF_##NAME: \
7656 if (LOC) \
7657 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7658 else { \
7659 for (value = 0; value < 256; value++) \
7660 if (TEST) \
7661 ANYOF_BITMAP_SET(ret, value); \
7662 } \
7663 yesno = '+'; \
7664 what = WORD; \
7665 break; \
7666case ANYOF_N##NAME: \
7667 if (LOC) \
7668 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7669 else { \
7670 for (value = 0; value < 256; value++) \
7671 if (!TEST) \
7672 ANYOF_BITMAP_SET(ret, value); \
7673 } \
7674 yesno = '!'; \
7675 what = WORD; \
7676 break
7677
e1d1eefb
YO
7678#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7679ANYOF_##NAME: \
7680 for (value = 0; value < 256; value++) \
7681 if (TEST) \
7682 ANYOF_BITMAP_SET(ret, value); \
7683 yesno = '+'; \
7684 what = WORD; \
7685 break; \
7686case ANYOF_N##NAME: \
7687 for (value = 0; value < 256; value++) \
7688 if (!TEST) \
7689 ANYOF_BITMAP_SET(ret, value); \
7690 yesno = '!'; \
7691 what = WORD; \
7692 break
89836f1f 7693
7f6f358c
YO
7694/*
7695 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7696 matches the pattern or if the pattern matches a single char only and
7697 that char is < 256 and we are case insensitive then we produce an
7698 EXACT node instead.
7f6f358c 7699*/
89836f1f 7700
76e3520e 7701STATIC regnode *
3dab1dad 7702S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7703{
97aff369 7704 dVAR;
9a86a77b 7705 register UV nextvalue;
3568d838 7706 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7707 register IV range = 0;
e1d1eefb 7708 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7709 register regnode *ret;
ba210ebe 7710 STRLEN numlen;
ffc61ed2 7711 IV namedclass;
cbbf8932 7712 char *rangebegin = NULL;
936ed897 7713 bool need_class = 0;
c445ea15 7714 SV *listsv = NULL;
ffc61ed2 7715 UV n;
9e55ce06 7716 bool optimize_invert = TRUE;
cbbf8932 7717 AV* unicode_alternate = NULL;
1b2d223b
JH
7718#ifdef EBCDIC
7719 UV literal_endpoint = 0;
7720#endif
7f6f358c 7721 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7722
3dab1dad 7723 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7724 case we need to change the emitted regop to an EXACT. */
07be1b83 7725 const char * orig_parse = RExC_parse;
72f13be8 7726 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7727#ifndef DEBUGGING
7728 PERL_UNUSED_ARG(depth);
7729#endif
72f13be8 7730
3dab1dad 7731 DEBUG_PARSE("clas");
7f6f358c
YO
7732
7733 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7734 ret = reganode(pRExC_state, ANYOF, 0);
7735
7736 if (!SIZE_ONLY)
7737 ANYOF_FLAGS(ret) = 0;
7738
9a86a77b 7739 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7740 RExC_naughty++;
7741 RExC_parse++;
7742 if (!SIZE_ONLY)
7743 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7744 }
a0d0e21e 7745
73060fc4 7746 if (SIZE_ONLY) {
830247a4 7747 RExC_size += ANYOF_SKIP;
73060fc4
JH
7748 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7749 }
936ed897 7750 else {
830247a4 7751 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7752 if (FOLD)
7753 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7754 if (LOC)
7755 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7756 ANYOF_BITMAP_ZERO(ret);
396482e1 7757 listsv = newSVpvs("# comment\n");
a0d0e21e 7758 }
b8c5462f 7759
9a86a77b
JH
7760 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7761
b938889d 7762 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7763 checkposixcc(pRExC_state);
b8c5462f 7764
f064b6ad
HS
7765 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7766 if (UCHARAT(RExC_parse) == ']')
7767 goto charclassloop;
ffc61ed2 7768
fc8cd66c 7769parseit:
9a86a77b 7770 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7771
7772 charclassloop:
7773
7774 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7775
73b437c8 7776 if (!range)
830247a4 7777 rangebegin = RExC_parse;
ffc61ed2 7778 if (UTF) {
5e12f4fb 7779 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7780 RExC_end - RExC_parse,
9f7f3913 7781 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7782 RExC_parse += numlen;
7783 }
7784 else
7785 value = UCHARAT(RExC_parse++);
7f6f358c 7786
9a86a77b
JH
7787 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7788 if (value == '[' && POSIXCC(nextvalue))
830247a4 7789 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7790 else if (value == '\\') {
ffc61ed2 7791 if (UTF) {
5e12f4fb 7792 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7793 RExC_end - RExC_parse,
9f7f3913 7794 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7795 RExC_parse += numlen;
7796 }
7797 else
7798 value = UCHARAT(RExC_parse++);
470c3474 7799 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7800 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7801 * be a problem later if we want switch on Unicode.
7802 * A similar issue a little bit later when switching on
7803 * namedclass. --jhi */
ffc61ed2 7804 switch ((I32)value) {
b8c5462f
JH
7805 case 'w': namedclass = ANYOF_ALNUM; break;
7806 case 'W': namedclass = ANYOF_NALNUM; break;
7807 case 's': namedclass = ANYOF_SPACE; break;
7808 case 'S': namedclass = ANYOF_NSPACE; break;
7809 case 'd': namedclass = ANYOF_DIGIT; break;
7810 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
7811 case 'v': namedclass = ANYOF_VERTWS; break;
7812 case 'V': namedclass = ANYOF_NVERTWS; break;
7813 case 'h': namedclass = ANYOF_HORIZWS; break;
7814 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
7815 case 'N': /* Handle \N{NAME} in class */
7816 {
7817 /* We only pay attention to the first char of
7818 multichar strings being returned. I kinda wonder
7819 if this makes sense as it does change the behaviour
7820 from earlier versions, OTOH that behaviour was broken
7821 as well. */
7822 UV v; /* value is register so we cant & it /grrr */
7823 if (reg_namedseq(pRExC_state, &v)) {
7824 goto parseit;
7825 }
7826 value= v;
7827 }
7828 break;
ffc61ed2
JH
7829 case 'p':
7830 case 'P':
3dab1dad
YO
7831 {
7832 char *e;
af6f566e 7833 if (RExC_parse >= RExC_end)
2a4859cd 7834 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7835 if (*RExC_parse == '{') {
1df70142 7836 const U8 c = (U8)value;
ffc61ed2
JH
7837 e = strchr(RExC_parse++, '}');
7838 if (!e)
0da60cf5 7839 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7840 while (isSPACE(UCHARAT(RExC_parse)))
7841 RExC_parse++;
7842 if (e == RExC_parse)
0da60cf5 7843 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7844 n = e - RExC_parse;
ab13f0c7
JH
7845 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7846 n--;
ffc61ed2
JH
7847 }
7848 else {
7849 e = RExC_parse;
7850 n = 1;
7851 }
7852 if (!SIZE_ONLY) {
ab13f0c7
JH
7853 if (UCHARAT(RExC_parse) == '^') {
7854 RExC_parse++;
7855 n--;
7856 value = value == 'p' ? 'P' : 'p'; /* toggle */
7857 while (isSPACE(UCHARAT(RExC_parse))) {
7858 RExC_parse++;
7859 n--;
7860 }
7861 }
097eb12c
AL
7862 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7863 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7864 }
7865 RExC_parse = e + 1;
7866 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7867 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7868 }
f81125e2 7869 break;
b8c5462f
JH
7870 case 'n': value = '\n'; break;
7871 case 'r': value = '\r'; break;
7872 case 't': value = '\t'; break;
7873 case 'f': value = '\f'; break;
7874 case 'b': value = '\b'; break;
c7f1f016
NIS
7875 case 'e': value = ASCII_TO_NATIVE('\033');break;
7876 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7877 case 'x':
ffc61ed2 7878 if (*RExC_parse == '{') {
a4c04bdc
NC
7879 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7880 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7881 char * const e = strchr(RExC_parse++, '}');
b81d288d 7882 if (!e)
ffc61ed2 7883 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7884
7885 numlen = e - RExC_parse;
7886 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7887 RExC_parse = e + 1;
7888 }
7889 else {
a4c04bdc 7890 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7891 numlen = 2;
7892 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7893 RExC_parse += numlen;
7894 }
9e08bc66
TS
7895 if (PL_encoding && value < 0x100)
7896 goto recode_encoding;
b8c5462f
JH
7897 break;
7898 case 'c':
830247a4 7899 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7900 value = toCTRL(value);
7901 break;
7902 case '0': case '1': case '2': case '3': case '4':
7903 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7904 {
7905 I32 flags = 0;
7906 numlen = 3;
7907 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7908 RExC_parse += numlen;
7909 if (PL_encoding && value < 0x100)
7910 goto recode_encoding;
7911 break;
7912 }
7913 recode_encoding:
7914 {
7915 SV* enc = PL_encoding;
7916 value = reg_recode((const char)(U8)value, &enc);
7917 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7918 vWARN(RExC_parse,
7919 "Invalid escape in the specified encoding");
7920 break;
7921 }
1028017a 7922 default:
041457d9 7923 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7924 vWARN2(RExC_parse,
7925 "Unrecognized escape \\%c in character class passed through",
7926 (int)value);
1028017a 7927 break;
b8c5462f 7928 }
ffc61ed2 7929 } /* end of \blah */
1b2d223b
JH
7930#ifdef EBCDIC
7931 else
7932 literal_endpoint++;
7933#endif
ffc61ed2
JH
7934
7935 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7936
7937 if (!SIZE_ONLY && !need_class)
936ed897 7938 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7939
936ed897 7940 need_class = 1;
ffc61ed2
JH
7941
7942 /* a bad range like a-\d, a-[:digit:] ? */
7943 if (range) {
73b437c8 7944 if (!SIZE_ONLY) {
afd78fd5 7945 if (ckWARN(WARN_REGEXP)) {
097eb12c 7946 const int w =
afd78fd5
JH
7947 RExC_parse >= rangebegin ?
7948 RExC_parse - rangebegin : 0;
830247a4 7949 vWARN4(RExC_parse,
b45f050a 7950 "False [] range \"%*.*s\"",
097eb12c 7951 w, w, rangebegin);
afd78fd5 7952 }
3568d838
JH
7953 if (prevvalue < 256) {
7954 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7955 ANYOF_BITMAP_SET(ret, '-');
7956 }
7957 else {
7958 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7959 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7960 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7961 }
b8c5462f 7962 }
ffc61ed2
JH
7963
7964 range = 0; /* this was not a true range */
73b437c8 7965 }
ffc61ed2 7966
89836f1f
YO
7967
7968
73b437c8 7969 if (!SIZE_ONLY) {
c49a72a9
NC
7970 const char *what = NULL;
7971 char yesno = 0;
7972
3568d838
JH
7973 if (namedclass > OOB_NAMEDCLASS)
7974 optimize_invert = FALSE;
e2962f66
JH
7975 /* Possible truncation here but in some 64-bit environments
7976 * the compiler gets heartburn about switch on 64-bit values.
7977 * A similar issue a little earlier when switching on value.
98f323fa 7978 * --jhi */
e2962f66 7979 switch ((I32)namedclass) {
89836f1f
YO
7980 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7981 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7982 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7983 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7984 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7985 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7986 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7987 case _C_C_T_(PRINT, isPRINT(value), "Print");
7988 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7989 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7990 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7991 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7992 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
7993 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7994 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
7995 case ANYOF_ASCII:
7996 if (LOC)
936ed897 7997 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7998 else {
c7f1f016 7999#ifndef EBCDIC
1ba5c669
JH
8000 for (value = 0; value < 128; value++)
8001 ANYOF_BITMAP_SET(ret, value);
8002#else /* EBCDIC */
ffbc6a93 8003 for (value = 0; value < 256; value++) {
3a3c4447
JH
8004 if (isASCII(value))
8005 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8006 }
1ba5c669 8007#endif /* EBCDIC */
73b437c8 8008 }
c49a72a9
NC
8009 yesno = '+';
8010 what = "ASCII";
73b437c8
JH
8011 break;
8012 case ANYOF_NASCII:
8013 if (LOC)
936ed897 8014 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8015 else {
c7f1f016 8016#ifndef EBCDIC
1ba5c669
JH
8017 for (value = 128; value < 256; value++)
8018 ANYOF_BITMAP_SET(ret, value);
8019#else /* EBCDIC */
ffbc6a93 8020 for (value = 0; value < 256; value++) {
3a3c4447
JH
8021 if (!isASCII(value))
8022 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8023 }
1ba5c669 8024#endif /* EBCDIC */
73b437c8 8025 }
c49a72a9
NC
8026 yesno = '!';
8027 what = "ASCII";
89836f1f 8028 break;
ffc61ed2
JH
8029 case ANYOF_DIGIT:
8030 if (LOC)
8031 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8032 else {
8033 /* consecutive digits assumed */
8034 for (value = '0'; value <= '9'; value++)
8035 ANYOF_BITMAP_SET(ret, value);
8036 }
c49a72a9
NC
8037 yesno = '+';
8038 what = "Digit";
ffc61ed2
JH
8039 break;
8040 case ANYOF_NDIGIT:
8041 if (LOC)
8042 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8043 else {
8044 /* consecutive digits assumed */
8045 for (value = 0; value < '0'; value++)
8046 ANYOF_BITMAP_SET(ret, value);
8047 for (value = '9' + 1; value < 256; value++)
8048 ANYOF_BITMAP_SET(ret, value);
8049 }
c49a72a9
NC
8050 yesno = '!';
8051 what = "Digit";
89836f1f 8052 break;
f81125e2
JP
8053 case ANYOF_MAX:
8054 /* this is to handle \p and \P */
8055 break;
73b437c8 8056 default:
b45f050a 8057 vFAIL("Invalid [::] class");
73b437c8 8058 break;
b8c5462f 8059 }
c49a72a9
NC
8060 if (what) {
8061 /* Strings such as "+utf8::isWord\n" */
8062 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8063 }
b8c5462f 8064 if (LOC)
936ed897 8065 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8066 continue;
a0d0e21e 8067 }
ffc61ed2
JH
8068 } /* end of namedclass \blah */
8069
a0d0e21e 8070 if (range) {
eb160463 8071 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8072 const int w = RExC_parse - rangebegin;
8073 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8074 range = 0; /* not a valid range */
73b437c8 8075 }
a0d0e21e
LW
8076 }
8077 else {
3568d838 8078 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8079 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8080 RExC_parse[1] != ']') {
8081 RExC_parse++;
ffc61ed2
JH
8082
8083 /* a bad range like \w-, [:word:]- ? */
8084 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8085 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8086 const int w =
afd78fd5
JH
8087 RExC_parse >= rangebegin ?
8088 RExC_parse - rangebegin : 0;
830247a4 8089 vWARN4(RExC_parse,
b45f050a 8090 "False [] range \"%*.*s\"",
097eb12c 8091 w, w, rangebegin);
afd78fd5 8092 }
73b437c8 8093 if (!SIZE_ONLY)
936ed897 8094 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8095 } else
ffc61ed2
JH
8096 range = 1; /* yeah, it's a range! */
8097 continue; /* but do it the next time */
a0d0e21e 8098 }
a687059c 8099 }
ffc61ed2 8100
93733859 8101 /* now is the next time */
07be1b83 8102 /*stored += (value - prevvalue + 1);*/
ae5c130c 8103 if (!SIZE_ONLY) {
3568d838 8104 if (prevvalue < 256) {
1df70142 8105 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8106 IV i;
3568d838 8107#ifdef EBCDIC
1b2d223b
JH
8108 /* In EBCDIC [\x89-\x91] should include
8109 * the \x8e but [i-j] should not. */
8110 if (literal_endpoint == 2 &&
8111 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8112 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8113 {
3568d838
JH
8114 if (isLOWER(prevvalue)) {
8115 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8116 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8117 stored++;
ffc61ed2 8118 ANYOF_BITMAP_SET(ret, i);
2670d666 8119 }
ffc61ed2 8120 } else {
3568d838 8121 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8122 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8123 stored++;
ffc61ed2 8124 ANYOF_BITMAP_SET(ret, i);
2670d666 8125 }
ffc61ed2 8126 }
8ada0baa 8127 }
ffc61ed2 8128 else
8ada0baa 8129#endif
07be1b83
YO
8130 for (i = prevvalue; i <= ceilvalue; i++) {
8131 if (!ANYOF_BITMAP_TEST(ret,i)) {
8132 stored++;
8133 ANYOF_BITMAP_SET(ret, i);
8134 }
8135 }
3568d838 8136 }
a5961de5 8137 if (value > 255 || UTF) {
1df70142
AL
8138 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8139 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8140 stored+=2; /* can't optimize this class */
ffc61ed2 8141 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8142 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8143 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8144 prevnatvalue, natvalue);
8145 }
8146 else if (prevnatvalue == natvalue) {
8147 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8148 if (FOLD) {
89ebb4a3 8149 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8150 STRLEN foldlen;
1df70142 8151 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8152
e294cc5d
JH
8153#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8154 if (RExC_precomp[0] == ':' &&
8155 RExC_precomp[1] == '[' &&
8156 (f == 0xDF || f == 0x92)) {
8157 f = NATIVE_TO_UNI(f);
8158 }
8159#endif
c840d2a2
JH
8160 /* If folding and foldable and a single
8161 * character, insert also the folded version
8162 * to the charclass. */
9e55ce06 8163 if (f != value) {
e294cc5d
JH
8164#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8165 if ((RExC_precomp[0] == ':' &&
8166 RExC_precomp[1] == '[' &&
8167 (f == 0xA2 &&
8168 (value == 0xFB05 || value == 0xFB06))) ?
8169 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8170 foldlen == (STRLEN)UNISKIP(f) )
8171#else
eb160463 8172 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8173#endif
9e55ce06
JH
8174 Perl_sv_catpvf(aTHX_ listsv,
8175 "%04"UVxf"\n", f);
8176 else {
8177 /* Any multicharacter foldings
8178 * require the following transform:
8179 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8180 * where E folds into "pq" and F folds
8181 * into "rst", all other characters
8182 * fold to single characters. We save
8183 * away these multicharacter foldings,
8184 * to be later saved as part of the
8185 * additional "s" data. */
8186 SV *sv;
8187
8188 if (!unicode_alternate)
8189 unicode_alternate = newAV();
740cce10
NC
8190 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8191 TRUE);
9e55ce06
JH
8192 av_push(unicode_alternate, sv);
8193 }
8194 }
254ba52a 8195
60a8b682
JH
8196 /* If folding and the value is one of the Greek
8197 * sigmas insert a few more sigmas to make the
8198 * folding rules of the sigmas to work right.
8199 * Note that not all the possible combinations
8200 * are handled here: some of them are handled
9e55ce06
JH
8201 * by the standard folding rules, and some of
8202 * them (literal or EXACTF cases) are handled
8203 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8204 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8205 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8206 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8207 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8208 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8209 }
8210 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8211 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8212 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8213 }
8214 }
ffc61ed2 8215 }
1b2d223b
JH
8216#ifdef EBCDIC
8217 literal_endpoint = 0;
8218#endif
8ada0baa 8219 }
ffc61ed2
JH
8220
8221 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8222 }
ffc61ed2 8223
936ed897 8224 if (need_class) {
4f66b38d 8225 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8226 if (SIZE_ONLY)
830247a4 8227 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8228 else
830247a4 8229 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8230 }
ffc61ed2 8231
7f6f358c
YO
8232
8233 if (SIZE_ONLY)
8234 return ret;
8235 /****** !SIZE_ONLY AFTER HERE *********/
8236
3b57cd43 8237 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8238 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8239 ) {
8240 /* optimize single char class to an EXACT node
8241 but *only* when its not a UTF/high char */
07be1b83
YO
8242 const char * cur_parse= RExC_parse;
8243 RExC_emit = (regnode *)orig_emit;
8244 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8245 ret = reg_node(pRExC_state,
8246 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8247 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8248 *STRING(ret)= (char)value;
8249 STR_LEN(ret)= 1;
8250 RExC_emit += STR_SZ(1);
8251 return ret;
8252 }
ae5c130c 8253 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8254 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8255 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8256 ) {
a0ed51b3 8257 for (value = 0; value < 256; ++value) {
936ed897 8258 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8259 UV fold = PL_fold[value];
ffc61ed2
JH
8260
8261 if (fold != value)
8262 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8263 }
8264 }
936ed897 8265 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8266 }
ffc61ed2 8267
ae5c130c 8268 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8269 if (optimize_invert &&
ffc61ed2
JH
8270 /* If the only flag is inversion. */
8271 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8272 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8273 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8274 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8275 }
7f6f358c 8276 {
097eb12c 8277 AV * const av = newAV();
ffc61ed2 8278 SV *rv;
9e55ce06 8279 /* The 0th element stores the character class description
6a0407ee 8280 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8281 * to initialize the appropriate swash (which gets stored in
8282 * the 1st element), and also useful for dumping the regnode.
8283 * The 2nd element stores the multicharacter foldings,
6a0407ee 8284 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8285 av_store(av, 0, listsv);
8286 av_store(av, 1, NULL);
9e55ce06 8287 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 8288 rv = newRV_noinc((SV*)av);
19860706 8289 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8290 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8291 ARG_SET(ret, n);
a0ed51b3 8292 }
a0ed51b3
LW
8293 return ret;
8294}
89836f1f
YO
8295#undef _C_C_T_
8296
a0ed51b3 8297
bcdf7404
YO
8298/* reg_skipcomment()
8299
8300 Absorbs an /x style # comments from the input stream.
8301 Returns true if there is more text remaining in the stream.
8302 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8303 terminates the pattern without including a newline.
8304
8305 Note its the callers responsibility to ensure that we are
8306 actually in /x mode
8307
8308*/
8309
8310STATIC bool
8311S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8312{
8313 bool ended = 0;
8314 while (RExC_parse < RExC_end)
8315 if (*RExC_parse++ == '\n') {
8316 ended = 1;
8317 break;
8318 }
8319 if (!ended) {
8320 /* we ran off the end of the pattern without ending
8321 the comment, so we have to add an \n when wrapping */
8322 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8323 return 0;
8324 } else
8325 return 1;
8326}
8327
8328/* nextchar()
8329
8330 Advance that parse position, and optionall absorbs
8331 "whitespace" from the inputstream.
8332
8333 Without /x "whitespace" means (?#...) style comments only,
8334 with /x this means (?#...) and # comments and whitespace proper.
8335
8336 Returns the RExC_parse point from BEFORE the scan occurs.
8337
8338 This is the /x friendly way of saying RExC_parse++.
8339*/
8340
76e3520e 8341STATIC char*
830247a4 8342S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8343{
097eb12c 8344 char* const retval = RExC_parse++;
a0d0e21e 8345
4633a7c4 8346 for (;;) {
830247a4
IZ
8347 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8348 RExC_parse[2] == '#') {
e994fd66
AE
8349 while (*RExC_parse != ')') {
8350 if (RExC_parse == RExC_end)
8351 FAIL("Sequence (?#... not terminated");
830247a4 8352 RExC_parse++;
e994fd66 8353 }
830247a4 8354 RExC_parse++;
4633a7c4
LW
8355 continue;
8356 }
bbe252da 8357 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8358 if (isSPACE(*RExC_parse)) {
8359 RExC_parse++;
748a9306
LW
8360 continue;
8361 }
830247a4 8362 else if (*RExC_parse == '#') {
bcdf7404
YO
8363 if ( reg_skipcomment( pRExC_state ) )
8364 continue;
748a9306 8365 }
748a9306 8366 }
4633a7c4 8367 return retval;
a0d0e21e 8368 }
a687059c
LW
8369}
8370
8371/*
c277df42 8372- reg_node - emit a node
a0d0e21e 8373*/
76e3520e 8374STATIC regnode * /* Location. */
830247a4 8375S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8376{
97aff369 8377 dVAR;
c277df42 8378 register regnode *ptr;
504618e9 8379 regnode * const ret = RExC_emit;
07be1b83 8380 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8381
c277df42 8382 if (SIZE_ONLY) {
830247a4
IZ
8383 SIZE_ALIGN(RExC_size);
8384 RExC_size += 1;
a0d0e21e
LW
8385 return(ret);
8386 }
3b57cd43
YO
8387 if (RExC_emit >= RExC_emit_bound)
8388 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8389
c277df42 8390 NODE_ALIGN_FILL(ret);
a0d0e21e 8391 ptr = ret;
c277df42 8392 FILL_ADVANCE_NODE(ptr, op);
7122b237 8393#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8394 if (RExC_offsets) { /* MJD */
07be1b83 8395 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8396 "reg_node", __LINE__,
13d6edb4 8397 PL_reg_name[op],
07be1b83
YO
8398 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8399 ? "Overwriting end of array!\n" : "OK",
8400 (UV)(RExC_emit - RExC_emit_start),
8401 (UV)(RExC_parse - RExC_start),
8402 (UV)RExC_offsets[0]));
ccb2c380 8403 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8404 }
7122b237 8405#endif
830247a4 8406 RExC_emit = ptr;
a0d0e21e 8407 return(ret);
a687059c
LW
8408}
8409
8410/*
a0d0e21e
LW
8411- reganode - emit a node with an argument
8412*/
76e3520e 8413STATIC regnode * /* Location. */
830247a4 8414S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8415{
97aff369 8416 dVAR;
c277df42 8417 register regnode *ptr;
504618e9 8418 regnode * const ret = RExC_emit;
07be1b83 8419 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8420
c277df42 8421 if (SIZE_ONLY) {
830247a4
IZ
8422 SIZE_ALIGN(RExC_size);
8423 RExC_size += 2;
6bda09f9
YO
8424 /*
8425 We can't do this:
8426
8427 assert(2==regarglen[op]+1);
8428
8429 Anything larger than this has to allocate the extra amount.
8430 If we changed this to be:
8431
8432 RExC_size += (1 + regarglen[op]);
8433
8434 then it wouldn't matter. Its not clear what side effect
8435 might come from that so its not done so far.
8436 -- dmq
8437 */
a0d0e21e
LW
8438 return(ret);
8439 }
3b57cd43
YO
8440 if (RExC_emit >= RExC_emit_bound)
8441 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8442
c277df42 8443 NODE_ALIGN_FILL(ret);
a0d0e21e 8444 ptr = ret;
c277df42 8445 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8446#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8447 if (RExC_offsets) { /* MJD */
07be1b83 8448 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8449 "reganode",
ccb2c380 8450 __LINE__,
13d6edb4 8451 PL_reg_name[op],
07be1b83 8452 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8453 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8454 (UV)(RExC_emit - RExC_emit_start),
8455 (UV)(RExC_parse - RExC_start),
8456 (UV)RExC_offsets[0]));
ccb2c380 8457 Set_Cur_Node_Offset;
fac92740 8458 }
7122b237 8459#endif
830247a4 8460 RExC_emit = ptr;
a0d0e21e 8461 return(ret);
fe14fcc3
LW
8462}
8463
8464/*
cd439c50 8465- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8466*/
71207a34
AL
8467STATIC STRLEN
8468S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8469{
97aff369 8470 dVAR;
71207a34 8471 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8472}
8473
8474/*
a0d0e21e
LW
8475- reginsert - insert an operator in front of already-emitted operand
8476*
8477* Means relocating the operand.
8478*/
76e3520e 8479STATIC void
6bda09f9 8480S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8481{
97aff369 8482 dVAR;
c277df42
IZ
8483 register regnode *src;
8484 register regnode *dst;
8485 register regnode *place;
504618e9 8486 const int offset = regarglen[(U8)op];
6bda09f9 8487 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8488 GET_RE_DEBUG_FLAGS_DECL;
def51078 8489 PERL_UNUSED_ARG(depth);
22c35a8c 8490/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8491 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8492 if (SIZE_ONLY) {
6bda09f9 8493 RExC_size += size;
a0d0e21e
LW
8494 return;
8495 }
a687059c 8496
830247a4 8497 src = RExC_emit;
6bda09f9 8498 RExC_emit += size;
830247a4 8499 dst = RExC_emit;
40d049e4 8500 if (RExC_open_parens) {
6bda09f9 8501 int paren;
3b57cd43 8502 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8503 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8504 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8505 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8506 RExC_open_parens[paren] += size;
8507 } else {
3b57cd43 8508 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8509 }
8510 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8511 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8512 RExC_close_parens[paren] += size;
8513 } else {
3b57cd43 8514 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8515 }
8516 }
6bda09f9 8517 }
40d049e4 8518
fac92740 8519 while (src > opnd) {
c277df42 8520 StructCopy(--src, --dst, regnode);
7122b237 8521#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8522 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8523 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8524 "reg_insert",
ccb2c380 8525 __LINE__,
13d6edb4 8526 PL_reg_name[op],
07be1b83
YO
8527 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8528 ? "Overwriting end of array!\n" : "OK",
8529 (UV)(src - RExC_emit_start),
8530 (UV)(dst - RExC_emit_start),
8531 (UV)RExC_offsets[0]));
ccb2c380
MP
8532 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8533 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8534 }
7122b237 8535#endif
fac92740
MJD
8536 }
8537
a0d0e21e
LW
8538
8539 place = opnd; /* Op node, where operand used to be. */
7122b237 8540#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8541 if (RExC_offsets) { /* MJD */
07be1b83 8542 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8543 "reginsert",
ccb2c380 8544 __LINE__,
13d6edb4 8545 PL_reg_name[op],
07be1b83 8546 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8547 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8548 (UV)(place - RExC_emit_start),
8549 (UV)(RExC_parse - RExC_start),
786e8c11 8550 (UV)RExC_offsets[0]));
ccb2c380 8551 Set_Node_Offset(place, RExC_parse);
45948336 8552 Set_Node_Length(place, 1);
fac92740 8553 }
7122b237 8554#endif
c277df42
IZ
8555 src = NEXTOPER(place);
8556 FILL_ADVANCE_NODE(place, op);
8557 Zero(src, offset, regnode);
a687059c
LW
8558}
8559
8560/*
c277df42 8561- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8562- SEE ALSO: regtail_study
a0d0e21e 8563*/
097eb12c 8564/* TODO: All three parms should be const */
76e3520e 8565STATIC void
3dab1dad 8566S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8567{
97aff369 8568 dVAR;
c277df42 8569 register regnode *scan;
72f13be8 8570 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
8571#ifndef DEBUGGING
8572 PERL_UNUSED_ARG(depth);
8573#endif
a0d0e21e 8574
c277df42 8575 if (SIZE_ONLY)
a0d0e21e
LW
8576 return;
8577
8578 /* Find last node. */
8579 scan = p;
8580 for (;;) {
504618e9 8581 regnode * const temp = regnext(scan);
3dab1dad
YO
8582 DEBUG_PARSE_r({
8583 SV * const mysv=sv_newmortal();
8584 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8585 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8586 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8587 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8588 (temp == NULL ? "->" : ""),
13d6edb4 8589 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8590 );
3dab1dad
YO
8591 });
8592 if (temp == NULL)
8593 break;
8594 scan = temp;
8595 }
8596
8597 if (reg_off_by_arg[OP(scan)]) {
8598 ARG_SET(scan, val - scan);
8599 }
8600 else {
8601 NEXT_OFF(scan) = val - scan;
8602 }
8603}
8604
07be1b83 8605#ifdef DEBUGGING
3dab1dad
YO
8606/*
8607- regtail_study - set the next-pointer at the end of a node chain of p to val.
8608- Look for optimizable sequences at the same time.
8609- currently only looks for EXACT chains.
07be1b83
YO
8610
8611This is expermental code. The idea is to use this routine to perform
8612in place optimizations on branches and groups as they are constructed,
8613with the long term intention of removing optimization from study_chunk so
8614that it is purely analytical.
8615
8616Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8617to control which is which.
8618
3dab1dad
YO
8619*/
8620/* TODO: All four parms should be const */
07be1b83 8621
3dab1dad
YO
8622STATIC U8
8623S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8624{
8625 dVAR;
8626 register regnode *scan;
07be1b83
YO
8627 U8 exact = PSEUDO;
8628#ifdef EXPERIMENTAL_INPLACESCAN
8629 I32 min = 0;
8630#endif
8631
3dab1dad
YO
8632 GET_RE_DEBUG_FLAGS_DECL;
8633
07be1b83 8634
3dab1dad
YO
8635 if (SIZE_ONLY)
8636 return exact;
8637
8638 /* Find last node. */
8639
8640 scan = p;
8641 for (;;) {
8642 regnode * const temp = regnext(scan);
07be1b83
YO
8643#ifdef EXPERIMENTAL_INPLACESCAN
8644 if (PL_regkind[OP(scan)] == EXACT)
8645 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8646 return EXACT;
8647#endif
3dab1dad
YO
8648 if ( exact ) {
8649 switch (OP(scan)) {
8650 case EXACT:
8651 case EXACTF:
8652 case EXACTFL:
8653 if( exact == PSEUDO )
8654 exact= OP(scan);
07be1b83
YO
8655 else if ( exact != OP(scan) )
8656 exact= 0;
3dab1dad
YO
8657 case NOTHING:
8658 break;
8659 default:
8660 exact= 0;
8661 }
8662 }
8663 DEBUG_PARSE_r({
8664 SV * const mysv=sv_newmortal();
8665 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8666 regprop(RExC_rx, mysv, scan);
eaf3ca90 8667 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8668 SvPV_nolen_const(mysv),
eaf3ca90 8669 REG_NODE_NUM(scan),
13d6edb4 8670 PL_reg_name[exact]);
3dab1dad 8671 });
a0d0e21e
LW
8672 if (temp == NULL)
8673 break;
8674 scan = temp;
8675 }
07be1b83
YO
8676 DEBUG_PARSE_r({
8677 SV * const mysv_val=sv_newmortal();
8678 DEBUG_PARSE_MSG("");
8679 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8680 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8681 SvPV_nolen_const(mysv_val),
8682 (IV)REG_NODE_NUM(val),
8683 (IV)(val - scan)
07be1b83
YO
8684 );
8685 });
c277df42
IZ
8686 if (reg_off_by_arg[OP(scan)]) {
8687 ARG_SET(scan, val - scan);
a0ed51b3
LW
8688 }
8689 else {
c277df42
IZ
8690 NEXT_OFF(scan) = val - scan;
8691 }
3dab1dad
YO
8692
8693 return exact;
a687059c 8694}
07be1b83 8695#endif
a687059c
LW
8696
8697/*
a687059c
LW
8698 - regcurly - a little FSA that accepts {\d+,?\d*}
8699 */
79072805 8700STATIC I32
5f66b61c 8701S_regcurly(register const char *s)
a687059c
LW
8702{
8703 if (*s++ != '{')
8704 return FALSE;
f0fcb552 8705 if (!isDIGIT(*s))
a687059c 8706 return FALSE;
f0fcb552 8707 while (isDIGIT(*s))
a687059c
LW
8708 s++;
8709 if (*s == ',')
8710 s++;
f0fcb552 8711 while (isDIGIT(*s))
a687059c
LW
8712 s++;
8713 if (*s != '}')
8714 return FALSE;
8715 return TRUE;
8716}
8717
a687059c
LW
8718
8719/*
fd181c75 8720 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8721 */
f7819f85
A
8722#ifdef DEBUGGING
8723void
8724S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8725 int bit;
8726 int set=0;
8727 for (bit=0; bit<32; bit++) {
8728 if (flags & (1<<bit)) {
8729 if (!set++ && lead)
8730 PerlIO_printf(Perl_debug_log, "%s",lead);
8731 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8732 }
8733 }
8734 if (lead) {
8735 if (set)
8736 PerlIO_printf(Perl_debug_log, "\n");
8737 else
8738 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8739 }
8740}
8741#endif
8742
a687059c 8743void
097eb12c 8744Perl_regdump(pTHX_ const regexp *r)
a687059c 8745{
35ff7856 8746#ifdef DEBUGGING
97aff369 8747 dVAR;
c445ea15 8748 SV * const sv = sv_newmortal();
ab3bbdeb 8749 SV *dsv= sv_newmortal();
f8fc2ecf 8750 RXi_GET_DECL(r,ri);
f7819f85 8751 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8752
f8fc2ecf 8753 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8754
8755 /* Header fields of interest. */
ab3bbdeb
YO
8756 if (r->anchored_substr) {
8757 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8758 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8759 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8760 "anchored %s%s at %"IVdf" ",
8761 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8762 (IV)r->anchored_offset);
ab3bbdeb
YO
8763 } else if (r->anchored_utf8) {
8764 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8765 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8766 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8767 "anchored utf8 %s%s at %"IVdf" ",
8768 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8769 (IV)r->anchored_offset);
ab3bbdeb
YO
8770 }
8771 if (r->float_substr) {
8772 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8773 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8774 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8775 "floating %s%s at %"IVdf"..%"UVuf" ",
8776 s, RE_SV_TAIL(r->float_substr),
7b0972df 8777 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8778 } else if (r->float_utf8) {
8779 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8780 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8781 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8782 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8783 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8784 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8785 }
33b8afdf 8786 if (r->check_substr || r->check_utf8)
b81d288d 8787 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8788 (const char *)
8789 (r->check_substr == r->float_substr
8790 && r->check_utf8 == r->float_utf8
8791 ? "(checking floating" : "(checking anchored"));
bbe252da 8792 if (r->extflags & RXf_NOSCAN)
c277df42 8793 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8794 if (r->extflags & RXf_CHECK_ALL)
c277df42 8795 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8796 if (r->check_substr || r->check_utf8)
c277df42
IZ
8797 PerlIO_printf(Perl_debug_log, ") ");
8798
f8fc2ecf
YO
8799 if (ri->regstclass) {
8800 regprop(r, sv, ri->regstclass);
1de06328 8801 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8802 }
bbe252da 8803 if (r->extflags & RXf_ANCH) {
774d564b 8804 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8805 if (r->extflags & RXf_ANCH_BOL)
774d564b 8806 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8807 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8808 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8809 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8810 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8811 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8812 PerlIO_printf(Perl_debug_log, "(GPOS)");
8813 PerlIO_putc(Perl_debug_log, ' ');
8814 }
bbe252da 8815 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8816 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8817 if (r->intflags & PREGf_SKIP)
760ac839 8818 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8819 if (r->intflags & PREGf_IMPLICIT)
760ac839 8820 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8821 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8822 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8823 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8824 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 8825 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 8826#else
96a5add6 8827 PERL_UNUSED_CONTEXT;
65e66c80 8828 PERL_UNUSED_ARG(r);
17c3b450 8829#endif /* DEBUGGING */
a687059c
LW
8830}
8831
8832/*
a0d0e21e
LW
8833- regprop - printable representation of opcode
8834*/
46fc3d4c 8835void
32fc9b6a 8836Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8837{
35ff7856 8838#ifdef DEBUGGING
97aff369 8839 dVAR;
9b155405 8840 register int k;
f8fc2ecf 8841 RXi_GET_DECL(prog,progi);
1de06328 8842 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8843
a0d0e21e 8844
54dc92de 8845 sv_setpvn(sv, "", 0);
8aa23a47 8846
03363afd 8847 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8848 /* It would be nice to FAIL() here, but this may be called from
8849 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8850 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 8851 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 8852
3dab1dad 8853 k = PL_regkind[OP(o)];
9b155405 8854
2a782b5b 8855 if (k == EXACT) {
f92a2122 8856 sv_catpvs(sv, " ");
ab3bbdeb
YO
8857 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8858 * is a crude hack but it may be the best for now since
8859 * we have no flag "this EXACTish node was UTF-8"
8860 * --jhi */
f92a2122
NC
8861 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8862 PERL_PV_ESCAPE_UNI_DETECT |
8863 PERL_PV_PRETTY_ELLIPSES |
8864 PERL_PV_PRETTY_LTGT |
8865 PERL_PV_PRETTY_NOCLEAR
8866 );
bb263b4e 8867 } else if (k == TRIE) {
3dab1dad 8868 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8869 * progi->data isn't available here */
1de06328 8870 const char op = OP(o);
647f639f 8871 const U32 n = ARG(o);
1de06328 8872 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8873 (reg_ac_data *)progi->data->data[n] :
1de06328 8874 NULL;
3251b653
NC
8875 const reg_trie_data * const trie
8876 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 8877
13d6edb4 8878 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
8879 DEBUG_TRIE_COMPILE_r(
8880 Perl_sv_catpvf(aTHX_ sv,
8881 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8882 (UV)trie->startstate,
1e2e3d02 8883 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8884 (UV)trie->wordcount,
8885 (UV)trie->minlen,
8886 (UV)trie->maxlen,
8887 (UV)TRIE_CHARCOUNT(trie),
8888 (UV)trie->uniquecharcount
8889 )
8890 );
8891 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8892 int i;
8893 int rangestart = -1;
f46cb337 8894 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 8895 sv_catpvs(sv, "[");
1de06328
YO
8896 for (i = 0; i <= 256; i++) {
8897 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8898 if (rangestart == -1)
8899 rangestart = i;
8900 } else if (rangestart != -1) {
8901 if (i <= rangestart + 3)
8902 for (; rangestart < i; rangestart++)
8903 put_byte(sv, rangestart);
8904 else {
8905 put_byte(sv, rangestart);
8906 sv_catpvs(sv, "-");
8907 put_byte(sv, i - 1);
8908 }
8909 rangestart = -1;
8910 }
8911 }
f3a2811a 8912 sv_catpvs(sv, "]");
1de06328
YO
8913 }
8914
a3621e74 8915 } else if (k == CURLY) {
cb434fcc 8916 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8917 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8918 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8919 }
2c2d71f5
JH
8920 else if (k == WHILEM && o->flags) /* Ordinal/of */
8921 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8922 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8923 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 8924 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae
YO
8925 if ( k != REF || OP(o) < NREF) {
8926 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8927 SV **name= av_fetch(list, ARG(o), 0 );
8928 if (name)
8929 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8930 }
8931 else {
8932 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8933 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8934 I32 *nums=(I32*)SvPVX(sv_dat);
8935 SV **name= av_fetch(list, nums[0], 0 );
8936 I32 n;
8937 if (name) {
8938 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8939 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8940 (n ? "," : ""), (IV)nums[n]);
8941 }
8942 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8943 }
1f1031fe 8944 }
ee9b8eae 8945 }
1f1031fe 8946 } else if (k == GOSUB)
6bda09f9 8947 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8948 else if (k == VERB) {
8949 if (!o->flags)
8950 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8951 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8952 } else if (k == LOGICAL)
04ebc1ab 8953 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 8954 else if (k == FOLDCHAR)
df44d732 8955 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
8956 else if (k == ANYOF) {
8957 int i, rangestart = -1;
2d03de9c 8958 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8959
8960 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8961 static const char * const anyofs[] = {
653099ff
GS
8962 "\\w",
8963 "\\W",
8964 "\\s",
8965 "\\S",
8966 "\\d",
8967 "\\D",
8968 "[:alnum:]",
8969 "[:^alnum:]",
8970 "[:alpha:]",
8971 "[:^alpha:]",
8972 "[:ascii:]",
8973 "[:^ascii:]",
8974 "[:ctrl:]",
8975 "[:^ctrl:]",
8976 "[:graph:]",
8977 "[:^graph:]",
8978 "[:lower:]",
8979 "[:^lower:]",
8980 "[:print:]",
8981 "[:^print:]",
8982 "[:punct:]",
8983 "[:^punct:]",
8984 "[:upper:]",
aaa51d5e 8985 "[:^upper:]",
653099ff 8986 "[:xdigit:]",
aaa51d5e
JF
8987 "[:^xdigit:]",
8988 "[:space:]",
8989 "[:^space:]",
8990 "[:blank:]",
8991 "[:^blank:]"
653099ff
GS
8992 };
8993
19860706 8994 if (flags & ANYOF_LOCALE)
396482e1 8995 sv_catpvs(sv, "{loc}");
19860706 8996 if (flags & ANYOF_FOLD)
396482e1 8997 sv_catpvs(sv, "{i}");
653099ff 8998 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8999 if (flags & ANYOF_INVERT)
396482e1 9000 sv_catpvs(sv, "^");
ffc61ed2
JH
9001 for (i = 0; i <= 256; i++) {
9002 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9003 if (rangestart == -1)
9004 rangestart = i;
9005 } else if (rangestart != -1) {
9006 if (i <= rangestart + 3)
9007 for (; rangestart < i; rangestart++)
653099ff 9008 put_byte(sv, rangestart);
ffc61ed2
JH
9009 else {
9010 put_byte(sv, rangestart);
396482e1 9011 sv_catpvs(sv, "-");
ffc61ed2 9012 put_byte(sv, i - 1);
653099ff 9013 }
ffc61ed2 9014 rangestart = -1;
653099ff 9015 }
847a199f 9016 }
ffc61ed2
JH
9017
9018 if (o->flags & ANYOF_CLASS)
bb7a0f54 9019 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
9020 if (ANYOF_CLASS_TEST(o,i))
9021 sv_catpv(sv, anyofs[i]);
9022
9023 if (flags & ANYOF_UNICODE)
396482e1 9024 sv_catpvs(sv, "{unicode}");
1aa99e6b 9025 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9026 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9027
9028 {
9029 SV *lv;
32fc9b6a 9030 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9031
ffc61ed2
JH
9032 if (lv) {
9033 if (sw) {
89ebb4a3 9034 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 9035
ffc61ed2 9036 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9037 uvchr_to_utf8(s, i);
ffc61ed2 9038
3568d838 9039 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9040 if (rangestart == -1)
9041 rangestart = i;
9042 } else if (rangestart != -1) {
ffc61ed2
JH
9043 if (i <= rangestart + 3)
9044 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9045 const U8 * const e = uvchr_to_utf8(s,rangestart);
9046 U8 *p;
9047 for(p = s; p < e; p++)
ffc61ed2
JH
9048 put_byte(sv, *p);
9049 }
9050 else {
2d03de9c
AL
9051 const U8 *e = uvchr_to_utf8(s,rangestart);
9052 U8 *p;
9053 for (p = s; p < e; p++)
ffc61ed2 9054 put_byte(sv, *p);
396482e1 9055 sv_catpvs(sv, "-");
2d03de9c
AL
9056 e = uvchr_to_utf8(s, i-1);
9057 for (p = s; p < e; p++)
1df70142 9058 put_byte(sv, *p);
ffc61ed2
JH
9059 }
9060 rangestart = -1;
9061 }
19860706 9062 }
ffc61ed2 9063
396482e1 9064 sv_catpvs(sv, "..."); /* et cetera */
19860706 9065 }
fde631ed 9066
ffc61ed2 9067 {
2e0de35c 9068 char *s = savesvpv(lv);
c445ea15 9069 char * const origs = s;
b81d288d 9070
3dab1dad
YO
9071 while (*s && *s != '\n')
9072 s++;
b81d288d 9073
ffc61ed2 9074 if (*s == '\n') {
2d03de9c 9075 const char * const t = ++s;
ffc61ed2
JH
9076
9077 while (*s) {
9078 if (*s == '\n')
9079 *s = ' ';
9080 s++;
9081 }
9082 if (s[-1] == ' ')
9083 s[-1] = 0;
9084
9085 sv_catpv(sv, t);
fde631ed 9086 }
b81d288d 9087
ffc61ed2 9088 Safefree(origs);
fde631ed
JH
9089 }
9090 }
653099ff 9091 }
ffc61ed2 9092
653099ff
GS
9093 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9094 }
9b155405 9095 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9096 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9097#else
96a5add6 9098 PERL_UNUSED_CONTEXT;
65e66c80
SP
9099 PERL_UNUSED_ARG(sv);
9100 PERL_UNUSED_ARG(o);
f9049ba1 9101 PERL_UNUSED_ARG(prog);
17c3b450 9102#endif /* DEBUGGING */
35ff7856 9103}
a687059c 9104
cad2e5aa 9105SV *
288b8c02 9106Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9107{ /* Assume that RE_INTUIT is set */
97aff369 9108 dVAR;
288b8c02 9109 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9110 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
9111 PERL_UNUSED_CONTEXT;
9112
a3621e74 9113 DEBUG_COMPILE_r(
cfd0369c 9114 {
2d03de9c 9115 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9116 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9117
9118 if (!PL_colorset) reginitcolors();
9119 PerlIO_printf(Perl_debug_log,
a0288114 9120 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9121 PL_colors[4],
9122 prog->check_substr ? "" : "utf8 ",
9123 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9124 s,
9125 PL_colors[1],
9126 (strlen(s) > 60 ? "..." : ""));
9127 } );
9128
33b8afdf 9129 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9130}
9131
84da74a7 9132/*
f8149455 9133 pregfree()
84da74a7 9134
f8149455
YO
9135 handles refcounting and freeing the perl core regexp structure. When
9136 it is necessary to actually free the structure the first thing it
9137 does is call the 'free' method of the regexp_engine associated to to
9138 the regexp, allowing the handling of the void *pprivate; member
9139 first. (This routine is not overridable by extensions, which is why
9140 the extensions free is called first.)
9141
9142 See regdupe and regdupe_internal if you change anything here.
84da74a7 9143*/
f8149455 9144#ifndef PERL_IN_XSUB_RE
2b69d0c2 9145void
84679df5 9146Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9147{
288b8c02
NC
9148 SvREFCNT_dec(r);
9149}
9150
9151void
9152Perl_pregfree2(pTHX_ REGEXP *rx)
9153{
27da23d5 9154 dVAR;
288b8c02 9155 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9156 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9157
28d8d7f4
YO
9158 if (r->mother_re) {
9159 ReREFCNT_dec(r->mother_re);
9160 } else {
288b8c02 9161 CALLREGFREE_PVT(rx); /* free the private data */
5daac39c
NC
9162 if (RXp_PAREN_NAMES(r))
9163 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9164 }
9165 if (r->substrs) {
9166 if (r->anchored_substr)
9167 SvREFCNT_dec(r->anchored_substr);
9168 if (r->anchored_utf8)
9169 SvREFCNT_dec(r->anchored_utf8);
9170 if (r->float_substr)
9171 SvREFCNT_dec(r->float_substr);
9172 if (r->float_utf8)
9173 SvREFCNT_dec(r->float_utf8);
9174 Safefree(r->substrs);
9175 }
288b8c02 9176 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9177#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 9178 if (r->saved_copy)
28d8d7f4 9179 SvREFCNT_dec(r->saved_copy);
ed252734 9180#endif
f0ab9afb
NC
9181 Safefree(r->swap);
9182 Safefree(r->offs);
f8149455 9183}
28d8d7f4
YO
9184
9185/* reg_temp_copy()
9186
9187 This is a hacky workaround to the structural issue of match results
9188 being stored in the regexp structure which is in turn stored in
9189 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9190 could be PL_curpm in multiple contexts, and could require multiple
9191 result sets being associated with the pattern simultaneously, such
9192 as when doing a recursive match with (??{$qr})
9193
9194 The solution is to make a lightweight copy of the regexp structure
9195 when a qr// is returned from the code executed by (??{$qr}) this
9196 lightweight copy doesnt actually own any of its data except for
9197 the starp/end and the actual regexp structure itself.
9198
9199*/
9200
9201
84679df5 9202REGEXP *
288b8c02 9203Perl_reg_temp_copy (pTHX_ REGEXP *rx) {
d2f13c59 9204 REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02
NC
9205 struct regexp *ret = (struct regexp *)SvANY(ret_x);
9206 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9207 register const I32 npar = r->nparens+1;
288b8c02 9208 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9209 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9210 by pointing directly at the buffer, but flagging that the allocated
9211 space in the copy is zero. As we've just done a struct copy, it's now
9212 a case of zero-ing that, rather than copying the current length. */
9213 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9214 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
08e44740 9215 StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated);
f7c278bf 9216 SvLEN_set(ret_x, 0);
f0ab9afb
NC
9217 Newx(ret->offs, npar, regexp_paren_pair);
9218 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9219 if (r->substrs) {
28d8d7f4 9220 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9221 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9222
9223 SvREFCNT_inc_void(ret->anchored_substr);
9224 SvREFCNT_inc_void(ret->anchored_utf8);
9225 SvREFCNT_inc_void(ret->float_substr);
9226 SvREFCNT_inc_void(ret->float_utf8);
9227
9228 /* check_substr and check_utf8, if non-NULL, point to either their
9229 anchored or float namesakes, and don't hold a second reference. */
486913e4 9230 }
288b8c02 9231 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9232#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9233 ret->saved_copy = NULL;
28d8d7f4 9234#endif
288b8c02 9235 ret->mother_re = rx;
28d8d7f4
YO
9236 ret->swap = NULL;
9237
288b8c02 9238 return ret_x;
28d8d7f4 9239}
f8149455
YO
9240#endif
9241
9242/* regfree_internal()
9243
9244 Free the private data in a regexp. This is overloadable by
9245 extensions. Perl takes care of the regexp structure in pregfree(),
9246 this covers the *pprivate pointer which technically perldoesnt
9247 know about, however of course we have to handle the
9248 regexp_internal structure when no extension is in use.
9249
9250 Note this is called before freeing anything in the regexp
9251 structure.
9252 */
9253
9254void
288b8c02 9255Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9256{
9257 dVAR;
288b8c02 9258 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9259 RXi_GET_DECL(r,ri);
9260 GET_RE_DEBUG_FLAGS_DECL;
9261
9262 DEBUG_COMPILE_r({
9263 if (!PL_colorset)
9264 reginitcolors();
9265 {
9266 SV *dsv= sv_newmortal();
3c8556c3 9267 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9268 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9269 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9270 PL_colors[4],PL_colors[5],s);
9271 }
9272 });
7122b237
YO
9273#ifdef RE_TRACK_PATTERN_OFFSETS
9274 if (ri->u.offsets)
9275 Safefree(ri->u.offsets); /* 20010421 MJD */
9276#endif
f8fc2ecf
YO
9277 if (ri->data) {
9278 int n = ri->data->count;
f3548bdc
DM
9279 PAD* new_comppad = NULL;
9280 PAD* old_comppad;
4026c95a 9281 PADOFFSET refcnt;
dfad63ad 9282
c277df42 9283 while (--n >= 0) {
261faec3 9284 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9285 switch (ri->data->what[n]) {
c277df42 9286 case 's':
81714fb9 9287 case 'S':
55eed653 9288 case 'u':
f8fc2ecf 9289 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 9290 break;
653099ff 9291 case 'f':
f8fc2ecf 9292 Safefree(ri->data->data[n]);
653099ff 9293 break;
dfad63ad 9294 case 'p':
f8fc2ecf 9295 new_comppad = (AV*)ri->data->data[n];
dfad63ad 9296 break;
c277df42 9297 case 'o':
dfad63ad 9298 if (new_comppad == NULL)
cea2e8a9 9299 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9300 PAD_SAVE_LOCAL(old_comppad,
9301 /* Watch out for global destruction's random ordering. */
c445ea15 9302 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9303 );
b34c0dd4 9304 OP_REFCNT_LOCK;
f8fc2ecf 9305 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9306 OP_REFCNT_UNLOCK;
9307 if (!refcnt)
f8fc2ecf 9308 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9309
f3548bdc 9310 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
9311 SvREFCNT_dec((SV*)new_comppad);
9312 new_comppad = NULL;
c277df42
IZ
9313 break;
9314 case 'n':
9e55ce06 9315 break;
07be1b83 9316 case 'T':
be8e71aa
YO
9317 { /* Aho Corasick add-on structure for a trie node.
9318 Used in stclass optimization only */
07be1b83 9319 U32 refcount;
f8fc2ecf 9320 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9321 OP_REFCNT_LOCK;
9322 refcount = --aho->refcount;
9323 OP_REFCNT_UNLOCK;
9324 if ( !refcount ) {
446bd890
NC
9325 PerlMemShared_free(aho->states);
9326 PerlMemShared_free(aho->fail);
446bd890
NC
9327 /* do this last!!!! */
9328 PerlMemShared_free(ri->data->data[n]);
9329 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9330 }
9331 }
9332 break;
a3621e74 9333 case 't':
07be1b83 9334 {
be8e71aa 9335 /* trie structure. */
07be1b83 9336 U32 refcount;
f8fc2ecf 9337 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9338 OP_REFCNT_LOCK;
9339 refcount = --trie->refcount;
9340 OP_REFCNT_UNLOCK;
9341 if ( !refcount ) {
446bd890 9342 PerlMemShared_free(trie->charmap);
446bd890
NC
9343 PerlMemShared_free(trie->states);
9344 PerlMemShared_free(trie->trans);
07be1b83 9345 if (trie->bitmap)
446bd890 9346 PerlMemShared_free(trie->bitmap);
07be1b83 9347 if (trie->wordlen)
446bd890 9348 PerlMemShared_free(trie->wordlen);
786e8c11 9349 if (trie->jump)
446bd890 9350 PerlMemShared_free(trie->jump);
786e8c11 9351 if (trie->nextword)
446bd890 9352 PerlMemShared_free(trie->nextword);
446bd890
NC
9353 /* do this last!!!! */
9354 PerlMemShared_free(ri->data->data[n]);
a3621e74 9355 }
07be1b83
YO
9356 }
9357 break;
c277df42 9358 default:
f8fc2ecf 9359 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9360 }
9361 }
f8fc2ecf
YO
9362 Safefree(ri->data->what);
9363 Safefree(ri->data);
a0d0e21e 9364 }
28d8d7f4 9365
f8fc2ecf 9366 Safefree(ri);
a687059c 9367}
c277df42 9368
84da74a7
YO
9369#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9370#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 9371#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
9372#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9373
9374/*
32cd70f6 9375 re_dup - duplicate a regexp.
84da74a7 9376
32cd70f6
NC
9377 This routine is expected to clone a given regexp structure. It is not
9378 compiler under USE_ITHREADS.
9379
f8149455
YO
9380 After all of the core data stored in struct regexp is duplicated
9381 the regexp_engine.dupe method is used to copy any private data
9382 stored in the *pprivate pointer. This allows extensions to handle
9383 any duplication it needs to do.
9384
9385 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9386*/
a3c0e9ca 9387#if defined(USE_ITHREADS)
f8149455 9388#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9389void
9390Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9391{
84da74a7 9392 dVAR;
a86a1ca7 9393 I32 npar;
288b8c02
NC
9394 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9395 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9396
84da74a7 9397 npar = r->nparens+1;
f0ab9afb
NC
9398 Newx(ret->offs, npar, regexp_paren_pair);
9399 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9400 if(ret->swap) {
28d8d7f4 9401 /* no need to copy these */
f0ab9afb 9402 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9403 }
84da74a7 9404
6057429f 9405 if (ret->substrs) {
32cd70f6
NC
9406 /* Do it this way to avoid reading from *r after the StructCopy().
9407 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9408 cache, it doesn't matter. */
9409 const bool anchored = r->check_substr == r->anchored_substr;
785a26d5 9410 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9411 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9412
32cd70f6
NC
9413 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9414 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9415 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9416 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9417
32cd70f6
NC
9418 /* check_substr and check_utf8, if non-NULL, point to either their
9419 anchored or float namesakes, and don't hold a second reference. */
9420
9421 if (ret->check_substr) {
9422 if (anchored) {
9423 assert(r->check_utf8 == r->anchored_utf8);
9424 ret->check_substr = ret->anchored_substr;
9425 ret->check_utf8 = ret->anchored_utf8;
9426 } else {
9427 assert(r->check_substr == r->float_substr);
9428 assert(r->check_utf8 == r->float_utf8);
9429 ret->check_substr = ret->float_substr;
9430 ret->check_utf8 = ret->float_utf8;
9431 }
9432 }
6057429f 9433 }
f8149455 9434
5daac39c 9435 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9436
6057429f 9437 if (ret->pprivate)
288b8c02 9438 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9439
288b8c02 9440 if (RX_MATCH_COPIED(dstr))
6057429f 9441 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9442 else
9443 ret->subbeg = NULL;
9444#ifdef PERL_OLD_COPY_ON_WRITE
9445 ret->saved_copy = NULL;
9446#endif
6057429f
NC
9447
9448 ret->mother_re = NULL;
9449 ret->gofs = 0;
9450 ret->seen_evals = 0;
f8149455
YO
9451}
9452#endif /* PERL_IN_XSUB_RE */
9453
9454/*
9455 regdupe_internal()
9456
9457 This is the internal complement to regdupe() which is used to copy
9458 the structure pointed to by the *pprivate pointer in the regexp.
9459 This is the core version of the extension overridable cloning hook.
9460 The regexp structure being duplicated will be copied by perl prior
9461 to this and will be provided as the regexp *r argument, however
9462 with the /old/ structures pprivate pointer value. Thus this routine
9463 may override any copying normally done by perl.
9464
9465 It returns a pointer to the new regexp_internal structure.
9466*/
9467
9468void *
288b8c02 9469Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9470{
9471 dVAR;
288b8c02 9472 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9473 regexp_internal *reti;
9474 int len, npar;
9475 RXi_GET_DECL(r,ri);
9476
9477 npar = r->nparens+1;
7122b237 9478 len = ProgLen(ri);
f8149455
YO
9479
9480 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9481 Copy(ri->program, reti->program, len+1, regnode);
9482
f8149455 9483
f8fc2ecf 9484 reti->regstclass = NULL;
bcdf7404 9485
f8fc2ecf 9486 if (ri->data) {
84da74a7 9487 struct reg_data *d;
f8fc2ecf 9488 const int count = ri->data->count;
84da74a7
YO
9489 int i;
9490
9491 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9492 char, struct reg_data);
9493 Newx(d->what, count, U8);
9494
9495 d->count = count;
9496 for (i = 0; i < count; i++) {
f8fc2ecf 9497 d->what[i] = ri->data->what[i];
84da74a7 9498 switch (d->what[i]) {
55eed653 9499 /* legal options are one of: sSfpontTu
84da74a7
YO
9500 see also regcomp.h and pregfree() */
9501 case 's':
81714fb9 9502 case 'S':
0536c0a7 9503 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9504 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 9505 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 9506 break;
84da74a7
YO
9507 case 'f':
9508 /* This is cheating. */
9509 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9510 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9511 struct regnode_charclass_class);
f8fc2ecf 9512 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9513 break;
9514 case 'o':
bbe252da
YO
9515 /* Compiled op trees are readonly and in shared memory,
9516 and can thus be shared without duplication. */
84da74a7 9517 OP_REFCNT_LOCK;
f8fc2ecf 9518 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9519 OP_REFCNT_UNLOCK;
9520 break;
23eab42c
NC
9521 case 'T':
9522 /* Trie stclasses are readonly and can thus be shared
9523 * without duplication. We free the stclass in pregfree
9524 * when the corresponding reg_ac_data struct is freed.
9525 */
9526 reti->regstclass= ri->regstclass;
9527 /* Fall through */
84da74a7 9528 case 't':
84da74a7 9529 OP_REFCNT_LOCK;
0536c0a7 9530 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9531 OP_REFCNT_UNLOCK;
0536c0a7
NC
9532 /* Fall through */
9533 case 'n':
9534 d->data[i] = ri->data->data[i];
84da74a7 9535 break;
84da74a7 9536 default:
f8fc2ecf 9537 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9538 }
9539 }
9540
f8fc2ecf 9541 reti->data = d;
84da74a7
YO
9542 }
9543 else
f8fc2ecf 9544 reti->data = NULL;
84da74a7 9545
cde0cee5
YO
9546 reti->name_list_idx = ri->name_list_idx;
9547
7122b237
YO
9548#ifdef RE_TRACK_PATTERN_OFFSETS
9549 if (ri->u.offsets) {
9550 Newx(reti->u.offsets, 2*len+1, U32);
9551 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9552 }
9553#else
9554 SetProgLen(reti,len);
9555#endif
9556
f8149455 9557 return (void*)reti;
84da74a7 9558}
f8149455
YO
9559
9560#endif /* USE_ITHREADS */
84da74a7 9561
de8c5301
YO
9562/*
9563 reg_stringify()
9564
9565 converts a regexp embedded in a MAGIC struct to its stringified form,
9566 caching the converted form in the struct and returns the cached
9567 string.
9568
9569 If lp is nonnull then it is used to return the length of the
9570 resulting string
9571
9572 If flags is nonnull and the returned string contains UTF8 then
f8149455 9573 (*flags & 1) will be true.
de8c5301
YO
9574
9575 If haseval is nonnull then it is used to return whether the pattern
9576 contains evals.
9577
9578 Normally called via macro:
9579
f8149455 9580 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
9581
9582 And internally with
9583
f8149455 9584 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
9585
9586 See sv_2pv_flags() in sv.c for an example of internal usage.
9587
9588 */
f8149455 9589#ifndef PERL_IN_XSUB_RE
bcdf7404 9590
de8c5301
YO
9591char *
9592Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9593 dVAR;
84679df5 9594 const REGEXP * const re = (REGEXP *)mg->mg_obj;
de8c5301 9595 if (haseval)
07bc277f 9596 *haseval = RX_SEEN_EVALS(re);
de8c5301 9597 if (flags)
3c8556c3 9598 *flags = RX_UTF8(re) ? 1 : 0;
de8c5301 9599 if (lp)
866c78d1
NC
9600 *lp = RX_WRAPLEN(re);
9601 return RX_WRAPPED(re);
de8c5301
YO
9602}
9603
c277df42
IZ
9604/*
9605 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9606 */
9607regnode *
864dbfa3 9608Perl_regnext(pTHX_ register regnode *p)
c277df42 9609{
97aff369 9610 dVAR;
c277df42
IZ
9611 register I32 offset;
9612
f8fc2ecf 9613 if (!p)
c277df42
IZ
9614 return(NULL);
9615
9616 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9617 if (offset == 0)
9618 return(NULL);
9619
c277df42 9620 return(p+offset);
c277df42 9621}
76234dfb 9622#endif
c277df42 9623
01f988be 9624STATIC void
cea2e8a9 9625S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9626{
9627 va_list args;
9628 STRLEN l1 = strlen(pat1);
9629 STRLEN l2 = strlen(pat2);
9630 char buf[512];
06bf62c7 9631 SV *msv;
73d840c0 9632 const char *message;
c277df42
IZ
9633
9634 if (l1 > 510)
9635 l1 = 510;
9636 if (l1 + l2 > 510)
9637 l2 = 510 - l1;
9638 Copy(pat1, buf, l1 , char);
9639 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9640 buf[l1 + l2] = '\n';
9641 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9642#ifdef I_STDARG
9643 /* ANSI variant takes additional second argument */
c277df42 9644 va_start(args, pat2);
8736538c
AS
9645#else
9646 va_start(args);
9647#endif
5a844595 9648 msv = vmess(buf, &args);
c277df42 9649 va_end(args);
cfd0369c 9650 message = SvPV_const(msv,l1);
c277df42
IZ
9651 if (l1 > 512)
9652 l1 = 512;
9653 Copy(message, buf, l1 , char);
197cf9b9 9654 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9655 Perl_croak(aTHX_ "%s", buf);
c277df42 9656}
a0ed51b3
LW
9657
9658/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9659
76234dfb 9660#ifndef PERL_IN_XSUB_RE
a0ed51b3 9661void
864dbfa3 9662Perl_save_re_context(pTHX)
b81d288d 9663{
97aff369 9664 dVAR;
1ade1aa1
NC
9665
9666 struct re_save_state *state;
9667
9668 SAVEVPTR(PL_curcop);
9669 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9670
9671 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9672 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9673 SSPUSHINT(SAVEt_RE_STATE);
9674
46ab3289 9675 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9676
a0ed51b3 9677 PL_reg_start_tmp = 0;
a0ed51b3 9678 PL_reg_start_tmpl = 0;
c445ea15 9679 PL_reg_oldsaved = NULL;
a5db57d6 9680 PL_reg_oldsavedlen = 0;
a5db57d6 9681 PL_reg_maxiter = 0;
a5db57d6 9682 PL_reg_leftiter = 0;
c445ea15 9683 PL_reg_poscache = NULL;
a5db57d6 9684 PL_reg_poscache_size = 0;
1ade1aa1
NC
9685#ifdef PERL_OLD_COPY_ON_WRITE
9686 PL_nrs = NULL;
9687#endif
ada6e8a9 9688
c445ea15
AL
9689 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9690 if (PL_curpm) {
9691 const REGEXP * const rx = PM_GETRE(PL_curpm);
9692 if (rx) {
1df70142 9693 U32 i;
07bc277f 9694 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9695 char digits[TYPE_CHARS(long)];
d9fad198 9696 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9697 GV *const *const gvp
9698 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9699
b37c2d43
AL
9700 if (gvp) {
9701 GV * const gv = *gvp;
9702 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9703 save_scalar(gv);
49f27e4b 9704 }
ada6e8a9
AMS
9705 }
9706 }
9707 }
a0ed51b3 9708}
76234dfb 9709#endif
51371543 9710
51371543 9711static void
acfe0abc 9712clear_re(pTHX_ void *r)
51371543 9713{
97aff369 9714 dVAR;
84679df5 9715 ReREFCNT_dec((REGEXP *)r);
51371543 9716}
ffbc6a93 9717
a28509cc
AL
9718#ifdef DEBUGGING
9719
9720STATIC void
9721S_put_byte(pTHX_ SV *sv, int c)
9722{
7fddd944
NC
9723 /* Our definition of isPRINT() ignores locales, so only bytes that are
9724 not part of UTF-8 are considered printable. I assume that the same
9725 holds for UTF-EBCDIC.
9726 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9727 which Wikipedia says:
9728
9729 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9730 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9731 identical, to the ASCII delete (DEL) or rubout control character.
9732 ) So the old condition can be simplified to !isPRINT(c) */
9733 if (!isPRINT(c))
a28509cc 9734 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9735 else {
88c9ea1e 9736 const char string = c;
5e7aa789
NC
9737 if (c == '-' || c == ']' || c == '\\' || c == '^')
9738 sv_catpvs(sv, "\\");
9739 sv_catpvn(sv, &string, 1);
9740 }
a28509cc
AL
9741}
9742
786e8c11 9743
3dab1dad
YO
9744#define CLEAR_OPTSTART \
9745 if (optstart) STMT_START { \
70685ca0 9746 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9747 optstart=NULL; \
9748 } STMT_END
9749
786e8c11 9750#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9751
b5a2f8d8
NC
9752STATIC const regnode *
9753S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9754 const regnode *last, const regnode *plast,
9755 SV* sv, I32 indent, U32 depth)
a28509cc 9756{
97aff369 9757 dVAR;
786e8c11 9758 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9759 register const regnode *next;
3dab1dad 9760 const regnode *optstart= NULL;
1f1031fe 9761
f8fc2ecf 9762 RXi_GET_DECL(r,ri);
3dab1dad 9763 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9764
786e8c11
YO
9765#ifdef DEBUG_DUMPUNTIL
9766 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9767 last ? last-start : 0,plast ? plast-start : 0);
9768#endif
9769
9770 if (plast && plast < last)
9771 last= plast;
9772
9773 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9774 /* While that wasn't END last time... */
a28509cc
AL
9775 NODE_ALIGN(node);
9776 op = OP(node);
de734bd5 9777 if (op == CLOSE || op == WHILEM)
786e8c11 9778 indent--;
b5a2f8d8 9779 next = regnext((regnode *)node);
1f1031fe 9780
a28509cc 9781 /* Where, what. */
8e11feef 9782 if (OP(node) == OPTIMIZED) {
e68ec53f 9783 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9784 optstart = node;
3dab1dad 9785 else
8e11feef 9786 goto after_print;
3dab1dad
YO
9787 } else
9788 CLEAR_OPTSTART;
1f1031fe 9789
32fc9b6a 9790 regprop(r, sv, node);
a28509cc 9791 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9792 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9793
9794 if (OP(node) != OPTIMIZED) {
9795 if (next == NULL) /* Next ptr. */
9796 PerlIO_printf(Perl_debug_log, " (0)");
9797 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9798 PerlIO_printf(Perl_debug_log, " (FAIL)");
9799 else
9800 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9801 (void)PerlIO_putc(Perl_debug_log, '\n');
9802 }
9803
a28509cc
AL
9804 after_print:
9805 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9806 assert(next);
9807 {
9808 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9809 ? regnext((regnode *)next)
9810 : next);
be8e71aa
YO
9811 if (last && nnode > last)
9812 nnode = last;
786e8c11 9813 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9814 }
a28509cc
AL
9815 }
9816 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9817 assert(next);
786e8c11 9818 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9819 }
9820 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9821 const regnode *this_trie = node;
1de06328 9822 const char op = OP(node);
647f639f 9823 const U32 n = ARG(node);
1de06328 9824 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9825 (reg_ac_data *)ri->data->data[n] :
1de06328 9826 NULL;
3251b653
NC
9827 const reg_trie_data * const trie =
9828 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9829#ifdef DEBUGGING
9830 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9831#endif
786e8c11 9832 const regnode *nextbranch= NULL;
a28509cc 9833 I32 word_idx;
1de06328 9834 sv_setpvn(sv, "", 0);
786e8c11 9835 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9836 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9837
9838 PerlIO_printf(Perl_debug_log, "%*s%s ",
9839 (int)(2*(indent+3)), "",
9840 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9841 PL_colors[0], PL_colors[1],
9842 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 9843 PERL_PV_PRETTY_ELLIPSES |
7f69552c 9844 PERL_PV_PRETTY_LTGT
786e8c11
YO
9845 )
9846 : "???"
9847 );
9848 if (trie->jump) {
40d049e4 9849 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9850 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9851 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9852 if (dist) {
9853 if (!nextbranch)
24b23f37 9854 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9855 DUMPUNTIL(this_trie + dist, nextbranch);
9856 }
786e8c11
YO
9857 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9858 nextbranch= regnext((regnode *)nextbranch);
9859 } else {
9860 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9861 }
786e8c11
YO
9862 }
9863 if (last && next > last)
9864 node= last;
9865 else
9866 node= next;
a28509cc 9867 }
786e8c11
YO
9868 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9869 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9870 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9871 }
9872 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9873 assert(next);
786e8c11 9874 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9875 }
9876 else if ( op == PLUS || op == STAR) {
786e8c11 9877 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9878 }
9879 else if (op == ANYOF) {
9880 /* arglen 1 + class block */
9881 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9882 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9883 node = NEXTOPER(node);
9884 }
9885 else if (PL_regkind[(U8)op] == EXACT) {
9886 /* Literal string, where present. */
9887 node += NODE_SZ_STR(node) - 1;
9888 node = NEXTOPER(node);
9889 }
9890 else {
9891 node = NEXTOPER(node);
9892 node += regarglen[(U8)op];
9893 }
9894 if (op == CURLYX || op == OPEN)
786e8c11 9895 indent++;
a28509cc 9896 }
3dab1dad 9897 CLEAR_OPTSTART;
786e8c11 9898#ifdef DEBUG_DUMPUNTIL
70685ca0 9899 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9900#endif
1de06328 9901 return node;
a28509cc
AL
9902}
9903
9904#endif /* DEBUGGING */
9905
241d1a3b
NC
9906/*
9907 * Local variables:
9908 * c-indentation-style: bsd
9909 * c-basic-offset: 4
9910 * indent-tabs-mode: t
9911 * End:
9912 *
37442d52
RGS
9913 * ex: set ts=8 sts=4 sw=4 noet:
9914 */