This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: Some Incompatible Changes clean-up
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
4624b182 145 I32 contains_locale;
bb3f3ed2 146 I32 override_recoding;
830247a4
IZ
147#if ADD_TO_REGEXEC
148 char *starttry; /* -Dr: where regtry was called. */
149#define RExC_starttry (pRExC_state->starttry)
150#endif
3dab1dad 151#ifdef DEBUGGING
be8e71aa 152 const char *lastparse;
3dab1dad 153 I32 lastnum;
1f1031fe 154 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
155#define RExC_lastparse (pRExC_state->lastparse)
156#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 157#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 158#endif
830247a4
IZ
159} RExC_state_t;
160
e2509266 161#define RExC_flags (pRExC_state->flags)
830247a4 162#define RExC_precomp (pRExC_state->precomp)
288b8c02 163#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 164#define RExC_rx (pRExC_state->rx)
f8fc2ecf 165#define RExC_rxi (pRExC_state->rxi)
fac92740 166#define RExC_start (pRExC_state->start)
830247a4
IZ
167#define RExC_end (pRExC_state->end)
168#define RExC_parse (pRExC_state->parse)
169#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
170#ifdef RE_TRACK_PATTERN_OFFSETS
171#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
172#endif
830247a4 173#define RExC_emit (pRExC_state->emit)
fac92740 174#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 175#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
176#define RExC_naughty (pRExC_state->naughty)
177#define RExC_sawback (pRExC_state->sawback)
178#define RExC_seen (pRExC_state->seen)
179#define RExC_size (pRExC_state->size)
180#define RExC_npar (pRExC_state->npar)
e2e6a0f1 181#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
182#define RExC_extralen (pRExC_state->extralen)
183#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
184#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 185#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 186#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 187#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
188#define RExC_open_parens (pRExC_state->open_parens)
189#define RExC_close_parens (pRExC_state->close_parens)
190#define RExC_opend (pRExC_state->opend)
81714fb9 191#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
192#define RExC_recurse (pRExC_state->recurse)
193#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 194#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 195#define RExC_contains_locale (pRExC_state->contains_locale)
bb3f3ed2 196#define RExC_override_recoding (pRExC_state->override_recoding)
830247a4 197
cde0cee5 198
a687059c
LW
199#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
200#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
201 ((*s) == '{' && regcurly(s)))
a687059c 202
35c8bce7
LW
203#ifdef SPSTART
204#undef SPSTART /* dratted cpp namespace... */
205#endif
a687059c
LW
206/*
207 * Flags to be passed up and down.
208 */
a687059c 209#define WORST 0 /* Worst case. */
a3b492c3 210#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
211
212/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 213 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 214#define SIMPLE 0x02
a3b492c3
YO
215#define SPSTART 0x04 /* Starts with * or +. */
216#define TRYAGAIN 0x08 /* Weeded out a declaration. */
217#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 218
3dab1dad
YO
219#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
220
07be1b83
YO
221/* whether trie related optimizations are enabled */
222#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
223#define TRIE_STUDY_OPT
786e8c11 224#define FULL_TRIE_STUDY
07be1b83
YO
225#define TRIE_STCLASS
226#endif
1de06328
YO
227
228
40d049e4
YO
229
230#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
231#define PBITVAL(paren) (1 << ((paren) & 7))
232#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
233#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
234#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
235
bbd61b5f
KW
236/* If not already in utf8, do a longjmp back to the beginning */
237#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
238#define REQUIRE_UTF8 STMT_START { \
239 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
240 } STMT_END
40d049e4 241
1de06328
YO
242/* About scan_data_t.
243
244 During optimisation we recurse through the regexp program performing
245 various inplace (keyhole style) optimisations. In addition study_chunk
246 and scan_commit populate this data structure with information about
247 what strings MUST appear in the pattern. We look for the longest
3b753521 248 string that must appear at a fixed location, and we look for the
1de06328
YO
249 longest string that may appear at a floating location. So for instance
250 in the pattern:
251
252 /FOO[xX]A.*B[xX]BAR/
253
254 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
255 strings (because they follow a .* construct). study_chunk will identify
256 both FOO and BAR as being the longest fixed and floating strings respectively.
257
258 The strings can be composites, for instance
259
260 /(f)(o)(o)/
261
262 will result in a composite fixed substring 'foo'.
263
264 For each string some basic information is maintained:
265
266 - offset or min_offset
267 This is the position the string must appear at, or not before.
268 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
269 characters must match before the string we are searching for.
270 Likewise when combined with minlenp and the length of the string it
1de06328
YO
271 tells us how many characters must appear after the string we have
272 found.
273
274 - max_offset
275 Only used for floating strings. This is the rightmost point that
3b753521 276 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
277 string can occur infinitely far to the right.
278
279 - minlenp
280 A pointer to the minimum length of the pattern that the string
281 was found inside. This is important as in the case of positive
282 lookahead or positive lookbehind we can have multiple patterns
283 involved. Consider
284
285 /(?=FOO).*F/
286
287 The minimum length of the pattern overall is 3, the minimum length
288 of the lookahead part is 3, but the minimum length of the part that
289 will actually match is 1. So 'FOO's minimum length is 3, but the
290 minimum length for the F is 1. This is important as the minimum length
291 is used to determine offsets in front of and behind the string being
292 looked for. Since strings can be composites this is the length of the
486ec47a 293 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
294 the length is calculated by study_chunk, so that the minimum lengths
295 are not known until the full pattern has been compiled, thus the
296 pointer to the value.
297
298 - lookbehind
299
300 In the case of lookbehind the string being searched for can be
301 offset past the start point of the final matching string.
302 If this value was just blithely removed from the min_offset it would
303 invalidate some of the calculations for how many chars must match
304 before or after (as they are derived from min_offset and minlen and
305 the length of the string being searched for).
306 When the final pattern is compiled and the data is moved from the
307 scan_data_t structure into the regexp structure the information
308 about lookbehind is factored in, with the information that would
309 have been lost precalculated in the end_shift field for the
310 associated string.
311
312 The fields pos_min and pos_delta are used to store the minimum offset
313 and the delta to the maximum offset at the current point in the pattern.
314
315*/
2c2d71f5
JH
316
317typedef struct scan_data_t {
1de06328
YO
318 /*I32 len_min; unused */
319 /*I32 len_delta; unused */
2c2d71f5
JH
320 I32 pos_min;
321 I32 pos_delta;
322 SV *last_found;
1de06328 323 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
324 I32 last_start_min;
325 I32 last_start_max;
1de06328
YO
326 SV **longest; /* Either &l_fixed, or &l_float. */
327 SV *longest_fixed; /* longest fixed string found in pattern */
328 I32 offset_fixed; /* offset where it starts */
486ec47a 329 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
330 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
331 SV *longest_float; /* longest floating string found in pattern */
332 I32 offset_float_min; /* earliest point in string it can appear */
333 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 334 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 335 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
336 I32 flags;
337 I32 whilem_c;
cb434fcc 338 I32 *last_closep;
653099ff 339 struct regnode_charclass_class *start_class;
2c2d71f5
JH
340} scan_data_t;
341
a687059c 342/*
e50aee73 343 * Forward declarations for pregcomp()'s friends.
a687059c 344 */
a0d0e21e 345
27da23d5 346static const scan_data_t zero_scan_data =
1de06328 347 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
348
349#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
350#define SF_BEFORE_SEOL 0x0001
351#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
352#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
353#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
354
09b7f37c
CB
355#ifdef NO_UNARY_PLUS
356# define SF_FIX_SHIFT_EOL (0+2)
357# define SF_FL_SHIFT_EOL (0+4)
358#else
359# define SF_FIX_SHIFT_EOL (+2)
360# define SF_FL_SHIFT_EOL (+4)
361#endif
c277df42
IZ
362
363#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
364#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
365
366#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
367#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
368#define SF_IS_INF 0x0040
369#define SF_HAS_PAR 0x0080
370#define SF_IN_PAR 0x0100
371#define SF_HAS_EVAL 0x0200
372#define SCF_DO_SUBSTR 0x0400
653099ff
GS
373#define SCF_DO_STCLASS_AND 0x0800
374#define SCF_DO_STCLASS_OR 0x1000
375#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 376#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 377
786e8c11 378#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 379#define SCF_SEEN_ACCEPT 0x8000
07be1b83 380
43fead97 381#define UTF cBOOL(RExC_utf8)
a62b1201
KW
382#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
383#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
384#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
385#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
386#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
387#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
388#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 389
43fead97 390#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 391
ffc61ed2 392#define OOB_UNICODE 12345678
93733859 393#define OOB_NAMEDCLASS -1
b8c5462f 394
a0ed51b3
LW
395#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
396#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
397
8615cb43 398
b45f050a
JF
399/* length of regex to show in messages that don't mark a position within */
400#define RegexLengthToShowInErrorMessages 127
401
402/*
403 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
404 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
405 * op/pragma/warn/regcomp.
406 */
7253e4e3
RK
407#define MARKER1 "<-- HERE" /* marker as it appears in the description */
408#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 409
7253e4e3 410#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
414 * arg. Show regex, up to a maximum length. If it's too long, chop and add
415 * "...".
416 */
58e23c8d 417#define _FAIL(code) STMT_START { \
bfed75c6 418 const char *ellipses = ""; \
ccb2c380
MP
419 IV len = RExC_end - RExC_precomp; \
420 \
421 if (!SIZE_ONLY) \
288b8c02 422 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
423 if (len > RegexLengthToShowInErrorMessages) { \
424 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
425 len = RegexLengthToShowInErrorMessages - 10; \
426 ellipses = "..."; \
427 } \
58e23c8d 428 code; \
ccb2c380 429} STMT_END
8615cb43 430
58e23c8d
YO
431#define FAIL(msg) _FAIL( \
432 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
433 msg, (int)len, RExC_precomp, ellipses))
434
435#define FAIL2(msg,arg) _FAIL( \
436 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
437 arg, (int)len, RExC_precomp, ellipses))
438
b45f050a 439/*
b45f050a
JF
440 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
441 */
ccb2c380 442#define Simple_vFAIL(m) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
445 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
450 */
ccb2c380
MP
451#define vFAIL(m) STMT_START { \
452 if (!SIZE_ONLY) \
288b8c02 453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
454 Simple_vFAIL(m); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts two arguments.
459 */
ccb2c380 460#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a
JF
465
466/*
467 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
468 */
ccb2c380
MP
469#define vFAIL2(m,a1) STMT_START { \
470 if (!SIZE_ONLY) \
288b8c02 471 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
472 Simple_vFAIL2(m, a1); \
473} STMT_END
b45f050a
JF
474
475
476/*
477 * Like Simple_vFAIL(), but accepts three arguments.
478 */
ccb2c380 479#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 480 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
481 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
482 (int)offset, RExC_precomp, RExC_precomp + offset); \
483} STMT_END
b45f050a
JF
484
485/*
486 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
487 */
ccb2c380
MP
488#define vFAIL3(m,a1,a2) STMT_START { \
489 if (!SIZE_ONLY) \
288b8c02 490 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
491 Simple_vFAIL3(m, a1, a2); \
492} STMT_END
b45f050a
JF
493
494/*
495 * Like Simple_vFAIL(), but accepts four arguments.
496 */
ccb2c380 497#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 498 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
499 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
501} STMT_END
b45f050a 502
668c081a 503#define ckWARNreg(loc,m) STMT_START { \
a28509cc 504 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
507} STMT_END
508
668c081a 509#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 510 const IV offset = loc - RExC_precomp; \
d1d15184 511 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
512 m REPORT_LOCATION, \
513 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
514} STMT_END
515
2335b3d3
KW
516#define ckWARN2regdep(loc,m, a1) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
519 m REPORT_LOCATION, \
520 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
521} STMT_END
522
668c081a 523#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 524 const IV offset = loc - RExC_precomp; \
668c081a 525 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
526 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
527} STMT_END
528
529#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 530 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
531 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
532 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
533} STMT_END
534
668c081a
NC
535#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
536 const IV offset = loc - RExC_precomp; \
537 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
538 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
539} STMT_END
540
ccb2c380 541#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 542 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
543 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
544 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
545} STMT_END
546
668c081a
NC
547#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
548 const IV offset = loc - RExC_precomp; \
549 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
550 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
551} STMT_END
552
ccb2c380 553#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 554 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
555 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
557} STMT_END
9d1d55b5 558
8615cb43 559
cd439c50 560/* Allow for side effects in s */
ccb2c380
MP
561#define REGC(c,s) STMT_START { \
562 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
563} STMT_END
cd439c50 564
fac92740
MJD
565/* Macros for recording node offsets. 20001227 mjd@plover.com
566 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
567 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
568 * Element 0 holds the number n.
07be1b83 569 * Position is 1 indexed.
fac92740 570 */
7122b237
YO
571#ifndef RE_TRACK_PATTERN_OFFSETS
572#define Set_Node_Offset_To_R(node,byte)
573#define Set_Node_Offset(node,byte)
574#define Set_Cur_Node_Offset
575#define Set_Node_Length_To_R(node,len)
576#define Set_Node_Length(node,len)
577#define Set_Node_Cur_Length(node)
578#define Node_Offset(n)
579#define Node_Length(n)
580#define Set_Node_Offset_Length(node,offset,len)
581#define ProgLen(ri) ri->u.proglen
582#define SetProgLen(ri,x) ri->u.proglen = x
583#else
584#define ProgLen(ri) ri->u.offsets[0]
585#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
586#define Set_Node_Offset_To_R(node,byte) STMT_START { \
587 if (! SIZE_ONLY) { \
588 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 589 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 590 if((node) < 0) { \
551405c4 591 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
592 } else { \
593 RExC_offsets[2*(node)-1] = (byte); \
594 } \
595 } \
596} STMT_END
597
598#define Set_Node_Offset(node,byte) \
599 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
600#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
601
602#define Set_Node_Length_To_R(node,len) STMT_START { \
603 if (! SIZE_ONLY) { \
604 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 605 __LINE__, (int)(node), (int)(len))); \
ccb2c380 606 if((node) < 0) { \
551405c4 607 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
608 } else { \
609 RExC_offsets[2*(node)] = (len); \
610 } \
611 } \
612} STMT_END
613
614#define Set_Node_Length(node,len) \
615 Set_Node_Length_To_R((node)-RExC_emit_start, len)
616#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
617#define Set_Node_Cur_Length(node) \
618 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
619
620/* Get offsets and lengths */
621#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
622#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
623
07be1b83
YO
624#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
625 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
626 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
627} STMT_END
7122b237 628#endif
07be1b83
YO
629
630#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
631#define EXPERIMENTAL_INPLACESCAN
f427392e 632#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 633
304ee84b
YO
634#define DEBUG_STUDYDATA(str,data,depth) \
635DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 636 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
637 "%*s" str "Pos:%"IVdf"/%"IVdf \
638 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
639 (int)(depth)*2, "", \
640 (IV)((data)->pos_min), \
641 (IV)((data)->pos_delta), \
304ee84b 642 (UV)((data)->flags), \
1de06328 643 (IV)((data)->whilem_c), \
304ee84b
YO
644 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
645 is_inf ? "INF " : "" \
1de06328
YO
646 ); \
647 if ((data)->last_found) \
648 PerlIO_printf(Perl_debug_log, \
649 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
650 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
651 SvPVX_const((data)->last_found), \
652 (IV)((data)->last_end), \
653 (IV)((data)->last_start_min), \
654 (IV)((data)->last_start_max), \
655 ((data)->longest && \
656 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
657 SvPVX_const((data)->longest_fixed), \
658 (IV)((data)->offset_fixed), \
659 ((data)->longest && \
660 (data)->longest==&((data)->longest_float)) ? "*" : "", \
661 SvPVX_const((data)->longest_float), \
662 (IV)((data)->offset_float_min), \
663 (IV)((data)->offset_float_max) \
664 ); \
665 PerlIO_printf(Perl_debug_log,"\n"); \
666});
667
acfe0abc 668static void clear_re(pTHX_ void *r);
4327152a 669
653099ff 670/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 671 Update the longest found anchored substring and the longest found
653099ff
GS
672 floating substrings if needed. */
673
4327152a 674STATIC void
304ee84b 675S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 676{
e1ec3a88
AL
677 const STRLEN l = CHR_SVLEN(data->last_found);
678 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 679 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 680
7918f24d
NC
681 PERL_ARGS_ASSERT_SCAN_COMMIT;
682
c277df42 683 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 684 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
685 if (*data->longest == data->longest_fixed) {
686 data->offset_fixed = l ? data->last_start_min : data->pos_min;
687 if (data->flags & SF_BEFORE_EOL)
b81d288d 688 data->flags
c277df42
IZ
689 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
690 else
691 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
692 data->minlen_fixed=minlenp;
693 data->lookbehind_fixed=0;
a0ed51b3 694 }
304ee84b 695 else { /* *data->longest == data->longest_float */
c277df42 696 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
697 data->offset_float_max = (l
698 ? data->last_start_max
c277df42 699 : data->pos_min + data->pos_delta);
304ee84b 700 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 701 data->offset_float_max = I32_MAX;
c277df42 702 if (data->flags & SF_BEFORE_EOL)
b81d288d 703 data->flags
c277df42
IZ
704 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
705 else
706 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
707 data->minlen_float=minlenp;
708 data->lookbehind_float=0;
c277df42
IZ
709 }
710 }
711 SvCUR_set(data->last_found, 0);
0eda9292 712 {
a28509cc 713 SV * const sv = data->last_found;
097eb12c
AL
714 if (SvUTF8(sv) && SvMAGICAL(sv)) {
715 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
716 if (mg)
717 mg->mg_len = 0;
718 }
0eda9292 719 }
c277df42
IZ
720 data->last_end = -1;
721 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 722 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
723}
724
653099ff
GS
725/* Can match anything (initialization) */
726STATIC void
3fffb88a 727S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 728{
7918f24d
NC
729 PERL_ARGS_ASSERT_CL_ANYTHING;
730
f8bef550 731 ANYOF_BITMAP_SETALL(cl);
dd58aee1 732 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
05dd4d39
KW
733 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
734 /* Even though no bitmap is in use here, we need to set
735 * the flag below so an AND with a node that does have one
736 * doesn't lose that one. The flag should get cleared if
737 * the other one doesn't; and the code in regexec.c is
738 * structured so this being set when not needed does no
739 * harm. It seemed a little cleaner to set it here than do
740 * a special case in cl_and() */
741 |ANYOF_NONBITMAP_NON_UTF8;
3fffb88a
KW
742
743 /* If any portion of the regex is to operate under locale rules,
744 * initialization includes it. The reason this isn't done for all regexes
745 * is that the optimizer was written under the assumption that locale was
746 * all-or-nothing. Given the complexity and lack of documentation in the
747 * optimizer, and that there are inadequate test cases for locale, so many
748 * parts of it may not work properly, it is safest to avoid locale unless
749 * necessary. */
750 if (RExC_contains_locale) {
9d7a1e63 751 ANYOF_CLASS_SETALL(cl); /* /l uses class */
3fffb88a
KW
752 cl->flags |= ANYOF_LOCALE;
753 }
9d7a1e63
KW
754 else {
755 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
756 }
653099ff
GS
757}
758
759/* Can match anything (initialization) */
760STATIC int
5f66b61c 761S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
762{
763 int value;
764
7918f24d
NC
765 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
766
aaa51d5e 767 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
768 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
769 return 1;
1aa99e6b
IH
770 if (!(cl->flags & ANYOF_UNICODE_ALL))
771 return 0;
10edeb5d 772 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 773 return 0;
653099ff
GS
774 return 1;
775}
776
777/* Can match anything (initialization) */
778STATIC void
e755fd73 779S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 780{
7918f24d
NC
781 PERL_ARGS_ASSERT_CL_INIT;
782
8ecf7187 783 Zero(cl, 1, struct regnode_charclass_class);
653099ff 784 cl->type = ANYOF;
3fffb88a 785 cl_anything(pRExC_state, cl);
1411dba4 786 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
787}
788
1051e1c4
KW
789/* These two functions currently do the exact same thing */
790#define cl_init_zero S_cl_init
653099ff 791
dd58aee1
KW
792/* 'AND' a given class with another one. Can create false positives. 'cl'
793 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
794 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 795STATIC void
5f66b61c 796S_cl_and(struct regnode_charclass_class *cl,
a28509cc 797 const struct regnode_charclass_class *and_with)
653099ff 798{
7918f24d 799 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
800
801 assert(and_with->type == ANYOF);
1e6ade67 802
c6b76537 803 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
804 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
805 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 806 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
807 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
808 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
809 int i;
810
811 if (and_with->flags & ANYOF_INVERT)
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] &= ~and_with->bitmap[i];
814 else
815 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
816 cl->bitmap[i] &= and_with->bitmap[i];
817 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 818
c6b76537 819 if (and_with->flags & ANYOF_INVERT) {
8951c461 820
c6b76537
KW
821 /* Here, the and'ed node is inverted. Get the AND of the flags that
822 * aren't affected by the inversion. Those that are affected are
823 * handled individually below */
824 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
825 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
826 cl->flags |= affected_flags;
827
828 /* We currently don't know how to deal with things that aren't in the
829 * bitmap, but we know that the intersection is no greater than what
830 * is already in cl, so let there be false positives that get sorted
831 * out after the synthetic start class succeeds, and the node is
832 * matched for real. */
833
834 /* The inversion of these two flags indicate that the resulting
835 * intersection doesn't have them */
836 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
837 cl->flags &= ~ANYOF_UNICODE_ALL;
838 }
c6b76537
KW
839 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
840 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 841 }
1aa99e6b 842 }
c6b76537 843 else { /* and'd node is not inverted */
137165a6 844 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
845
846 /* Here 'and_with' doesn't match anything outside the bitmap
847 * (except possibly ANYOF_UNICODE_ALL), which means the
848 * intersection can't either, except for ANYOF_UNICODE_ALL, in
849 * which case we don't know what the intersection is, but it's no
850 * greater than what cl already has, so can just leave it alone,
851 * with possible false positives */
852 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
853 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 854 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 855 }
137165a6 856 }
c6b76537
KW
857 else if (! ANYOF_NONBITMAP(cl)) {
858
859 /* Here, 'and_with' does match something outside the bitmap, and cl
860 * doesn't have a list of things to match outside the bitmap. If
861 * cl can match all code points above 255, the intersection will
862 * be those above-255 code points that 'and_with' matches. There
863 * may be false positives from code points in 'and_with' that are
864 * outside the bitmap but below 256, but those get sorted out
865 * after the synthetic start class succeeds). If cl can't match
866 * all Unicode code points, it means here that it can't match *
867 * anything outside the bitmap, so we leave the bitmap empty */
868 if (cl->flags & ANYOF_UNICODE_ALL) {
869 ARG_SET(cl, ARG(and_with));
870 }
871 }
872 else {
873 /* Here, both 'and_with' and cl match something outside the
874 * bitmap. Currently we do not do the intersection, so just match
875 * whatever cl had at the beginning. */
876 }
877
878
879 /* Take the intersection of the two sets of flags */
880 cl->flags &= and_with->flags;
137165a6 881 }
653099ff
GS
882}
883
dd58aee1
KW
884/* 'OR' a given class with another one. Can create false positives. 'cl'
885 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
886 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 887STATIC void
3fffb88a 888S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 889{
7918f24d
NC
890 PERL_ARGS_ASSERT_CL_OR;
891
653099ff 892 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
893
894 /* Here, the or'd node is to be inverted. This means we take the
895 * complement of everything not in the bitmap, but currently we don't
896 * know what that is, so give up and match anything */
897 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 898 cl_anything(pRExC_state, cl);
c6b76537 899 }
653099ff
GS
900 /* We do not use
901 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
902 * <= (B1 | !B2) | (CL1 | !CL2)
903 * which is wasteful if CL2 is small, but we ignore CL2:
904 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
905 * XXXX Can we handle case-fold? Unclear:
906 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
907 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
908 */
c6b76537 909 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
910 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
911 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
912 int i;
913
914 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
915 cl->bitmap[i] |= ~or_with->bitmap[i];
916 } /* XXXX: logic is complicated otherwise */
917 else {
3fffb88a 918 cl_anything(pRExC_state, cl);
653099ff 919 }
c6b76537
KW
920
921 /* And, we can just take the union of the flags that aren't affected
922 * by the inversion */
923 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
924
925 /* For the remaining flags:
926 ANYOF_UNICODE_ALL and inverted means to not match anything above
927 255, which means that the union with cl should just be
928 what cl has in it, so can ignore this flag
929 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
930 is 127-255 to match them, but then invert that, so the
931 union with cl should just be what cl has in it, so can
932 ignore this flag
933 */
934 } else { /* 'or_with' is not inverted */
653099ff
GS
935 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
936 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
937 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
938 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
939 int i;
940
941 /* OR char bitmap and class bitmap separately */
942 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
943 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 944 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
945 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
946 cl->classflags[i] |= or_with->classflags[i];
947 cl->flags |= ANYOF_CLASS;
948 }
949 }
950 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 951 cl_anything(pRExC_state, cl);
653099ff 952 }
9826f543 953
c6b76537
KW
954 if (ANYOF_NONBITMAP(or_with)) {
955
956 /* Use the added node's outside-the-bit-map match if there isn't a
957 * conflict. If there is a conflict (both nodes match something
958 * outside the bitmap, but what they match outside is not the same
959 * pointer, and hence not easily compared until XXX we extend
960 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
961 * match everything outside the bitmap. If that stuff is all above
962 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
c6b76537
KW
963 if (! ANYOF_NONBITMAP(cl)) {
964 ARG_SET(cl, ARG(or_with));
965 }
966 else if (ARG(cl) != ARG(or_with)) {
d94b1d13
KW
967
968 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
969 cl_anything(pRExC_state, cl);
970 }
971 else {
972 cl->flags |= ANYOF_UNICODE_ALL;
973 }
c6b76537 974 }
0b9668ee
KW
975
976 /* Take the union */
977 cl->flags |= or_with->flags;
c6b76537 978 }
1aa99e6b 979 }
653099ff
GS
980}
981
a3621e74
YO
982#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
983#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
984#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
985#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
986
3dab1dad
YO
987
988#ifdef DEBUGGING
07be1b83 989/*
2b8b4781
NC
990 dump_trie(trie,widecharmap,revcharmap)
991 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
992 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
993
994 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
995 The _interim_ variants are used for debugging the interim
996 tables that are used to generate the final compressed
997 representation which is what dump_trie expects.
998
486ec47a 999 Part of the reason for their existence is to provide a form
3dab1dad 1000 of documentation as to how the different representations function.
07be1b83
YO
1001
1002*/
3dab1dad
YO
1003
1004/*
3dab1dad
YO
1005 Dumps the final compressed table form of the trie to Perl_debug_log.
1006 Used for debugging make_trie().
1007*/
b9a59e08 1008
3dab1dad 1009STATIC void
2b8b4781
NC
1010S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1011 AV *revcharmap, U32 depth)
3dab1dad
YO
1012{
1013 U32 state;
ab3bbdeb 1014 SV *sv=sv_newmortal();
55eed653 1015 int colwidth= widecharmap ? 6 : 4;
2e64971a 1016 U16 word;
3dab1dad
YO
1017 GET_RE_DEBUG_FLAGS_DECL;
1018
7918f24d 1019 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1020
3dab1dad
YO
1021 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1022 (int)depth * 2 + 2,"",
1023 "Match","Base","Ofs" );
1024
1025 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1026 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1027 if ( tmp ) {
ab3bbdeb
YO
1028 PerlIO_printf( Perl_debug_log, "%*s",
1029 colwidth,
ddc5bc0f 1030 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1031 PL_colors[0], PL_colors[1],
1032 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1033 PERL_PV_ESCAPE_FIRSTCHAR
1034 )
1035 );
3dab1dad
YO
1036 }
1037 }
1038 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1039 (int)depth * 2 + 2,"");
1040
1041 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1042 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1043 PerlIO_printf( Perl_debug_log, "\n");
1044
1e2e3d02 1045 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1046 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1047
1048 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1049
1050 if ( trie->states[ state ].wordnum ) {
1051 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1052 } else {
1053 PerlIO_printf( Perl_debug_log, "%6s", "" );
1054 }
1055
1056 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1057
1058 if ( base ) {
1059 U32 ofs = 0;
1060
1061 while( ( base + ofs < trie->uniquecharcount ) ||
1062 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1063 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1064 ofs++;
1065
1066 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1067
1068 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1069 if ( ( base + ofs >= trie->uniquecharcount ) &&
1070 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1071 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1072 {
ab3bbdeb
YO
1073 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1074 colwidth,
3dab1dad
YO
1075 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1076 } else {
ab3bbdeb 1077 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1078 }
1079 }
1080
1081 PerlIO_printf( Perl_debug_log, "]");
1082
1083 }
1084 PerlIO_printf( Perl_debug_log, "\n" );
1085 }
2e64971a
DM
1086 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1087 for (word=1; word <= trie->wordcount; word++) {
1088 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1089 (int)word, (int)(trie->wordinfo[word].prev),
1090 (int)(trie->wordinfo[word].len));
1091 }
1092 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1093}
1094/*
3dab1dad
YO
1095 Dumps a fully constructed but uncompressed trie in list form.
1096 List tries normally only are used for construction when the number of
1097 possible chars (trie->uniquecharcount) is very high.
1098 Used for debugging make_trie().
1099*/
1100STATIC void
55eed653 1101S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1102 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1103 U32 depth)
3dab1dad
YO
1104{
1105 U32 state;
ab3bbdeb 1106 SV *sv=sv_newmortal();
55eed653 1107 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1108 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1109
1110 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1111
3dab1dad 1112 /* print out the table precompression. */
ab3bbdeb
YO
1113 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1114 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1115 "------:-----+-----------------\n" );
3dab1dad
YO
1116
1117 for( state=1 ; state < next_alloc ; state ++ ) {
1118 U16 charid;
1119
ab3bbdeb 1120 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1121 (int)depth * 2 + 2,"", (UV)state );
1122 if ( ! trie->states[ state ].wordnum ) {
1123 PerlIO_printf( Perl_debug_log, "%5s| ","");
1124 } else {
1125 PerlIO_printf( Perl_debug_log, "W%4x| ",
1126 trie->states[ state ].wordnum
1127 );
1128 }
1129 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1130 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1131 if ( tmp ) {
1132 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1133 colwidth,
ddc5bc0f 1134 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1135 PL_colors[0], PL_colors[1],
1136 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1137 PERL_PV_ESCAPE_FIRSTCHAR
1138 ) ,
1e2e3d02
YO
1139 TRIE_LIST_ITEM(state,charid).forid,
1140 (UV)TRIE_LIST_ITEM(state,charid).newstate
1141 );
1142 if (!(charid % 10))
664e119d
RGS
1143 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1144 (int)((depth * 2) + 14), "");
1e2e3d02 1145 }
ab3bbdeb
YO
1146 }
1147 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1148 }
1149}
1150
1151/*
3dab1dad
YO
1152 Dumps a fully constructed but uncompressed trie in table form.
1153 This is the normal DFA style state transition table, with a few
1154 twists to facilitate compression later.
1155 Used for debugging make_trie().
1156*/
1157STATIC void
55eed653 1158S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1159 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1160 U32 depth)
3dab1dad
YO
1161{
1162 U32 state;
1163 U16 charid;
ab3bbdeb 1164 SV *sv=sv_newmortal();
55eed653 1165 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1166 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1167
1168 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1169
1170 /*
1171 print out the table precompression so that we can do a visual check
1172 that they are identical.
1173 */
1174
1175 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1176
1177 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1178 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1179 if ( tmp ) {
ab3bbdeb
YO
1180 PerlIO_printf( Perl_debug_log, "%*s",
1181 colwidth,
ddc5bc0f 1182 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1183 PL_colors[0], PL_colors[1],
1184 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1185 PERL_PV_ESCAPE_FIRSTCHAR
1186 )
1187 );
3dab1dad
YO
1188 }
1189 }
1190
1191 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1192
1193 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1194 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1195 }
1196
1197 PerlIO_printf( Perl_debug_log, "\n" );
1198
1199 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1200
1201 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1202 (int)depth * 2 + 2,"",
1203 (UV)TRIE_NODENUM( state ) );
1204
1205 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1206 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1207 if (v)
1208 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1209 else
1210 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1211 }
1212 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1213 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1214 } else {
1215 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1216 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1217 }
1218 }
07be1b83 1219}
3dab1dad
YO
1220
1221#endif
1222
2e64971a 1223
786e8c11
YO
1224/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1225 startbranch: the first branch in the whole branch sequence
1226 first : start branch of sequence of branch-exact nodes.
1227 May be the same as startbranch
1228 last : Thing following the last branch.
1229 May be the same as tail.
1230 tail : item following the branch sequence
1231 count : words in the sequence
1232 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1233 depth : indent depth
3dab1dad 1234
786e8c11 1235Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1236
786e8c11
YO
1237A trie is an N'ary tree where the branches are determined by digital
1238decomposition of the key. IE, at the root node you look up the 1st character and
1239follow that branch repeat until you find the end of the branches. Nodes can be
1240marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1241
786e8c11 1242 /he|she|his|hers/
72f13be8 1243
786e8c11
YO
1244would convert into the following structure. Numbers represent states, letters
1245following numbers represent valid transitions on the letter from that state, if
1246the number is in square brackets it represents an accepting state, otherwise it
1247will be in parenthesis.
07be1b83 1248
786e8c11
YO
1249 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1250 | |
1251 | (2)
1252 | |
1253 (1) +-i->(6)-+-s->[7]
1254 |
1255 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1256
786e8c11
YO
1257 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1258
1259This shows that when matching against the string 'hers' we will begin at state 1
1260read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1261then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1262is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1263single traverse. We store a mapping from accepting to state to which word was
1264matched, and then when we have multiple possibilities we try to complete the
1265rest of the regex in the order in which they occured in the alternation.
1266
1267The only prior NFA like behaviour that would be changed by the TRIE support is
1268the silent ignoring of duplicate alternations which are of the form:
1269
1270 / (DUPE|DUPE) X? (?{ ... }) Y /x
1271
4b714af6 1272Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1273and without the optimisation. With the optimisations dupes will be silently
486ec47a 1274ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1275the following demonstrates:
1276
1277 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1278
1279which prints out 'word' three times, but
1280
1281 'words'=~/(word|word|word)(?{ print $1 })S/
1282
1283which doesnt print it out at all. This is due to other optimisations kicking in.
1284
1285Example of what happens on a structural level:
1286
486ec47a 1287The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1288
1289 1: CURLYM[1] {1,32767}(18)
1290 5: BRANCH(8)
1291 6: EXACT <ac>(16)
1292 8: BRANCH(11)
1293 9: EXACT <ad>(16)
1294 11: BRANCH(14)
1295 12: EXACT <ab>(16)
1296 16: SUCCEED(0)
1297 17: NOTHING(18)
1298 18: END(0)
1299
1300This would be optimizable with startbranch=5, first=5, last=16, tail=16
1301and should turn into:
1302
1303 1: CURLYM[1] {1,32767}(18)
1304 5: TRIE(16)
1305 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1306 <ac>
1307 <ad>
1308 <ab>
1309 16: SUCCEED(0)
1310 17: NOTHING(18)
1311 18: END(0)
1312
1313Cases where tail != last would be like /(?foo|bar)baz/:
1314
1315 1: BRANCH(4)
1316 2: EXACT <foo>(8)
1317 4: BRANCH(7)
1318 5: EXACT <bar>(8)
1319 7: TAIL(8)
1320 8: EXACT <baz>(10)
1321 10: END(0)
1322
1323which would be optimizable with startbranch=1, first=1, last=7, tail=8
1324and would end up looking like:
1325
1326 1: TRIE(8)
1327 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1328 <foo>
1329 <bar>
1330 7: TAIL(8)
1331 8: EXACT <baz>(10)
1332 10: END(0)
1333
1334 d = uvuni_to_utf8_flags(d, uv, 0);
1335
1336is the recommended Unicode-aware way of saying
1337
1338 *(d++) = uv;
1339*/
1340
1e2e3d02 1341#define TRIE_STORE_REVCHAR \
786e8c11 1342 STMT_START { \
73031816
NC
1343 if (UTF) { \
1344 SV *zlopp = newSV(2); \
88c9ea1e
CB
1345 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1346 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1347 SvCUR_set(zlopp, kapow - flrbbbbb); \
1348 SvPOK_on(zlopp); \
1349 SvUTF8_on(zlopp); \
1350 av_push(revcharmap, zlopp); \
1351 } else { \
6bdeddd2 1352 char ooooff = (char)uvc; \
73031816
NC
1353 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1354 } \
1355 } STMT_END
786e8c11
YO
1356
1357#define TRIE_READ_CHAR STMT_START { \
1358 wordlen++; \
1359 if ( UTF ) { \
1360 if ( folder ) { \
1361 if ( foldlen > 0 ) { \
1362 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1363 foldlen -= len; \
1364 scan += len; \
1365 len = 0; \
1366 } else { \
1367 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1368 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1369 foldlen -= UNISKIP( uvc ); \
1370 scan = foldbuf + UNISKIP( uvc ); \
1371 } \
1372 } else { \
1373 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1374 } \
1375 } else { \
1376 uvc = (U32)*uc; \
1377 len = 1; \
1378 } \
1379} STMT_END
1380
1381
1382
1383#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1384 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1385 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1386 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1387 } \
1388 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1389 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1390 TRIE_LIST_CUR( state )++; \
1391} STMT_END
07be1b83 1392
786e8c11
YO
1393#define TRIE_LIST_NEW(state) STMT_START { \
1394 Newxz( trie->states[ state ].trans.list, \
1395 4, reg_trie_trans_le ); \
1396 TRIE_LIST_CUR( state ) = 1; \
1397 TRIE_LIST_LEN( state ) = 4; \
1398} STMT_END
07be1b83 1399
786e8c11
YO
1400#define TRIE_HANDLE_WORD(state) STMT_START { \
1401 U16 dupe= trie->states[ state ].wordnum; \
1402 regnode * const noper_next = regnext( noper ); \
1403 \
786e8c11
YO
1404 DEBUG_r({ \
1405 /* store the word for dumping */ \
1406 SV* tmp; \
1407 if (OP(noper) != NOTHING) \
740cce10 1408 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1409 else \
740cce10 1410 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1411 av_push( trie_words, tmp ); \
786e8c11
YO
1412 }); \
1413 \
1414 curword++; \
2e64971a
DM
1415 trie->wordinfo[curword].prev = 0; \
1416 trie->wordinfo[curword].len = wordlen; \
1417 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1418 \
1419 if ( noper_next < tail ) { \
1420 if (!trie->jump) \
c944940b 1421 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1422 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1423 if (!jumper) \
1424 jumper = noper_next; \
1425 if (!nextbranch) \
1426 nextbranch= regnext(cur); \
1427 } \
1428 \
1429 if ( dupe ) { \
2e64971a
DM
1430 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1431 /* chain, so that when the bits of chain are later */\
1432 /* linked together, the dups appear in the chain */\
1433 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1434 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1435 } else { \
1436 /* we haven't inserted this word yet. */ \
1437 trie->states[ state ].wordnum = curword; \
1438 } \
1439} STMT_END
07be1b83 1440
3dab1dad 1441
786e8c11
YO
1442#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1443 ( ( base + charid >= ucharcount \
1444 && base + charid < ubound \
1445 && state == trie->trans[ base - ucharcount + charid ].check \
1446 && trie->trans[ base - ucharcount + charid ].next ) \
1447 ? trie->trans[ base - ucharcount + charid ].next \
1448 : ( state==1 ? special : 0 ) \
1449 )
3dab1dad 1450
786e8c11
YO
1451#define MADE_TRIE 1
1452#define MADE_JUMP_TRIE 2
1453#define MADE_EXACT_TRIE 4
3dab1dad 1454
a3621e74 1455STATIC I32
786e8c11 1456S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1457{
27da23d5 1458 dVAR;
a3621e74
YO
1459 /* first pass, loop through and scan words */
1460 reg_trie_data *trie;
55eed653 1461 HV *widecharmap = NULL;
2b8b4781 1462 AV *revcharmap = newAV();
a3621e74 1463 regnode *cur;
9f7f3913 1464 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1465 STRLEN len = 0;
1466 UV uvc = 0;
1467 U16 curword = 0;
1468 U32 next_alloc = 0;
786e8c11
YO
1469 regnode *jumper = NULL;
1470 regnode *nextbranch = NULL;
7f69552c 1471 regnode *convert = NULL;
2e64971a 1472 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1473 /* we just use folder as a flag in utf8 */
1e696034 1474 const U8 * folder = NULL;
a3621e74 1475
2b8b4781
NC
1476#ifdef DEBUGGING
1477 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1478 AV *trie_words = NULL;
1479 /* along with revcharmap, this only used during construction but both are
1480 * useful during debugging so we store them in the struct when debugging.
8e11feef 1481 */
2b8b4781
NC
1482#else
1483 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1484 STRLEN trie_charcount=0;
3dab1dad 1485#endif
2b8b4781 1486 SV *re_trie_maxbuff;
a3621e74 1487 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1488
1489 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1490#ifndef DEBUGGING
1491 PERL_UNUSED_ARG(depth);
1492#endif
a3621e74 1493
1e696034 1494 switch (flags) {
2f7f8cb1 1495 case EXACTFA:
1e696034
KW
1496 case EXACTFU: folder = PL_fold_latin1; break;
1497 case EXACTF: folder = PL_fold; break;
1498 case EXACTFL: folder = PL_fold_locale; break;
1499 }
1500
c944940b 1501 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1502 trie->refcount = 1;
3dab1dad 1503 trie->startstate = 1;
786e8c11 1504 trie->wordcount = word_count;
f8fc2ecf 1505 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1506 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1507 if (!(UTF && folder))
c944940b 1508 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1509 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1510 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1511
a3621e74 1512 DEBUG_r({
2b8b4781 1513 trie_words = newAV();
a3621e74 1514 });
a3621e74 1515
0111c4fd 1516 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1517 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1518 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1519 }
3dab1dad
YO
1520 DEBUG_OPTIMISE_r({
1521 PerlIO_printf( Perl_debug_log,
786e8c11 1522 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1523 (int)depth * 2 + 2, "",
1524 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1525 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1526 (int)depth);
3dab1dad 1527 });
7f69552c
YO
1528
1529 /* Find the node we are going to overwrite */
1530 if ( first == startbranch && OP( last ) != BRANCH ) {
1531 /* whole branch chain */
1532 convert = first;
1533 } else {
1534 /* branch sub-chain */
1535 convert = NEXTOPER( first );
1536 }
1537
a3621e74
YO
1538 /* -- First loop and Setup --
1539
1540 We first traverse the branches and scan each word to determine if it
1541 contains widechars, and how many unique chars there are, this is
1542 important as we have to build a table with at least as many columns as we
1543 have unique chars.
1544
1545 We use an array of integers to represent the character codes 0..255
38a44b82 1546 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1547 native representation of the character value as the key and IV's for the
1548 coded index.
1549
1550 *TODO* If we keep track of how many times each character is used we can
1551 remap the columns so that the table compression later on is more
3b753521 1552 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1553 middle and the least common are on the outside. IMO this would be better
1554 than a most to least common mapping as theres a decent chance the most
1555 common letter will share a node with the least common, meaning the node
486ec47a 1556 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1557 case is when we have the least common nodes twice.
1558
1559 */
1560
a3621e74 1561 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1562 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1563 const U8 *uc = (U8*)STRING( noper );
a28509cc 1564 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1565 STRLEN foldlen = 0;
1566 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1567 const U8 *scan = (U8*)NULL;
07be1b83 1568 U32 wordlen = 0; /* required init */
02daf0ab
YO
1569 STRLEN chars = 0;
1570 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1571
3dab1dad
YO
1572 if (OP(noper) == NOTHING) {
1573 trie->minlen= 0;
1574 continue;
1575 }
02daf0ab
YO
1576 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1577 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1578 regardless of encoding */
1579
a3621e74 1580 for ( ; uc < e ; uc += len ) {
3dab1dad 1581 TRIE_CHARCOUNT(trie)++;
a3621e74 1582 TRIE_READ_CHAR;
3dab1dad 1583 chars++;
a3621e74
YO
1584 if ( uvc < 256 ) {
1585 if ( !trie->charmap[ uvc ] ) {
1586 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1587 if ( folder )
1588 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1589 TRIE_STORE_REVCHAR;
a3621e74 1590 }
02daf0ab 1591 if ( set_bit ) {
62012aee
KW
1592 /* store the codepoint in the bitmap, and its folded
1593 * equivalent. */
02daf0ab 1594 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1595
1596 /* store the folded codepoint */
1597 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1598
1599 if ( !UTF ) {
1600 /* store first byte of utf8 representation of
acdf4139
KW
1601 variant codepoints */
1602 if (! UNI_IS_INVARIANT(uvc)) {
1603 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1604 }
1605 }
02daf0ab
YO
1606 set_bit = 0; /* We've done our bit :-) */
1607 }
a3621e74
YO
1608 } else {
1609 SV** svpp;
55eed653
NC
1610 if ( !widecharmap )
1611 widecharmap = newHV();
a3621e74 1612
55eed653 1613 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1614
1615 if ( !svpp )
e4584336 1616 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1617
1618 if ( !SvTRUE( *svpp ) ) {
1619 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1620 TRIE_STORE_REVCHAR;
a3621e74
YO
1621 }
1622 }
1623 }
3dab1dad
YO
1624 if( cur == first ) {
1625 trie->minlen=chars;
1626 trie->maxlen=chars;
1627 } else if (chars < trie->minlen) {
1628 trie->minlen=chars;
1629 } else if (chars > trie->maxlen) {
1630 trie->maxlen=chars;
1631 }
1632
a3621e74
YO
1633 } /* end first pass */
1634 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1635 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1636 (int)depth * 2 + 2,"",
55eed653 1637 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1638 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1639 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1640 );
a3621e74
YO
1641
1642 /*
1643 We now know what we are dealing with in terms of unique chars and
1644 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1645 representation using a flat table will take. If it's over a reasonable
1646 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1647 conservative but potentially much slower representation using an array
1648 of lists.
1649
1650 At the end we convert both representations into the same compressed
1651 form that will be used in regexec.c for matching with. The latter
1652 is a form that cannot be used to construct with but has memory
1653 properties similar to the list form and access properties similar
1654 to the table form making it both suitable for fast searches and
1655 small enough that its feasable to store for the duration of a program.
1656
1657 See the comment in the code where the compressed table is produced
1658 inplace from the flat tabe representation for an explanation of how
1659 the compression works.
1660
1661 */
1662
1663
2e64971a
DM
1664 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1665 prev_states[1] = 0;
1666
3dab1dad 1667 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1668 /*
1669 Second Pass -- Array Of Lists Representation
1670
1671 Each state will be represented by a list of charid:state records
1672 (reg_trie_trans_le) the first such element holds the CUR and LEN
1673 points of the allocated array. (See defines above).
1674
1675 We build the initial structure using the lists, and then convert
1676 it into the compressed table form which allows faster lookups
1677 (but cant be modified once converted).
a3621e74
YO
1678 */
1679
a3621e74
YO
1680 STRLEN transcount = 1;
1681
1e2e3d02
YO
1682 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1683 "%*sCompiling trie using list compiler\n",
1684 (int)depth * 2 + 2, ""));
446bd890 1685
c944940b
JH
1686 trie->states = (reg_trie_state *)
1687 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1688 sizeof(reg_trie_state) );
a3621e74
YO
1689 TRIE_LIST_NEW(1);
1690 next_alloc = 2;
1691
1692 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1693
c445ea15
AL
1694 regnode * const noper = NEXTOPER( cur );
1695 U8 *uc = (U8*)STRING( noper );
1696 const U8 * const e = uc + STR_LEN( noper );
1697 U32 state = 1; /* required init */
1698 U16 charid = 0; /* sanity init */
1699 U8 *scan = (U8*)NULL; /* sanity init */
1700 STRLEN foldlen = 0; /* required init */
07be1b83 1701 U32 wordlen = 0; /* required init */
c445ea15
AL
1702 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1703
3dab1dad 1704 if (OP(noper) != NOTHING) {
786e8c11 1705 for ( ; uc < e ; uc += len ) {
c445ea15 1706
786e8c11 1707 TRIE_READ_CHAR;
c445ea15 1708
786e8c11
YO
1709 if ( uvc < 256 ) {
1710 charid = trie->charmap[ uvc ];
c445ea15 1711 } else {
55eed653 1712 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1713 if ( !svpp ) {
1714 charid = 0;
1715 } else {
1716 charid=(U16)SvIV( *svpp );
1717 }
c445ea15 1718 }
786e8c11
YO
1719 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1720 if ( charid ) {
a3621e74 1721
786e8c11
YO
1722 U16 check;
1723 U32 newstate = 0;
a3621e74 1724
786e8c11
YO
1725 charid--;
1726 if ( !trie->states[ state ].trans.list ) {
1727 TRIE_LIST_NEW( state );
c445ea15 1728 }
786e8c11
YO
1729 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1730 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1731 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1732 break;
1733 }
1734 }
1735 if ( ! newstate ) {
1736 newstate = next_alloc++;
2e64971a 1737 prev_states[newstate] = state;
786e8c11
YO
1738 TRIE_LIST_PUSH( state, charid, newstate );
1739 transcount++;
1740 }
1741 state = newstate;
1742 } else {
1743 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1744 }
a28509cc 1745 }
c445ea15 1746 }
3dab1dad 1747 TRIE_HANDLE_WORD(state);
a3621e74
YO
1748
1749 } /* end second pass */
1750
1e2e3d02
YO
1751 /* next alloc is the NEXT state to be allocated */
1752 trie->statecount = next_alloc;
c944940b
JH
1753 trie->states = (reg_trie_state *)
1754 PerlMemShared_realloc( trie->states,
1755 next_alloc
1756 * sizeof(reg_trie_state) );
a3621e74 1757
3dab1dad 1758 /* and now dump it out before we compress it */
2b8b4781
NC
1759 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1760 revcharmap, next_alloc,
1761 depth+1)
1e2e3d02 1762 );
a3621e74 1763
c944940b
JH
1764 trie->trans = (reg_trie_trans *)
1765 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1766 {
1767 U32 state;
a3621e74
YO
1768 U32 tp = 0;
1769 U32 zp = 0;
1770
1771
1772 for( state=1 ; state < next_alloc ; state ++ ) {
1773 U32 base=0;
1774
1775 /*
1776 DEBUG_TRIE_COMPILE_MORE_r(
1777 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1778 );
1779 */
1780
1781 if (trie->states[state].trans.list) {
1782 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1783 U16 maxid=minid;
a28509cc 1784 U16 idx;
a3621e74
YO
1785
1786 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1787 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1788 if ( forid < minid ) {
1789 minid=forid;
1790 } else if ( forid > maxid ) {
1791 maxid=forid;
1792 }
a3621e74
YO
1793 }
1794 if ( transcount < tp + maxid - minid + 1) {
1795 transcount *= 2;
c944940b
JH
1796 trie->trans = (reg_trie_trans *)
1797 PerlMemShared_realloc( trie->trans,
446bd890
NC
1798 transcount
1799 * sizeof(reg_trie_trans) );
a3621e74
YO
1800 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1801 }
1802 base = trie->uniquecharcount + tp - minid;
1803 if ( maxid == minid ) {
1804 U32 set = 0;
1805 for ( ; zp < tp ; zp++ ) {
1806 if ( ! trie->trans[ zp ].next ) {
1807 base = trie->uniquecharcount + zp - minid;
1808 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1809 trie->trans[ zp ].check = state;
1810 set = 1;
1811 break;
1812 }
1813 }
1814 if ( !set ) {
1815 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1816 trie->trans[ tp ].check = state;
1817 tp++;
1818 zp = tp;
1819 }
1820 } else {
1821 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1822 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1823 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1824 trie->trans[ tid ].check = state;
1825 }
1826 tp += ( maxid - minid + 1 );
1827 }
1828 Safefree(trie->states[ state ].trans.list);
1829 }
1830 /*
1831 DEBUG_TRIE_COMPILE_MORE_r(
1832 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1833 );
1834 */
1835 trie->states[ state ].trans.base=base;
1836 }
cc601c31 1837 trie->lasttrans = tp + 1;
a3621e74
YO
1838 }
1839 } else {
1840 /*
1841 Second Pass -- Flat Table Representation.
1842
1843 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1844 We know that we will need Charcount+1 trans at most to store the data
1845 (one row per char at worst case) So we preallocate both structures
1846 assuming worst case.
1847
1848 We then construct the trie using only the .next slots of the entry
1849 structs.
1850
3b753521 1851 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1852 make compression both faster and easier by keeping track of how many non
1853 zero fields are in the node.
1854
1855 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1856 transition.
1857
1858 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1859 number representing the first entry of the node, and state as a
1860 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1861 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1862 are 2 entrys per node. eg:
1863
1864 A B A B
1865 1. 2 4 1. 3 7
1866 2. 0 3 3. 0 5
1867 3. 0 0 5. 0 0
1868 4. 0 0 7. 0 0
1869
1870 The table is internally in the right hand, idx form. However as we also
1871 have to deal with the states array which is indexed by nodenum we have to
1872 use TRIE_NODENUM() to convert.
1873
1874 */
1e2e3d02
YO
1875 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1876 "%*sCompiling trie using table compiler\n",
1877 (int)depth * 2 + 2, ""));
3dab1dad 1878
c944940b
JH
1879 trie->trans = (reg_trie_trans *)
1880 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1881 * trie->uniquecharcount + 1,
1882 sizeof(reg_trie_trans) );
1883 trie->states = (reg_trie_state *)
1884 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1885 sizeof(reg_trie_state) );
a3621e74
YO
1886 next_alloc = trie->uniquecharcount + 1;
1887
3dab1dad 1888
a3621e74
YO
1889 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1890
c445ea15 1891 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1892 const U8 *uc = (U8*)STRING( noper );
1893 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1894
1895 U32 state = 1; /* required init */
1896
1897 U16 charid = 0; /* sanity init */
1898 U32 accept_state = 0; /* sanity init */
1899 U8 *scan = (U8*)NULL; /* sanity init */
1900
1901 STRLEN foldlen = 0; /* required init */
07be1b83 1902 U32 wordlen = 0; /* required init */
a3621e74
YO
1903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1904
3dab1dad 1905 if ( OP(noper) != NOTHING ) {
786e8c11 1906 for ( ; uc < e ; uc += len ) {
a3621e74 1907
786e8c11 1908 TRIE_READ_CHAR;
a3621e74 1909
786e8c11
YO
1910 if ( uvc < 256 ) {
1911 charid = trie->charmap[ uvc ];
1912 } else {
55eed653 1913 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1914 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1915 }
786e8c11
YO
1916 if ( charid ) {
1917 charid--;
1918 if ( !trie->trans[ state + charid ].next ) {
1919 trie->trans[ state + charid ].next = next_alloc;
1920 trie->trans[ state ].check++;
2e64971a
DM
1921 prev_states[TRIE_NODENUM(next_alloc)]
1922 = TRIE_NODENUM(state);
786e8c11
YO
1923 next_alloc += trie->uniquecharcount;
1924 }
1925 state = trie->trans[ state + charid ].next;
1926 } else {
1927 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1928 }
1929 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1930 }
a3621e74 1931 }
3dab1dad
YO
1932 accept_state = TRIE_NODENUM( state );
1933 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1934
1935 } /* end second pass */
1936
3dab1dad 1937 /* and now dump it out before we compress it */
2b8b4781
NC
1938 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1939 revcharmap,
1940 next_alloc, depth+1));
a3621e74 1941
a3621e74
YO
1942 {
1943 /*
1944 * Inplace compress the table.*
1945
1946 For sparse data sets the table constructed by the trie algorithm will
1947 be mostly 0/FAIL transitions or to put it another way mostly empty.
1948 (Note that leaf nodes will not contain any transitions.)
1949
1950 This algorithm compresses the tables by eliminating most such
1951 transitions, at the cost of a modest bit of extra work during lookup:
1952
1953 - Each states[] entry contains a .base field which indicates the
1954 index in the state[] array wheres its transition data is stored.
1955
3b753521 1956 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1957
1958 - If .base is nonzero then charid is added to it to find an entry in
1959 the trans array.
1960
1961 -If trans[states[state].base+charid].check!=state then the
1962 transition is taken to be a 0/Fail transition. Thus if there are fail
1963 transitions at the front of the node then the .base offset will point
1964 somewhere inside the previous nodes data (or maybe even into a node
1965 even earlier), but the .check field determines if the transition is
1966 valid.
1967
786e8c11 1968 XXX - wrong maybe?
a3621e74 1969 The following process inplace converts the table to the compressed
3b753521 1970 table: We first do not compress the root node 1,and mark all its
a3621e74 1971 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1972 allows us to do a DFA construction from the compressed table later,
1973 and ensures that any .base pointers we calculate later are greater
1974 than 0.
a3621e74
YO
1975
1976 - We set 'pos' to indicate the first entry of the second node.
1977
1978 - We then iterate over the columns of the node, finding the first and
1979 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1980 and set the .check pointers accordingly, and advance pos
1981 appropriately and repreat for the next node. Note that when we copy
1982 the next pointers we have to convert them from the original
1983 NODEIDX form to NODENUM form as the former is not valid post
1984 compression.
1985
1986 - If a node has no transitions used we mark its base as 0 and do not
1987 advance the pos pointer.
1988
1989 - If a node only has one transition we use a second pointer into the
1990 structure to fill in allocated fail transitions from other states.
1991 This pointer is independent of the main pointer and scans forward
1992 looking for null transitions that are allocated to a state. When it
1993 finds one it writes the single transition into the "hole". If the
786e8c11 1994 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1995
1996 - Once compressed we can Renew/realloc the structures to release the
1997 excess space.
1998
1999 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2000 specifically Fig 3.47 and the associated pseudocode.
2001
2002 demq
2003 */
a3b680e6 2004 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2005 U32 state, charid;
a3621e74 2006 U32 pos = 0, zp=0;
1e2e3d02 2007 trie->statecount = laststate;
a3621e74
YO
2008
2009 for ( state = 1 ; state < laststate ; state++ ) {
2010 U8 flag = 0;
a28509cc
AL
2011 const U32 stateidx = TRIE_NODEIDX( state );
2012 const U32 o_used = trie->trans[ stateidx ].check;
2013 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2014 trie->trans[ stateidx ].check = 0;
2015
2016 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2017 if ( flag || trie->trans[ stateidx + charid ].next ) {
2018 if ( trie->trans[ stateidx + charid ].next ) {
2019 if (o_used == 1) {
2020 for ( ; zp < pos ; zp++ ) {
2021 if ( ! trie->trans[ zp ].next ) {
2022 break;
2023 }
2024 }
2025 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2026 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2027 trie->trans[ zp ].check = state;
2028 if ( ++zp > pos ) pos = zp;
2029 break;
2030 }
2031 used--;
2032 }
2033 if ( !flag ) {
2034 flag = 1;
2035 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2036 }
2037 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2038 trie->trans[ pos ].check = state;
2039 pos++;
2040 }
2041 }
2042 }
cc601c31 2043 trie->lasttrans = pos + 1;
c944940b
JH
2044 trie->states = (reg_trie_state *)
2045 PerlMemShared_realloc( trie->states, laststate
2046 * sizeof(reg_trie_state) );
a3621e74 2047 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2048 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2049 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2050 (int)depth * 2 + 2,"",
2051 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2052 (IV)next_alloc,
2053 (IV)pos,
a3621e74
YO
2054 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2055 );
2056
2057 } /* end table compress */
2058 }
1e2e3d02
YO
2059 DEBUG_TRIE_COMPILE_MORE_r(
2060 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2061 (int)depth * 2 + 2, "",
2062 (UV)trie->statecount,
2063 (UV)trie->lasttrans)
2064 );
cc601c31 2065 /* resize the trans array to remove unused space */
c944940b
JH
2066 trie->trans = (reg_trie_trans *)
2067 PerlMemShared_realloc( trie->trans, trie->lasttrans
2068 * sizeof(reg_trie_trans) );
a3621e74 2069
3b753521 2070 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2071 U8 nodetype =(U8)(flags & 0xFF);
2072 char *str=NULL;
786e8c11 2073
07be1b83 2074#ifdef DEBUGGING
e62cc96a 2075 regnode *optimize = NULL;
7122b237
YO
2076#ifdef RE_TRACK_PATTERN_OFFSETS
2077
b57a0404
JH
2078 U32 mjd_offset = 0;
2079 U32 mjd_nodelen = 0;
7122b237
YO
2080#endif /* RE_TRACK_PATTERN_OFFSETS */
2081#endif /* DEBUGGING */
a3621e74 2082 /*
3dab1dad
YO
2083 This means we convert either the first branch or the first Exact,
2084 depending on whether the thing following (in 'last') is a branch
2085 or not and whther first is the startbranch (ie is it a sub part of
2086 the alternation or is it the whole thing.)
3b753521 2087 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2088 the whole branch sequence, including the first.
a3621e74 2089 */
3dab1dad 2090 /* Find the node we are going to overwrite */
7f69552c 2091 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2092 /* branch sub-chain */
3dab1dad 2093 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2094#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2095 DEBUG_r({
2096 mjd_offset= Node_Offset((convert));
2097 mjd_nodelen= Node_Length((convert));
2098 });
7122b237 2099#endif
7f69552c 2100 /* whole branch chain */
7122b237
YO
2101 }
2102#ifdef RE_TRACK_PATTERN_OFFSETS
2103 else {
7f69552c
YO
2104 DEBUG_r({
2105 const regnode *nop = NEXTOPER( convert );
2106 mjd_offset= Node_Offset((nop));
2107 mjd_nodelen= Node_Length((nop));
2108 });
07be1b83
YO
2109 }
2110 DEBUG_OPTIMISE_r(
2111 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2112 (int)depth * 2 + 2, "",
786e8c11 2113 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2114 );
7122b237 2115#endif
3dab1dad
YO
2116 /* But first we check to see if there is a common prefix we can
2117 split out as an EXACT and put in front of the TRIE node. */
2118 trie->startstate= 1;
55eed653 2119 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2120 U32 state;
1e2e3d02 2121 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2122 U32 ofs = 0;
8e11feef
RGS
2123 I32 idx = -1;
2124 U32 count = 0;
2125 const U32 base = trie->states[ state ].trans.base;
a3621e74 2126
3dab1dad 2127 if ( trie->states[state].wordnum )
8e11feef 2128 count = 1;
a3621e74 2129
8e11feef 2130 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2131 if ( ( base + ofs >= trie->uniquecharcount ) &&
2132 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2133 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2134 {
3dab1dad 2135 if ( ++count > 1 ) {
2b8b4781 2136 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2137 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2138 if ( state == 1 ) break;
3dab1dad
YO
2139 if ( count == 2 ) {
2140 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2141 DEBUG_OPTIMISE_r(
8e11feef
RGS
2142 PerlIO_printf(Perl_debug_log,
2143 "%*sNew Start State=%"UVuf" Class: [",
2144 (int)depth * 2 + 2, "",
786e8c11 2145 (UV)state));
be8e71aa 2146 if (idx >= 0) {
2b8b4781 2147 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2148 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2149
3dab1dad 2150 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2151 if ( folder )
2152 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2153 DEBUG_OPTIMISE_r(
f1f66076 2154 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2155 );
8e11feef
RGS
2156 }
2157 }
2158 TRIE_BITMAP_SET(trie,*ch);
2159 if ( folder )
2160 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2161 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2162 }
2163 idx = ofs;
2164 }
3dab1dad
YO
2165 }
2166 if ( count == 1 ) {
2b8b4781 2167 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2168 STRLEN len;
2169 char *ch = SvPV( *tmp, len );
de734bd5
A
2170 DEBUG_OPTIMISE_r({
2171 SV *sv=sv_newmortal();
8e11feef
RGS
2172 PerlIO_printf( Perl_debug_log,
2173 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2174 (int)depth * 2 + 2, "",
de734bd5
A
2175 (UV)state, (UV)idx,
2176 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2177 PL_colors[0], PL_colors[1],
2178 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2179 PERL_PV_ESCAPE_FIRSTCHAR
2180 )
2181 );
2182 });
3dab1dad
YO
2183 if ( state==1 ) {
2184 OP( convert ) = nodetype;
2185 str=STRING(convert);
2186 STR_LEN(convert)=0;
2187 }
c490c714
YO
2188 STR_LEN(convert) += len;
2189 while (len--)
de734bd5 2190 *str++ = *ch++;
8e11feef 2191 } else {
f9049ba1 2192#ifdef DEBUGGING
8e11feef
RGS
2193 if (state>1)
2194 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2195#endif
8e11feef
RGS
2196 break;
2197 }
2198 }
2e64971a 2199 trie->prefixlen = (state-1);
3dab1dad 2200 if (str) {
8e11feef 2201 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2202 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2203 trie->startstate = state;
07be1b83
YO
2204 trie->minlen -= (state - 1);
2205 trie->maxlen -= (state - 1);
33809eae
JH
2206#ifdef DEBUGGING
2207 /* At least the UNICOS C compiler choked on this
2208 * being argument to DEBUG_r(), so let's just have
2209 * it right here. */
2210 if (
2211#ifdef PERL_EXT_RE_BUILD
2212 1
2213#else
2214 DEBUG_r_TEST
2215#endif
2216 ) {
2217 regnode *fix = convert;
2218 U32 word = trie->wordcount;
2219 mjd_nodelen++;
2220 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2221 while( ++fix < n ) {
2222 Set_Node_Offset_Length(fix, 0, 0);
2223 }
2224 while (word--) {
2225 SV ** const tmp = av_fetch( trie_words, word, 0 );
2226 if (tmp) {
2227 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2228 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2229 else
2230 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2231 }
2232 }
2233 }
2234#endif
8e11feef
RGS
2235 if (trie->maxlen) {
2236 convert = n;
2237 } else {
3dab1dad 2238 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2239 DEBUG_r(optimize= n);
3dab1dad
YO
2240 }
2241 }
2242 }
a5ca303d
YO
2243 if (!jumper)
2244 jumper = last;
3dab1dad 2245 if ( trie->maxlen ) {
8e11feef
RGS
2246 NEXT_OFF( convert ) = (U16)(tail - convert);
2247 ARG_SET( convert, data_slot );
786e8c11
YO
2248 /* Store the offset to the first unabsorbed branch in
2249 jump[0], which is otherwise unused by the jump logic.
2250 We use this when dumping a trie and during optimisation. */
2251 if (trie->jump)
7f69552c 2252 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2253
6c48061a
YO
2254 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2255 * and there is a bitmap
2256 * and the first "jump target" node we found leaves enough room
2257 * then convert the TRIE node into a TRIEC node, with the bitmap
2258 * embedded inline in the opcode - this is hypothetically faster.
2259 */
2260 if ( !trie->states[trie->startstate].wordnum
2261 && trie->bitmap
2262 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2263 {
2264 OP( convert ) = TRIEC;
2265 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2266 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2267 trie->bitmap= NULL;
2268 } else
2269 OP( convert ) = TRIE;
a3621e74 2270
3dab1dad
YO
2271 /* store the type in the flags */
2272 convert->flags = nodetype;
a5ca303d
YO
2273 DEBUG_r({
2274 optimize = convert
2275 + NODE_STEP_REGNODE
2276 + regarglen[ OP( convert ) ];
2277 });
2278 /* XXX We really should free up the resource in trie now,
2279 as we won't use them - (which resources?) dmq */
3dab1dad 2280 }
a3621e74 2281 /* needed for dumping*/
e62cc96a 2282 DEBUG_r(if (optimize) {
07be1b83 2283 regnode *opt = convert;
bcdf7404 2284
e62cc96a 2285 while ( ++opt < optimize) {
07be1b83
YO
2286 Set_Node_Offset_Length(opt,0,0);
2287 }
786e8c11
YO
2288 /*
2289 Try to clean up some of the debris left after the
2290 optimisation.
a3621e74 2291 */
786e8c11 2292 while( optimize < jumper ) {
07be1b83 2293 mjd_nodelen += Node_Length((optimize));
a3621e74 2294 OP( optimize ) = OPTIMIZED;
07be1b83 2295 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2296 optimize++;
2297 }
07be1b83 2298 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2299 });
2300 } /* end node insert */
2e64971a
DM
2301
2302 /* Finish populating the prev field of the wordinfo array. Walk back
2303 * from each accept state until we find another accept state, and if
2304 * so, point the first word's .prev field at the second word. If the
2305 * second already has a .prev field set, stop now. This will be the
2306 * case either if we've already processed that word's accept state,
3b753521
FN
2307 * or that state had multiple words, and the overspill words were
2308 * already linked up earlier.
2e64971a
DM
2309 */
2310 {
2311 U16 word;
2312 U32 state;
2313 U16 prev;
2314
2315 for (word=1; word <= trie->wordcount; word++) {
2316 prev = 0;
2317 if (trie->wordinfo[word].prev)
2318 continue;
2319 state = trie->wordinfo[word].accept;
2320 while (state) {
2321 state = prev_states[state];
2322 if (!state)
2323 break;
2324 prev = trie->states[state].wordnum;
2325 if (prev)
2326 break;
2327 }
2328 trie->wordinfo[word].prev = prev;
2329 }
2330 Safefree(prev_states);
2331 }
2332
2333
2334 /* and now dump out the compressed format */
2335 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2336
55eed653 2337 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2338#ifdef DEBUGGING
2339 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2340 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2341#else
2342 SvREFCNT_dec(revcharmap);
07be1b83 2343#endif
786e8c11
YO
2344 return trie->jump
2345 ? MADE_JUMP_TRIE
2346 : trie->startstate>1
2347 ? MADE_EXACT_TRIE
2348 : MADE_TRIE;
2349}
2350
2351STATIC void
2352S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2353{
3b753521 2354/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2355
2356 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2357 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2358 ISBN 0-201-10088-6
2359
2360 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2361 suffix of the current state's 'word' that is also a proper prefix of another word in our
2362 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2363 the DFA not to have to restart after its tried and failed a word at a given point, it
2364 simply continues as though it had been matching the other word in the first place.
2365 Consider
2366 'abcdgu'=~/abcdefg|cdgu/
2367 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2368 fail, which would bring us to the state representing 'd' in the second word where we would
2369 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2370 */
2371 /* add a fail transition */
3251b653
NC
2372 const U32 trie_offset = ARG(source);
2373 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2374 U32 *q;
2375 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2376 const U32 numstates = trie->statecount;
786e8c11
YO
2377 const U32 ubound = trie->lasttrans + ucharcount;
2378 U32 q_read = 0;
2379 U32 q_write = 0;
2380 U32 charid;
2381 U32 base = trie->states[ 1 ].trans.base;
2382 U32 *fail;
2383 reg_ac_data *aho;
2384 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2385 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2386
2387 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2388#ifndef DEBUGGING
2389 PERL_UNUSED_ARG(depth);
2390#endif
2391
2392
2393 ARG_SET( stclass, data_slot );
c944940b 2394 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2395 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2396 aho->trie=trie_offset;
446bd890
NC
2397 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2398 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2399 Newxz( q, numstates, U32);
c944940b 2400 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2401 aho->refcount = 1;
2402 fail = aho->fail;
2403 /* initialize fail[0..1] to be 1 so that we always have
2404 a valid final fail state */
2405 fail[ 0 ] = fail[ 1 ] = 1;
2406
2407 for ( charid = 0; charid < ucharcount ; charid++ ) {
2408 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2409 if ( newstate ) {
2410 q[ q_write ] = newstate;
2411 /* set to point at the root */
2412 fail[ q[ q_write++ ] ]=1;
2413 }
2414 }
2415 while ( q_read < q_write) {
2416 const U32 cur = q[ q_read++ % numstates ];
2417 base = trie->states[ cur ].trans.base;
2418
2419 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2420 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2421 if (ch_state) {
2422 U32 fail_state = cur;
2423 U32 fail_base;
2424 do {
2425 fail_state = fail[ fail_state ];
2426 fail_base = aho->states[ fail_state ].trans.base;
2427 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2428
2429 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2430 fail[ ch_state ] = fail_state;
2431 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2432 {
2433 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2434 }
2435 q[ q_write++ % numstates] = ch_state;
2436 }
2437 }
2438 }
2439 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2440 when we fail in state 1, this allows us to use the
2441 charclass scan to find a valid start char. This is based on the principle
2442 that theres a good chance the string being searched contains lots of stuff
2443 that cant be a start char.
2444 */
2445 fail[ 0 ] = fail[ 1 ] = 0;
2446 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2447 PerlIO_printf(Perl_debug_log,
2448 "%*sStclass Failtable (%"UVuf" states): 0",
2449 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2450 );
786e8c11
YO
2451 for( q_read=1; q_read<numstates; q_read++ ) {
2452 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2453 }
2454 PerlIO_printf(Perl_debug_log, "\n");
2455 });
2456 Safefree(q);
2457 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2458}
2459
786e8c11 2460
a3621e74 2461/*
5d1c421c
JH
2462 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2463 * These need to be revisited when a newer toolchain becomes available.
2464 */
2465#if defined(__sparc64__) && defined(__GNUC__)
2466# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2467# undef SPARC64_GCC_WORKAROUND
2468# define SPARC64_GCC_WORKAROUND 1
2469# endif
2470#endif
2471
07be1b83 2472#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2473 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2474 SV * const mysv=sv_newmortal(); \
2475 regnode *Next = regnext(scan); \
2476 regprop(RExC_rx, mysv, scan); \
7f69552c 2477 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2478 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2479 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2480 }});
07be1b83 2481
1de06328
YO
2482
2483
2484
2485
07be1b83
YO
2486#define JOIN_EXACT(scan,min,flags) \
2487 if (PL_regkind[OP(scan)] == EXACT) \
2488 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2489
be8e71aa 2490STATIC U32
07be1b83
YO
2491S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2492 /* Merge several consecutive EXACTish nodes into one. */
2493 regnode *n = regnext(scan);
2494 U32 stringok = 1;
2495 regnode *next = scan + NODE_SZ_STR(scan);
2496 U32 merged = 0;
2497 U32 stopnow = 0;
2498#ifdef DEBUGGING
2499 regnode *stop = scan;
72f13be8 2500 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2501#else
d47053eb
RGS
2502 PERL_UNUSED_ARG(depth);
2503#endif
7918f24d
NC
2504
2505 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2506#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2507 PERL_UNUSED_ARG(flags);
2508 PERL_UNUSED_ARG(val);
07be1b83 2509#endif
07be1b83
YO
2510 DEBUG_PEEP("join",scan,depth);
2511
2512 /* Skip NOTHING, merge EXACT*. */
2513 while (n &&
2514 ( PL_regkind[OP(n)] == NOTHING ||
2515 (stringok && (OP(n) == OP(scan))))
2516 && NEXT_OFF(n)
2517 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2518
2519 if (OP(n) == TAIL || n > next)
2520 stringok = 0;
2521 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2522 DEBUG_PEEP("skip:",n,depth);
2523 NEXT_OFF(scan) += NEXT_OFF(n);
2524 next = n + NODE_STEP_REGNODE;
2525#ifdef DEBUGGING
2526 if (stringok)
2527 stop = n;
2528#endif
2529 n = regnext(n);
2530 }
2531 else if (stringok) {
786e8c11 2532 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2533 regnode * const nnext = regnext(n);
2534
2535 DEBUG_PEEP("merg",n,depth);
2536
2537 merged++;
2538 if (oldl + STR_LEN(n) > U8_MAX)
2539 break;
2540 NEXT_OFF(scan) += NEXT_OFF(n);
2541 STR_LEN(scan) += STR_LEN(n);
2542 next = n + NODE_SZ_STR(n);
2543 /* Now we can overwrite *n : */
2544 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2545#ifdef DEBUGGING
2546 stop = next - 1;
2547#endif
2548 n = nnext;
2549 if (stopnow) break;
2550 }
2551
d47053eb
RGS
2552#ifdef EXPERIMENTAL_INPLACESCAN
2553 if (flags && !NEXT_OFF(n)) {
2554 DEBUG_PEEP("atch", val, depth);
2555 if (reg_off_by_arg[OP(n)]) {
2556 ARG_SET(n, val - n);
2557 }
2558 else {
2559 NEXT_OFF(n) = val - n;
2560 }
2561 stopnow = 1;
2562 }
07be1b83
YO
2563#endif
2564 }
ced7f090
KW
2565#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2566#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2567#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2568#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2569
2570 if (UTF
2f7f8cb1 2571 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2572 && ( STR_LEN(scan) >= 6 ) )
2573 {
07be1b83
YO
2574 /*
2575 Two problematic code points in Unicode casefolding of EXACT nodes:
2576
2577 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2578 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2579
2580 which casefold to
2581
2582 Unicode UTF-8
2583
2584 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2585 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2586
2587 This means that in case-insensitive matching (or "loose matching",
2588 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2589 length of the above casefolded versions) can match a target string
2590 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2591 This would rather mess up the minimum length computation.
2592
2593 What we'll do is to look for the tail four bytes, and then peek
2594 at the preceding two bytes to see whether we need to decrease
2595 the minimum length by four (six minus two).
2596
2597 Thanks to the design of UTF-8, there cannot be false matches:
2598 A sequence of valid UTF-8 bytes cannot be a subsequence of
2599 another valid sequence of UTF-8 bytes.
2600
2601 */
2602 char * const s0 = STRING(scan), *s, *t;
2603 char * const s1 = s0 + STR_LEN(scan) - 1;
2604 char * const s2 = s1 - 4;
e294cc5d
JH
2605#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2606 const char t0[] = "\xaf\x49\xaf\x42";
2607#else
07be1b83 2608 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2609#endif
07be1b83
YO
2610 const char * const t1 = t0 + 3;
2611
2612 for (s = s0 + 2;
2613 s < s2 && (t = ninstr(s, s1, t0, t1));
2614 s = t + 4) {
e294cc5d
JH
2615#ifdef EBCDIC
2616 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2617 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2618#else
07be1b83
YO
2619 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2620 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2621#endif
07be1b83
YO
2622 *min -= 4;
2623 }
2624 }
2625
2626#ifdef DEBUGGING
2627 /* Allow dumping */
2628 n = scan + NODE_SZ_STR(scan);
2629 while (n <= stop) {
2630 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2631 OP(n) = OPTIMIZED;
2632 NEXT_OFF(n) = 0;
2633 }
2634 n++;
2635 }
2636#endif
2637 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2638 return stopnow;
2639}
2640
486ec47a 2641/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2642 Finds fixed substrings. */
2643
a0288114 2644/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2645 to the position after last scanned or to NULL. */
2646
40d049e4
YO
2647#define INIT_AND_WITHP \
2648 assert(!and_withp); \
2649 Newx(and_withp,1,struct regnode_charclass_class); \
2650 SAVEFREEPV(and_withp)
07be1b83 2651
b515a41d 2652/* this is a chain of data about sub patterns we are processing that
486ec47a 2653 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2654 we can simulate recursion without losing state. */
2655struct scan_frame;
2656typedef struct scan_frame {
2657 regnode *last; /* last node to process in this frame */
2658 regnode *next; /* next node to process when last is reached */
2659 struct scan_frame *prev; /*previous frame*/
2660 I32 stop; /* what stopparen do we use */
2661} scan_frame;
2662
304ee84b
YO
2663
2664#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2665
e1d1eefb
YO
2666#define CASE_SYNST_FNC(nAmE) \
2667case nAmE: \
2668 if (flags & SCF_DO_STCLASS_AND) { \
2669 for (value = 0; value < 256; value++) \
2670 if (!is_ ## nAmE ## _cp(value)) \
2671 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2672 } \
2673 else { \
2674 for (value = 0; value < 256; value++) \
2675 if (is_ ## nAmE ## _cp(value)) \
2676 ANYOF_BITMAP_SET(data->start_class, value); \
2677 } \
2678 break; \
2679case N ## nAmE: \
2680 if (flags & SCF_DO_STCLASS_AND) { \
2681 for (value = 0; value < 256; value++) \
2682 if (is_ ## nAmE ## _cp(value)) \
2683 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2684 } \
2685 else { \
2686 for (value = 0; value < 256; value++) \
2687 if (!is_ ## nAmE ## _cp(value)) \
2688 ANYOF_BITMAP_SET(data->start_class, value); \
2689 } \
2690 break
2691
2692
2693
76e3520e 2694STATIC I32
40d049e4 2695S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2696 I32 *minlenp, I32 *deltap,
40d049e4
YO
2697 regnode *last,
2698 scan_data_t *data,
2699 I32 stopparen,
2700 U8* recursed,
2701 struct regnode_charclass_class *and_withp,
2702 U32 flags, U32 depth)
c277df42
IZ
2703 /* scanp: Start here (read-write). */
2704 /* deltap: Write maxlen-minlen here. */
2705 /* last: Stop before this one. */
40d049e4
YO
2706 /* data: string data about the pattern */
2707 /* stopparen: treat close N as END */
2708 /* recursed: which subroutines have we recursed into */
2709 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2710{
97aff369 2711 dVAR;
c277df42
IZ
2712 I32 min = 0, pars = 0, code;
2713 regnode *scan = *scanp, *next;
2714 I32 delta = 0;
2715 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2716 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2717 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2718 scan_data_t data_fake;
a3621e74 2719 SV *re_trie_maxbuff = NULL;
786e8c11 2720 regnode *first_non_open = scan;
e2e6a0f1 2721 I32 stopmin = I32_MAX;
8aa23a47 2722 scan_frame *frame = NULL;
a3621e74 2723 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2724
7918f24d
NC
2725 PERL_ARGS_ASSERT_STUDY_CHUNK;
2726
13a24bad 2727#ifdef DEBUGGING
40d049e4 2728 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2729#endif
40d049e4 2730
786e8c11 2731 if ( depth == 0 ) {
40d049e4 2732 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2733 first_non_open=regnext(first_non_open);
2734 }
2735
b81d288d 2736
8aa23a47
YO
2737 fake_study_recurse:
2738 while ( scan && OP(scan) != END && scan < last ){
2739 /* Peephole optimizer: */
304ee84b 2740 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2741 DEBUG_PEEP("Peep",scan,depth);
2742 JOIN_EXACT(scan,&min,0);
2743
2744 /* Follow the next-chain of the current node and optimize
2745 away all the NOTHINGs from it. */
2746 if (OP(scan) != CURLYX) {
2747 const int max = (reg_off_by_arg[OP(scan)]
2748 ? I32_MAX
2749 /* I32 may be smaller than U16 on CRAYs! */
2750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2752 int noff;
2753 regnode *n = scan;
2754
2755 /* Skip NOTHING and LONGJMP. */
2756 while ((n = regnext(n))
2757 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2759 && off + noff < max)
2760 off += noff;
2761 if (reg_off_by_arg[OP(scan)])
2762 ARG(scan) = off;
2763 else
2764 NEXT_OFF(scan) = off;
2765 }
a3621e74 2766
c277df42 2767
8aa23a47
YO
2768
2769 /* The principal pseudo-switch. Cannot be a switch, since we
2770 look into several different things. */
2771 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2772 || OP(scan) == IFTHEN) {
2773 next = regnext(scan);
2774 code = OP(scan);
2775 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2776
2777 if (OP(next) == code || code == IFTHEN) {
2778 /* NOTE - There is similar code to this block below for handling
2779 TRIE nodes on a re-study. If you change stuff here check there
2780 too. */
2781 I32 max1 = 0, min1 = I32_MAX, num = 0;
2782 struct regnode_charclass_class accum;
2783 regnode * const startbranch=scan;
2784
2785 if (flags & SCF_DO_SUBSTR)
304ee84b 2786 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 2787 if (flags & SCF_DO_STCLASS)
e755fd73 2788 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
2789
2790 while (OP(scan) == code) {
2791 I32 deltanext, minnext, f = 0, fake;
2792 struct regnode_charclass_class this_class;
2793
2794 num++;
2795 data_fake.flags = 0;
2796 if (data) {
2797 data_fake.whilem_c = data->whilem_c;
2798 data_fake.last_closep = data->last_closep;
2799 }
2800 else
2801 data_fake.last_closep = &fake;
58e23c8d
YO
2802
2803 data_fake.pos_delta = delta;
8aa23a47
YO
2804 next = regnext(scan);
2805 scan = NEXTOPER(scan);
2806 if (code != BRANCH)
c277df42 2807 scan = NEXTOPER(scan);
8aa23a47 2808 if (flags & SCF_DO_STCLASS) {
e755fd73 2809 cl_init(pRExC_state, &this_class);
8aa23a47
YO
2810 data_fake.start_class = &this_class;
2811 f = SCF_DO_STCLASS_AND;
58e23c8d 2812 }
8aa23a47
YO
2813 if (flags & SCF_WHILEM_VISITED_POS)
2814 f |= SCF_WHILEM_VISITED_POS;
2815
2816 /* we suppose the run is continuous, last=next...*/
2817 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2818 next, &data_fake,
2819 stopparen, recursed, NULL, f,depth+1);
2820 if (min1 > minnext)
2821 min1 = minnext;
2822 if (max1 < minnext + deltanext)
2823 max1 = minnext + deltanext;
2824 if (deltanext == I32_MAX)
2825 is_inf = is_inf_internal = 1;
2826 scan = next;
2827 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2828 pars++;
2829 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2830 if ( stopmin > minnext)
2831 stopmin = min + min1;
2832 flags &= ~SCF_DO_SUBSTR;
2833 if (data)
2834 data->flags |= SCF_SEEN_ACCEPT;
2835 }
2836 if (data) {
2837 if (data_fake.flags & SF_HAS_EVAL)
2838 data->flags |= SF_HAS_EVAL;
2839 data->whilem_c = data_fake.whilem_c;
3dab1dad 2840 }
8aa23a47 2841 if (flags & SCF_DO_STCLASS)
3fffb88a 2842 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
2843 }
2844 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2845 min1 = 0;
2846 if (flags & SCF_DO_SUBSTR) {
2847 data->pos_min += min1;
2848 data->pos_delta += max1 - min1;
2849 if (max1 != min1 || is_inf)
2850 data->longest = &(data->longest_float);
2851 }
2852 min += min1;
2853 delta += max1 - min1;
2854 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 2855 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
2856 if (min1) {
2857 cl_and(data->start_class, and_withp);
2858 flags &= ~SCF_DO_STCLASS;
653099ff 2859 }
8aa23a47
YO
2860 }
2861 else if (flags & SCF_DO_STCLASS_AND) {
2862 if (min1) {
2863 cl_and(data->start_class, &accum);
2864 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2865 }
8aa23a47
YO
2866 else {
2867 /* Switch to OR mode: cache the old value of
2868 * data->start_class */
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&accum, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2877 }
8aa23a47 2878 }
a3621e74 2879
8aa23a47
YO
2880 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2881 /* demq.
a3621e74 2882
8aa23a47
YO
2883 Assuming this was/is a branch we are dealing with: 'scan' now
2884 points at the item that follows the branch sequence, whatever
2885 it is. We now start at the beginning of the sequence and look
2886 for subsequences of
a3621e74 2887
8aa23a47
YO
2888 BRANCH->EXACT=>x1
2889 BRANCH->EXACT=>x2
2890 tail
a3621e74 2891
8aa23a47 2892 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2893
486ec47a 2894 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2895 element into a trie and then add the subsequent branch exact
2896 strings to the trie.
a3621e74 2897
8aa23a47 2898 We have two cases
a3621e74 2899
3b753521 2900 1. patterns where the whole set of branches can be converted.
a3621e74 2901
8aa23a47 2902 2. patterns where only a subset can be converted.
a3621e74 2903
8aa23a47
YO
2904 In case 1 we can replace the whole set with a single regop
2905 for the trie. In case 2 we need to keep the start and end
3b753521 2906 branches so
a3621e74 2907
8aa23a47
YO
2908 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2909 becomes BRANCH TRIE; BRANCH X;
786e8c11 2910
8aa23a47
YO
2911 There is an additional case, that being where there is a
2912 common prefix, which gets split out into an EXACT like node
2913 preceding the TRIE node.
a3621e74 2914
8aa23a47
YO
2915 If x(1..n)==tail then we can do a simple trie, if not we make
2916 a "jump" trie, such that when we match the appropriate word
486ec47a 2917 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2918 a nested if into a case structure of sorts.
b515a41d 2919
8aa23a47
YO
2920 */
2921
2922 int made=0;
2923 if (!re_trie_maxbuff) {
2924 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2925 if (!SvIOK(re_trie_maxbuff))
2926 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2927 }
2928 if ( SvIV(re_trie_maxbuff)>=0 ) {
2929 regnode *cur;
2930 regnode *first = (regnode *)NULL;
2931 regnode *last = (regnode *)NULL;
2932 regnode *tail = scan;
2933 U8 optype = 0;
2934 U32 count=0;
a3621e74
YO
2935
2936#ifdef DEBUGGING
8aa23a47 2937 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2938#endif
8aa23a47
YO
2939 /* var tail is used because there may be a TAIL
2940 regop in the way. Ie, the exacts will point to the
2941 thing following the TAIL, but the last branch will
2942 point at the TAIL. So we advance tail. If we
2943 have nested (?:) we may have to move through several
2944 tails.
2945 */
2946
2947 while ( OP( tail ) == TAIL ) {
2948 /* this is the TAIL generated by (?:) */
2949 tail = regnext( tail );
2950 }
a3621e74 2951
8aa23a47
YO
2952
2953 DEBUG_OPTIMISE_r({
2954 regprop(RExC_rx, mysv, tail );
2955 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2956 (int)depth * 2 + 2, "",
2957 "Looking for TRIE'able sequences. Tail node is: ",
2958 SvPV_nolen_const( mysv )
2959 );
2960 });
2961
2962 /*
2963
2964 step through the branches, cur represents each
2965 branch, noper is the first thing to be matched
2966 as part of that branch and noper_next is the
2967 regnext() of that node. if noper is an EXACT
2968 and noper_next is the same as scan (our current
2969 position in the regex) then the EXACT branch is
2970 a possible optimization target. Once we have
486ec47a 2971 two or more consecutive such branches we can
8aa23a47
YO
2972 create a trie of the EXACT's contents and stich
2973 it in place. If the sequence represents all of
2974 the branches we eliminate the whole thing and
2975 replace it with a single TRIE. If it is a
2976 subsequence then we need to stitch it in. This
2977 means the first branch has to remain, and needs
2978 to be repointed at the item on the branch chain
2979 following the last branch optimized. This could
2980 be either a BRANCH, in which case the
2981 subsequence is internal, or it could be the
2982 item following the branch sequence in which
2983 case the subsequence is at the end.
2984
2985 */
2986
2987 /* dont use tail as the end marker for this traverse */
2988 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2989 regnode * const noper = NEXTOPER( cur );
b515a41d 2990#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2991 regnode * const noper_next = regnext( noper );
b515a41d
YO
2992#endif
2993
8aa23a47
YO
2994 DEBUG_OPTIMISE_r({
2995 regprop(RExC_rx, mysv, cur);
2996 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2997 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2998
2999 regprop(RExC_rx, mysv, noper);
3000 PerlIO_printf( Perl_debug_log, " -> %s",
3001 SvPV_nolen_const(mysv));
3002
3003 if ( noper_next ) {
3004 regprop(RExC_rx, mysv, noper_next );
3005 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3006 SvPV_nolen_const(mysv));
3007 }
3008 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3009 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3010 });
3011 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3012 : PL_regkind[ OP( noper ) ] == EXACT )
3013 || OP(noper) == NOTHING )
786e8c11 3014#ifdef NOJUMPTRIE
8aa23a47 3015 && noper_next == tail
786e8c11 3016#endif
8aa23a47
YO
3017 && count < U16_MAX)
3018 {
3019 count++;
3020 if ( !first || optype == NOTHING ) {
3021 if (!first) first = cur;
3022 optype = OP( noper );
3023 } else {
3024 last = cur;
3025 }
3026 } else {
a0a388a1 3027/*
0abd0d78
YO
3028 Currently we do not believe that the trie logic can
3029 handle case insensitive matching properly when the
3030 pattern is not unicode (thus forcing unicode semantics).
3031
3032 If/when this is fixed the following define can be swapped
3033 in below to fully enable trie logic.
3034
f0c16e54
KW
3035 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3036 not /aa
3037
a0a388a1 3038#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
3039
3040*/
f0c16e54 3041#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
0abd0d78 3042
a0a388a1 3043 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
3044 make_trie( pRExC_state,
3045 startbranch, first, cur, tail, count,
3046 optype, depth+1 );
3047 }
3048 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 3049#ifdef NOJUMPTRIE
8aa23a47 3050 && noper_next == tail
786e8c11 3051#endif
8aa23a47
YO
3052 ){
3053 count = 1;
3054 first = cur;
3055 optype = OP( noper );
3056 } else {
3057 count = 0;
3058 first = NULL;
3059 optype = 0;
3060 }
3061 last = NULL;
3062 }
3063 }
3064 DEBUG_OPTIMISE_r({
3065 regprop(RExC_rx, mysv, cur);
3066 PerlIO_printf( Perl_debug_log,
3067 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3068 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3069
3070 });
a0a388a1
YO
3071
3072 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 3073 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 3074#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3075 if ( ((made == MADE_EXACT_TRIE &&
3076 startbranch == first)
3077 || ( first_non_open == first )) &&
3078 depth==0 ) {
3079 flags |= SCF_TRIE_RESTUDY;
3080 if ( startbranch == first
3081 && scan == tail )
3082 {
3083 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3084 }
3085 }
3dab1dad 3086#endif
8aa23a47
YO
3087 }
3088 }
3089
3090 } /* do trie */
3091
653099ff 3092 }
8aa23a47
YO
3093 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3094 scan = NEXTOPER(NEXTOPER(scan));
3095 } else /* single branch is optimized. */
3096 scan = NEXTOPER(scan);
3097 continue;
3098 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3099 scan_frame *newframe = NULL;
3100 I32 paren;
3101 regnode *start;
3102 regnode *end;
3103
3104 if (OP(scan) != SUSPEND) {
3105 /* set the pointer */
3106 if (OP(scan) == GOSUB) {
3107 paren = ARG(scan);
3108 RExC_recurse[ARG2L(scan)] = scan;
3109 start = RExC_open_parens[paren-1];
3110 end = RExC_close_parens[paren-1];
3111 } else {
3112 paren = 0;
f8fc2ecf 3113 start = RExC_rxi->program + 1;
8aa23a47
YO
3114 end = RExC_opend;
3115 }
3116 if (!recursed) {
3117 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3118 SAVEFREEPV(recursed);
3119 }
3120 if (!PAREN_TEST(recursed,paren+1)) {
3121 PAREN_SET(recursed,paren+1);
3122 Newx(newframe,1,scan_frame);
3123 } else {
3124 if (flags & SCF_DO_SUBSTR) {
304ee84b 3125 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3126 data->longest = &(data->longest_float);
3127 }
3128 is_inf = is_inf_internal = 1;
3129 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3130 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3131 flags &= ~SCF_DO_STCLASS;
3132 }
3133 } else {
3134 Newx(newframe,1,scan_frame);
3135 paren = stopparen;
3136 start = scan+2;
3137 end = regnext(scan);
3138 }
3139 if (newframe) {
3140 assert(start);
3141 assert(end);
3142 SAVEFREEPV(newframe);
3143 newframe->next = regnext(scan);
3144 newframe->last = last;
3145 newframe->stop = stopparen;
3146 newframe->prev = frame;
3147
3148 frame = newframe;
3149 scan = start;
3150 stopparen = paren;
3151 last = end;
3152
3153 continue;
3154 }
3155 }
3156 else if (OP(scan) == EXACT) {
3157 I32 l = STR_LEN(scan);
3158 UV uc;
3159 if (UTF) {
3160 const U8 * const s = (U8*)STRING(scan);
3161 l = utf8_length(s, s + l);
3162 uc = utf8_to_uvchr(s, NULL);
3163 } else {
3164 uc = *((U8*)STRING(scan));
3165 }
3166 min += l;
3167 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3168 /* The code below prefers earlier match for fixed
3169 offset, later match for variable offset. */
3170 if (data->last_end == -1) { /* Update the start info. */
3171 data->last_start_min = data->pos_min;
3172 data->last_start_max = is_inf
3173 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3174 }
8aa23a47
YO
3175 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3176 if (UTF)
3177 SvUTF8_on(data->last_found);
3178 {
3179 SV * const sv = data->last_found;
3180 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3181 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3182 if (mg && mg->mg_len >= 0)
3183 mg->mg_len += utf8_length((U8*)STRING(scan),
3184 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3185 }
8aa23a47
YO
3186 data->last_end = data->pos_min + l;
3187 data->pos_min += l; /* As in the first entry. */
3188 data->flags &= ~SF_BEFORE_EOL;
3189 }
3190 if (flags & SCF_DO_STCLASS_AND) {
3191 /* Check whether it is compatible with what we know already! */
3192 int compat = 1;
3193
54251c2e 3194
486ec47a 3195 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3196 * in the bitmp and either 1) its bit or its fold is set, or 2)
3197 * it's for a locale. Even if there isn't unicode semantics
3198 * here, at runtime there may be because of matching against a
3199 * utf8 string, so accept a possible false positive for
3200 * latin1-range folds */
8aa23a47
YO
3201 if (uc >= 0x100 ||
3202 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3203 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3204 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3205 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3206 )
d18bf9dc 3207 {
8aa23a47 3208 compat = 0;
d18bf9dc 3209 }
8aa23a47
YO
3210 ANYOF_CLASS_ZERO(data->start_class);
3211 ANYOF_BITMAP_ZERO(data->start_class);
3212 if (compat)
3213 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3214 else if (uc >= 0x100) {
3215 int i;
3216
3217 /* Some Unicode code points fold to the Latin1 range; as
3218 * XXX temporary code, instead of figuring out if this is
3219 * one, just assume it is and set all the start class bits
3220 * that could be some such above 255 code point's fold
3221 * which will generate fals positives. As the code
3222 * elsewhere that does compute the fold settles down, it
3223 * can be extracted out and re-used here */
3224 for (i = 0; i < 256; i++){
3225 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3226 ANYOF_BITMAP_SET(data->start_class, i);
3227 }
3228 }
3229 }
8aa23a47
YO
3230 data->start_class->flags &= ~ANYOF_EOS;
3231 if (uc < 0x100)
3232 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3233 }
3234 else if (flags & SCF_DO_STCLASS_OR) {
3235 /* false positive possible if the class is case-folded */
3236 if (uc < 0x100)
3237 ANYOF_BITMAP_SET(data->start_class, uc);
3238 else
3239 data->start_class->flags |= ANYOF_UNICODE_ALL;
3240 data->start_class->flags &= ~ANYOF_EOS;
3241 cl_and(data->start_class, and_withp);
3242 }
3243 flags &= ~SCF_DO_STCLASS;
3244 }
3245 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3246 I32 l = STR_LEN(scan);
3247 UV uc = *((U8*)STRING(scan));
3248
3249 /* Search for fixed substrings supports EXACT only. */
3250 if (flags & SCF_DO_SUBSTR) {
3251 assert(data);
304ee84b 3252 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3253 }
3254 if (UTF) {
3255 const U8 * const s = (U8 *)STRING(scan);
3256 l = utf8_length(s, s + l);
3257 uc = utf8_to_uvchr(s, NULL);
3258 }
3259 min += l;
3260 if (flags & SCF_DO_SUBSTR)
3261 data->pos_min += l;
3262 if (flags & SCF_DO_STCLASS_AND) {
3263 /* Check whether it is compatible with what we know already! */
3264 int compat = 1;
8aa23a47 3265 if (uc >= 0x100 ||
54251c2e
KW
3266 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3267 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3268 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3269 {
8aa23a47 3270 compat = 0;
54251c2e 3271 }
8aa23a47
YO
3272 ANYOF_CLASS_ZERO(data->start_class);
3273 ANYOF_BITMAP_ZERO(data->start_class);
3274 if (compat) {
3275 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3276 data->start_class->flags &= ~ANYOF_EOS;
39065660 3277 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3278 if (OP(scan) == EXACTFL) {
af302e7f
KW
3279 /* XXX This set is probably no longer necessary, and
3280 * probably wrong as LOCALE now is on in the initial
3281 * state */
8aa23a47 3282 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3283 }
3284 else {
3285
54251c2e
KW
3286 /* Also set the other member of the fold pair. In case
3287 * that unicode semantics is called for at runtime, use
3288 * the full latin1 fold. (Can't do this for locale,
3289 * because not known until runtime */
3290 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3291 }
653099ff 3292 }
d18bf9dc
KW
3293 else if (uc >= 0x100) {
3294 int i;
3295 for (i = 0; i < 256; i++){
3296 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3297 ANYOF_BITMAP_SET(data->start_class, i);
3298 }
3299 }
3300 }
8aa23a47
YO
3301 }
3302 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3303 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3304 /* false positive possible if the class is case-folded.
3305 Assume that the locale settings are the same... */
970c8436 3306 if (uc < 0x100) {
1aa99e6b 3307 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3308 if (OP(scan) != EXACTFL) {
3309
3310 /* And set the other member of the fold pair, but
3311 * can't do that in locale because not known until
3312 * run-time */
3313 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3314 PL_fold_latin1[uc]);
970c8436
KW
3315 }
3316 }
653099ff
GS
3317 data->start_class->flags &= ~ANYOF_EOS;
3318 }
8aa23a47 3319 cl_and(data->start_class, and_withp);
653099ff 3320 }
8aa23a47
YO
3321 flags &= ~SCF_DO_STCLASS;
3322 }
e52fc539 3323 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3324 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3325 I32 f = flags, pos_before = 0;
3326 regnode * const oscan = scan;
3327 struct regnode_charclass_class this_class;
3328 struct regnode_charclass_class *oclass = NULL;
3329 I32 next_is_eval = 0;
3330
3331 switch (PL_regkind[OP(scan)]) {
3332 case WHILEM: /* End of (?:...)* . */
3333 scan = NEXTOPER(scan);
3334 goto finish;
3335 case PLUS:
3336 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3337 next = NEXTOPER(scan);
3338 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3339 mincount = 1;
3340 maxcount = REG_INFTY;
3341 next = regnext(scan);
3342 scan = NEXTOPER(scan);
3343 goto do_curly;
3344 }
3345 }
3346 if (flags & SCF_DO_SUBSTR)
3347 data->pos_min++;
3348 min++;
3349 /* Fall through. */
3350 case STAR:
3351 if (flags & SCF_DO_STCLASS) {
3352 mincount = 0;
3353 maxcount = REG_INFTY;
3354 next = regnext(scan);
3355 scan = NEXTOPER(scan);
3356 goto do_curly;
3357 }
3358 is_inf = is_inf_internal = 1;
3359 scan = regnext(scan);
c277df42 3360 if (flags & SCF_DO_SUBSTR) {
304ee84b 3361 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3362 data->longest = &(data->longest_float);
c277df42 3363 }
8aa23a47
YO
3364 goto optimize_curly_tail;
3365 case CURLY:
3366 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3367 && (scan->flags == stopparen))
3368 {
3369 mincount = 1;
3370 maxcount = 1;
3371 } else {
3372 mincount = ARG1(scan);
3373 maxcount = ARG2(scan);
653099ff 3374 }
8aa23a47
YO
3375 next = regnext(scan);
3376 if (OP(scan) == CURLYX) {
3377 I32 lp = (data ? *(data->last_closep) : 0);
3378 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3379 }
8aa23a47
YO
3380 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3381 next_is_eval = (OP(scan) == EVAL);
3382 do_curly:
3383 if (flags & SCF_DO_SUBSTR) {
304ee84b 3384 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3385 pos_before = data->pos_min;
b45f050a 3386 }
8aa23a47
YO
3387 if (data) {
3388 fl = data->flags;
3389 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3390 if (is_inf)
3391 data->flags |= SF_IS_INF;
3392 }
3393 if (flags & SCF_DO_STCLASS) {
e755fd73 3394 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3395 oclass = data->start_class;
3396 data->start_class = &this_class;
3397 f |= SCF_DO_STCLASS_AND;
3398 f &= ~SCF_DO_STCLASS_OR;
3399 }
779bcb7d
NC
3400 /* Exclude from super-linear cache processing any {n,m}
3401 regops for which the combination of input pos and regex
3402 pos is not enough information to determine if a match
3403 will be possible.
3404
3405 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3406 regex pos at the \s*, the prospects for a match depend not
3407 only on the input position but also on how many (bar\s*)
3408 repeats into the {4,8} we are. */
3409 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3410 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3411
8aa23a47
YO
3412 /* This will finish on WHILEM, setting scan, or on NULL: */
3413 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3414 last, data, stopparen, recursed, NULL,
3415 (mincount == 0
3416 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3417
8aa23a47
YO
3418 if (flags & SCF_DO_STCLASS)
3419 data->start_class = oclass;
3420 if (mincount == 0 || minnext == 0) {
3421 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3422 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3423 }
3424 else if (flags & SCF_DO_STCLASS_AND) {
3425 /* Switch to OR mode: cache the old value of
3426 * data->start_class */
3427 INIT_AND_WITHP;
3428 StructCopy(data->start_class, and_withp,
3429 struct regnode_charclass_class);
3430 flags &= ~SCF_DO_STCLASS_AND;
3431 StructCopy(&this_class, data->start_class,
3432 struct regnode_charclass_class);
3433 flags |= SCF_DO_STCLASS_OR;
3434 data->start_class->flags |= ANYOF_EOS;
3435 }
3436 } else { /* Non-zero len */
3437 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3438 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3439 cl_and(data->start_class, and_withp);
3440 }
3441 else if (flags & SCF_DO_STCLASS_AND)
3442 cl_and(data->start_class, &this_class);
3443 flags &= ~SCF_DO_STCLASS;
3444 }
3445 if (!scan) /* It was not CURLYX, but CURLY. */
3446 scan = next;
3447 if ( /* ? quantifier ok, except for (?{ ... }) */
3448 (next_is_eval || !(mincount == 0 && maxcount == 1))
3449 && (minnext == 0) && (deltanext == 0)
3450 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3451 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3452 {
668c081a
NC
3453 ckWARNreg(RExC_parse,
3454 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3455 }
3456
3457 min += minnext * mincount;
3458 is_inf_internal |= ((maxcount == REG_INFTY
3459 && (minnext + deltanext) > 0)
3460 || deltanext == I32_MAX);
3461 is_inf |= is_inf_internal;
3462 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3463
3464 /* Try powerful optimization CURLYX => CURLYN. */
3465 if ( OP(oscan) == CURLYX && data
3466 && data->flags & SF_IN_PAR
3467 && !(data->flags & SF_HAS_EVAL)
3468 && !deltanext && minnext == 1 ) {
3469 /* Try to optimize to CURLYN. */
3470 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3471 regnode * const nxt1 = nxt;
497b47a8 3472#ifdef DEBUGGING
8aa23a47 3473 regnode *nxt2;
497b47a8 3474#endif
c277df42 3475
8aa23a47
YO
3476 /* Skip open. */
3477 nxt = regnext(nxt);
e52fc539 3478 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3479 && !(PL_regkind[OP(nxt)] == EXACT
3480 && STR_LEN(nxt) == 1))
3481 goto nogo;
497b47a8 3482#ifdef DEBUGGING
8aa23a47 3483 nxt2 = nxt;
497b47a8 3484#endif
8aa23a47
YO
3485 nxt = regnext(nxt);
3486 if (OP(nxt) != CLOSE)
3487 goto nogo;
3488 if (RExC_open_parens) {
3489 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3490 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3491 }
3492 /* Now we know that nxt2 is the only contents: */
3493 oscan->flags = (U8)ARG(nxt);
3494 OP(oscan) = CURLYN;
3495 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3496
c277df42 3497#ifdef DEBUGGING
8aa23a47 3498 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3499 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3500 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3501 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3502 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3503 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3504#endif
8aa23a47
YO
3505 }
3506 nogo:
3507
3508 /* Try optimization CURLYX => CURLYM. */
3509 if ( OP(oscan) == CURLYX && data
3510 && !(data->flags & SF_HAS_PAR)
3511 && !(data->flags & SF_HAS_EVAL)
3512 && !deltanext /* atom is fixed width */
3513 && minnext != 0 /* CURLYM can't handle zero width */
3514 ) {
3515 /* XXXX How to optimize if data == 0? */
3516 /* Optimize to a simpler form. */
3517 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3518 regnode *nxt2;
3519
3520 OP(oscan) = CURLYM;
3521 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3522 && (OP(nxt2) != WHILEM))
3523 nxt = nxt2;
3524 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3525 /* Need to optimize away parenths. */
b3c0965f 3526 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3527 /* Set the parenth number. */
3528 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3529
8aa23a47
YO
3530 oscan->flags = (U8)ARG(nxt);
3531 if (RExC_open_parens) {
3532 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3533 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3534 }
8aa23a47
YO
3535 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3536 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3537
c277df42 3538#ifdef DEBUGGING
8aa23a47
YO
3539 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3540 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3541 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3542 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3543#endif
c277df42 3544#if 0
8aa23a47
YO
3545 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3546 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3547 if (nnxt == nxt) {
3548 if (reg_off_by_arg[OP(nxt1)])
3549 ARG_SET(nxt1, nxt2 - nxt1);
3550 else if (nxt2 - nxt1 < U16_MAX)
3551 NEXT_OFF(nxt1) = nxt2 - nxt1;
3552 else
3553 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3554 }
8aa23a47 3555 nxt1 = nnxt;
c277df42 3556 }
5d1c421c 3557#endif
8aa23a47
YO
3558 /* Optimize again: */
3559 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3560 NULL, stopparen, recursed, NULL, 0,depth+1);
3561 }
3562 else
3563 oscan->flags = 0;
3564 }
3565 else if ((OP(oscan) == CURLYX)
3566 && (flags & SCF_WHILEM_VISITED_POS)
3567 /* See the comment on a similar expression above.
3b753521 3568 However, this time it's not a subexpression
8aa23a47
YO
3569 we care about, but the expression itself. */
3570 && (maxcount == REG_INFTY)
3571 && data && ++data->whilem_c < 16) {
3572 /* This stays as CURLYX, we can put the count/of pair. */
3573 /* Find WHILEM (as in regexec.c) */
3574 regnode *nxt = oscan + NEXT_OFF(oscan);
3575
3576 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3577 nxt += ARG(nxt);
3578 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3579 | (RExC_whilem_seen << 4)); /* On WHILEM */
3580 }
3581 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3582 pars++;
3583 if (flags & SCF_DO_SUBSTR) {
3584 SV *last_str = NULL;
3585 int counted = mincount != 0;
a0ed51b3 3586
8aa23a47
YO
3587 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3588#if defined(SPARC64_GCC_WORKAROUND)
3589 I32 b = 0;
3590 STRLEN l = 0;
3591 const char *s = NULL;
3592 I32 old = 0;
b515a41d 3593
8aa23a47
YO
3594 if (pos_before >= data->last_start_min)
3595 b = pos_before;
3596 else
3597 b = data->last_start_min;
b515a41d 3598
8aa23a47
YO
3599 l = 0;
3600 s = SvPV_const(data->last_found, l);
3601 old = b - data->last_start_min;
3602
3603#else
3604 I32 b = pos_before >= data->last_start_min
3605 ? pos_before : data->last_start_min;
3606 STRLEN l;
3607 const char * const s = SvPV_const(data->last_found, l);
3608 I32 old = b - data->last_start_min;
3609#endif
3610
3611 if (UTF)
3612 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3613 l -= old;
3614 /* Get the added string: */
740cce10 3615 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3616 if (deltanext == 0 && pos_before == b) {
3617 /* What was added is a constant string */
3618 if (mincount > 1) {
3619 SvGROW(last_str, (mincount * l) + 1);
3620 repeatcpy(SvPVX(last_str) + l,
3621 SvPVX_const(last_str), l, mincount - 1);
3622 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3623 /* Add additional parts. */
3624 SvCUR_set(data->last_found,
3625 SvCUR(data->last_found) - l);
3626 sv_catsv(data->last_found, last_str);
3627 {
3628 SV * sv = data->last_found;
3629 MAGIC *mg =
3630 SvUTF8(sv) && SvMAGICAL(sv) ?
3631 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3632 if (mg && mg->mg_len >= 0)
bd94e887 3633 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3634 }
8aa23a47 3635 data->last_end += l * (mincount - 1);
b515a41d 3636 }
8aa23a47
YO
3637 } else {
3638 /* start offset must point into the last copy */
3639 data->last_start_min += minnext * (mincount - 1);
3640 data->last_start_max += is_inf ? I32_MAX
3641 : (maxcount - 1) * (minnext + data->pos_delta);
3642 }
c277df42 3643 }
8aa23a47
YO
3644 /* It is counted once already... */
3645 data->pos_min += minnext * (mincount - counted);
3646 data->pos_delta += - counted * deltanext +
3647 (minnext + deltanext) * maxcount - minnext * mincount;
3648 if (mincount != maxcount) {
3649 /* Cannot extend fixed substrings found inside
3650 the group. */
304ee84b 3651 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3652 if (mincount && last_str) {
3653 SV * const sv = data->last_found;
3654 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3655 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3656
3657 if (mg)
3658 mg->mg_len = -1;
3659 sv_setsv(sv, last_str);
3660 data->last_end = data->pos_min;
3661 data->last_start_min =
3662 data->pos_min - CHR_SVLEN(last_str);
3663 data->last_start_max = is_inf
3664 ? I32_MAX
3665 : data->pos_min + data->pos_delta
3666 - CHR_SVLEN(last_str);
3667 }
3668 data->longest = &(data->longest_float);
3669 }
3670 SvREFCNT_dec(last_str);
c277df42 3671 }
8aa23a47
YO
3672 if (data && (fl & SF_HAS_EVAL))
3673 data->flags |= SF_HAS_EVAL;
3674 optimize_curly_tail:
3675 if (OP(oscan) != CURLYX) {
3676 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3677 && NEXT_OFF(next))
3678 NEXT_OFF(oscan) += NEXT_OFF(next);
3679 }
3680 continue;
f56b6394 3681 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3682 if (flags & SCF_DO_SUBSTR) {
304ee84b 3683 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3684 data->longest = &(data->longest_float);
3685 }
3686 is_inf = is_inf_internal = 1;
3687 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 3688 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3689 flags &= ~SCF_DO_STCLASS;
3690 break;
c277df42 3691 }
8aa23a47 3692 }
e1d1eefb
YO
3693 else if (OP(scan) == LNBREAK) {
3694 if (flags & SCF_DO_STCLASS) {
3695 int value = 0;
3696 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3697 if (flags & SCF_DO_STCLASS_AND) {
3698 for (value = 0; value < 256; value++)
e64b1bd1 3699 if (!is_VERTWS_cp(value))
b9a59e08
KW
3700 ANYOF_BITMAP_CLEAR(data->start_class, value);
3701 }
3702 else {
e1d1eefb 3703 for (value = 0; value < 256; value++)
e64b1bd1 3704 if (is_VERTWS_cp(value))
b9a59e08
KW
3705 ANYOF_BITMAP_SET(data->start_class, value);
3706 }
e1d1eefb
YO
3707 if (flags & SCF_DO_STCLASS_OR)
3708 cl_and(data->start_class, and_withp);
3709 flags &= ~SCF_DO_STCLASS;
3710 }
3711 min += 1;
f9a79580 3712 delta += 1;
e1d1eefb
YO
3713 if (flags & SCF_DO_SUBSTR) {
3714 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3715 data->pos_min += 1;
f9a79580 3716 data->pos_delta += 1;
e1d1eefb
YO
3717 data->longest = &(data->longest_float);
3718 }
e1d1eefb 3719 }
f9a79580 3720 else if (OP(scan) == FOLDCHAR) {
ced7f090 3721 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3722 flags &= ~SCF_DO_STCLASS;
3723 min += 1;
3724 delta += d;
3725 if (flags & SCF_DO_SUBSTR) {
3726 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3727 data->pos_min += 1;
3728 data->pos_delta += d;
3729 data->longest = &(data->longest_float);
3730 }
3731 }
e52fc539 3732 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3733 int value = 0;
653099ff 3734
8aa23a47 3735 if (flags & SCF_DO_SUBSTR) {
304ee84b 3736 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3737 data->pos_min++;
3738 }
3739 min++;
3740 if (flags & SCF_DO_STCLASS) {
3741 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3742
8aa23a47
YO
3743 /* Some of the logic below assumes that switching
3744 locale on will only add false positives. */
3745 switch (PL_regkind[OP(scan)]) {
3746 case SANY:
3747 default:
3748 do_default:
3749 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3750 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3751 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3752 break;
3753 case REG_ANY:
3754 if (OP(scan) == SANY)
3755 goto do_default;
3756 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3757 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3758 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3fffb88a 3759 cl_anything(pRExC_state, data->start_class);
653099ff 3760 }
8aa23a47
YO
3761 if (flags & SCF_DO_STCLASS_AND || !value)
3762 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3763 break;
3764 case ANYOF:
3765 if (flags & SCF_DO_STCLASS_AND)
3766 cl_and(data->start_class,
3767 (struct regnode_charclass_class*)scan);
653099ff 3768 else
3fffb88a 3769 cl_or(pRExC_state, data->start_class,
8aa23a47
YO
3770 (struct regnode_charclass_class*)scan);
3771 break;
3772 case ALNUM:
3773 if (flags & SCF_DO_STCLASS_AND) {
3774 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3775 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3776 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3777 for (value = 0; value < 256; value++) {
3778 if (!isWORDCHAR_L1(value)) {
3779 ANYOF_BITMAP_CLEAR(data->start_class, value);
3780 }
3781 }
3782 } else {
3783 for (value = 0; value < 256; value++) {
3784 if (!isALNUM(value)) {
3785 ANYOF_BITMAP_CLEAR(data->start_class, value);
3786 }
3787 }
3788 }
8aa23a47 3789 }
653099ff 3790 }
8aa23a47
YO
3791 else {
3792 if (data->start_class->flags & ANYOF_LOCALE)
3793 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
af302e7f
KW
3794
3795 /* Even if under locale, set the bits for non-locale
3796 * in case it isn't a true locale-node. This will
3797 * create false positives if it truly is locale */
3798 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3799 for (value = 0; value < 256; value++) {
3800 if (isWORDCHAR_L1(value)) {
3801 ANYOF_BITMAP_SET(data->start_class, value);
3802 }
3803 }
3804 } else {
3805 for (value = 0; value < 256; value++) {
3806 if (isALNUM(value)) {
3807 ANYOF_BITMAP_SET(data->start_class, value);
3808 }
3809 }
3810 }
8aa23a47
YO
3811 }
3812 break;
8aa23a47
YO
3813 case NALNUM:
3814 if (flags & SCF_DO_STCLASS_AND) {
3815 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3816 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 3817 if (OP(scan) == NALNUMU) {
a12cf05f
KW
3818 for (value = 0; value < 256; value++) {
3819 if (isWORDCHAR_L1(value)) {
3820 ANYOF_BITMAP_CLEAR(data->start_class, value);
3821 }
3822 }
3823 } else {
3824 for (value = 0; value < 256; value++) {
3825 if (isALNUM(value)) {
3826 ANYOF_BITMAP_CLEAR(data->start_class, value);
3827 }
3828 }
3829 }
653099ff
GS
3830 }
3831 }
8aa23a47
YO
3832 else {
3833 if (data->start_class->flags & ANYOF_LOCALE)
3834 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
af302e7f 3835
75950e1c
KW
3836 /* Even if under locale, set the bits for non-locale in
3837 * case it isn't a true locale-node. This will create
3838 * false positives if it truly is locale */
3839 if (OP(scan) == NALNUMU) {
3840 for (value = 0; value < 256; value++) {
3841 if (! isWORDCHAR_L1(value)) {
3842 ANYOF_BITMAP_SET(data->start_class, value);
3843 }
e9a9c1bc 3844 }
75950e1c
KW
3845 } else {
3846 for (value = 0; value < 256; value++) {
3847 if (! isALNUM(value)) {
3848 ANYOF_BITMAP_SET(data->start_class, value);
3849 }
3850 }
3851 }
653099ff 3852 }
8aa23a47 3853 break;
8aa23a47
YO
3854 case SPACE:
3855 if (flags & SCF_DO_STCLASS_AND) {
3856 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3857 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 3858 if (OP(scan) == SPACEU) {
a12cf05f
KW
3859 for (value = 0; value < 256; value++) {
3860 if (!isSPACE_L1(value)) {
3861 ANYOF_BITMAP_CLEAR(data->start_class, value);
3862 }
3863 }
3864 } else {
3865 for (value = 0; value < 256; value++) {
3866 if (!isSPACE(value)) {
3867 ANYOF_BITMAP_CLEAR(data->start_class, value);
3868 }
3869 }
3870 }
653099ff
GS
3871 }
3872 }
8aa23a47 3873 else {
a12cf05f 3874 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3875 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 3876 }
af302e7f 3877 if (OP(scan) == SPACEU) {
a12cf05f
KW
3878 for (value = 0; value < 256; value++) {
3879 if (isSPACE_L1(value)) {
3880 ANYOF_BITMAP_SET(data->start_class, value);
3881 }
3882 }
3883 } else {
3884 for (value = 0; value < 256; value++) {
3885 if (isSPACE(value)) {
3886 ANYOF_BITMAP_SET(data->start_class, value);
3887 }
3888 }
8aa23a47 3889 }
653099ff 3890 }
8aa23a47 3891 break;
8aa23a47
YO
3892 case NSPACE:
3893 if (flags & SCF_DO_STCLASS_AND) {
3894 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3895 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 3896 if (OP(scan) == NSPACEU) {
a12cf05f
KW
3897 for (value = 0; value < 256; value++) {
3898 if (isSPACE_L1(value)) {
3899 ANYOF_BITMAP_CLEAR(data->start_class, value);
3900 }
3901 }
3902 } else {
3903 for (value = 0; value < 256; value++) {
3904 if (isSPACE(value)) {
3905 ANYOF_BITMAP_CLEAR(data->start_class, value);
3906 }
3907 }
3908 }
653099ff 3909 }
8aa23a47
YO
3910 }
3911 else {
3912 if (data->start_class->flags & ANYOF_LOCALE)
3913 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
af302e7f 3914 if (OP(scan) == NSPACEU) {
a12cf05f
KW
3915 for (value = 0; value < 256; value++) {
3916 if (!isSPACE_L1(value)) {
3917 ANYOF_BITMAP_SET(data->start_class, value);
3918 }
3919 }
3920 }
3921 else {
3922 for (value = 0; value < 256; value++) {
3923 if (!isSPACE(value)) {
3924 ANYOF_BITMAP_SET(data->start_class, value);
3925 }
3926 }
3927 }
653099ff 3928 }
8aa23a47 3929 break;
8aa23a47
YO
3930 case DIGIT:
3931 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 3932 if (!(data->start_class->flags & ANYOF_LOCALE)) {
bf3c5c06
KW
3933 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3934 for (value = 0; value < 256; value++)
3935 if (!isDIGIT(value))
3936 ANYOF_BITMAP_CLEAR(data->start_class, value);
bcc0256f 3937 }
8aa23a47
YO
3938 }
3939 else {
3940 if (data->start_class->flags & ANYOF_LOCALE)
3941 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
75950e1c
KW
3942 for (value = 0; value < 256; value++)
3943 if (isDIGIT(value))
3944 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3945 }
3946 break;
3947 case NDIGIT:
3948 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 3949 if (!(data->start_class->flags & ANYOF_LOCALE))
bf3c5c06 3950 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
8aa23a47
YO
3951 for (value = 0; value < 256; value++)
3952 if (isDIGIT(value))
3953 ANYOF_BITMAP_CLEAR(data->start_class, value);
3954 }
3955 else {
3956 if (data->start_class->flags & ANYOF_LOCALE)
3957 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
75950e1c
KW
3958 for (value = 0; value < 256; value++)
3959 if (!isDIGIT(value))
3960 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3961 }
8aa23a47 3962 break;
e1d1eefb
YO
3963 CASE_SYNST_FNC(VERTWS);
3964 CASE_SYNST_FNC(HORIZWS);
3965
8aa23a47
YO
3966 }
3967 if (flags & SCF_DO_STCLASS_OR)
3968 cl_and(data->start_class, and_withp);
3969 flags &= ~SCF_DO_STCLASS;
3970 }
3971 }
3972 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3973 data->flags |= (OP(scan) == MEOL
3974 ? SF_BEFORE_MEOL
3975 : SF_BEFORE_SEOL);
3976 }
3977 else if ( PL_regkind[OP(scan)] == BRANCHJ
3978 /* Lookbehind, or need to calculate parens/evals/stclass: */
3979 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3980 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3981 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3982 || OP(scan) == UNLESSM )
3983 {
3984 /* Negative Lookahead/lookbehind
3985 In this case we can't do fixed string optimisation.
3986 */
1de06328 3987
8aa23a47
YO
3988 I32 deltanext, minnext, fake = 0;
3989 regnode *nscan;
3990 struct regnode_charclass_class intrnl;
3991 int f = 0;
1de06328 3992
8aa23a47
YO
3993 data_fake.flags = 0;
3994 if (data) {
3995 data_fake.whilem_c = data->whilem_c;
3996 data_fake.last_closep = data->last_closep;
c277df42 3997 }
8aa23a47
YO
3998 else
3999 data_fake.last_closep = &fake;
58e23c8d 4000 data_fake.pos_delta = delta;
8aa23a47
YO
4001 if ( flags & SCF_DO_STCLASS && !scan->flags
4002 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4003 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4004 data_fake.start_class = &intrnl;
4005 f |= SCF_DO_STCLASS_AND;
4006 }
4007 if (flags & SCF_WHILEM_VISITED_POS)
4008 f |= SCF_WHILEM_VISITED_POS;
4009 next = regnext(scan);
4010 nscan = NEXTOPER(NEXTOPER(scan));
4011 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4012 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4013 if (scan->flags) {
4014 if (deltanext) {
58e23c8d 4015 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4016 }
4017 else if (minnext > (I32)U8_MAX) {
58e23c8d 4018 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4019 }
4020 scan->flags = (U8)minnext;
4021 }
4022 if (data) {
4023 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4024 pars++;
4025 if (data_fake.flags & SF_HAS_EVAL)
4026 data->flags |= SF_HAS_EVAL;
4027 data->whilem_c = data_fake.whilem_c;
4028 }
4029 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
4030 if (flags & SCF_DO_STCLASS_OR) {
4031 /* OR before, AND after: ideally we would recurse with
4032 * data_fake to get the AND applied by study of the
4033 * remainder of the pattern, and then derecurse;
4034 * *** HACK *** for now just treat as "no information".
4035 * See [perl #56690].
4036 */
e755fd73 4037 cl_init(pRExC_state, data->start_class);
906cdd2b
HS
4038 } else {
4039 /* AND before and after: combine and continue */
4040 const int was = (data->start_class->flags & ANYOF_EOS);
4041
4042 cl_and(data->start_class, &intrnl);
4043 if (was)
4044 data->start_class->flags |= ANYOF_EOS;
4045 }
8aa23a47 4046 }
cb434fcc 4047 }
8aa23a47
YO
4048#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4049 else {
4050 /* Positive Lookahead/lookbehind
4051 In this case we can do fixed string optimisation,
4052 but we must be careful about it. Note in the case of
4053 lookbehind the positions will be offset by the minimum
4054 length of the pattern, something we won't know about
4055 until after the recurse.
4056 */
4057 I32 deltanext, fake = 0;
4058 regnode *nscan;
4059 struct regnode_charclass_class intrnl;
4060 int f = 0;
4061 /* We use SAVEFREEPV so that when the full compile
4062 is finished perl will clean up the allocated
3b753521 4063 minlens when it's all done. This way we don't
8aa23a47
YO
4064 have to worry about freeing them when we know
4065 they wont be used, which would be a pain.
4066 */
4067 I32 *minnextp;
4068 Newx( minnextp, 1, I32 );
4069 SAVEFREEPV(minnextp);
4070
4071 if (data) {
4072 StructCopy(data, &data_fake, scan_data_t);
4073 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4074 f |= SCF_DO_SUBSTR;
4075 if (scan->flags)
304ee84b 4076 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
4077 data_fake.last_found=newSVsv(data->last_found);
4078 }
4079 }
4080 else
4081 data_fake.last_closep = &fake;
4082 data_fake.flags = 0;
58e23c8d 4083 data_fake.pos_delta = delta;
8aa23a47
YO
4084 if (is_inf)
4085 data_fake.flags |= SF_IS_INF;
4086 if ( flags & SCF_DO_STCLASS && !scan->flags
4087 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4088 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4089 data_fake.start_class = &intrnl;
4090 f |= SCF_DO_STCLASS_AND;
4091 }
4092 if (flags & SCF_WHILEM_VISITED_POS)
4093 f |= SCF_WHILEM_VISITED_POS;
4094 next = regnext(scan);
4095 nscan = NEXTOPER(NEXTOPER(scan));
4096
4097 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4098 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4099 if (scan->flags) {
4100 if (deltanext) {
58e23c8d 4101 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4102 }
4103 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 4104 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4105 }
4106 scan->flags = (U8)*minnextp;
4107 }
4108
4109 *minnextp += min;
4110
4111 if (f & SCF_DO_STCLASS_AND) {
4112 const int was = (data->start_class->flags & ANYOF_EOS);
4113
4114 cl_and(data->start_class, &intrnl);
4115 if (was)
4116 data->start_class->flags |= ANYOF_EOS;
4117 }
4118 if (data) {
4119 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4120 pars++;
4121 if (data_fake.flags & SF_HAS_EVAL)
4122 data->flags |= SF_HAS_EVAL;
4123 data->whilem_c = data_fake.whilem_c;
4124 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4125 if (RExC_rx->minlen<*minnextp)
4126 RExC_rx->minlen=*minnextp;
304ee84b 4127 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4128 SvREFCNT_dec(data_fake.last_found);
4129
4130 if ( data_fake.minlen_fixed != minlenp )
4131 {
4132 data->offset_fixed= data_fake.offset_fixed;
4133 data->minlen_fixed= data_fake.minlen_fixed;
4134 data->lookbehind_fixed+= scan->flags;
4135 }
4136 if ( data_fake.minlen_float != minlenp )
4137 {
4138 data->minlen_float= data_fake.minlen_float;
4139 data->offset_float_min=data_fake.offset_float_min;
4140 data->offset_float_max=data_fake.offset_float_max;
4141 data->lookbehind_float+= scan->flags;
4142 }
4143 }
4144 }
4145
4146
40d049e4 4147 }
8aa23a47
YO
4148#endif
4149 }
4150 else if (OP(scan) == OPEN) {
4151 if (stopparen != (I32)ARG(scan))
4152 pars++;
4153 }
4154 else if (OP(scan) == CLOSE) {
4155 if (stopparen == (I32)ARG(scan)) {
4156 break;
4157 }
4158 if ((I32)ARG(scan) == is_par) {
4159 next = regnext(scan);
b515a41d 4160
8aa23a47
YO
4161 if ( next && (OP(next) != WHILEM) && next < last)
4162 is_par = 0; /* Disable optimization */
40d049e4 4163 }
8aa23a47
YO
4164 if (data)
4165 *(data->last_closep) = ARG(scan);
4166 }
4167 else if (OP(scan) == EVAL) {
c277df42
IZ
4168 if (data)
4169 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4170 }
4171 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4172 if (flags & SCF_DO_SUBSTR) {
304ee84b 4173 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4174 flags &= ~SCF_DO_SUBSTR;
40d049e4 4175 }
8aa23a47
YO
4176 if (data && OP(scan)==ACCEPT) {
4177 data->flags |= SCF_SEEN_ACCEPT;
4178 if (stopmin > min)
4179 stopmin = min;
e2e6a0f1 4180 }
8aa23a47
YO
4181 }
4182 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4183 {
0f5d15d6 4184 if (flags & SCF_DO_SUBSTR) {
304ee84b 4185 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4186 data->longest = &(data->longest_float);
4187 }
4188 is_inf = is_inf_internal = 1;
653099ff 4189 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4190 cl_anything(pRExC_state, data->start_class);
96776eda 4191 flags &= ~SCF_DO_STCLASS;
8aa23a47 4192 }
58e23c8d 4193 else if (OP(scan) == GPOS) {
bbe252da 4194 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4195 !(delta || is_inf || (data && data->pos_delta)))
4196 {
bbe252da
YO
4197 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4198 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4199 if (RExC_rx->gofs < (U32)min)
4200 RExC_rx->gofs = min;
4201 } else {
bbe252da 4202 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4203 RExC_rx->gofs = 0;
4204 }
4205 }
786e8c11 4206#ifdef TRIE_STUDY_OPT
40d049e4 4207#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4208 else if (PL_regkind[OP(scan)] == TRIE) {
4209 /* NOTE - There is similar code to this block above for handling
4210 BRANCH nodes on the initial study. If you change stuff here
4211 check there too. */
4212 regnode *trie_node= scan;
4213 regnode *tail= regnext(scan);
f8fc2ecf 4214 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4215 I32 max1 = 0, min1 = I32_MAX;
4216 struct regnode_charclass_class accum;
4217
4218 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4219 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47 4220 if (flags & SCF_DO_STCLASS)
e755fd73 4221 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
4222
4223 if (!trie->jump) {
4224 min1= trie->minlen;
4225 max1= trie->maxlen;
4226 } else {
4227 const regnode *nextbranch= NULL;
4228 U32 word;
4229
4230 for ( word=1 ; word <= trie->wordcount ; word++)
4231 {
4232 I32 deltanext=0, minnext=0, f = 0, fake;
4233 struct regnode_charclass_class this_class;
4234
4235 data_fake.flags = 0;
4236 if (data) {
4237 data_fake.whilem_c = data->whilem_c;
4238 data_fake.last_closep = data->last_closep;
4239 }
4240 else
4241 data_fake.last_closep = &fake;
58e23c8d 4242 data_fake.pos_delta = delta;
8aa23a47 4243 if (flags & SCF_DO_STCLASS) {
e755fd73 4244 cl_init(pRExC_state, &this_class);
8aa23a47
YO
4245 data_fake.start_class = &this_class;
4246 f = SCF_DO_STCLASS_AND;
4247 }
4248 if (flags & SCF_WHILEM_VISITED_POS)
4249 f |= SCF_WHILEM_VISITED_POS;
4250
4251 if (trie->jump[word]) {
4252 if (!nextbranch)
4253 nextbranch = trie_node + trie->jump[0];
4254 scan= trie_node + trie->jump[word];
4255 /* We go from the jump point to the branch that follows
4256 it. Note this means we need the vestigal unused branches
4257 even though they arent otherwise used.
4258 */
4259 minnext = study_chunk(pRExC_state, &scan, minlenp,
4260 &deltanext, (regnode *)nextbranch, &data_fake,
4261 stopparen, recursed, NULL, f,depth+1);
4262 }
4263 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4264 nextbranch= regnext((regnode*)nextbranch);
4265
4266 if (min1 > (I32)(minnext + trie->minlen))
4267 min1 = minnext + trie->minlen;
4268 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4269 max1 = minnext + deltanext + trie->maxlen;
4270 if (deltanext == I32_MAX)
4271 is_inf = is_inf_internal = 1;
4272
4273 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4274 pars++;
4275 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4276 if ( stopmin > min + min1)
4277 stopmin = min + min1;
4278 flags &= ~SCF_DO_SUBSTR;
4279 if (data)
4280 data->flags |= SCF_SEEN_ACCEPT;
4281 }
4282 if (data) {
4283 if (data_fake.flags & SF_HAS_EVAL)
4284 data->flags |= SF_HAS_EVAL;
4285 data->whilem_c = data_fake.whilem_c;
4286 }
4287 if (flags & SCF_DO_STCLASS)
3fffb88a 4288 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
4289 }
4290 }
4291 if (flags & SCF_DO_SUBSTR) {
4292 data->pos_min += min1;
4293 data->pos_delta += max1 - min1;
4294 if (max1 != min1 || is_inf)
4295 data->longest = &(data->longest_float);
4296 }
4297 min += min1;
4298 delta += max1 - min1;
4299 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 4300 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
4301 if (min1) {
4302 cl_and(data->start_class, and_withp);
4303 flags &= ~SCF_DO_STCLASS;
4304 }
4305 }
4306 else if (flags & SCF_DO_STCLASS_AND) {
4307 if (min1) {
4308 cl_and(data->start_class, &accum);
4309 flags &= ~SCF_DO_STCLASS;
4310 }
4311 else {
4312 /* Switch to OR mode: cache the old value of
4313 * data->start_class */
4314 INIT_AND_WITHP;
4315 StructCopy(data->start_class, and_withp,
4316 struct regnode_charclass_class);
4317 flags &= ~SCF_DO_STCLASS_AND;
4318 StructCopy(&accum, data->start_class,
4319 struct regnode_charclass_class);
4320 flags |= SCF_DO_STCLASS_OR;
4321 data->start_class->flags |= ANYOF_EOS;
4322 }
4323 }
4324 scan= tail;
4325 continue;
4326 }
786e8c11 4327#else
8aa23a47 4328 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4329 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4330 U8*bang=NULL;
4331
4332 min += trie->minlen;
4333 delta += (trie->maxlen - trie->minlen);
4334 flags &= ~SCF_DO_STCLASS; /* xxx */
4335 if (flags & SCF_DO_SUBSTR) {
304ee84b 4336 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4337 data->pos_min += trie->minlen;
4338 data->pos_delta += (trie->maxlen - trie->minlen);
4339 if (trie->maxlen != trie->minlen)
4340 data->longest = &(data->longest_float);
4341 }
4342 if (trie->jump) /* no more substrings -- for now /grr*/
4343 flags &= ~SCF_DO_SUBSTR;
b515a41d 4344 }
8aa23a47
YO
4345#endif /* old or new */
4346#endif /* TRIE_STUDY_OPT */
e1d1eefb 4347
8aa23a47
YO
4348 /* Else: zero-length, ignore. */
4349 scan = regnext(scan);
4350 }
4351 if (frame) {
4352 last = frame->last;
4353 scan = frame->next;
4354 stopparen = frame->stop;
4355 frame = frame->prev;
4356 goto fake_study_recurse;
c277df42
IZ
4357 }
4358
4359 finish:
8aa23a47 4360 assert(!frame);
304ee84b 4361 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4362
c277df42 4363 *scanp = scan;
aca2d497 4364 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4365 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4366 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4367 if (is_par > (I32)U8_MAX)
c277df42
IZ
4368 is_par = 0;
4369 if (is_par && pars==1 && data) {
4370 data->flags |= SF_IN_PAR;
4371 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4372 }
4373 else if (pars && data) {
c277df42
IZ
4374 data->flags |= SF_HAS_PAR;
4375 data->flags &= ~SF_IN_PAR;
4376 }
653099ff 4377 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4378 cl_and(data->start_class, and_withp);
786e8c11
YO
4379 if (flags & SCF_TRIE_RESTUDY)
4380 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4381
304ee84b 4382 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4383
e2e6a0f1 4384 return min < stopmin ? min : stopmin;
c277df42
IZ
4385}
4386
2eccd3b2
NC
4387STATIC U32
4388S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4389{
4a4e7719
NC
4390 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4391
7918f24d
NC
4392 PERL_ARGS_ASSERT_ADD_DATA;
4393
4a4e7719
NC
4394 Renewc(RExC_rxi->data,
4395 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4396 char, struct reg_data);
4397 if(count)
f8fc2ecf 4398 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4399 else
f8fc2ecf 4400 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4401 RExC_rxi->data->count = count + n;
4402 Copy(s, RExC_rxi->data->what + count, n, U8);
4403 return count;
c277df42
IZ
4404}
4405
f8149455 4406/*XXX: todo make this not included in a non debugging perl */
76234dfb 4407#ifndef PERL_IN_XSUB_RE
d88dccdf 4408void
864dbfa3 4409Perl_reginitcolors(pTHX)
d88dccdf 4410{
97aff369 4411 dVAR;
1df70142 4412 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4413 if (s) {
1df70142
AL
4414 char *t = savepv(s);
4415 int i = 0;
4416 PL_colors[0] = t;
d88dccdf 4417 while (++i < 6) {
1df70142
AL
4418 t = strchr(t, '\t');
4419 if (t) {
4420 *t = '\0';
4421 PL_colors[i] = ++t;
d88dccdf
IZ
4422 }
4423 else
1df70142 4424 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4425 }
4426 } else {
1df70142 4427 int i = 0;
b81d288d 4428 while (i < 6)
06b5626a 4429 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4430 }
4431 PL_colorset = 1;
4432}
76234dfb 4433#endif
8615cb43 4434
07be1b83 4435
786e8c11
YO
4436#ifdef TRIE_STUDY_OPT
4437#define CHECK_RESTUDY_GOTO \
4438 if ( \
4439 (data.flags & SCF_TRIE_RESTUDY) \
4440 && ! restudied++ \
4441 ) goto reStudy
4442#else
4443#define CHECK_RESTUDY_GOTO
4444#endif
f9f4320a 4445
a687059c 4446/*
e50aee73 4447 - pregcomp - compile a regular expression into internal code
a687059c
LW
4448 *
4449 * We can't allocate space until we know how big the compiled form will be,
4450 * but we can't compile it (and thus know how big it is) until we've got a
4451 * place to put the code. So we cheat: we compile it twice, once with code
4452 * generation turned off and size counting turned on, and once "for real".
4453 * This also means that we don't allocate space until we are sure that the
4454 * thing really will compile successfully, and we never have to move the
4455 * code and thus invalidate pointers into it. (Note that it has to be in
4456 * one piece because free() must be able to free it all.) [NB: not true in perl]
4457 *
4458 * Beware that the optimization-preparation code in here knows about some
4459 * of the structure of the compiled regexp. [I'll say.]
4460 */
b9b4dddf
YO
4461
4462
4463
f9f4320a 4464#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4465#define RE_ENGINE_PTR &PL_core_reg_engine
4466#else
f9f4320a
YO
4467extern const struct regexp_engine my_reg_engine;
4468#define RE_ENGINE_PTR &my_reg_engine
4469#endif
6d5c990f
RGS
4470
4471#ifndef PERL_IN_XSUB_RE
3ab4a224 4472REGEXP *
1593ad57 4473Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4474{
97aff369 4475 dVAR;
6d5c990f 4476 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4477
4478 PERL_ARGS_ASSERT_PREGCOMP;
4479
f9f4320a
YO
4480 /* Dispatch a request to compile a regexp to correct
4481 regexp engine. */
f9f4320a
YO
4482 if (table) {
4483 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4484 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4485 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4486 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4487 DEBUG_COMPILE_r({
8d8756e7 4488 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4489 SvIV(*ptr));
4490 });
3ab4a224 4491 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4492 }
b9b4dddf 4493 }
3ab4a224 4494 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4495}
6d5c990f 4496#endif
2a5d9b1d 4497
3ab4a224 4498REGEXP *
29b09c41 4499Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4500{
4501 dVAR;
288b8c02
NC
4502 REGEXP *rx;
4503 struct regexp *r;
f8fc2ecf 4504 register regexp_internal *ri;
3ab4a224 4505 STRLEN plen;
5d51ce98
KW
4506 char *exp;
4507 char* xend;
c277df42 4508 regnode *scan;
a0d0e21e 4509 I32 flags;
a0d0e21e 4510 I32 minlen = 0;
29b09c41 4511 U32 pm_flags;
e7f38d0f
YO
4512
4513 /* these are all flags - maybe they should be turned
4514 * into a single int with different bit masks */
4515 I32 sawlookahead = 0;
a0d0e21e
LW
4516 I32 sawplus = 0;
4517 I32 sawopen = 0;
29b09c41 4518 bool used_setjump = FALSE;
4624b182 4519 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
e7f38d0f 4520
bbd61b5f
KW
4521 U8 jump_ret = 0;
4522 dJMPENV;
2c2d71f5 4523 scan_data_t data;
830247a4 4524 RExC_state_t RExC_state;
be8e71aa 4525 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4526#ifdef TRIE_STUDY_OPT
5d51ce98 4527 int restudied;
07be1b83
YO
4528 RExC_state_t copyRExC_state;
4529#endif
2a5d9b1d 4530 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4531
4532 PERL_ARGS_ASSERT_RE_COMPILE;
4533
6d5c990f 4534 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4535
29b09c41 4536 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
e40e74fe 4537 RExC_uni_semantics = 0;
4624b182 4538 RExC_contains_locale = 0;
7b597bb8 4539
d6bd454d 4540 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4541 /* Longjmp back to here if have to switch in midstream to utf8 */
4542 if (! RExC_orig_utf8) {
4543 JMPENV_PUSH(jump_ret);
29b09c41 4544 used_setjump = TRUE;
bbd61b5f
KW
4545 }
4546
5d51ce98 4547 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4548 exp = SvPV(pattern, plen);
4549 xend = exp + plen;
4550 /* ignore the utf8ness if the pattern is 0 length */
4551 if (plen == 0) {
4552 RExC_utf8 = RExC_orig_utf8 = 0;
4553 }
4554
5d51ce98
KW
4555 DEBUG_COMPILE_r({
4556 SV *dsv= sv_newmortal();
4557 RE_PV_QUOTED_DECL(s, RExC_utf8,
4558 dsv, exp, plen, 60);
4559 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4560 PL_colors[4],PL_colors[5],s);
4561 });
4562 }
4563 else { /* longjumped back */
bbd61b5f
KW
4564 STRLEN len = plen;
4565
5d51ce98
KW
4566 /* If the cause for the longjmp was other than changing to utf8, pop
4567 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4568 if (jump_ret != UTF8_LONGJMP) {
4569 JMPENV_POP;
4570 JMPENV_JUMP(jump_ret);
4571 }
4572
595598ee
KW
4573 GET_RE_DEBUG_FLAGS;
4574
bbd61b5f
KW
4575 /* It's possible to write a regexp in ascii that represents Unicode
4576 codepoints outside of the byte range, such as via \x{100}. If we
4577 detect such a sequence we have to convert the entire pattern to utf8
4578 and then recompile, as our sizing calculation will have been based
4579 on 1 byte == 1 character, but we will need to use utf8 to encode
4580 at least some part of the pattern, and therefore must convert the whole
4581 thing.
4582 -- dmq */
4583 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4584 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4585 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4586 xend = exp + len;
4587 RExC_orig_utf8 = RExC_utf8 = 1;
4588 SAVEFREEPV(exp);
4589 }
4590
5d51ce98
KW
4591#ifdef TRIE_STUDY_OPT
4592 restudied = 0;
4593#endif
4594
29b09c41 4595 pm_flags = orig_pm_flags;
a62b1201 4596
4624b182
KW
4597 if (initial_charset == REGEX_LOCALE_CHARSET) {
4598 RExC_contains_locale = 1;
4599 }
4600 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4601
4602 /* Set to use unicode semantics if the pattern is in utf8 and has the
4603 * 'depends' charset specified, as it means unicode when utf8 */
a62b1201 4604 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
4605 }
4606
02daf0ab 4607 RExC_precomp = exp;
c737faaf 4608 RExC_flags = pm_flags;
830247a4 4609 RExC_sawback = 0;
bbce6d69 4610
830247a4 4611 RExC_seen = 0;
b57e4118 4612 RExC_in_lookbehind = 0;
830247a4
IZ
4613 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4614 RExC_seen_evals = 0;
4615 RExC_extralen = 0;
e2a7e165 4616 RExC_override_recoding = 0;
c277df42 4617
bbce6d69 4618 /* First pass: determine size, legality. */
830247a4 4619 RExC_parse = exp;
fac92740 4620 RExC_start = exp;
830247a4
IZ
4621 RExC_end = xend;
4622 RExC_naughty = 0;
4623 RExC_npar = 1;
e2e6a0f1 4624 RExC_nestroot = 0;
830247a4
IZ
4625 RExC_size = 0L;
4626 RExC_emit = &PL_regdummy;
4627 RExC_whilem_seen = 0;
40d049e4
YO
4628 RExC_open_parens = NULL;
4629 RExC_close_parens = NULL;
4630 RExC_opend = NULL;
81714fb9 4631 RExC_paren_names = NULL;
1f1031fe
YO
4632#ifdef DEBUGGING
4633 RExC_paren_name_list = NULL;
4634#endif
40d049e4
YO
4635 RExC_recurse = NULL;
4636 RExC_recurse_count = 0;
81714fb9 4637
85ddcde9
JH
4638#if 0 /* REGC() is (currently) a NOP at the first pass.
4639 * Clever compilers notice this and complain. --jhi */
830247a4 4640 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4641#endif
3dab1dad
YO
4642 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4643 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4644 RExC_precomp = NULL;
a0d0e21e
LW
4645 return(NULL);
4646 }
bbd61b5f 4647
29b09c41
KW
4648 /* Here, finished first pass. Get rid of any added setjmp */
4649 if (used_setjump) {
bbd61b5f 4650 JMPENV_POP;
02daf0ab 4651 }
e40e74fe 4652
07be1b83 4653 DEBUG_PARSE_r({
81714fb9
YO
4654 PerlIO_printf(Perl_debug_log,
4655 "Required size %"IVdf" nodes\n"
4656 "Starting second pass (creation)\n",
4657 (IV)RExC_size);
07be1b83
YO
4658 RExC_lastnum=0;
4659 RExC_lastparse=NULL;
4660 });
e40e74fe
KW
4661
4662 /* The first pass could have found things that force Unicode semantics */
4663 if ((RExC_utf8 || RExC_uni_semantics)
4664 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4665 {
4666 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4667 }
4668
c277df42
IZ
4669 /* Small enough for pointer-storage convention?
4670 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4671 if (RExC_size >= 0x10000L && RExC_extralen)
4672 RExC_size += RExC_extralen;
c277df42 4673 else
830247a4
IZ
4674 RExC_extralen = 0;
4675 if (RExC_whilem_seen > 15)
4676 RExC_whilem_seen = 15;
a0d0e21e 4677
f9f4320a
YO
4678 /* Allocate space and zero-initialize. Note, the two step process
4679 of zeroing when in debug mode, thus anything assigned has to
4680 happen after that */
d2f13c59 4681 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4682 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4683 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4684 char, regexp_internal);
4685 if ( r == NULL || ri == NULL )
b45f050a 4686 FAIL("Regexp out of space");
0f79a09d
GS
4687#ifdef DEBUGGING
4688 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4689 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4690#else
f8fc2ecf
YO
4691 /* bulk initialize base fields with 0. */
4692 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4693#endif
58e23c8d
YO
4694
4695 /* non-zero initialization begins here */
f8fc2ecf 4696 RXi_SET( r, ri );
f9f4320a 4697 r->engine= RE_ENGINE_PTR;
c737faaf 4698 r->extflags = pm_flags;
bcdf7404 4699 {
f7819f85 4700 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 4701 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
4702
4703 /* The caret is output if there are any defaults: if not all the STD
4704 * flags are set, or if no character set specifier is needed */
4705 bool has_default =
4706 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4707 || ! has_charset);
bcdf7404 4708 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4709 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4710 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4711 const char *fptr = STD_PAT_MODS; /*"msix"*/
4712 char *p;
fb85c044 4713 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4714 * on. If more precision is desired, we could do a population count of
4715 * the flags set. This could be done with a small lookup table, or by
4716 * shifting, masking and adding, or even, when available, assembly
4717 * language for a machine-language population count.
4718 * We never output a minus, as all those are defaults, so are
4719 * covered by the caret */
fb85c044 4720 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 4721 + has_default /* If needs a caret */
a62b1201
KW
4722
4723 /* If needs a character set specifier */
4724 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
4725 + (sizeof(STD_PAT_MODS) - 1)
4726 + (sizeof("(?:)") - 1);
4727
c5ea2ffa 4728 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4729 SvPOK_on(rx);
8f6ae13c 4730 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4731 *p++='('; *p++='?';
9de15fec
KW
4732
4733 /* If a default, cover it using the caret */
c5ea2ffa 4734 if (has_default) {
85508812 4735 *p++= DEFAULT_PAT_MOD;
fb85c044 4736 }
c5ea2ffa 4737 if (has_charset) {
a62b1201
KW
4738 STRLEN len;
4739 const char* const name = get_regex_charset_name(r->extflags, &len);
4740 Copy(name, p, len, char);
4741 p += len;
9de15fec 4742 }
f7819f85
A
4743 if (has_p)
4744 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4745 {
bcdf7404 4746 char ch;
bcdf7404
YO
4747 while((ch = *fptr++)) {
4748 if(reganch & 1)
4749 *p++ = ch;
bcdf7404
YO
4750 reganch >>= 1;
4751 }
bcdf7404
YO
4752 }
4753
28d8d7f4 4754 *p++ = ':';
bb661a58 4755 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4756 assert ((RX_WRAPPED(rx) - p) < 16);
4757 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4758 p += plen;
bcdf7404 4759 if (has_runon)
28d8d7f4
YO
4760 *p++ = '\n';
4761 *p++ = ')';
4762 *p = 0;
fb85c044 4763 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4764 }
4765
bbe252da 4766 r->intflags = 0;
830247a4 4767 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4768
6bda09f9 4769 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4770 Newxz(RExC_open_parens, RExC_npar,regnode *);
4771 SAVEFREEPV(RExC_open_parens);
4772 Newxz(RExC_close_parens,RExC_npar,regnode *);
4773 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4774 }
4775
4776 /* Useful during FAIL. */
7122b237
YO
4777#ifdef RE_TRACK_PATTERN_OFFSETS
4778 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4779 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4780 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4781 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4782 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4783#endif
4784 SetProgLen(ri,RExC_size);
288b8c02 4785 RExC_rx_sv = rx;
830247a4 4786 RExC_rx = r;
f8fc2ecf 4787 RExC_rxi = ri;
bbce6d69 4788
4789 /* Second pass: emit code. */
c737faaf 4790 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4791 RExC_parse = exp;
4792 RExC_end = xend;
4793 RExC_naughty = 0;
4794 RExC_npar = 1;
f8fc2ecf
YO
4795 RExC_emit_start = ri->program;
4796 RExC_emit = ri->program;
3b57cd43
YO
4797 RExC_emit_bound = ri->program + RExC_size + 1;
4798
2cd61cdb 4799 /* Store the count of eval-groups for security checks: */
f8149455 4800 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4801 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4802 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4803 ReREFCNT_dec(rx);
a0d0e21e 4804 return(NULL);
80757612 4805 }
07be1b83
YO
4806 /* XXXX To minimize changes to RE engine we always allocate
4807 3-units-long substrs field. */
4808 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4809 if (RExC_recurse_count) {
4810 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4811 SAVEFREEPV(RExC_recurse);
4812 }
a0d0e21e 4813
07be1b83 4814reStudy:
e7f38d0f 4815 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4816 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4817
07be1b83 4818#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4819 if (!restudied) {
4820 StructCopy(&zero_scan_data, &data, scan_data_t);
4821 copyRExC_state = RExC_state;
4822 } else {
5d458dd8 4823 U32 seen=RExC_seen;
07be1b83 4824 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4825
4826 RExC_state = copyRExC_state;
4827 if (seen & REG_TOP_LEVEL_BRANCHES)
4828 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4829 else
4830 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4831 if (data.last_found) {
07be1b83 4832 SvREFCNT_dec(data.longest_fixed);
07be1b83 4833 SvREFCNT_dec(data.longest_float);
07be1b83 4834 SvREFCNT_dec(data.last_found);
1de06328 4835 }
40d049e4 4836 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4837 }
40d049e4
YO
4838#else
4839 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4840#endif
fc8cd66c 4841
a0d0e21e 4842 /* Dig out information for optimizations. */
f7819f85 4843 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4844 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4845
a0ed51b3 4846 if (UTF)
8f6ae13c 4847 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4848 ri->regstclass = NULL;
830247a4 4849 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4850 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4851 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4852
1de06328
YO
4853 /* testing for BRANCH here tells us whether there is "must appear"
4854 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4855 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4856 I32 fake;
c5254dd6 4857 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4858 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4859 int stclass_flag;
07be1b83 4860 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4861 regnode *first= scan;
4862 regnode *first_next= regnext(first);
639081d6
YO
4863 /*
4864 * Skip introductions and multiplicators >= 1
4865 * so that we can extract the 'meat' of the pattern that must
4866 * match in the large if() sequence following.
4867 * NOTE that EXACT is NOT covered here, as it is normally
4868 * picked up by the optimiser separately.
4869 *
4870 * This is unfortunate as the optimiser isnt handling lookahead
4871 * properly currently.
4872 *
4873 */
a0d0e21e 4874 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4875 /* An OR of *one* alternative - should not happen now. */
5339e136 4876 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4877 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4878 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4879 (OP(first) == PLUS) ||
4880 (OP(first) == MINMOD) ||
653099ff 4881 /* An {n,m} with n>0 */
5339e136
YO
4882 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4883 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4884 {
639081d6
YO
4885 /*
4886 * the only op that could be a regnode is PLUS, all the rest
4887 * will be regnode_1 or regnode_2.
4888 *
4889 */
a0d0e21e
LW
4890 if (OP(first) == PLUS)
4891 sawplus = 1;
4892 else
3dab1dad 4893 first += regarglen[OP(first)];
639081d6
YO
4894
4895 first = NEXTOPER(first);
5339e136 4896 first_next= regnext(first);
a687059c
LW
4897 }
4898
a0d0e21e
LW
4899 /* Starting-point info. */
4900 again:
786e8c11 4901 DEBUG_PEEP("first:",first,0);
07be1b83 4902 /* Ignore EXACT as we deal with it later. */
3dab1dad 4903 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4904 if (OP(first) == EXACT)
6f207bd3 4905 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4906 else
f8fc2ecf 4907 ri->regstclass = first;
b3c9acc1 4908 }
07be1b83 4909#ifdef TRIE_STCLASS
786e8c11 4910 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4911 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4912 {
786e8c11 4913 regnode *trie_op;
07be1b83 4914 /* this can happen only on restudy */
786e8c11 4915 if ( OP(first) == TRIE ) {
c944940b 4916 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4917 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4918 StructCopy(first,trieop,struct regnode_1);
4919 trie_op=(regnode *)trieop;
4920 } else {
c944940b 4921 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4922 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4923 StructCopy(first,trieop,struct regnode_charclass);
4924 trie_op=(regnode *)trieop;
4925 }
1de06328 4926 OP(trie_op)+=2;
786e8c11 4927 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4928 ri->regstclass = trie_op;
07be1b83
YO
4929 }
4930#endif
e52fc539 4931 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4932 ri->regstclass = first;
3dab1dad
YO
4933 else if (PL_regkind[OP(first)] == BOUND ||
4934 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4935 ri->regstclass = first;
3dab1dad 4936 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4937 r->extflags |= (OP(first) == MBOL
4938 ? RXf_ANCH_MBOL
cad2e5aa 4939 : (OP(first) == SBOL
bbe252da
YO
4940 ? RXf_ANCH_SBOL
4941 : RXf_ANCH_BOL));
a0d0e21e 4942 first = NEXTOPER(first);
774d564b 4943 goto again;
4944 }
4945 else if (OP(first) == GPOS) {
bbe252da 4946 r->extflags |= RXf_ANCH_GPOS;
774d564b 4947 first = NEXTOPER(first);
4948 goto again;
a0d0e21e 4949 }
cf2a2b69
YO
4950 else if ((!sawopen || !RExC_sawback) &&
4951 (OP(first) == STAR &&
3dab1dad 4952 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4953 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4954 {
4955 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4956 const int type =
4957 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4958 ? RXf_ANCH_MBOL
4959 : RXf_ANCH_SBOL;
4960 r->extflags |= type;
4961 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4962 first = NEXTOPER(first);
774d564b 4963 goto again;
a0d0e21e 4964 }
e7f38d0f 4965 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4966 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4967 /* x+ must match at the 1st pos of run of x's */
bbe252da 4968 r->intflags |= PREGf_SKIP;
a0d0e21e 4969
c277df42 4970 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4971#ifdef TRIE_STUDY_OPT
81714fb9 4972 DEBUG_PARSE_r(
be8e71aa
YO
4973 if (!restudied)
4974 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4975 (IV)(first - scan + 1))
4976 );
4977#else
81714fb9 4978 DEBUG_PARSE_r(
be8e71aa
YO
4979 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4980 (IV)(first - scan + 1))
4981 );
4982#endif
4983
4984
a0d0e21e
LW
4985 /*
4986 * If there's something expensive in the r.e., find the
4987 * longest literal string that must appear and make it the
4988 * regmust. Resolve ties in favor of later strings, since
4989 * the regstart check works with the beginning of the r.e.
4990 * and avoiding duplication strengthens checking. Not a
4991 * strong reason, but sufficient in the absence of others.
4992 * [Now we resolve ties in favor of the earlier string if
c277df42 4993 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4994 * earlier string may buy us something the later one won't.]
4995 */
de8c5301 4996
396482e1
GA
4997 data.longest_fixed = newSVpvs("");
4998 data.longest_float = newSVpvs("");
4999 data.last_found = newSVpvs("");
c277df42
IZ
5000 data.longest = &(data.longest_fixed);
5001 first = scan;
f8fc2ecf 5002 if (!ri->regstclass) {
e755fd73 5003 cl_init(pRExC_state, &ch_class);
653099ff
GS
5004 data.start_class = &ch_class;
5005 stclass_flag = SCF_DO_STCLASS_AND;
5006 } else /* XXXX Check for BOUND? */
5007 stclass_flag = 0;
cb434fcc 5008 data.last_closep = &last_close;
de8c5301 5009
1de06328 5010 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
5011 &data, -1, NULL, NULL,
5012 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 5013
07be1b83 5014
786e8c11
YO
5015 CHECK_RESTUDY_GOTO;
5016
5017
830247a4 5018 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 5019 && data.last_start_min == 0 && data.last_end > 0
830247a4 5020 && !RExC_seen_zerolen
2bf803e2 5021 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
5022 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5023 r->extflags |= RXf_CHECK_ALL;
304ee84b 5024 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
5025 SvREFCNT_dec(data.last_found);
5026
1de06328
YO
5027 /* Note that code very similar to this but for anchored string
5028 follows immediately below, changes may need to be made to both.
5029 Be careful.
5030 */
a0ed51b3 5031 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 5032 if (longest_float_length
c277df42
IZ
5033 || (data.flags & SF_FL_BEFORE_EOL
5034 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5035 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 5036 {
1182767e 5037 I32 t,ml;
cf93c79d 5038
1de06328 5039 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
5040 && data.offset_fixed == data.offset_float_min
5041 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5042 goto remove_float; /* As in (a)+. */
5043
1de06328
YO
5044 /* copy the information about the longest float from the reg_scan_data
5045 over to the program. */
33b8afdf
JH
5046 if (SvUTF8(data.longest_float)) {
5047 r->float_utf8 = data.longest_float;
c445ea15 5048 r->float_substr = NULL;
33b8afdf
JH
5049 } else {
5050 r->float_substr = data.longest_float;
c445ea15 5051 r->float_utf8 = NULL;
33b8afdf 5052 }
1de06328
YO
5053 /* float_end_shift is how many chars that must be matched that
5054 follow this item. We calculate it ahead of time as once the
5055 lookbehind offset is added in we lose the ability to correctly
5056 calculate it.*/
5057 ml = data.minlen_float ? *(data.minlen_float)
1182767e 5058 : (I32)longest_float_length;
1de06328
YO
5059 r->float_end_shift = ml - data.offset_float_min
5060 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5061 + data.lookbehind_float;
5062 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 5063 r->float_max_offset = data.offset_float_max;
1182767e 5064 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
5065 r->float_max_offset -= data.lookbehind_float;
5066
cf93c79d
IZ
5067 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5068 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5069 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5070 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5071 }
5072 else {
aca2d497 5073 remove_float:
c445ea15 5074 r->float_substr = r->float_utf8 = NULL;
c277df42 5075 SvREFCNT_dec(data.longest_float);
c5254dd6 5076 longest_float_length = 0;
a0d0e21e 5077 }
c277df42 5078
1de06328
YO
5079 /* Note that code very similar to this but for floating string
5080 is immediately above, changes may need to be made to both.
5081 Be careful.
5082 */
a0ed51b3 5083 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 5084 if (longest_fixed_length
c277df42
IZ
5085 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5086 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 5087 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 5088 {
1182767e 5089 I32 t,ml;
cf93c79d 5090
1de06328
YO
5091 /* copy the information about the longest fixed
5092 from the reg_scan_data over to the program. */
33b8afdf
JH
5093 if (SvUTF8(data.longest_fixed)) {
5094 r->anchored_utf8 = data.longest_fixed;
c445ea15 5095 r->anchored_substr = NULL;
33b8afdf
JH
5096 } else {
5097 r->anchored_substr = data.longest_fixed;
c445ea15 5098 r->anchored_utf8 = NULL;
33b8afdf 5099 }
1de06328
YO
5100 /* fixed_end_shift is how many chars that must be matched that
5101 follow this item. We calculate it ahead of time as once the
5102 lookbehind offset is added in we lose the ability to correctly
5103 calculate it.*/
5104 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 5105 : (I32)longest_fixed_length;
1de06328
YO
5106 r->anchored_end_shift = ml - data.offset_fixed
5107 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5108 + data.lookbehind_fixed;
5109 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5110
cf93c79d
IZ
5111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 5113 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5115 }
5116 else {
c445ea15 5117 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 5118 SvREFCNT_dec(data.longest_fixed);
c5254dd6 5119 longest_fixed_length = 0;
a0d0e21e 5120 }
f8fc2ecf
YO
5121 if (ri->regstclass
5122 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5123 ri->regstclass = NULL;
f4244008 5124
33b8afdf
JH
5125 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5126 && stclass_flag
653099ff 5127 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5128 && !cl_is_anything(data.start_class))
5129 {
2eccd3b2 5130 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5131 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5132
f8fc2ecf 5133 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5134 struct regnode_charclass_class);
5135 StructCopy(data.start_class,
f8fc2ecf 5136 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5137 struct regnode_charclass_class);
f8fc2ecf 5138 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5139 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5140 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5141 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5142 PerlIO_printf(Perl_debug_log,
a0288114 5143 "synthetic stclass \"%s\".\n",
3f7c398e 5144 SvPVX_const(sv));});
653099ff 5145 }
c277df42
IZ
5146
5147 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5148 if (longest_fixed_length > longest_float_length) {
1de06328 5149 r->check_end_shift = r->anchored_end_shift;
c277df42 5150 r->check_substr = r->anchored_substr;
33b8afdf 5151 r->check_utf8 = r->anchored_utf8;
c277df42 5152 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5153 if (r->extflags & RXf_ANCH_SINGLE)
5154 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5155 }
5156 else {
1de06328 5157 r->check_end_shift = r->float_end_shift;
c277df42 5158 r->check_substr = r->float_substr;
33b8afdf 5159 r->check_utf8 = r->float_utf8;
1de06328
YO
5160 r->check_offset_min = r->float_min_offset;
5161 r->check_offset_max = r->float_max_offset;
a0d0e21e 5162 }
30382c73
IZ
5163 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5164 This should be changed ASAP! */
bbe252da
YO
5165 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5166 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5167 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5168 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5169 }
1de06328
YO
5170 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5171 if ( (STRLEN)minlen < longest_float_length )
5172 minlen= longest_float_length;
5173 if ( (STRLEN)minlen < longest_fixed_length )
5174 minlen= longest_fixed_length;
5175 */
a0ed51b3
LW
5176 }
5177 else {
c277df42
IZ
5178 /* Several toplevels. Best we can is to set minlen. */
5179 I32 fake;
653099ff 5180 struct regnode_charclass_class ch_class;
cb434fcc 5181 I32 last_close = 0;
c277df42 5182
5d458dd8 5183 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5184
f8fc2ecf 5185 scan = ri->program + 1;
e755fd73 5186 cl_init(pRExC_state, &ch_class);
653099ff 5187 data.start_class = &ch_class;
cb434fcc 5188 data.last_closep = &last_close;
07be1b83 5189
de8c5301 5190
1de06328 5191 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5192 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5193
786e8c11 5194 CHECK_RESTUDY_GOTO;
07be1b83 5195
33b8afdf 5196 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5197 = r->float_substr = r->float_utf8 = NULL;
f4244008 5198
653099ff 5199 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5200 && !cl_is_anything(data.start_class))
5201 {
2eccd3b2 5202 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5203 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5204
f8fc2ecf 5205 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5206 struct regnode_charclass_class);
5207 StructCopy(data.start_class,
f8fc2ecf 5208 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5209 struct regnode_charclass_class);
f8fc2ecf 5210 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5211 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5212 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5213 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5214 PerlIO_printf(Perl_debug_log,
a0288114 5215 "synthetic stclass \"%s\".\n",
3f7c398e 5216 SvPVX_const(sv));});
653099ff 5217 }
a0d0e21e
LW
5218 }
5219
1de06328
YO
5220 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5221 the "real" pattern. */
cf9788e3
RGS
5222 DEBUG_OPTIMISE_r({
5223 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5224 (IV)minlen, (IV)r->minlen);
cf9788e3 5225 });
de8c5301 5226 r->minlenret = minlen;
1de06328
YO
5227 if (r->minlen < minlen)
5228 r->minlen = minlen;
5229
b81d288d 5230 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5231 r->extflags |= RXf_GPOS_SEEN;
830247a4 5232 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5233 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5234 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5235 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5236 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5237 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5238 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5239 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5240 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5241 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5242 if (RExC_paren_names)
85fbaab2 5243 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5244 else
5daac39c 5245 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5246
7bd1e614 5247#ifdef STUPID_PATTERN_CHECKS
5509d87a 5248 if (RX_PRELEN(rx) == 0)
640f820d 5249 r->extflags |= RXf_NULL;
5509d87a 5250 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5251 /* XXX: this should happen BEFORE we compile */
5252 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5253 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5254 r->extflags |= RXf_WHITE;
5509d87a 5255 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5256 r->extflags |= RXf_START_ONLY;
f1b875a0 5257#else
5509d87a 5258 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5259 /* XXX: this should happen BEFORE we compile */
5260 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5261 else {
5262 regnode *first = ri->program + 1;
39aa8307 5263 U8 fop = OP(first);
f6d9469c
DM
5264
5265 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 5266 r->extflags |= RXf_NULL;
f6d9469c 5267 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 5268 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
5269 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5270 && OP(regnext(first)) == END)
7bd1e614
YO
5271 r->extflags |= RXf_WHITE;
5272 }
f1b875a0 5273#endif
1f1031fe
YO
5274#ifdef DEBUGGING
5275 if (RExC_paren_names) {
af534a04 5276 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5277 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5278 } else
1f1031fe 5279#endif
cde0cee5 5280 ri->name_list_idx = 0;
1f1031fe 5281
40d049e4
YO
5282 if (RExC_recurse_count) {
5283 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5284 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5285 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5286 }
5287 }
f0ab9afb 5288 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5289 /* assume we don't need to swap parens around before we match */
5290
be8e71aa
YO
5291 DEBUG_DUMP_r({
5292 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5293 regdump(r);
5294 });
7122b237
YO
5295#ifdef RE_TRACK_PATTERN_OFFSETS
5296 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5297 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5298 U32 i;
5299 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5300 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5301 for (i = 1; i <= len; i++) {
7122b237 5302 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5303 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5304 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5305 }
5306 PerlIO_printf(Perl_debug_log, "\n");
5307 });
7122b237 5308#endif
288b8c02 5309 return rx;
a687059c
LW
5310}
5311
f9f4320a 5312#undef RE_ENGINE_PTR
3dab1dad 5313
93b32b6d 5314
81714fb9 5315SV*
192b9cd1
AB
5316Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5317 const U32 flags)
5318{
7918f24d
NC
5319 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5320
192b9cd1
AB
5321 PERL_UNUSED_ARG(value);
5322
f1b875a0 5323 if (flags & RXapif_FETCH) {
192b9cd1 5324 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5325 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5326 Perl_croak_no_modify(aTHX);
192b9cd1 5327 return NULL;
f1b875a0 5328 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5329 return reg_named_buff_exists(rx, key, flags)
5330 ? &PL_sv_yes
5331 : &PL_sv_no;
f1b875a0 5332 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5333 return reg_named_buff_all(rx, flags);
f1b875a0 5334 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5335 return reg_named_buff_scalar(rx, flags);
5336 } else {
5337 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5338 return NULL;
5339 }
5340}
5341
5342SV*
5343Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5344 const U32 flags)
5345{
7918f24d 5346 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5347 PERL_UNUSED_ARG(lastkey);
5348
f1b875a0 5349 if (flags & RXapif_FIRSTKEY)
192b9cd1 5350 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5351 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5352 return reg_named_buff_nextkey(rx, flags);
5353 else {
5354 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5355 return NULL;
5356 }
5357}
5358
5359SV*
288b8c02
NC
5360Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5361 const U32 flags)
81714fb9 5362{
44a2ac75
YO
5363 AV *retarray = NULL;
5364 SV *ret;
288b8c02 5365 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5366
5367 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5368
f1b875a0 5369 if (flags & RXapif_ALL)
44a2ac75 5370 retarray=newAV();
93b32b6d 5371
5daac39c
NC
5372 if (rx && RXp_PAREN_NAMES(rx)) {
5373 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5374 if (he_str) {
5375 IV i;
5376 SV* sv_dat=HeVAL(he_str);
5377 I32 *nums=(I32*)SvPVX(sv_dat);
5378 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5379 if ((I32)(rx->nparens) >= nums[i]
5380 && rx->offs[nums[i]].start != -1
5381 && rx->offs[nums[i]].end != -1)
93b32b6d 5382 {
49d7dfbc 5383 ret = newSVpvs("");
288b8c02 5384 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5385 if (!retarray)
5386 return ret;
5387 } else {
5388 ret = newSVsv(&PL_sv_undef);
5389 }
ec83ea38 5390 if (retarray)
93b32b6d 5391 av_push(retarray, ret);
81714fb9 5392 }
93b32b6d 5393 if (retarray)
ad64d0ec 5394 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5395 }
5396 }
5397 return NULL;
5398}
5399
5400bool
288b8c02 5401Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5402 const U32 flags)
5403{
288b8c02 5404 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5405
5406 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5407
5daac39c 5408 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5409 if (flags & RXapif_ALL) {
5daac39c 5410 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5411 } else {
288b8c02 5412 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5413 if (sv) {
5414 SvREFCNT_dec(sv);
192b9cd1
AB
5415 return TRUE;
5416 } else {
5417 return FALSE;
5418 }
5419 }
5420 } else {
5421 return FALSE;
5422 }
5423}
5424
5425SV*
288b8c02 5426Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5427{
288b8c02 5428 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5429
5430 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5431
5daac39c
NC
5432 if ( rx && RXp_PAREN_NAMES(rx) ) {
5433 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5434
288b8c02 5435 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5436 } else {
5437 return FALSE;
5438 }
192b9cd1
AB
5439}
5440
5441SV*
288b8c02 5442Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5443{
288b8c02 5444 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5445 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5446
5447 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5448
5daac39c
NC
5449 if (rx && RXp_PAREN_NAMES(rx)) {
5450 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5451 HE *temphe;
5452 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5453 IV i;
5454 IV parno = 0;
5455 SV* sv_dat = HeVAL(temphe);
5456 I32 *nums = (I32*)SvPVX(sv_dat);
5457 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5458 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5459 rx->offs[nums[i]].start != -1 &&
5460 rx->offs[nums[i]].end != -1)
5461 {
5462 parno = nums[i];
5463 break;
5464 }
5465 }
f1b875a0 5466 if (parno || flags & RXapif_ALL) {
a663657d 5467 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5468 }
81714fb9
YO
5469 }
5470 }
44a2ac75
YO
5471 return NULL;
5472}
5473
192b9cd1 5474SV*
288b8c02 5475Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5476{
5477 SV *ret;
5478 AV *av;
5479 I32 length;
288b8c02 5480 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5481
7918f24d
NC
5482 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5483
5daac39c 5484 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5485 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5486 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5487 } else if (flags & RXapif_ONE) {
288b8c02 5488 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5489 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5490 length = av_len(av);
ec83ea38 5491 SvREFCNT_dec(ret);
192b9cd1
AB
5492 return newSViv(length + 1);
5493 } else {
5494 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5495 return NULL;
5496 }
5497 }
5498 return &PL_sv_undef;
5499}
5500
5501SV*
288b8c02 5502Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5503{
288b8c02 5504 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5505 AV *av = newAV();
5506
7918f24d
NC
5507 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5508
5daac39c
NC
5509 if (rx && RXp_PAREN_NAMES(rx)) {
5510 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5511 HE *temphe;
5512 (void)hv_iterinit(hv);
5513 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5514 IV i;
5515 IV parno = 0;
5516 SV* sv_dat = HeVAL(temphe);
5517 I32 *nums = (I32*)SvPVX(sv_dat);
5518 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5519 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5520 rx->offs[nums[i]].start != -1 &&
5521 rx->offs[nums[i]].end != -1)
5522 {
5523 parno = nums[i];
5524 break;
5525 }
5526 }
f1b875a0 5527 if (parno || flags & RXapif_ALL) {
a663657d 5528 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5529 }
5530 }
5531 }
5532
ad64d0ec 5533 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5534}
5535
49d7dfbc 5536void
288b8c02
NC
5537Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5538 SV * const sv)
44a2ac75 5539{
288b8c02 5540 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5541 char *s = NULL;
a9d504c3 5542 I32 i = 0;
44a2ac75 5543 I32 s1, t1;
7918f24d
NC
5544
5545 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5546
cde0cee5
YO
5547 if (!rx->subbeg) {
5548 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5549 return;
cde0cee5
YO
5550 }
5551 else
f1b875a0 5552 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5553 /* $` */
f0ab9afb 5554 i = rx->offs[0].start;
cde0cee5 5555 s = rx->subbeg;
44a2ac75
YO
5556 }
5557 else
f1b875a0 5558 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5559 /* $' */
f0ab9afb
NC
5560 s = rx->subbeg + rx->offs[0].end;
5561 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5562 }
5563 else
5564 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5565 (s1 = rx->offs[paren].start) != -1 &&
5566 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5567 {
5568 /* $& $1 ... */
5569 i = t1 - s1;
5570 s = rx->subbeg + s1;
cde0cee5
YO
5571 } else {
5572 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5573 return;
cde0cee5
YO
5574 }
5575 assert(rx->sublen >= (s - rx->subbeg) + i );
5576 if (i >= 0) {
5577 const int oldtainted = PL_tainted;
5578 TAINT_NOT;
5579 sv_setpvn(sv, s, i);
5580 PL_tainted = oldtainted;
5581 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5582 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5583 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5584 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5585 {
5586 SvUTF8_on(sv);
5587 }
5588 else
5589 SvUTF8_off(sv);
5590 if (PL_tainting) {
07bc277f 5591 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5592 if (SvTYPE(sv) >= SVt_PVMG) {
5593 MAGIC* const mg = SvMAGIC(sv);
5594 MAGIC* mgt;
5595 PL_tainted = 1;
5596 SvMAGIC_set(sv, mg->mg_moremagic);
5597 SvTAINT(sv);
5598 if ((mgt = SvMAGIC(sv))) {
5599 mg->mg_moremagic = mgt;
5600 SvMAGIC_set(sv, mg);
44a2ac75 5601 }
cde0cee5
YO
5602 } else {
5603 PL_tainted = 1;
5604 SvTAINT(sv);
5605 }
5606 } else
5607 SvTAINTED_off(sv);
44a2ac75 5608 }
81714fb9 5609 } else {
44a2ac75 5610 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5611 return;
81714fb9
YO
5612 }
5613}
93b32b6d 5614
2fdbfb4d
AB
5615void
5616Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5617 SV const * const value)
5618{
7918f24d
NC
5619 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5620
2fdbfb4d
AB
5621 PERL_UNUSED_ARG(rx);
5622 PERL_UNUSED_ARG(paren);
5623 PERL_UNUSED_ARG(value);
5624
5625 if (!PL_localizing)
6ad8f254 5626 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5627}
5628
5629I32
288b8c02 5630Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5631 const I32 paren)
5632{
288b8c02 5633 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5634 I32 i;
5635 I32 s1, t1;
5636
7918f24d
NC
5637 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5638
2fdbfb4d
AB
5639 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5640 switch (paren) {
192b9cd1 5641 /* $` / ${^PREMATCH} */
f1b875a0 5642 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5643 if (rx->offs[0].start != -1) {
5644 i = rx->offs[0].start;
5645 if (i > 0) {
5646 s1 = 0;
5647 t1 = i;
5648 goto getlen;
5649 }
5650 }
5651 return 0;
192b9cd1 5652 /* $' / ${^POSTMATCH} */
f1b875a0 5653 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5654 if (rx->offs[0].end != -1) {
5655 i = rx->sublen - rx->offs[0].end;
5656 if (i > 0) {
5657 s1 = rx->offs[0].end;
5658 t1 = rx->sublen;
5659 goto getlen;
5660 }
5661 }
5662 return 0;
192b9cd1
AB
5663 /* $& / ${^MATCH}, $1, $2, ... */
5664 default:
2fdbfb4d
AB
5665 if (paren <= (I32)rx->nparens &&
5666 (s1 = rx->offs[paren].start) != -1 &&
5667 (t1 = rx->offs[paren].end) != -1)
5668 {
5669 i = t1 - s1;
5670 goto getlen;
5671 } else {
5672 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5673 report_uninit((const SV *)sv);
2fdbfb4d
AB
5674 return 0;
5675 }
5676 }
5677 getlen:
07bc277f 5678 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5679 const char * const s = rx->subbeg + s1;
5680 const U8 *ep;
5681 STRLEN el;
5682
5683 i = t1 - s1;
5684 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5685 i = el;
5686 }
5687 return i;
5688}
5689
fe578d7f 5690SV*
49d7dfbc 5691Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5692{
7918f24d 5693 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5694 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5695 if (0)
5696 return NULL;
5697 else
5698 return newSVpvs("Regexp");
fe578d7f 5699}
0a4db386 5700
894be9b7 5701/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5702 * If flags is REG_RSN_RETURN_NULL returns null.
5703 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5704 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5705 * to the parsed name as looked up in the RExC_paren_names hash.
5706 * If there is an error throws a vFAIL().. type exception.
894be9b7 5707 */
0a4db386
YO
5708
5709#define REG_RSN_RETURN_NULL 0
5710#define REG_RSN_RETURN_NAME 1
5711#define REG_RSN_RETURN_DATA 2
5712
894be9b7 5713STATIC SV*
7918f24d
NC
5714S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5715{
894be9b7 5716 char *name_start = RExC_parse;
1f1031fe 5717
7918f24d
NC
5718 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5719
1f1031fe
YO
5720 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5721 /* skip IDFIRST by using do...while */
5722 if (UTF)
5723 do {
5724 RExC_parse += UTF8SKIP(RExC_parse);
5725 } while (isALNUM_utf8((U8*)RExC_parse));
5726 else
5727 do {
5728 RExC_parse++;
5729 } while (isALNUM(*RExC_parse));
894be9b7 5730 }
1f1031fe 5731
0a4db386 5732 if ( flags ) {
59cd0e26
NC
5733 SV* sv_name
5734 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5735 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5736 if ( flags == REG_RSN_RETURN_NAME)
5737 return sv_name;
5738 else if (flags==REG_RSN_RETURN_DATA) {
5739 HE *he_str = NULL;
5740 SV *sv_dat = NULL;
5741 if ( ! sv_name ) /* should not happen*/
5742 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5743 if (RExC_paren_names)
5744 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5745 if ( he_str )
5746 sv_dat = HeVAL(he_str);
5747 if ( ! sv_dat )
5748 vFAIL("Reference to nonexistent named group");
5749 return sv_dat;
5750 }
5751 else {
5752 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5753 }
5754 /* NOT REACHED */
894be9b7 5755 }
0a4db386 5756 return NULL;
894be9b7
YO
5757}
5758
3dab1dad
YO
5759#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5760 int rem=(int)(RExC_end - RExC_parse); \
5761 int cut; \
5762 int num; \
5763 int iscut=0; \
5764 if (rem>10) { \
5765 rem=10; \
5766 iscut=1; \
5767 } \
5768 cut=10-rem; \
5769 if (RExC_lastparse!=RExC_parse) \
5770 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5771 rem, RExC_parse, \
5772 cut + 4, \
5773 iscut ? "..." : "<" \
5774 ); \
5775 else \
5776 PerlIO_printf(Perl_debug_log,"%16s",""); \
5777 \
5778 if (SIZE_ONLY) \
3b57cd43 5779 num = RExC_size + 1; \
3dab1dad
YO
5780 else \
5781 num=REG_NODE_NUM(RExC_emit); \
5782 if (RExC_lastnum!=num) \
0a4db386 5783 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5784 else \
0a4db386 5785 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5786 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5787 (int)((depth*2)), "", \
3dab1dad
YO
5788 (funcname) \
5789 ); \
5790 RExC_lastnum=num; \
5791 RExC_lastparse=RExC_parse; \
5792})
5793
07be1b83
YO
5794
5795
3dab1dad
YO
5796#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5797 DEBUG_PARSE_MSG((funcname)); \
5798 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5799})
6bda09f9
YO
5800#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5801 DEBUG_PARSE_MSG((funcname)); \
5802 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5803})
d764b54e
KW
5804
5805/* This section of code defines the inversion list object and its methods. The
5806 * interfaces are highly subject to change, so as much as possible is static to
5807 * this file. An inversion list is here implemented as a malloc'd C array with
5808 * some added info. More will be coming when functionality is added later.
5809 *
5810 * Some of the methods should always be private to the implementation, and some
5811 * should eventually be made public */
5812
5813#define INVLIST_INITIAL_LEN 10
5814#define INVLIST_ARRAY_KEY "array"
5815#define INVLIST_MAX_KEY "max"
5816#define INVLIST_LEN_KEY "len"
5817
5818PERL_STATIC_INLINE UV*
5819S_invlist_array(pTHX_ HV* const invlist)
5820{
5821 /* Returns the pointer to the inversion list's array. Every time the
5822 * length changes, this needs to be called in case malloc or realloc moved
5823 * it */
5824
5825 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5826
5827 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5828
5829 if (list_ptr == NULL) {
5830 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5831 INVLIST_ARRAY_KEY);
5832 }
5833
5834 return INT2PTR(UV *, SvUV(*list_ptr));
5835}
5836
5837PERL_STATIC_INLINE void
5838S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5839{
5840 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5841
5842 /* Sets the array stored in the inversion list to the memory beginning with
5843 * the parameter */
5844
5845 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5846 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5847 INVLIST_ARRAY_KEY);
5848 }
5849}
5850
5851PERL_STATIC_INLINE UV
5852S_invlist_len(pTHX_ HV* const invlist)
5853{
5854 /* Returns the current number of elements in the inversion list's array */
5855
5856 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5857
5858 PERL_ARGS_ASSERT_INVLIST_LEN;
5859
5860 if (len_ptr == NULL) {
5861 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5862 INVLIST_LEN_KEY);
5863 }
5864
5865 return SvUV(*len_ptr);
5866}
5867
5868PERL_STATIC_INLINE UV
5869S_invlist_max(pTHX_ HV* const invlist)
5870{
5871 /* Returns the maximum number of elements storable in the inversion list's
5872 * array, without having to realloc() */
5873
5874 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5875
5876 PERL_ARGS_ASSERT_INVLIST_MAX;
5877
5878 if (max_ptr == NULL) {
5879 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5880 INVLIST_MAX_KEY);
5881 }
5882
5883 return SvUV(*max_ptr);
5884}
5885
5886PERL_STATIC_INLINE void
5887S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5888{
5889 /* Sets the current number of elements stored in the inversion list */
5890
5891 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5892
5893 if (len != 0 && len > invlist_max(invlist)) {
5894 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5895 }
5896
5897 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5898 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5899 INVLIST_LEN_KEY);
5900 }
5901}
5902
5903PERL_STATIC_INLINE void
5904S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5905{
5906
5907 /* Sets the maximum number of elements storable in the inversion list
5908 * without having to realloc() */
5909
5910 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5911
5912 if (max < invlist_len(invlist)) {
5913 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5914 }
5915
5916 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5917 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5918 INVLIST_LEN_KEY);
5919 }
5920}
5921
8d69a883 5922#ifndef PERL_IN_XSUB_RE
d764b54e
KW
5923HV*
5924Perl__new_invlist(pTHX_ IV initial_size)
5925{
5926
5927 /* Return a pointer to a newly constructed inversion list, with enough
5928 * space to store 'initial_size' elements. If that number is negative, a
5929 * system default is used instead */
5930
5931 HV* invlist = newHV();
5932 UV* list;
5933
5934 if (initial_size < 0) {
5935 initial_size = INVLIST_INITIAL_LEN;
5936 }
5937
5938 /* Allocate the initial space */
5939 Newx(list, initial_size, UV);
5940 invlist_set_array(invlist, list);
5941
5942 /* set_len has to come before set_max, as the latter inspects the len */
5943 invlist_set_len(invlist, 0);
5944 invlist_set_max(invlist, initial_size);
5945
5946 return invlist;
5947}
8d69a883 5948#endif
d764b54e
KW
5949
5950PERL_STATIC_INLINE void
5951S_invlist_destroy(pTHX_ HV* const invlist)
5952{
5953 /* Inversion list destructor */
5954
5955 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5956
5957 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5958
5959 if (list_ptr != NULL) {
b9d2ea5b
GG
5960 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5961 Safefree(list);
d764b54e
KW
5962 }
5963}
5964
5965STATIC void
5966S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5967{
5968 /* Change the maximum size of an inversion list (up or down) */
5969
5970 UV* orig_array;
5971 UV* array;
5972 const UV old_max = invlist_max(invlist);
5973
5974 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5975
5976 if (old_max == new_max) { /* If a no-op */
5977 return;
5978 }
5979
5980 array = orig_array = invlist_array(invlist);
5981 Renew(array, new_max, UV);
5982
5983 /* If the size change moved the list in memory, set the new one */
5984 if (array != orig_array) {
5985 invlist_set_array(invlist, array);
5986 }
5987
5988 invlist_set_max(invlist, new_max);
5989
5990}
5991
5992PERL_STATIC_INLINE void
5993S_invlist_trim(pTHX_ HV* const invlist)
5994{
5995 PERL_ARGS_ASSERT_INVLIST_TRIM;
5996
5997 /* Change the length of the inversion list to how many entries it currently
5998 * has */
5999
6000 invlist_extend(invlist, invlist_len(invlist));
6001}
6002
6003/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6004 * etc */
6005
6006#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6007
8d69a883 6008#ifndef PERL_IN_XSUB_RE
d764b54e 6009void
c2b03b8c 6010Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
d764b54e
KW
6011{
6012 /* Subject to change or removal. Append the range from 'start' to 'end' at
6013 * the end of the inversion list. The range must be above any existing
6014 * ones. */
6015
6016 UV* array = invlist_array(invlist);
6017 UV max = invlist_max(invlist);
6018 UV len = invlist_len(invlist);
6019
6020 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6021
6022 if (len > 0) {
6023
6024 /* Here, the existing list is non-empty. The current max entry in the
6025 * list is generally the first value not in the set, except when the
6026 * set extends to the end of permissible values, in which case it is
6027 * the first entry in that final set, and so this call is an attempt to
6028 * append out-of-order */
6029
6030 UV final_element = len - 1;
6031 if (array[final_element] > start
6032 || ELEMENT_IN_INVLIST_SET(final_element))
6033 {
6034 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6035 }
6036
6037 /* Here, it is a legal append. If the new range begins with the first
6038 * value not in the set, it is extending the set, so the new first
6039 * value not in the set is one greater than the newly extended range.
6040 * */
6041 if (array[final_element] == start) {
6042 if (end != UV_MAX) {
6043 array[final_element] = end + 1;
6044 }
6045 else {
6046 /* But if the end is the maximum representable on the machine,
6047 * just let the range that this would extend have no end */
6048 invlist_set_len(invlist, len - 1);
6049 }
6050 return;
6051 }
6052 }
6053
6054 /* Here the new range doesn't extend any existing set. Add it */
6055
6056 len += 2; /* Includes an element each for the start and end of range */
6057
6058 /* If overflows the existing space, extend, which may cause the array to be
6059 * moved */
6060 if (max < len) {
6061 invlist_extend(invlist, len);
6062 array = invlist_array(invlist);
6063 }
6064
6065 invlist_set_len(invlist, len);
6066
6067 /* The next item on the list starts the range, the one after that is
6068 * one past the new range. */
6069 array[len - 2] = start;
6070 if (end != UV_MAX) {
6071 array[len - 1] = end + 1;
6072 }
6073 else {
6074 /* But if the end is the maximum representable on the machine, just let
6075 * the range have no end */
6076 invlist_set_len(invlist, len - 1);
6077 }
6078}
8d69a883 6079#endif
d764b54e 6080
3097108b 6081STATIC HV*
d764b54e
KW
6082S_invlist_union(pTHX_ HV* const a, HV* const b)
6083{
6084 /* Return a new inversion list which is the union of two inversion lists.
6085 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6086 * Richard Gillam, published by Addison-Wesley, and explained at some
6087 * length there. The preface says to incorporate its examples into your
6088 * code at your own risk.
6089 *
6090 * The algorithm is like a merge sort.
6091 *
6092 * XXX A potential performance improvement is to keep track as we go along
6093 * if only one of the inputs contributes to the result, meaning the other
6094 * is a subset of that one. In that case, we can skip the final copy and
6095 * return the larger of the input lists */
6096
6097 UV* array_a = invlist_array(a); /* a's array */
6098 UV* array_b = invlist_array(b);
6099 UV len_a = invlist_len(a); /* length of a's array */
6100 UV len_b = invlist_len(b);
6101
6102 HV* u; /* the resulting union */
6103 UV* array_u;
6104 UV len_u;
6105
6106 UV i_a = 0; /* current index into a's array */
6107 UV i_b = 0;
6108 UV i_u = 0;
6109
6110 /* running count, as explained in the algorithm source book; items are
6111 * stopped accumulating and are output when the count changes to/from 0.
6112 * The count is incremented when we start a range that's in the set, and
6113 * decremented when we start a range that's not in the set. So its range
6114 * is 0 to 2. Only when the count is zero is something not in the set.
6115 */
6116 UV count = 0;
6117
6118 PERL_ARGS_ASSERT_INVLIST_UNION;
6119
6120 /* Size the union for the worst case: that the sets are completely
6121 * disjoint */
6122 u = _new_invlist(len_a + len_b);
6123 array_u = invlist_array(u);
6124
6125 /* Go through each list item by item, stopping when exhausted one of
6126 * them */
6127 while (i_a < len_a && i_b < len_b) {
6128 UV cp; /* The element to potentially add to the union's array */
6129 bool cp_in_set; /* is it in the the input list's set or not */
6130
6131 /* We need to take one or the other of the two inputs for the union.
6132 * Since we are merging two sorted lists, we take the smaller of the
6133 * next items. In case of a tie, we take the one that is in its set
6134 * first. If we took one not in the set first, it would decrement the
6135 * count, possibly to 0 which would cause it to be output as ending the
6136 * range, and the next time through we would take the same number, and
6137 * output it again as beginning the next range. By doing it the
6138 * opposite way, there is no possibility that the count will be
6139 * momentarily decremented to 0, and thus the two adjoining ranges will
6140 * be seamlessly merged. (In a tie and both are in the set or both not
6141 * in the set, it doesn't matter which we take first.) */
6142 if (array_a[i_a] < array_b[i_b]
6143 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6144 {
6145 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6146 cp= array_a[i_a++];
6147 }
6148 else {
6149 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6150 cp= array_b[i_b++];
6151 }
6152
6153 /* Here, have chosen which of the two inputs to look at. Only output
6154 * if the running count changes to/from 0, which marks the
6155 * beginning/end of a range in that's in the set */
6156 if (cp_in_set) {
6157 if (count == 0) {
6158 array_u[i_u++] = cp;
6159 }
6160 count++;
6161 }
6162 else {
6163 count--;
6164 if (count == 0) {
6165 array_u[i_u++] = cp;
6166 }
6167 }
6168 }
6169
6170 /* Here, we are finished going through at least one of the lists, which
6171 * means there is something remaining in at most one. We check if the list
6172 * that hasn't been exhausted is positioned such that we are in the middle
6173 * of a range in its set or not. (We are in the set if the next item in
6174 * the array marks the beginning of something not in the set) If in the
6175 * set, we decrement 'count'; if 0, there is potentially more to output.
6176 * There are four cases:
6177 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6178 * in the union is entirely from the non-exhausted set.
6179 * 2) Both were in their sets, count is 2. Nothing further should
6180 * be output, as everything that remains will be in the exhausted
6181 * list's set, hence in the union; decrementing to 1 but not 0 insures
6182 * that
6183 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6184 * Nothing further should be output because the union includes
6185 * everything from the exhausted set. Not decrementing insures that.
6186 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6187 * decrementing to 0 insures that we look at the remainder of the
6188 * non-exhausted set */
6189 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6190 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6191 {
6192 count--;
6193 }
6194
6195 /* The final length is what we've output so far, plus what else is about to
6196 * be output. (If 'count' is non-zero, then the input list we exhausted
6197 * has everything remaining up to the machine's limit in its set, and hence
6198 * in the union, so there will be no further output. */
6199 len_u = i_u;
6200 if (count == 0) {
6201 /* At most one of the subexpressions will be non-zero */
6202 len_u += (len_a - i_a) + (len_b - i_b);
6203 }
6204
6205 /* Set result to final length, which can change the pointer to array_u, so
6206 * re-find it */
6207 if (len_u != invlist_len(u)) {
6208 invlist_set_len(u, len_u);
6209 invlist_trim(u);
6210 array_u = invlist_array(u);
6211 }
6212
6213 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6214 * the other) ended with everything above it not in its set. That means
6215 * that the remaining part of the union is precisely the same as the
6216 * non-exhausted list, so can just copy it unchanged. (If both list were
6217 * exhausted at the same time, then the operations below will be both 0.)
6218 */
6219 if (count == 0) {
6220 IV copy_count; /* At most one will have a non-zero copy count */
6221 if ((copy_count = len_a - i_a) > 0) {
6222 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6223 }
6224 else if ((copy_count = len_b - i_b) > 0) {
6225 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6226 }
6227 }
6228
6229 return u;
6230}
6231
3097108b 6232STATIC HV*
d764b54e
KW
6233S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6234{
6235 /* Return the intersection of two inversion lists. The basis for this
6236 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6237 * by Addison-Wesley, and explained at some length there. The preface says
6238 * to incorporate its examples into your code at your own risk.
6239 *
6240 * The algorithm is like a merge sort, and is essentially the same as the
6241 * union above
6242 */
6243
6244 UV* array_a = invlist_array(a); /* a's array */
6245 UV* array_b = invlist_array(b);
6246 UV len_a = invlist_len(a); /* length of a's array */
6247 UV len_b = invlist_len(b);
6248
6249 HV* r; /* the resulting intersection */
6250 UV* array_r;
6251 UV len_r;
6252
6253 UV i_a = 0; /* current index into a's array */
6254 UV i_b = 0;
6255 UV i_r = 0;
6256
6257 /* running count, as explained in the algorithm source book; items are
6258 * stopped accumulating and are output when the count changes to/from 2.
6259 * The count is incremented when we start a range that's in the set, and
6260 * decremented when we start a range that's not in the set. So its range
6261 * is 0 to 2. Only when the count is 2 is something in the intersection.
6262 */
6263 UV count = 0;
6264
6265 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6266
6267 /* Size the intersection for the worst case: that the intersection ends up
6268 * fragmenting everything to be completely disjoint */
6269 r= _new_invlist(len_a + len_b);
6270 array_r = invlist_array(r);
6271
6272 /* Go through each list item by item, stopping when exhausted one of
6273 * them */
6274 while (i_a < len_a && i_b < len_b) {
6275 UV cp; /* The element to potentially add to the intersection's
6276 array */
6277 bool cp_in_set; /* Is it in the input list's set or not */
6278
6279 /* We need to take one or the other of the two inputs for the union.
6280 * Since we are merging two sorted lists, we take the smaller of the
6281 * next items. In case of a tie, we take the one that is not in its
6282 * set first (a difference from the union algorithm). If we took one
6283 * in the set first, it would increment the count, possibly to 2 which
6284 * would cause it to be output as starting a range in the intersection,
6285 * and the next time through we would take that same number, and output
6286 * it again as ending the set. By doing it the opposite of this, we
6287 * there is no possibility that the count will be momentarily
6288 * incremented to 2. (In a tie and both are in the set or both not in
6289 * the set, it doesn't matter which we take first.) */
6290 if (array_a[i_a] < array_b[i_b]
6291 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6292 {
6293 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6294 cp= array_a[i_a++];
6295 }
6296 else {
6297 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6298 cp= array_b[i_b++];
6299 }
6300
6301 /* Here, have chosen which of the two inputs to look at. Only output
6302 * if the running count changes to/from 2, which marks the
6303 * beginning/end of a range that's in the intersection */
6304 if (cp_in_set) {
6305 count++;
6306 if (count == 2) {
6307 array_r[i_r++] = cp;
6308 }
6309 }
6310 else {
6311 if (count == 2) {
6312 array_r[i_r++] = cp;
6313 }
6314 count--;
6315 }
6316 }
6317
6318 /* Here, we are finished going through at least one of the sets, which
6319 * means there is something remaining in at most one. See the comments in
6320 * the union code */
6321 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6322 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6323 {
6324 count--;
6325 }
6326
6327 /* The final length is what we've output so far plus what else is in the
6328 * intersection. Only one of the subexpressions below will be non-zero */
6329 len_r = i_r;
6330 if (count == 2) {
6331 len_r += (len_a - i_a) + (len_b - i_b);
6332 }
6333
6334 /* Set result to final length, which can change the pointer to array_r, so
6335 * re-find it */
6336 if (len_r != invlist_len(r)) {
6337 invlist_set_len(r, len_r);
6338 invlist_trim(r);
6339 array_r = invlist_array(r);
6340 }
6341
6342 /* Finish outputting any remaining */
6343 if (count == 2) { /* Only one of will have a non-zero copy count */
6344 IV copy_count;
6345 if ((copy_count = len_a - i_a) > 0) {
6346 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6347 }
6348 else if ((copy_count = len_b - i_b) > 0) {
6349 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6350 }
6351 }
6352
6353 return r;
6354}
6355
6356STATIC HV*
c52a3e71 6357S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
d764b54e
KW
6358{
6359 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6360 * set. A pointer to the inversion list is returned. This may actually be
c52a3e71
KW
6361 * a new list, in which case the passed in one has been destroyed. The
6362 * passed in inversion list can be NULL, in which case a new one is created
6363 * with just the one range in it */
d764b54e
KW
6364
6365 HV* range_invlist;
6366 HV* added_invlist;
c52a3e71 6367 UV len;
d764b54e 6368
c52a3e71
KW
6369 if (invlist == NULL) {
6370 invlist = _new_invlist(2);
6371 len = 0;
6372 }
6373 else {
6374 len = invlist_len(invlist);
6375 }
d764b54e
KW
6376
6377 /* If comes after the final entry, can just append it to the end */
6378 if (len == 0
6379 || start >= invlist_array(invlist)
6380 [invlist_len(invlist) - 1])
6381 {
6382 _append_range_to_invlist(invlist, start, end);
6383 return invlist;
6384 }
6385
6386 /* Here, can't just append things, create and return a new inversion list
6387 * which is the union of this range and the existing inversion list */
6388 range_invlist = _new_invlist(2);
6389 _append_range_to_invlist(range_invlist, start, end);
6390
6391 added_invlist = invlist_union(invlist, range_invlist);
6392
6393 /* The passed in list can be freed, as well as our temporary */
6394 invlist_destroy(range_invlist);
6395 if (invlist != added_invlist) {
6396 invlist_destroy(invlist);
6397 }
6398
6399 return added_invlist;
6400}
6401
c229b64c
KW
6402PERL_STATIC_INLINE HV*
6403S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6404 return add_range_to_invlist(invlist, cp, cp);
6405}
6406
d764b54e
KW
6407/* End of inversion list object */
6408
a687059c
LW
6409/*
6410 - reg - regular expression, i.e. main body or parenthesized thing
6411 *
6412 * Caller must absorb opening parenthesis.
6413 *
6414 * Combining parenthesis handling with the base level of regular expression
6415 * is a trifle forced, but the need to tie the tails of the branches to what
6416 * follows makes it hard to avoid.
6417 */
07be1b83
YO
6418#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6419#ifdef DEBUGGING
6420#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6421#else
6422#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6423#endif
3dab1dad 6424
76e3520e 6425STATIC regnode *
3dab1dad 6426S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 6427 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 6428{
27da23d5 6429 dVAR;
c277df42
IZ
6430 register regnode *ret; /* Will be the head of the group. */
6431 register regnode *br;
6432 register regnode *lastbr;
cbbf8932 6433 register regnode *ender = NULL;
a0d0e21e 6434 register I32 parno = 0;
cbbf8932 6435 I32 flags;
f7819f85 6436 U32 oregflags = RExC_flags;
6136c704
AL
6437 bool have_branch = 0;
6438 bool is_open = 0;
594d7033
YO
6439 I32 freeze_paren = 0;
6440 I32 after_freeze = 0;
9d1d55b5
JP
6441
6442 /* for (?g), (?gc), and (?o) warnings; warning
6443 about (?c) will warn about (?g) -- japhy */
6444
6136c704
AL
6445#define WASTED_O 0x01
6446#define WASTED_G 0x02
6447#define WASTED_C 0x04
6448#define WASTED_GC (0x02|0x04)
cbbf8932 6449 I32 wastedflags = 0x00;
9d1d55b5 6450
fac92740 6451 char * parse_start = RExC_parse; /* MJD */
a28509cc 6452 char * const oregcomp_parse = RExC_parse;
a0d0e21e 6453
3dab1dad 6454 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6455
6456 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
6457 DEBUG_PARSE("reg ");
6458
821b33a5 6459 *flagp = 0; /* Tentatively. */
a0d0e21e 6460
9d1d55b5 6461
a0d0e21e
LW
6462 /* Make an OPEN node, if parenthesized. */
6463 if (paren) {
e2e6a0f1
YO
6464 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6465 char *start_verb = RExC_parse;
6466 STRLEN verb_len = 0;
6467 char *start_arg = NULL;
6468 unsigned char op = 0;
6469 int argok = 1;
6470 int internal_argval = 0; /* internal_argval is only useful if !argok */
6471 while ( *RExC_parse && *RExC_parse != ')' ) {
6472 if ( *RExC_parse == ':' ) {
6473 start_arg = RExC_parse + 1;
6474 break;
6475 }
6476 RExC_parse++;
6477 }
6478 ++start_verb;
6479 verb_len = RExC_parse - start_verb;
6480 if ( start_arg ) {
6481 RExC_parse++;
6482 while ( *RExC_parse && *RExC_parse != ')' )
6483 RExC_parse++;
6484 if ( *RExC_parse != ')' )
6485 vFAIL("Unterminated verb pattern argument");
6486 if ( RExC_parse == start_arg )
6487 start_arg = NULL;
6488 } else {
6489 if ( *RExC_parse != ')' )
6490 vFAIL("Unterminated verb pattern");
6491 }
5d458dd8 6492
e2e6a0f1
YO
6493 switch ( *start_verb ) {
6494 case 'A': /* (*ACCEPT) */
568a785a 6495 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
6496 op = ACCEPT;
6497 internal_argval = RExC_nestroot;
6498 }
6499 break;
6500 case 'C': /* (*COMMIT) */
568a785a 6501 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 6502 op = COMMIT;
e2e6a0f1
YO
6503 break;
6504 case 'F': /* (*FAIL) */
568a785a 6505 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
6506 op = OPFAIL;
6507 argok = 0;
6508 }
6509 break;
5d458dd8
YO
6510 case ':': /* (*:NAME) */
6511 case 'M': /* (*MARK:NAME) */
568a785a 6512 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 6513 op = MARKPOINT;
5d458dd8
YO
6514 argok = -1;
6515 }
6516 break;
6517 case 'P': /* (*PRUNE) */
568a785a 6518 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 6519 op = PRUNE;
e2e6a0f1 6520 break;
5d458dd8 6521 case 'S': /* (*SKIP) */
568a785a 6522 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
6523 op = SKIP;
6524 break;
6525 case 'T': /* (*THEN) */
6526 /* [19:06] <TimToady> :: is then */
568a785a 6527 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
6528 op = CUTGROUP;
6529 RExC_seen |= REG_SEEN_CUTGROUP;
6530 }
e2e6a0f1
YO
6531 break;
6532 }
6533 if ( ! op ) {
6534 RExC_parse++;
6535 vFAIL3("Unknown verb pattern '%.*s'",
6536 verb_len, start_verb);
6537 }
6538 if ( argok ) {
6539 if ( start_arg && internal_argval ) {
6540 vFAIL3("Verb pattern '%.*s' may not have an argument",
6541 verb_len, start_verb);
6542 } else if ( argok < 0 && !start_arg ) {
6543 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6544 verb_len, start_verb);
6545 } else {
6546 ret = reganode(pRExC_state, op, internal_argval);
6547 if ( ! internal_argval && ! SIZE_ONLY ) {
6548 if (start_arg) {
6549 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6550 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6551 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
6552 ret->flags = 0;
6553 } else {
6554 ret->flags = 1;
6555 }
6556 }
6557 }
6558 if (!internal_argval)
6559 RExC_seen |= REG_SEEN_VERBARG;
6560 } else if ( start_arg ) {
6561 vFAIL3("Verb pattern '%.*s' may not have an argument",
6562 verb_len, start_verb);
6563 } else {
6564 ret = reg_node(pRExC_state, op);
6565 }
6566 nextchar(pRExC_state);
6567 return ret;
6568 } else
fac92740 6569 if (*RExC_parse == '?') { /* (?...) */
6136c704 6570 bool is_logical = 0;
a28509cc 6571 const char * const seqstart = RExC_parse;
fb85c044 6572 bool has_use_defaults = FALSE;
ca9dfc88 6573
830247a4
IZ
6574 RExC_parse++;
6575 paren = *RExC_parse++;
c277df42 6576 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 6577 switch (paren) {
894be9b7 6578
1f1031fe
YO
6579 case 'P': /* (?P...) variants for those used to PCRE/Python */
6580 paren = *RExC_parse++;
6581 if ( paren == '<') /* (?P<...>) named capture */
6582 goto named_capture;
6583 else if (paren == '>') { /* (?P>name) named recursion */
6584 goto named_recursion;
6585 }
6586 else if (paren == '=') { /* (?P=...) named backref */
6587 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6588 you change this make sure you change that */
6589 char* name_start = RExC_parse;
6590 U32 num = 0;
6591 SV *sv_dat = reg_scan_name(pRExC_state,
6592 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6593 if (RExC_parse == name_start || *RExC_parse != ')')
6594 vFAIL2("Sequence %.3s... not terminated",parse_start);
6595
6596 if (!SIZE_ONLY) {
6597 num = add_data( pRExC_state, 1, "S" );
6598 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6599 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
6600 }
6601 RExC_sawback = 1;
4444fd9f
KW
6602 ret = reganode(pRExC_state,
6603 ((! FOLD)
6604 ? NREF
2f7f8cb1
KW
6605 : (MORE_ASCII_RESTRICTED)
6606 ? NREFFA
6607 : (AT_LEAST_UNI_SEMANTICS)
6608 ? NREFFU
6609 : (LOC)
6610 ? NREFFL
6611 : NREFF),
4444fd9f 6612 num);
1f1031fe
YO
6613 *flagp |= HASWIDTH;
6614
6615 Set_Node_Offset(ret, parse_start+1);
6616 Set_Node_Cur_Length(ret); /* MJD */
6617
6618 nextchar(pRExC_state);
6619 return ret;
6620 }
57b84237
YO
6621 RExC_parse++;
6622 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6623 /*NOTREACHED*/
6624 case '<': /* (?<...) */
b81d288d 6625 if (*RExC_parse == '!')
c277df42 6626 paren = ',';
0a4db386 6627 else if (*RExC_parse != '=')
1f1031fe 6628 named_capture:
0a4db386 6629 { /* (?<...>) */
81714fb9 6630 char *name_start;
894be9b7 6631 SV *svname;
81714fb9
YO
6632 paren= '>';
6633 case '\'': /* (?'...') */
6634 name_start= RExC_parse;
0a4db386
YO
6635 svname = reg_scan_name(pRExC_state,
6636 SIZE_ONLY ? /* reverse test from the others */
6637 REG_RSN_RETURN_NAME :
6638 REG_RSN_RETURN_NULL);
57b84237
YO
6639 if (RExC_parse == name_start) {
6640 RExC_parse++;
6641 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6642 /*NOTREACHED*/
6643 }
81714fb9
YO
6644 if (*RExC_parse != paren)
6645 vFAIL2("Sequence (?%c... not terminated",
6646 paren=='>' ? '<' : paren);
6647 if (SIZE_ONLY) {
e62cc96a
YO
6648 HE *he_str;
6649 SV *sv_dat = NULL;
486ec47a 6650 if (!svname) /* shouldn't happen */
894be9b7
YO
6651 Perl_croak(aTHX_
6652 "panic: reg_scan_name returned NULL");
81714fb9
YO
6653 if (!RExC_paren_names) {
6654 RExC_paren_names= newHV();
ad64d0ec 6655 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
6656#ifdef DEBUGGING
6657 RExC_paren_name_list= newAV();
ad64d0ec 6658 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 6659#endif
81714fb9
YO
6660 }
6661 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 6662 if ( he_str )
81714fb9 6663 sv_dat = HeVAL(he_str);
e62cc96a 6664 if ( ! sv_dat ) {
81714fb9 6665 /* croak baby croak */
e62cc96a
YO
6666 Perl_croak(aTHX_
6667 "panic: paren_name hash element allocation failed");
6668 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
6669 /* (?|...) can mean we have dupes so scan to check
6670 its already been stored. Maybe a flag indicating
6671 we are inside such a construct would be useful,
6672 but the arrays are likely to be quite small, so
6673 for now we punt -- dmq */
6674 IV count = SvIV(sv_dat);
6675 I32 *pv = (I32*)SvPVX(sv_dat);
6676 IV i;
6677 for ( i = 0 ; i < count ; i++ ) {
6678 if ( pv[i] == RExC_npar ) {
6679 count = 0;
6680 break;
6681 }
6682 }
6683 if ( count ) {
6684 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6685 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6686 pv[count] = RExC_npar;
3a92e6ae 6687 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 6688 }
81714fb9
YO
6689 } else {
6690 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6691 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6692 SvIOK_on(sv_dat);
3ec35e0f 6693 SvIV_set(sv_dat, 1);
e62cc96a 6694 }
1f1031fe
YO
6695#ifdef DEBUGGING
6696 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6697 SvREFCNT_dec(svname);
6698#endif
e62cc96a 6699
81714fb9
YO
6700 /*sv_dump(sv_dat);*/
6701 }
6702 nextchar(pRExC_state);
6703 paren = 1;
6704 goto capturing_parens;
6705 }
6706 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 6707 RExC_in_lookbehind++;
830247a4 6708 RExC_parse++;
fac92740 6709 case '=': /* (?=...) */
89c6a13e 6710 RExC_seen_zerolen++;
5c3fa2e7 6711 break;
fac92740 6712 case '!': /* (?!...) */
830247a4 6713 RExC_seen_zerolen++;
e2e6a0f1
YO
6714 if (*RExC_parse == ')') {
6715 ret=reg_node(pRExC_state, OPFAIL);
6716 nextchar(pRExC_state);
6717 return ret;
6718 }
594d7033
YO
6719 break;
6720 case '|': /* (?|...) */
6721 /* branch reset, behave like a (?:...) except that
6722 buffers in alternations share the same numbers */
6723 paren = ':';
6724 after_freeze = freeze_paren = RExC_npar;
6725 break;
fac92740
MJD
6726 case ':': /* (?:...) */
6727 case '>': /* (?>...) */
a0d0e21e 6728 break;
fac92740
MJD
6729 case '$': /* (?$...) */
6730 case '@': /* (?@...) */
8615cb43 6731 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 6732 break;
fac92740 6733 case '#': /* (?#...) */
830247a4
IZ
6734 while (*RExC_parse && *RExC_parse != ')')
6735 RExC_parse++;
6736 if (*RExC_parse != ')')
c277df42 6737 FAIL("Sequence (?#... not terminated");
830247a4 6738 nextchar(pRExC_state);
a0d0e21e
LW
6739 *flagp = TRYAGAIN;
6740 return NULL;
894be9b7
YO
6741 case '0' : /* (?0) */
6742 case 'R' : /* (?R) */
6743 if (*RExC_parse != ')')
6bda09f9 6744 FAIL("Sequence (?R) not terminated");
1a147d38 6745 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 6746 *flagp |= POSTPONED;
7f69552c
YO
6747 nextchar(pRExC_state);
6748 return ret;
6749 /*notreached*/
894be9b7
YO
6750 { /* named and numeric backreferences */
6751 I32 num;
894be9b7
YO
6752 case '&': /* (?&NAME) */
6753 parse_start = RExC_parse - 1;
1f1031fe 6754 named_recursion:
894be9b7 6755 {
0a4db386
YO
6756 SV *sv_dat = reg_scan_name(pRExC_state,
6757 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6758 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6759 }
6760 goto gen_recurse_regop;
6761 /* NOT REACHED */
542fa716
YO
6762 case '+':
6763 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6764 RExC_parse++;
6765 vFAIL("Illegal pattern");
6766 }
6767 goto parse_recursion;
6768 /* NOT REACHED*/
6769 case '-': /* (?-1) */
6770 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6771 RExC_parse--; /* rewind to let it be handled later */
6772 goto parse_flags;
6773 }
6774 /*FALLTHROUGH */
6bda09f9
YO
6775 case '1': case '2': case '3': case '4': /* (?1) */
6776 case '5': case '6': case '7': case '8': case '9':
6777 RExC_parse--;
542fa716 6778 parse_recursion:
894be9b7
YO
6779 num = atoi(RExC_parse);
6780 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6781 if (*RExC_parse == '-')
6782 RExC_parse++;
6bda09f9
YO
6783 while (isDIGIT(*RExC_parse))
6784 RExC_parse++;
6785 if (*RExC_parse!=')')
6786 vFAIL("Expecting close bracket");
894be9b7
YO
6787
6788 gen_recurse_regop:
542fa716
YO
6789 if ( paren == '-' ) {
6790 /*
6791 Diagram of capture buffer numbering.
6792 Top line is the normal capture buffer numbers
3b753521 6793 Bottom line is the negative indexing as from
542fa716
YO
6794 the X (the (?-2))
6795
6796 + 1 2 3 4 5 X 6 7
6797 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6798 - 5 4 3 2 1 X x x
6799
6800 */
6801 num = RExC_npar + num;
6802 if (num < 1) {
6803 RExC_parse++;
6804 vFAIL("Reference to nonexistent group");
6805 }
6806 } else if ( paren == '+' ) {
6807 num = RExC_npar + num - 1;
6808 }
6809
1a147d38 6810 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6811 if (!SIZE_ONLY) {
6812 if (num > (I32)RExC_rx->nparens) {
6813 RExC_parse++;
6814 vFAIL("Reference to nonexistent group");
6815 }
40d049e4 6816 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6817 RExC_emit++;
226de585 6818 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6819 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6820 } else {
6bda09f9 6821 RExC_size++;
6bda09f9 6822 }
0a4db386 6823 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6824 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6825 Set_Node_Offset(ret, parse_start); /* MJD */
6826
a3b492c3 6827 *flagp |= POSTPONED;
6bda09f9
YO
6828 nextchar(pRExC_state);
6829 return ret;
894be9b7
YO
6830 } /* named and numeric backreferences */
6831 /* NOT REACHED */
6832
fac92740 6833 case '?': /* (??...) */
6136c704 6834 is_logical = 1;
57b84237
YO
6835 if (*RExC_parse != '{') {
6836 RExC_parse++;
6837 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6838 /*NOTREACHED*/
6839 }
a3b492c3 6840 *flagp |= POSTPONED;
830247a4 6841 paren = *RExC_parse++;
0f5d15d6 6842 /* FALL THROUGH */
fac92740 6843 case '{': /* (?{...}) */
c277df42 6844 {
2eccd3b2
NC
6845 I32 count = 1;
6846 U32 n = 0;
c277df42 6847 char c;
830247a4 6848 char *s = RExC_parse;
c277df42 6849
830247a4
IZ
6850 RExC_seen_zerolen++;
6851 RExC_seen |= REG_SEEN_EVAL;
6852 while (count && (c = *RExC_parse)) {
6136c704
AL
6853 if (c == '\\') {
6854 if (RExC_parse[1])
6855 RExC_parse++;
6856 }
b81d288d 6857 else if (c == '{')
c277df42 6858 count++;
b81d288d 6859 else if (c == '}')
c277df42 6860 count--;
830247a4 6861 RExC_parse++;
c277df42 6862 }
6136c704 6863 if (*RExC_parse != ')') {
b81d288d 6864 RExC_parse = s;
b45f050a
JF
6865 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6866 }
c277df42 6867 if (!SIZE_ONLY) {
f3548bdc 6868 PAD *pad;
6136c704
AL
6869 OP_4tree *sop, *rop;
6870 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6871
569233ed
SB
6872 ENTER;
6873 Perl_save_re_context(aTHX);
d59a8b3e 6874 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6875 sop->op_private |= OPpREFCOUNTED;
6876 /* re_dup will OpREFCNT_inc */
6877 OpREFCNT_set(sop, 1);
569233ed 6878 LEAVE;
c277df42 6879
830247a4 6880 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6881 RExC_rxi->data->data[n] = (void*)rop;
6882 RExC_rxi->data->data[n+1] = (void*)sop;
6883 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6884 SvREFCNT_dec(sv);
a0ed51b3 6885 }
e24b16f9 6886 else { /* First pass */
830247a4 6887 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6888 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6889 /* No compiled RE interpolated, has runtime
6890 components ===> unsafe. */
6891 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6892 if (PL_tainting && PL_tainted)
cc6b7395 6893 FAIL("Eval-group in insecure regular expression");
54df2634 6894#if PERL_VERSION > 8
923e4eb5 6895 if (IN_PERL_COMPILETIME)
b5c19bd7 6896 PL_cv_has_eval = 1;
54df2634 6897#endif
c277df42 6898 }
b5c19bd7 6899
830247a4 6900 nextchar(pRExC_state);
6136c704 6901 if (is_logical) {
830247a4 6902 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6903 if (!SIZE_ONLY)
6904 ret->flags = 2;
3dab1dad 6905 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6906 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6907 return ret;
6908 }
ccb2c380
MP
6909 ret = reganode(pRExC_state, EVAL, n);
6910 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6911 Set_Node_Offset(ret, parse_start);
6912 return ret;
c277df42 6913 }
fac92740 6914 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6915 {
0a4db386 6916 int is_define= 0;
fac92740 6917 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6918 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6919 || RExC_parse[1] == '<'
830247a4 6920 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6921 I32 flag;
6922
830247a4 6923 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6924 if (!SIZE_ONLY)
6925 ret->flags = 1;
3dab1dad 6926 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6927 goto insert_if;
b81d288d 6928 }
a0ed51b3 6929 }
0a4db386
YO
6930 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6931 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6932 {
6933 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6934 char *name_start= RExC_parse++;
2eccd3b2 6935 U32 num = 0;
0a4db386
YO
6936 SV *sv_dat=reg_scan_name(pRExC_state,
6937 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6938 if (RExC_parse == name_start || *RExC_parse != ch)
6939 vFAIL2("Sequence (?(%c... not terminated",
6940 (ch == '>' ? '<' : ch));
6941 RExC_parse++;
6942 if (!SIZE_ONLY) {
6943 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6944 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6945 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6946 }
6947 ret = reganode(pRExC_state,NGROUPP,num);
6948 goto insert_if_check_paren;
6949 }
6950 else if (RExC_parse[0] == 'D' &&
6951 RExC_parse[1] == 'E' &&
6952 RExC_parse[2] == 'F' &&
6953 RExC_parse[3] == 'I' &&
6954 RExC_parse[4] == 'N' &&
6955 RExC_parse[5] == 'E')
6956 {
6957 ret = reganode(pRExC_state,DEFINEP,0);
6958 RExC_parse +=6 ;
6959 is_define = 1;
6960 goto insert_if_check_paren;
6961 }
6962 else if (RExC_parse[0] == 'R') {
6963 RExC_parse++;
6964 parno = 0;
6965 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6966 parno = atoi(RExC_parse++);
6967 while (isDIGIT(*RExC_parse))
6968 RExC_parse++;
6969 } else if (RExC_parse[0] == '&') {
6970 SV *sv_dat;
6971 RExC_parse++;
6972 sv_dat = reg_scan_name(pRExC_state,
6973 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6974 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6975 }
1a147d38 6976 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6977 goto insert_if_check_paren;
6978 }
830247a4 6979 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6980 /* (?(1)...) */
6136c704 6981 char c;
830247a4 6982 parno = atoi(RExC_parse++);
c277df42 6983
830247a4
IZ
6984 while (isDIGIT(*RExC_parse))
6985 RExC_parse++;
fac92740 6986 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6987
0a4db386 6988 insert_if_check_paren:
830247a4 6989 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6990 vFAIL("Switch condition not recognized");
c277df42 6991 insert_if:
3dab1dad
YO
6992 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6993 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6994 if (br == NULL)
830247a4 6995 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6996 else
3dab1dad 6997 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6998 c = *nextchar(pRExC_state);
d1b80229
IZ
6999 if (flags&HASWIDTH)
7000 *flagp |= HASWIDTH;
c277df42 7001 if (c == '|') {
0a4db386
YO
7002 if (is_define)
7003 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 7004 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
7005 regbranch(pRExC_state, &flags, 1,depth+1);
7006 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
7007 if (flags&HASWIDTH)
7008 *flagp |= HASWIDTH;
830247a4 7009 c = *nextchar(pRExC_state);
a0ed51b3
LW
7010 }
7011 else
c277df42
IZ
7012 lastbr = NULL;
7013 if (c != ')')
8615cb43 7014 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 7015 ender = reg_node(pRExC_state, TAIL);
3dab1dad 7016 REGTAIL(pRExC_state, br, ender);
c277df42 7017 if (lastbr) {
3dab1dad
YO
7018 REGTAIL(pRExC_state, lastbr, ender);
7019 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
7020 }
7021 else
3dab1dad 7022 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
7023 RExC_size++; /* XXX WHY do we need this?!!
7024 For large programs it seems to be required
7025 but I can't figure out why. -- dmq*/
c277df42 7026 return ret;
a0ed51b3
LW
7027 }
7028 else {
830247a4 7029 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
7030 }
7031 }
1b1626e4 7032 case 0:
830247a4 7033 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 7034 vFAIL("Sequence (? incomplete");
1b1626e4 7035 break;
85508812
KW
7036 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7037 that follow */
fb85c044
KW
7038 has_use_defaults = TRUE;
7039 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
7040 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7041 ? REGEX_UNICODE_CHARSET
7042 : REGEX_DEPENDS_CHARSET);
fb85c044 7043 goto parse_flags;
a0d0e21e 7044 default:
cde0cee5
YO
7045 --RExC_parse;
7046 parse_flags: /* (?i) */
7047 {
7048 U32 posflags = 0, negflags = 0;
7049 U32 *flagsp = &posflags;
9de15fec 7050 bool has_charset_modifier = 0;
295c2f7d
KW
7051 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7052 ? REGEX_UNICODE_CHARSET
7053 : REGEX_DEPENDS_CHARSET;
cde0cee5
YO
7054
7055 while (*RExC_parse) {
7056 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
7057 /* (?g), (?gc) and (?o) are useless here
7058 and must be globally applied -- japhy */
cde0cee5
YO
7059 switch (*RExC_parse) {
7060 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
7061 case LOCALE_PAT_MOD:
7062 if (has_charset_modifier || flagsp == &negflags) {
7063 goto fail_modifiers;
7064 }
a62b1201 7065 cs = REGEX_LOCALE_CHARSET;
9de15fec 7066 has_charset_modifier = 1;
4624b182 7067 RExC_contains_locale = 1;
9de15fec
KW
7068 break;
7069 case UNICODE_PAT_MOD:
7070 if (has_charset_modifier || flagsp == &negflags) {
7071 goto fail_modifiers;
7072 }
a62b1201 7073 cs = REGEX_UNICODE_CHARSET;
9de15fec
KW
7074 has_charset_modifier = 1;
7075 break;
cfaf538b
KW
7076 case ASCII_RESTRICT_PAT_MOD:
7077 if (has_charset_modifier || flagsp == &negflags) {
7078 goto fail_modifiers;
7079 }
2f7f8cb1
KW
7080 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7081 /* Doubled modifier implies more restricted */
7082 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7083 RExC_parse++;
7084 }
7085 else {
7086 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7087 }
cfaf538b
KW
7088 has_charset_modifier = 1;
7089 break;
50e91148 7090 case DEPENDS_PAT_MOD:
9de15fec
KW
7091 if (has_use_defaults
7092 || has_charset_modifier
7093 || flagsp == &negflags)
7094 {
7095 goto fail_modifiers;
7096 }
7b98bc43
KW
7097
7098 /* The dual charset means unicode semantics if the
7099 * pattern (or target, not known until runtime) are
e40e74fe
KW
7100 * utf8, or something in the pattern indicates unicode
7101 * semantics */
7102 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
7103 ? REGEX_UNICODE_CHARSET
7104 : REGEX_DEPENDS_CHARSET;
9de15fec
KW
7105 has_charset_modifier = 1;
7106 break;
f7819f85
A
7107 case ONCE_PAT_MOD: /* 'o' */
7108 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 7109 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 7110 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
7111 if (! (wastedflags & wflagbit) ) {
7112 wastedflags |= wflagbit;
7113 vWARN5(
7114 RExC_parse + 1,
7115 "Useless (%s%c) - %suse /%c modifier",
7116 flagsp == &negflags ? "?-" : "?",
7117 *RExC_parse,
7118 flagsp == &negflags ? "don't " : "",
7119 *RExC_parse
7120 );
7121 }
7122 }
cde0cee5
YO
7123 break;
7124
f7819f85 7125 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 7126 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
7127 if (! (wastedflags & WASTED_C) ) {
7128 wastedflags |= WASTED_GC;
9d1d55b5
JP
7129 vWARN3(
7130 RExC_parse + 1,
7131 "Useless (%sc) - %suse /gc modifier",
7132 flagsp == &negflags ? "?-" : "?",
7133 flagsp == &negflags ? "don't " : ""
7134 );
7135 }
7136 }
cde0cee5 7137 break;
f7819f85 7138 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 7139 if (flagsp == &negflags) {
668c081a
NC
7140 if (SIZE_ONLY)
7141 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
7142 } else {
7143 *flagsp |= RXf_PMf_KEEPCOPY;
7144 }
7145 break;
7146 case '-':
3b753521 7147 /* A flag is a default iff it is following a minus, so
fb85c044
KW
7148 * if there is a minus, it means will be trying to
7149 * re-specify a default which is an error */
7150 if (has_use_defaults || flagsp == &negflags) {
9de15fec 7151 fail_modifiers:
57b84237
YO
7152 RExC_parse++;
7153 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7154 /*NOTREACHED*/
7155 }
cde0cee5
YO
7156 flagsp = &negflags;
7157 wastedflags = 0; /* reset so (?g-c) warns twice */
7158 break;
7159 case ':':
7160 paren = ':';
7161 /*FALLTHROUGH*/
7162 case ')':
7163 RExC_flags |= posflags;
7164 RExC_flags &= ~negflags;
a62b1201 7165 set_regex_charset(&RExC_flags, cs);
f7819f85
A
7166 if (paren != ':') {
7167 oregflags |= posflags;
7168 oregflags &= ~negflags;
a62b1201 7169 set_regex_charset(&oregflags, cs);
f7819f85 7170 }
cde0cee5
YO
7171 nextchar(pRExC_state);
7172 if (paren != ':') {
7173 *flagp = TRYAGAIN;
7174 return NULL;
7175 } else {
7176 ret = NULL;
7177 goto parse_rest;
7178 }
7179 /*NOTREACHED*/
7180 default:
cde0cee5
YO
7181 RExC_parse++;
7182 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7183 /*NOTREACHED*/
7184 }
830247a4 7185 ++RExC_parse;
48c036b1 7186 }
cde0cee5 7187 }} /* one for the default block, one for the switch */
a0d0e21e 7188 }
fac92740 7189 else { /* (...) */
81714fb9 7190 capturing_parens:
830247a4
IZ
7191 parno = RExC_npar;
7192 RExC_npar++;
e2e6a0f1 7193
830247a4 7194 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
7195 if (!SIZE_ONLY ){
7196 if (!RExC_nestroot)
7197 RExC_nestroot = parno;
c009da3d
YO
7198 if (RExC_seen & REG_SEEN_RECURSE
7199 && !RExC_open_parens[parno-1])
7200 {
e2e6a0f1 7201 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
7202 "Setting open paren #%"IVdf" to %d\n",
7203 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
7204 RExC_open_parens[parno-1]= ret;
7205 }
6bda09f9 7206 }
fac92740
MJD
7207 Set_Node_Length(ret, 1); /* MJD */
7208 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 7209 is_open = 1;
a0d0e21e 7210 }
a0ed51b3 7211 }
fac92740 7212 else /* ! paren */
a0d0e21e 7213 ret = NULL;
cde0cee5
YO
7214
7215 parse_rest:
a0d0e21e 7216 /* Pick up the branches, linking them together. */
fac92740 7217 parse_start = RExC_parse; /* MJD */
3dab1dad 7218 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e 7219
fac92740 7220 /* branch_len = (paren != 0); */
2af232bd 7221
a0d0e21e
LW
7222 if (br == NULL)
7223 return(NULL);
830247a4
IZ
7224 if (*RExC_parse == '|') {
7225 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 7226 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 7227 }
fac92740 7228 else { /* MJD */
6bda09f9 7229 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
7230 Set_Node_Length(br, paren != 0);
7231 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7232 }
c277df42
IZ
7233 have_branch = 1;
7234 if (SIZE_ONLY)
830247a4 7235 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
7236 }
7237 else if (paren == ':') {
c277df42
IZ
7238 *flagp |= flags&SIMPLE;
7239 }
6136c704 7240 if (is_open) { /* Starts with OPEN. */
3dab1dad 7241 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
7242 }
7243 else if (paren != '?') /* Not Conditional */
a0d0e21e 7244 ret = br;
8ae10a67 7245 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 7246 lastbr = br;
830247a4
IZ
7247 while (*RExC_parse == '|') {
7248 if (!SIZE_ONLY && RExC_extralen) {
7249 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 7250 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
7251 }
7252 if (SIZE_ONLY)
830247a4
IZ
7253 RExC_extralen += 2; /* Account for LONGJMP. */
7254 nextchar(pRExC_state);
594d7033
YO
7255 if (freeze_paren) {
7256 if (RExC_npar > after_freeze)
7257 after_freeze = RExC_npar;
7258 RExC_npar = freeze_paren;
7259 }
3dab1dad 7260 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 7261
a687059c 7262 if (br == NULL)
a0d0e21e 7263 return(NULL);
3dab1dad 7264 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 7265 lastbr = br;
8ae10a67 7266 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
7267 }
7268
c277df42
IZ
7269 if (have_branch || paren != ':') {
7270 /* Make a closing node, and hook it on the end. */
7271 switch (paren) {
7272 case ':':
830247a4 7273 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
7274 break;
7275 case 1:
830247a4 7276 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
7277 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7278 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7279 "Setting close paren #%"IVdf" to %d\n",
7280 (IV)parno, REG_NODE_NUM(ender)));
7281 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
7282 if (RExC_nestroot == parno)
7283 RExC_nestroot = 0;
40d049e4 7284 }
fac92740
MJD
7285 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7286 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
7287 break;
7288 case '<':
c277df42
IZ
7289 case ',':
7290 case '=':
7291 case '!':
c277df42 7292 *flagp &= ~HASWIDTH;
821b33a5
IZ
7293 /* FALL THROUGH */
7294 case '>':
830247a4 7295 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
7296 break;
7297 case 0:
830247a4 7298 ender = reg_node(pRExC_state, END);
40d049e4
YO
7299 if (!SIZE_ONLY) {
7300 assert(!RExC_opend); /* there can only be one! */
7301 RExC_opend = ender;
7302 }
c277df42
IZ
7303 break;
7304 }
eaf3ca90 7305 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 7306
9674d46a 7307 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
7308 if (depth==1)
7309 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7310
c277df42 7311 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
7312 for (br = ret; br; br = regnext(br)) {
7313 const U8 op = PL_regkind[OP(br)];
7314 if (op == BRANCH) {
07be1b83 7315 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
7316 }
7317 else if (op == BRANCHJ) {
07be1b83 7318 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 7319 }
c277df42
IZ
7320 }
7321 }
a0d0e21e 7322 }
c277df42
IZ
7323
7324 {
e1ec3a88
AL
7325 const char *p;
7326 static const char parens[] = "=!<,>";
c277df42
IZ
7327
7328 if (paren && (p = strchr(parens, paren))) {
eb160463 7329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
7330 int flag = (p - parens) > 1;
7331
7332 if (paren == '>')
7333 node = SUSPEND, flag = 0;
6bda09f9 7334 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
7335 Set_Node_Cur_Length(ret);
7336 Set_Node_Offset(ret, parse_start + 1);
c277df42 7337 ret->flags = flag;
07be1b83 7338 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 7339 }
a0d0e21e
LW
7340 }
7341
7342 /* Check for proper termination. */
ce3e6498 7343 if (paren) {
e2509266 7344 RExC_flags = oregflags;
830247a4
IZ
7345 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7346 RExC_parse = oregcomp_parse;
380a0633 7347 vFAIL("Unmatched (");
ce3e6498 7348 }
a0ed51b3 7349 }
830247a4
IZ
7350 else if (!paren && RExC_parse < RExC_end) {
7351 if (*RExC_parse == ')') {
7352 RExC_parse++;
380a0633 7353 vFAIL("Unmatched )");
a0ed51b3
LW
7354 }
7355 else
b45f050a 7356 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
7357 /* NOTREACHED */
7358 }
b57e4118
KW
7359
7360 if (RExC_in_lookbehind) {
7361 RExC_in_lookbehind--;
7362 }
fd4be6f0 7363 if (after_freeze > RExC_npar)
594d7033 7364 RExC_npar = after_freeze;
a0d0e21e 7365 return(ret);
a687059c
LW
7366}
7367
7368/*
7369 - regbranch - one alternative of an | operator
7370 *
7371 * Implements the concatenation operator.
7372 */
76e3520e 7373STATIC regnode *
3dab1dad 7374S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 7375{
97aff369 7376 dVAR;
c277df42
IZ
7377 register regnode *ret;
7378 register regnode *chain = NULL;
7379 register regnode *latest;
7380 I32 flags = 0, c = 0;
3dab1dad 7381 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7382
7383 PERL_ARGS_ASSERT_REGBRANCH;
7384
3dab1dad 7385 DEBUG_PARSE("brnc");
02daf0ab 7386
b81d288d 7387 if (first)
c277df42
IZ
7388 ret = NULL;
7389 else {
b81d288d 7390 if (!SIZE_ONLY && RExC_extralen)
830247a4 7391 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 7392 else {
830247a4 7393 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
7394 Set_Node_Length(ret, 1);
7395 }
c277df42
IZ
7396 }
7397
b81d288d 7398 if (!first && SIZE_ONLY)
830247a4 7399 RExC_extralen += 1; /* BRANCHJ */
b81d288d 7400
c277df42 7401 *flagp = WORST; /* Tentatively. */
a0d0e21e 7402
830247a4
IZ
7403 RExC_parse--;
7404 nextchar(pRExC_state);
7405 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 7406 flags &= ~TRYAGAIN;
3dab1dad 7407 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7408 if (latest == NULL) {
7409 if (flags & TRYAGAIN)
7410 continue;
7411 return(NULL);
a0ed51b3
LW
7412 }
7413 else if (ret == NULL)
c277df42 7414 ret = latest;
8ae10a67 7415 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 7416 if (chain == NULL) /* First piece. */
a0d0e21e
LW
7417 *flagp |= flags&SPSTART;
7418 else {
830247a4 7419 RExC_naughty++;
3dab1dad 7420 REGTAIL(pRExC_state, chain, latest);
a687059c 7421 }
a0d0e21e 7422 chain = latest;
c277df42
IZ
7423 c++;
7424 }
7425 if (chain == NULL) { /* Loop ran zero times. */
830247a4 7426 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
7427 if (ret == NULL)
7428 ret = chain;
7429 }
7430 if (c == 1) {
7431 *flagp |= flags&SIMPLE;
a0d0e21e 7432 }
a687059c 7433
d4c19fe8 7434 return ret;
a687059c
LW
7435}
7436
7437/*
7438 - regpiece - something followed by possible [*+?]
7439 *
7440 * Note that the branching code sequences used for ? and the general cases
7441 * of * and + are somewhat optimized: they use the same NOTHING node as
7442 * both the endmarker for their branch list and the body of the last branch.
7443 * It might seem that this node could be dispensed with entirely, but the
7444 * endmarker role is not redundant.
7445 */
76e3520e 7446STATIC regnode *
3dab1dad 7447S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7448{
97aff369 7449 dVAR;
c277df42 7450 register regnode *ret;
a0d0e21e
LW
7451 register char op;
7452 register char *next;
7453 I32 flags;
1df70142 7454 const char * const origparse = RExC_parse;
a0d0e21e 7455 I32 min;
c277df42 7456 I32 max = REG_INFTY;
fac92740 7457 char *parse_start;
10edeb5d 7458 const char *maxpos = NULL;
3dab1dad 7459 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7460
7461 PERL_ARGS_ASSERT_REGPIECE;
7462
3dab1dad 7463 DEBUG_PARSE("piec");
a0d0e21e 7464
3dab1dad 7465 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7466 if (ret == NULL) {
7467 if (flags & TRYAGAIN)
7468 *flagp |= TRYAGAIN;
7469 return(NULL);
7470 }
7471
830247a4 7472 op = *RExC_parse;
a0d0e21e 7473
830247a4 7474 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 7475 maxpos = NULL;
fac92740 7476 parse_start = RExC_parse; /* MJD */
830247a4 7477 next = RExC_parse + 1;
a0d0e21e
LW
7478 while (isDIGIT(*next) || *next == ',') {
7479 if (*next == ',') {
7480 if (maxpos)
7481 break;
7482 else
7483 maxpos = next;
a687059c 7484 }
a0d0e21e
LW
7485 next++;
7486 }
7487 if (*next == '}') { /* got one */
7488 if (!maxpos)
7489 maxpos = next;
830247a4
IZ
7490 RExC_parse++;
7491 min = atoi(RExC_parse);
a0d0e21e
LW
7492 if (*maxpos == ',')
7493 maxpos++;
7494 else
830247a4 7495 maxpos = RExC_parse;
a0d0e21e
LW
7496 max = atoi(maxpos);
7497 if (!max && *maxpos != '0')
c277df42
IZ
7498 max = REG_INFTY; /* meaning "infinity" */
7499 else if (max >= REG_INFTY)
8615cb43 7500 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
7501 RExC_parse = next;
7502 nextchar(pRExC_state);
a0d0e21e
LW
7503
7504 do_curly:
7505 if ((flags&SIMPLE)) {
830247a4 7506 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 7507 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
7508 Set_Node_Offset(ret, parse_start+1); /* MJD */
7509 Set_Node_Cur_Length(ret);
a0d0e21e
LW
7510 }
7511 else {
3dab1dad 7512 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
7513
7514 w->flags = 0;
3dab1dad 7515 REGTAIL(pRExC_state, ret, w);
830247a4 7516 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
7517 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7518 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
7519 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7520 }
6bda09f9 7521 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
7522 /* MJD hk */
7523 Set_Node_Offset(ret, parse_start+1);
2af232bd 7524 Set_Node_Length(ret,
fac92740 7525 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 7526
830247a4 7527 if (!SIZE_ONLY && RExC_extralen)
c277df42 7528 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 7529 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 7530 if (SIZE_ONLY)
830247a4
IZ
7531 RExC_whilem_seen++, RExC_extralen += 3;
7532 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 7533 }
c277df42 7534 ret->flags = 0;
a0d0e21e
LW
7535
7536 if (min > 0)
821b33a5
IZ
7537 *flagp = WORST;
7538 if (max > 0)
7539 *flagp |= HASWIDTH;
8fa23287 7540 if (max < min)
8615cb43 7541 vFAIL("Can't do {n,m} with n > m");
c277df42 7542 if (!SIZE_ONLY) {
eb160463
GS
7543 ARG1_SET(ret, (U16)min);
7544 ARG2_SET(ret, (U16)max);
a687059c 7545 }
a687059c 7546
a0d0e21e 7547 goto nest_check;
a687059c 7548 }
a0d0e21e 7549 }
a687059c 7550
a0d0e21e
LW
7551 if (!ISMULT1(op)) {
7552 *flagp = flags;
a687059c 7553 return(ret);
a0d0e21e 7554 }
bb20fd44 7555
c277df42 7556#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
7557
7558 /* if this is reinstated, don't forget to put this back into perldiag:
7559
7560 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7561
7562 (F) The part of the regexp subject to either the * or + quantifier
7563 could match an empty string. The {#} shows in the regular
7564 expression about where the problem was discovered.
7565
7566 */
7567
bb20fd44 7568 if (!(flags&HASWIDTH) && op != '?')
b45f050a 7569 vFAIL("Regexp *+ operand could be empty");
b81d288d 7570#endif
bb20fd44 7571
fac92740 7572 parse_start = RExC_parse;
830247a4 7573 nextchar(pRExC_state);
a0d0e21e 7574
821b33a5 7575 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
7576
7577 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 7578 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 7579 ret->flags = 0;
830247a4 7580 RExC_naughty += 4;
a0d0e21e
LW
7581 }
7582 else if (op == '*') {
7583 min = 0;
7584 goto do_curly;
a0ed51b3
LW
7585 }
7586 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 7587 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 7588 ret->flags = 0;
830247a4 7589 RExC_naughty += 3;
a0d0e21e
LW
7590 }
7591 else if (op == '+') {
7592 min = 1;
7593 goto do_curly;
a0ed51b3
LW
7594 }
7595 else if (op == '?') {
a0d0e21e
LW
7596 min = 0; max = 1;
7597 goto do_curly;
7598 }
7599 nest_check:
668c081a
NC
7600 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7601 ckWARN3reg(RExC_parse,
7602 "%.*s matches null string many times",
7603 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7604 origparse);
a0d0e21e
LW
7605 }
7606
b9b4dddf 7607 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 7608 nextchar(pRExC_state);
6bda09f9 7609 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 7610 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 7611 }
b9b4dddf
YO
7612#ifndef REG_ALLOW_MINMOD_SUSPEND
7613 else
7614#endif
7615 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7616 regnode *ender;
7617 nextchar(pRExC_state);
7618 ender = reg_node(pRExC_state, SUCCEED);
7619 REGTAIL(pRExC_state, ret, ender);
7620 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7621 ret->flags = 0;
7622 ender = reg_node(pRExC_state, TAIL);
7623 REGTAIL(pRExC_state, ret, ender);
7624 /*ret= ender;*/
7625 }
7626
7627 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 7628 RExC_parse++;
b45f050a
JF
7629 vFAIL("Nested quantifiers");
7630 }
a0d0e21e
LW
7631
7632 return(ret);
a687059c
LW
7633}
7634
fc8cd66c 7635
9d64099b 7636/* reg_namedseq(pRExC_state,UVp, UV depth)
fc8cd66c
YO
7637
7638 This is expected to be called by a parser routine that has
afefe6bf 7639 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
7640 expected to point at the first char following the N at the time
7641 of the call.
ff3f963a
KW
7642
7643 The \N may be inside (indicated by valuep not being NULL) or outside a
7644 character class.
7645
7646 \N may begin either a named sequence, or if outside a character class, mean
7647 to match a non-newline. For non single-quoted regexes, the tokenizer has
7648 attempted to decide which, and in the case of a named sequence converted it
7649 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7650 where c1... are the characters in the sequence. For single-quoted regexes,
7651 the tokenizer passes the \N sequence through unchanged; this code will not
7652 attempt to determine this nor expand those. The net effect is that if the
7653 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7654 signals that this \N occurrence means to match a non-newline.
7655
7656 Only the \N{U+...} form should occur in a character class, for the same
7657 reason that '.' inside a character class means to just match a period: it
7658 just doesn't make sense.
fc8cd66c
YO
7659
7660 If valuep is non-null then it is assumed that we are parsing inside
7661 of a charclass definition and the first codepoint in the resolved
7662 string is returned via *valuep and the routine will return NULL.
7663 In this mode if a multichar string is returned from the charnames
ff3f963a 7664 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
7665 sequence will be examined. If the string returned is zero length
7666 then the value of *valuep is undefined and NON-NULL will
7667 be returned to indicate failure. (This will NOT be a valid pointer
7668 to a regnode.)
7669
ff3f963a
KW
7670 If valuep is null then it is assumed that we are parsing normal text and a
7671 new EXACT node is inserted into the program containing the resolved string,
7672 and a pointer to the new node is returned. But if the string is zero length
7673 a NOTHING node is emitted instead.
afefe6bf 7674
fc8cd66c 7675 On success RExC_parse is set to the char following the endbrace.
ff3f963a 7676 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
7677 */
7678STATIC regnode *
9d64099b 7679S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
fc8cd66c 7680{
c3c41406 7681 char * endbrace; /* '}' following the name */
fc8cd66c 7682 regnode *ret = NULL;
c3c41406 7683 char* p;
ff3f963a
KW
7684
7685 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7686
7687 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
7688
7689 GET_RE_DEBUG_FLAGS;
c3c41406
KW
7690
7691 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7692 * modifier. The other meaning does not */
7693 p = (RExC_flags & RXf_PMf_EXTENDED)
7694 ? regwhite( pRExC_state, RExC_parse )
7695 : RExC_parse;
7918f24d 7696
ff3f963a 7697 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
7698 * [^\n]. The former is assumed when it can't be the latter. */
7699 if (*p != '{' || regcurly(p)) {
7700 RExC_parse = p;
ff3f963a 7701 if (valuep) {
afefe6bf 7702 /* no bare \N in a charclass */
ff3f963a
KW
7703 vFAIL("\\N in a character class must be a named character: \\N{...}");
7704 }
afefe6bf
RGS
7705 nextchar(pRExC_state);
7706 ret = reg_node(pRExC_state, REG_ANY);
7707 *flagp |= HASWIDTH|SIMPLE;
7708 RExC_naughty++;
7709 RExC_parse--;
7710 Set_Node_Length(ret, 1); /* MJD */
7711 return ret;
fc8cd66c 7712 }
a4893424 7713
c3c41406
KW
7714 /* Here, we have decided it should be a named sequence */
7715
7716 /* The test above made sure that the next real character is a '{', but
7717 * under the /x modifier, it could be separated by space (or a comment and
7718 * \n) and this is not allowed (for consistency with \x{...} and the
7719 * tokenizer handling of \N{NAME}). */
7720 if (*RExC_parse != '{') {
7721 vFAIL("Missing braces on \\N{}");
7722 }
7723
ff3f963a 7724 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
7725
7726 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7727 || ! (endbrace == RExC_parse /* nothing between the {} */
7728 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7729 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7730 {
7731 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7732 vFAIL("\\N{NAME} must be resolved by the lexer");
7733 }
7734
ff3f963a
KW
7735 if (endbrace == RExC_parse) { /* empty: \N{} */
7736 if (! valuep) {
7737 RExC_parse = endbrace + 1;
7738 return reg_node(pRExC_state,NOTHING);
a4893424 7739 }
fc8cd66c 7740
ff3f963a
KW
7741 if (SIZE_ONLY) {
7742 ckWARNreg(RExC_parse,
7743 "Ignoring zero length \\N{} in character class"
7744 );
7745 RExC_parse = endbrace + 1;
7746 }
7747 *valuep = 0;
7748 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7749 }
ff3f963a 7750
62fed28b 7751 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
7752 RExC_parse += 2; /* Skip past the 'U+' */
7753
7754 if (valuep) { /* In a bracketed char class */
7755 /* We only pay attention to the first char of
7756 multichar strings being returned. I kinda wonder
7757 if this makes sense as it does change the behaviour
7758 from earlier versions, OTOH that behaviour was broken
7759 as well. XXX Solution is to recharacterize as
7760 [rest-of-class]|multi1|multi2... */
7761
7762 STRLEN length_of_hex;
7763 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7764 | PERL_SCAN_DISALLOW_PREFIX
7765 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7766
37820adc
KW
7767 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7768 if (endchar < endbrace) {
ff3f963a
KW
7769 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7770 }
ff3f963a
KW
7771
7772 length_of_hex = (STRLEN)(endchar - RExC_parse);
7773 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7774
7775 /* The tokenizer should have guaranteed validity, but it's possible to
7776 * bypass it by using single quoting, so check */
c3c41406
KW
7777 if (length_of_hex == 0
7778 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7779 {
7780 RExC_parse += length_of_hex; /* Includes all the valid */
7781 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7782 ? UTF8SKIP(RExC_parse)
7783 : 1;
7784 /* Guard against malformed utf8 */
7785 if (RExC_parse >= endchar) RExC_parse = endchar;
7786 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7787 }
7788
7789 RExC_parse = endbrace + 1;
7790 if (endchar == endbrace) return NULL;
7791
7792 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7793 }
ff3f963a 7794 else { /* Not a char class */
e2a7e165
KW
7795
7796 /* What is done here is to convert this to a sub-pattern of the form
7797 * (?:\x{char1}\x{char2}...)
7798 * and then call reg recursively. That way, it retains its atomicness,
7799 * while not having to worry about special handling that some code
7800 * points may have. toke.c has converted the original Unicode values
7801 * to native, so that we can just pass on the hex values unchanged. We
7802 * do have to set a flag to keep recoding from happening in the
7803 * recursion */
7804
7805 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
7806 STRLEN len;
ff3f963a
KW
7807 char *endchar; /* Points to '.' or '}' ending cur char in the input
7808 stream */
e2a7e165
KW
7809 char *orig_end = RExC_end;
7810
7811 while (RExC_parse < endbrace) {
ff3f963a
KW
7812
7813 /* Code points are separated by dots. If none, there is only one
7814 * code point, and is terminated by the brace */
37820adc 7815 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a 7816
e2a7e165
KW
7817 /* Convert to notation the rest of the code understands */
7818 sv_catpv(substitute_parse, "\\x{");
7819 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
7820 sv_catpv(substitute_parse, "}");
ff3f963a
KW
7821
7822 /* Point to the beginning of the next character in the sequence. */
7823 RExC_parse = endchar + 1;
ff3f963a 7824 }
e2a7e165 7825 sv_catpv(substitute_parse, ")");
ff3f963a 7826
e2a7e165 7827 RExC_parse = SvPV(substitute_parse, len);
ff3f963a 7828
e2a7e165
KW
7829 /* Don't allow empty number */
7830 if (len < 8) {
7831 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a 7832 }
e2a7e165 7833 RExC_end = RExC_parse + len;
ff3f963a 7834
e2a7e165
KW
7835 /* The values are Unicode, and therefore not subject to recoding */
7836 RExC_override_recoding = 1;
7837
7838 ret = reg(pRExC_state, 1, flagp, depth+1);
7839
7840 RExC_parse = endbrace;
7841 RExC_end = orig_end;
7842 RExC_override_recoding = 0;
ff3f963a 7843
ff3f963a
KW
7844 nextchar(pRExC_state);
7845 }
7846
7847 return ret;
fc8cd66c
YO
7848}
7849
7850
9e08bc66
TS
7851/*
7852 * reg_recode
7853 *
7854 * It returns the code point in utf8 for the value in *encp.
7855 * value: a code value in the source encoding
7856 * encp: a pointer to an Encode object
7857 *
7858 * If the result from Encode is not a single character,
7859 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7860 */
7861STATIC UV
7862S_reg_recode(pTHX_ const char value, SV **encp)
7863{
7864 STRLEN numlen = 1;
59cd0e26 7865 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7866 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7867 const STRLEN newlen = SvCUR(sv);
7868 UV uv = UNICODE_REPLACEMENT;
7869
7918f24d
NC
7870 PERL_ARGS_ASSERT_REG_RECODE;
7871
9e08bc66
TS
7872 if (newlen)
7873 uv = SvUTF8(sv)
7874 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7875 : *(U8*)s;
7876
7877 if (!newlen || numlen != newlen) {
7878 uv = UNICODE_REPLACEMENT;
c86f7df5 7879 *encp = NULL;
9e08bc66
TS
7880 }
7881 return uv;
7882}
7883
fc8cd66c 7884
a687059c
LW
7885/*
7886 - regatom - the lowest level
ee9b8eae
YO
7887
7888 Try to identify anything special at the start of the pattern. If there
7889 is, then handle it as required. This may involve generating a single regop,
7890 such as for an assertion; or it may involve recursing, such as to
7891 handle a () structure.
7892
7893 If the string doesn't start with something special then we gobble up
7894 as much literal text as we can.
7895
7896 Once we have been able to handle whatever type of thing started the
7897 sequence, we return.
7898
7899 Note: we have to be careful with escapes, as they can be both literal
7900 and special, and in the case of \10 and friends can either, depending
486ec47a 7901 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
7902 escape sequences, with the one for handling literal escapes requiring
7903 a dummy entry for all of the special escapes that are actually handled
7904 by the other.
7905*/
7906
76e3520e 7907STATIC regnode *
3dab1dad 7908S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7909{
97aff369 7910 dVAR;
cbbf8932 7911 register regnode *ret = NULL;
a0d0e21e 7912 I32 flags;
45948336 7913 char *parse_start = RExC_parse;
980866de 7914 U8 op;
3dab1dad
YO
7915 GET_RE_DEBUG_FLAGS_DECL;
7916 DEBUG_PARSE("atom");
a0d0e21e
LW
7917 *flagp = WORST; /* Tentatively. */
7918
7918f24d 7919 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7920
a0d0e21e 7921tryagain:
f9a79580 7922 switch ((U8)*RExC_parse) {
a0d0e21e 7923 case '^':
830247a4
IZ
7924 RExC_seen_zerolen++;
7925 nextchar(pRExC_state);
bbe252da 7926 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7927 ret = reg_node(pRExC_state, MBOL);
bbe252da 7928 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7929 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7930 else
830247a4 7931 ret = reg_node(pRExC_state, BOL);
fac92740 7932 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7933 break;
7934 case '$':
830247a4 7935 nextchar(pRExC_state);
b81d288d 7936 if (*RExC_parse)
830247a4 7937 RExC_seen_zerolen++;
bbe252da 7938 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7939 ret = reg_node(pRExC_state, MEOL);
bbe252da 7940 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7941 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7942 else
830247a4 7943 ret = reg_node(pRExC_state, EOL);
fac92740 7944 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7945 break;
7946 case '.':
830247a4 7947 nextchar(pRExC_state);
bbe252da 7948 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7949 ret = reg_node(pRExC_state, SANY);
7950 else
7951 ret = reg_node(pRExC_state, REG_ANY);
7952 *flagp |= HASWIDTH|SIMPLE;
830247a4 7953 RExC_naughty++;
fac92740 7954 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7955 break;
7956 case '[':
b45f050a 7957 {
3dab1dad
YO
7958 char * const oregcomp_parse = ++RExC_parse;
7959 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7960 if (*RExC_parse != ']') {
7961 RExC_parse = oregcomp_parse;
b45f050a
JF
7962 vFAIL("Unmatched [");
7963 }
830247a4 7964 nextchar(pRExC_state);
a0d0e21e 7965 *flagp |= HASWIDTH|SIMPLE;
fac92740 7966 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7967 break;
b45f050a 7968 }
a0d0e21e 7969 case '(':
830247a4 7970 nextchar(pRExC_state);
3dab1dad 7971 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7972 if (ret == NULL) {
bf93d4cc 7973 if (flags & TRYAGAIN) {
830247a4 7974 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7975 /* Make parent create an empty node if needed. */
7976 *flagp |= TRYAGAIN;
7977 return(NULL);
7978 }
a0d0e21e 7979 goto tryagain;
bf93d4cc 7980 }
a0d0e21e
LW
7981 return(NULL);
7982 }
a3b492c3 7983 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7984 break;
7985 case '|':
7986 case ')':
7987 if (flags & TRYAGAIN) {
7988 *flagp |= TRYAGAIN;
7989 return NULL;
7990 }
b45f050a 7991 vFAIL("Internal urp");
a0d0e21e
LW
7992 /* Supposed to be caught earlier. */
7993 break;
85afd4ae 7994 case '{':
830247a4
IZ
7995 if (!regcurly(RExC_parse)) {
7996 RExC_parse++;
85afd4ae
CS
7997 goto defchar;
7998 }
7999 /* FALL THROUGH */
a0d0e21e
LW
8000 case '?':
8001 case '+':
8002 case '*':
830247a4 8003 RExC_parse++;
b45f050a 8004 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
8005 break;
8006 case '\\':
ee9b8eae
YO
8007 /* Special Escapes
8008
8009 This switch handles escape sequences that resolve to some kind
8010 of special regop and not to literal text. Escape sequnces that
8011 resolve to literal text are handled below in the switch marked
8012 "Literal Escapes".
8013
8014 Every entry in this switch *must* have a corresponding entry
8015 in the literal escape switch. However, the opposite is not
8016 required, as the default for this switch is to jump to the
8017 literal text handling code.
8018 */
a0a388a1 8019 switch ((U8)*++RExC_parse) {
ee9b8eae 8020 /* Special Escapes */
a0d0e21e 8021 case 'A':
830247a4
IZ
8022 RExC_seen_zerolen++;
8023 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 8024 *flagp |= SIMPLE;
ee9b8eae 8025 goto finish_meta_pat;
a0d0e21e 8026 case 'G':
830247a4
IZ
8027 ret = reg_node(pRExC_state, GPOS);
8028 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 8029 *flagp |= SIMPLE;
ee9b8eae
YO
8030 goto finish_meta_pat;
8031 case 'K':
8032 RExC_seen_zerolen++;
8033 ret = reg_node(pRExC_state, KEEPS);
8034 *flagp |= SIMPLE;
37923168
RGS
8035 /* XXX:dmq : disabling in-place substitution seems to
8036 * be necessary here to avoid cases of memory corruption, as
8037 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8038 */
8039 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 8040 goto finish_meta_pat;
a0d0e21e 8041 case 'Z':
830247a4 8042 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 8043 *flagp |= SIMPLE;
a1917ab9 8044 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 8045 goto finish_meta_pat;
b85d18e9 8046 case 'z':
830247a4 8047 ret = reg_node(pRExC_state, EOS);
b85d18e9 8048 *flagp |= SIMPLE;
830247a4 8049 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 8050 goto finish_meta_pat;
4a2d328f 8051 case 'C':
f33976b4
DB
8052 ret = reg_node(pRExC_state, CANY);
8053 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 8054 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8055 goto finish_meta_pat;
a0ed51b3 8056 case 'X':
830247a4 8057 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 8058 *flagp |= HASWIDTH;
ee9b8eae 8059 goto finish_meta_pat;
a0d0e21e 8060 case 'w':
980866de
KW
8061 switch (get_regex_charset(RExC_flags)) {
8062 case REGEX_LOCALE_CHARSET:
8063 op = ALNUML;
8064 break;
8065 case REGEX_UNICODE_CHARSET:
8066 op = ALNUMU;
8067 break;
cfaf538b 8068 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8069 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8070 op = ALNUMA;
8071 break;
980866de
KW
8072 case REGEX_DEPENDS_CHARSET:
8073 op = ALNUM;
8074 break;
8075 default:
8076 goto bad_charset;
a12cf05f 8077 }
980866de 8078 ret = reg_node(pRExC_state, op);
a0d0e21e 8079 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8080 goto finish_meta_pat;
a0d0e21e 8081 case 'W':
980866de
KW
8082 switch (get_regex_charset(RExC_flags)) {
8083 case REGEX_LOCALE_CHARSET:
8084 op = NALNUML;
8085 break;
8086 case REGEX_UNICODE_CHARSET:
8087 op = NALNUMU;
8088 break;
cfaf538b 8089 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8090 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8091 op = NALNUMA;
8092 break;
980866de
KW
8093 case REGEX_DEPENDS_CHARSET:
8094 op = NALNUM;
8095 break;
8096 default:
8097 goto bad_charset;
a12cf05f 8098 }
980866de 8099 ret = reg_node(pRExC_state, op);
a0d0e21e 8100 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8101 goto finish_meta_pat;
a0d0e21e 8102 case 'b':
830247a4
IZ
8103 RExC_seen_zerolen++;
8104 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8105 switch (get_regex_charset(RExC_flags)) {
8106 case REGEX_LOCALE_CHARSET:
8107 op = BOUNDL;
8108 break;
8109 case REGEX_UNICODE_CHARSET:
8110 op = BOUNDU;
8111 break;
cfaf538b 8112 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8113 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8114 op = BOUNDA;
8115 break;
63ac0dad
KW
8116 case REGEX_DEPENDS_CHARSET:
8117 op = BOUND;
8118 break;
8119 default:
8120 goto bad_charset;
a12cf05f 8121 }
63ac0dad 8122 ret = reg_node(pRExC_state, op);
b988e673 8123 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8124 *flagp |= SIMPLE;
5024bc2d
KW
8125 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8126 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8127 }
ee9b8eae 8128 goto finish_meta_pat;
a0d0e21e 8129 case 'B':
830247a4
IZ
8130 RExC_seen_zerolen++;
8131 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8132 switch (get_regex_charset(RExC_flags)) {
8133 case REGEX_LOCALE_CHARSET:
8134 op = NBOUNDL;
8135 break;
8136 case REGEX_UNICODE_CHARSET:
8137 op = NBOUNDU;
8138 break;
cfaf538b 8139 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8140 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8141 op = NBOUNDA;
8142 break;
63ac0dad
KW
8143 case REGEX_DEPENDS_CHARSET:
8144 op = NBOUND;
8145 break;
8146 default:
8147 goto bad_charset;
a12cf05f 8148 }
63ac0dad 8149 ret = reg_node(pRExC_state, op);
b988e673 8150 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8151 *flagp |= SIMPLE;
5024bc2d
KW
8152 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8153 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8154 }
ee9b8eae 8155 goto finish_meta_pat;
a0d0e21e 8156 case 's':
980866de
KW
8157 switch (get_regex_charset(RExC_flags)) {
8158 case REGEX_LOCALE_CHARSET:
8159 op = SPACEL;
8160 break;
8161 case REGEX_UNICODE_CHARSET:
8162 op = SPACEU;
8163 break;
cfaf538b 8164 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8165 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8166 op = SPACEA;
8167 break;
980866de
KW
8168 case REGEX_DEPENDS_CHARSET:
8169 op = SPACE;
8170 break;
8171 default:
8172 goto bad_charset;
a12cf05f 8173 }
980866de 8174 ret = reg_node(pRExC_state, op);
a0d0e21e 8175 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8176 goto finish_meta_pat;
a0d0e21e 8177 case 'S':
980866de
KW
8178 switch (get_regex_charset(RExC_flags)) {
8179 case REGEX_LOCALE_CHARSET:
8180 op = NSPACEL;
8181 break;
8182 case REGEX_UNICODE_CHARSET:
8183 op = NSPACEU;
8184 break;
cfaf538b 8185 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8186 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8187 op = NSPACEA;
8188 break;
980866de
KW
8189 case REGEX_DEPENDS_CHARSET:
8190 op = NSPACE;
8191 break;
8192 default:
8193 goto bad_charset;
a12cf05f 8194 }
980866de 8195 ret = reg_node(pRExC_state, op);
a0d0e21e 8196 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8197 goto finish_meta_pat;
a0d0e21e 8198 case 'd':
56ae17b4
KW
8199 switch (get_regex_charset(RExC_flags)) {
8200 case REGEX_LOCALE_CHARSET:
8201 op = DIGITL;
8202 break;
cfaf538b 8203 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8204 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8205 op = DIGITA;
8206 break;
56ae17b4
KW
8207 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8208 case REGEX_UNICODE_CHARSET:
8209 op = DIGIT;
8210 break;
8211 default:
8212 goto bad_charset;
6ab9ea91 8213 }
56ae17b4 8214 ret = reg_node(pRExC_state, op);
a0d0e21e 8215 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8216 goto finish_meta_pat;
a0d0e21e 8217 case 'D':
56ae17b4
KW
8218 switch (get_regex_charset(RExC_flags)) {
8219 case REGEX_LOCALE_CHARSET:
8220 op = NDIGITL;
8221 break;
cfaf538b 8222 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8223 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8224 op = NDIGITA;
8225 break;
56ae17b4
KW
8226 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8227 case REGEX_UNICODE_CHARSET:
8228 op = NDIGIT;
8229 break;
8230 default:
8231 goto bad_charset;
6ab9ea91 8232 }
56ae17b4 8233 ret = reg_node(pRExC_state, op);
a0d0e21e 8234 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8235 goto finish_meta_pat;
e1d1eefb
YO
8236 case 'R':
8237 ret = reg_node(pRExC_state, LNBREAK);
8238 *flagp |= HASWIDTH|SIMPLE;
8239 goto finish_meta_pat;
8240 case 'h':
8241 ret = reg_node(pRExC_state, HORIZWS);
8242 *flagp |= HASWIDTH|SIMPLE;
8243 goto finish_meta_pat;
8244 case 'H':
8245 ret = reg_node(pRExC_state, NHORIZWS);
8246 *flagp |= HASWIDTH|SIMPLE;
8247 goto finish_meta_pat;
ee9b8eae 8248 case 'v':
e1d1eefb
YO
8249 ret = reg_node(pRExC_state, VERTWS);
8250 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
8251 goto finish_meta_pat;
8252 case 'V':
e1d1eefb
YO
8253 ret = reg_node(pRExC_state, NVERTWS);
8254 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8255 finish_meta_pat:
830247a4 8256 nextchar(pRExC_state);
fac92740 8257 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 8258 break;
a14b48bc
LW
8259 case 'p':
8260 case 'P':
3568d838 8261 {
3dab1dad 8262 char* const oldregxend = RExC_end;
d008bc60 8263#ifdef DEBUGGING
ccb2c380 8264 char* parse_start = RExC_parse - 2;
d008bc60 8265#endif
a14b48bc 8266
830247a4 8267 if (RExC_parse[1] == '{') {
3568d838 8268 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
8269 RExC_end = strchr(RExC_parse, '}');
8270 if (!RExC_end) {
3dab1dad 8271 const U8 c = (U8)*RExC_parse;
830247a4
IZ
8272 RExC_parse += 2;
8273 RExC_end = oldregxend;
0da60cf5 8274 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 8275 }
830247a4 8276 RExC_end++;
a14b48bc 8277 }
af6f566e 8278 else {
830247a4 8279 RExC_end = RExC_parse + 2;
af6f566e
HS
8280 if (RExC_end > oldregxend)
8281 RExC_end = oldregxend;
8282 }
830247a4 8283 RExC_parse--;
a14b48bc 8284
3dab1dad 8285 ret = regclass(pRExC_state,depth+1);
a14b48bc 8286
830247a4
IZ
8287 RExC_end = oldregxend;
8288 RExC_parse--;
ccb2c380
MP
8289
8290 Set_Node_Offset(ret, parse_start + 2);
8291 Set_Node_Cur_Length(ret);
830247a4 8292 nextchar(pRExC_state);
a14b48bc
LW
8293 *flagp |= HASWIDTH|SIMPLE;
8294 }
8295 break;
fc8cd66c 8296 case 'N':
afefe6bf 8297 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
8298 multicharacter. join_exact() will join them up later on.
8299 Also this makes sure that things like /\N{BLAH}+/ and
8300 \N{BLAH} being multi char Just Happen. dmq*/
8301 ++RExC_parse;
9d64099b 8302 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
fc8cd66c 8303 break;
0a4db386 8304 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 8305 parse_named_seq:
81714fb9
YO
8306 {
8307 char ch= RExC_parse[1];
1f1031fe
YO
8308 if (ch != '<' && ch != '\'' && ch != '{') {
8309 RExC_parse++;
8310 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 8311 } else {
1f1031fe
YO
8312 /* this pretty much dupes the code for (?P=...) in reg(), if
8313 you change this make sure you change that */
81714fb9 8314 char* name_start = (RExC_parse += 2);
2eccd3b2 8315 U32 num = 0;
0a4db386
YO
8316 SV *sv_dat = reg_scan_name(pRExC_state,
8317 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 8318 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 8319 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
8320 vFAIL2("Sequence %.3s... not terminated",parse_start);
8321
8322 if (!SIZE_ONLY) {
8323 num = add_data( pRExC_state, 1, "S" );
8324 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8325 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
8326 }
8327
81714fb9
YO
8328 RExC_sawback = 1;
8329 ret = reganode(pRExC_state,
4444fd9f
KW
8330 ((! FOLD)
8331 ? NREF
2f7f8cb1
KW
8332 : (MORE_ASCII_RESTRICTED)
8333 ? NREFFA
8334 : (AT_LEAST_UNI_SEMANTICS)
8335 ? NREFFU
8336 : (LOC)
8337 ? NREFFL
8338 : NREFF),
4444fd9f 8339 num);
81714fb9 8340 *flagp |= HASWIDTH;
1f1031fe 8341
81714fb9
YO
8342 /* override incorrect value set in reganode MJD */
8343 Set_Node_Offset(ret, parse_start+1);
8344 Set_Node_Cur_Length(ret); /* MJD */
8345 nextchar(pRExC_state);
1f1031fe 8346
81714fb9
YO
8347 }
8348 break;
1f1031fe 8349 }
2bf803e2 8350 case 'g':
a0d0e21e
LW
8351 case '1': case '2': case '3': case '4':
8352 case '5': case '6': case '7': case '8': case '9':
8353 {
c74340f9 8354 I32 num;
2bf803e2
YO
8355 bool isg = *RExC_parse == 'g';
8356 bool isrel = 0;
8357 bool hasbrace = 0;
8358 if (isg) {
c74340f9 8359 RExC_parse++;
2bf803e2
YO
8360 if (*RExC_parse == '{') {
8361 RExC_parse++;
8362 hasbrace = 1;
8363 }
8364 if (*RExC_parse == '-') {
8365 RExC_parse++;
8366 isrel = 1;
8367 }
1f1031fe
YO
8368 if (hasbrace && !isDIGIT(*RExC_parse)) {
8369 if (isrel) RExC_parse--;
8370 RExC_parse -= 2;
8371 goto parse_named_seq;
8372 } }
c74340f9 8373 num = atoi(RExC_parse);
b72d83b2
RGS
8374 if (isg && num == 0)
8375 vFAIL("Reference to invalid group 0");
c74340f9 8376 if (isrel) {
5624f11d 8377 num = RExC_npar - num;
c74340f9
YO
8378 if (num < 1)
8379 vFAIL("Reference to nonexistent or unclosed group");
8380 }
2bf803e2 8381 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
8382 goto defchar;
8383 else {
3dab1dad 8384 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
8385 while (isDIGIT(*RExC_parse))
8386 RExC_parse++;
1f1031fe
YO
8387 if (parse_start == RExC_parse - 1)
8388 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
8389 if (hasbrace) {
8390 if (*RExC_parse != '}')
8391 vFAIL("Unterminated \\g{...} pattern");
8392 RExC_parse++;
8393 }
c74340f9
YO
8394 if (!SIZE_ONLY) {
8395 if (num > (I32)RExC_rx->nparens)
8396 vFAIL("Reference to nonexistent group");
c74340f9 8397 }
830247a4 8398 RExC_sawback = 1;
eb160463 8399 ret = reganode(pRExC_state,
4444fd9f
KW
8400 ((! FOLD)
8401 ? REF
2f7f8cb1
KW
8402 : (MORE_ASCII_RESTRICTED)
8403 ? REFFA
8404 : (AT_LEAST_UNI_SEMANTICS)
8405 ? REFFU
8406 : (LOC)
8407 ? REFFL
8408 : REFF),
4444fd9f 8409 num);
a0d0e21e 8410 *flagp |= HASWIDTH;
2af232bd 8411
fac92740 8412 /* override incorrect value set in reganode MJD */
2af232bd 8413 Set_Node_Offset(ret, parse_start+1);
fac92740 8414 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
8415 RExC_parse--;
8416 nextchar(pRExC_state);
a0d0e21e
LW
8417 }
8418 }
8419 break;
8420 case '\0':
830247a4 8421 if (RExC_parse >= RExC_end)
b45f050a 8422 FAIL("Trailing \\");
a0d0e21e
LW
8423 /* FALL THROUGH */
8424 default:
a0288114 8425 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 8426 back into the quick-grab loop below */
45948336 8427 parse_start--;
a0d0e21e
LW
8428 goto defchar;
8429 }
8430 break;
4633a7c4
LW
8431
8432 case '#':
bbe252da 8433 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 8434 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
8435 goto tryagain;
8436 }
8437 /* FALL THROUGH */
8438
f9a79580 8439 default:
561784a5
KW
8440
8441 parse_start = RExC_parse - 1;
8442
8443 RExC_parse++;
8444
8445 defchar: {
d669c36c 8446 typedef enum {
9bed422d
KW
8447 generic_char = 0,
8448 char_s,
d669c36c
KW
8449 upsilon_1,
8450 upsilon_2,
8451 iota_1,
8452 iota_2,
8453 } char_state;
9bed422d 8454 char_state latest_char_state = generic_char;
ba210ebe 8455 register STRLEN len;
58ae7d3f 8456 register UV ender;
a0d0e21e 8457 register char *p;
3dab1dad 8458 char *s;
80aecb99 8459 STRLEN foldlen;
89ebb4a3 8460 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6e326e84 8461 regnode * orig_emit;
f06dbbb7 8462
58ae7d3f 8463 ender = 0;
6e326e84
KW
8464 orig_emit = RExC_emit; /* Save the original output node position in
8465 case we need to output a different node
8466 type */
eb160463 8467 ret = reg_node(pRExC_state,
2c2b7f86
KW
8468 (U8) ((! FOLD) ? EXACT
8469 : (LOC)
8470 ? EXACTFL
2f7f8cb1
KW
8471 : (MORE_ASCII_RESTRICTED)
8472 ? EXACTFA
8473 : (AT_LEAST_UNI_SEMANTICS)
8474 ? EXACTFU
8475 : EXACTF)
2c2b7f86 8476 );
cd439c50 8477 s = STRING(ret);
830247a4
IZ
8478 for (len = 0, p = RExC_parse - 1;
8479 len < 127 && p < RExC_end;
a0d0e21e
LW
8480 len++)
8481 {
3dab1dad 8482 char * const oldp = p;
5b5a24f7 8483
bbe252da 8484 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 8485 p = regwhite( pRExC_state, p );
f9a79580 8486 switch ((U8)*p) {
a0d0e21e
LW
8487 case '^':
8488 case '$':
8489 case '.':
8490 case '[':
8491 case '(':
8492 case ')':
8493 case '|':
8494 goto loopdone;
8495 case '\\':
ee9b8eae
YO
8496 /* Literal Escapes Switch
8497
8498 This switch is meant to handle escape sequences that
8499 resolve to a literal character.
8500
8501 Every escape sequence that represents something
8502 else, like an assertion or a char class, is handled
8503 in the switch marked 'Special Escapes' above in this
8504 routine, but also has an entry here as anything that
8505 isn't explicitly mentioned here will be treated as
8506 an unescaped equivalent literal.
8507 */
8508
a0a388a1 8509 switch ((U8)*++p) {
ee9b8eae
YO
8510 /* These are all the special escapes. */
8511 case 'A': /* Start assertion */
8512 case 'b': case 'B': /* Word-boundary assertion*/
8513 case 'C': /* Single char !DANGEROUS! */
8514 case 'd': case 'D': /* digit class */
8515 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 8516 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
8517 case 'k': case 'K': /* named backref, keep marker */
8518 case 'N': /* named char sequence */
38a44b82 8519 case 'p': case 'P': /* Unicode property */
e1d1eefb 8520 case 'R': /* LNBREAK */
ee9b8eae 8521 case 's': case 'S': /* space class */
e1d1eefb 8522 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
8523 case 'w': case 'W': /* word class */
8524 case 'X': /* eXtended Unicode "combining character sequence" */
8525 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
8526 --p;
8527 goto loopdone;
ee9b8eae
YO
8528
8529 /* Anything after here is an escape that resolves to a
8530 literal. (Except digits, which may or may not)
8531 */
a0d0e21e
LW
8532 case 'n':
8533 ender = '\n';
8534 p++;
a687059c 8535 break;
a0d0e21e
LW
8536 case 'r':
8537 ender = '\r';
8538 p++;
a687059c 8539 break;
a0d0e21e
LW
8540 case 't':
8541 ender = '\t';
8542 p++;
a687059c 8543 break;
a0d0e21e
LW
8544 case 'f':
8545 ender = '\f';
8546 p++;
a687059c 8547 break;
a0d0e21e 8548 case 'e':
c7f1f016 8549 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 8550 p++;
a687059c 8551 break;
a0d0e21e 8552 case 'a':
c7f1f016 8553 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 8554 p++;
a687059c 8555 break;
f0a2b745
KW
8556 case 'o':
8557 {
8558 STRLEN brace_len = len;
00c0cb6d 8559 UV result;
454155d9
KW
8560 const char* error_msg;
8561
8562 bool valid = grok_bslash_o(p,
8563 &result,
8564 &brace_len,
8565 &error_msg,
8566 1);
8567 p += brace_len;
8568 if (! valid) {
8569 RExC_parse = p; /* going to die anyway; point
8570 to exact spot of failure */
f0a2b745
KW
8571 vFAIL(error_msg);
8572 }
00c0cb6d
DG
8573 else
8574 {
8575 ender = result;
8576 }
f0a2b745
KW
8577 if (PL_encoding && ender < 0x100) {
8578 goto recode_encoding;
8579 }
8580 if (ender > 0xff) {
62fed28b 8581 REQUIRE_UTF8;
f0a2b745
KW
8582 }
8583 break;
8584 }
a0d0e21e 8585 case 'x':
a0ed51b3 8586 if (*++p == '{') {
1df70142 8587 char* const e = strchr(p, '}');
b81d288d 8588
b45f050a 8589 if (!e) {
830247a4 8590 RExC_parse = p + 1;
b45f050a
JF
8591 vFAIL("Missing right brace on \\x{}");
8592 }
de5f0749 8593 else {
a4c04bdc
NC
8594 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8595 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 8596 STRLEN numlen = e - p - 1;
53305cf1 8597 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 8598 if (ender > 0xff)
62fed28b 8599 REQUIRE_UTF8;
a0ed51b3
LW
8600 p = e + 1;
8601 }
a0ed51b3
LW
8602 }
8603 else {
a4c04bdc 8604 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 8605 STRLEN numlen = 2;
53305cf1 8606 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
8607 p += numlen;
8608 }
9e08bc66
TS
8609 if (PL_encoding && ender < 0x100)
8610 goto recode_encoding;
a687059c 8611 break;
a0d0e21e
LW
8612 case 'c':
8613 p++;
17a3df4c 8614 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
a687059c 8615 break;
a0d0e21e
LW
8616 case '0': case '1': case '2': case '3':case '4':
8617 case '5': case '6': case '7': case '8':case '9':
8618 if (*p == '0' ||
ca67da41 8619 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
8620 {
8621 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 8622 STRLEN numlen = 3;
53305cf1 8623 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 8624 if (ender > 0xff) {
62fed28b 8625 REQUIRE_UTF8;
609122bd 8626 }
a0d0e21e
LW
8627 p += numlen;
8628 }
8629 else {
8630 --p;
8631 goto loopdone;
a687059c 8632 }
9e08bc66
TS
8633 if (PL_encoding && ender < 0x100)
8634 goto recode_encoding;
8635 break;
8636 recode_encoding:
e2a7e165 8637 if (! RExC_override_recoding) {
9e08bc66
TS
8638 SV* enc = PL_encoding;
8639 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
8640 if (!enc && SIZE_ONLY)
8641 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 8642 REQUIRE_UTF8;
9e08bc66 8643 }
a687059c 8644 break;
a0d0e21e 8645 case '\0':
830247a4 8646 if (p >= RExC_end)
b45f050a 8647 FAIL("Trailing \\");
a687059c 8648 /* FALL THROUGH */
a0d0e21e 8649 default:
216bfc0a
KW
8650 if (!SIZE_ONLY&& isALPHA(*p)) {
8651 /* Include any { following the alpha to emphasize
8652 * that it could be part of an escape at some point
8653 * in the future */
8654 int len = (*(p + 1) == '{') ? 2 : 1;
8655 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8656 }
a0ed51b3 8657 goto normal_default;
a0d0e21e
LW
8658 }
8659 break;
a687059c 8660 default:
a0ed51b3 8661 normal_default:
fd400ab9 8662 if (UTF8_IS_START(*p) && UTF) {
1df70142 8663 STRLEN numlen;
5e12f4fb 8664 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 8665 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
8666 p += numlen;
8667 }
8668 else
5b67c30a 8669 ender = (U8) *p++;
a0d0e21e 8670 break;
7e2509c1
KW
8671 } /* End of switch on the literal */
8672
6e326e84
KW
8673 /* Certain characters are problematic because their folded
8674 * length is so different from their original length that it
8675 * isn't handleable by the optimizer. They are therefore not
8676 * placed in an EXACTish node; and are here handled specially.
8677 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8678 * putting it in a special node keeps regexec from having to
8679 * deal with a non-utf8 multi-char fold */
2f7f8cb1 8680 if (FOLD
d669c36c 8681 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
2f7f8cb1 8682 {
d669c36c
KW
8683 /* We look for either side of the fold. For example \xDF
8684 * folds to 'ss'. We look for both the single character
8685 * \xDF and the sequence 'ss'. When we find something that
8686 * could be one of those, we stop and flush whatever we
8687 * have output so far into the EXACTish node that was being
8688 * built. Then restore the input pointer to what it was.
8689 * regatom will return that EXACT node, and will be called
8690 * again, positioned so the first character is the one in
8691 * question, which we return in a different node type.
8692 * The multi-char folds are a sequence, so the occurrence
8693 * of the first character in that sequence doesn't
8694 * necessarily mean that what follows is the rest of the
8695 * sequence. We keep track of that with a state machine,
8696 * with the state being set to the latest character
8697 * processed before the current one. Most characters will
8698 * set the state to 0, but if one occurs that is part of a
8699 * potential tricky fold sequence, the state is set to that
8700 * character, and the next loop iteration sees if the state
8701 * should progress towards the final folded-from character,
8702 * or if it was a false alarm. If it turns out to be a
8703 * false alarm, the character(s) will be output in a new
8704 * EXACTish node, and join_exact() will later combine them.
8705 * In the case of the 'ss' sequence, which is more common
8706 * and more easily checked, some look-ahead is done to
8707 * save time by ruling-out some false alarms */
8708 switch (ender) {
8709 default:
9bed422d 8710 latest_char_state = generic_char;
d669c36c
KW
8711 break;
8712 case 's':
8713 case 'S':
8714 if (AT_LEAST_UNI_SEMANTICS) {
8715 if (latest_char_state == char_s) { /* 'ss' */
8716 ender = LATIN_SMALL_LETTER_SHARP_S;
8717 goto do_tricky;
8718 }
8719 else if (p < RExC_end) {
8720
8721 /* Look-ahead at the next character. If it
8722 * is also an s, we handle as a sharp s
8723 * tricky regnode. */
8724 if (*p == 's' || *p == 'S') {
8725
8726 /* But first flush anything in the
8727 * EXACTish buffer */
8728 if (len != 0) {
8729 p = oldp;
8730 goto loopdone;
8731 }
8732 p++; /* Account for swallowing this
8733 's' up */
8734 ender = LATIN_SMALL_LETTER_SHARP_S;
8735 goto do_tricky;
8736 }
8737 /* Here, the next character is not a
8738 * literal 's', but still could
8739 * evaluate to one if part of a \o{},
8740 * \x or \OCTAL-DIGIT. The minimum
8741 * length required for that is 4, eg
8742 * \x53 or \123 */
8743 else if (*p == '\\'
8744 && p < RExC_end - 4
8745 && (isDIGIT(*(p + 1))
8746 || *(p + 1) == 'x'
8747 || *(p + 1) == 'o' ))
8748 {
8749
8750 /* Here, it could be an 's', too much
8751 * bother to figure it out here. Flush
8752 * the buffer if any; when come back
8753 * here, set the state so know that the
8754 * previous char was an 's' */
8755 if (len != 0) {
9bed422d 8756 latest_char_state = generic_char;
d669c36c
KW
8757 p = oldp;
8758 goto loopdone;
8759 }
8760 latest_char_state = char_s;
8761 break;
8762 }
8763 }
8764 }
6e326e84 8765
d669c36c
KW
8766 /* Here, can't be an 'ss' sequence, or at least not
8767 * one that could fold to/from the sharp ss */
9bed422d 8768 latest_char_state = generic_char;
d669c36c
KW
8769 break;
8770 case 0x03C5: /* First char in upsilon series */
8771 if (p < RExC_end - 4) { /* Need >= 4 bytes left */
8772 latest_char_state = upsilon_1;
8773 if (len != 0) {
8774 p = oldp;
8775 goto loopdone;
8776 }
8777 }
8778 else {
9bed422d 8779 latest_char_state = generic_char;
d669c36c
KW
8780 }
8781 break;
8782 case 0x03B9: /* First char in iota series */
8783 if (p < RExC_end - 4) {
8784 latest_char_state = iota_1;
8785 if (len != 0) {
8786 p = oldp;
8787 goto loopdone;
8788 }
8789 }
8790 else {
9bed422d 8791 latest_char_state = generic_char;
d669c36c
KW
8792 }
8793 break;
8794 case 0x0308:
8795 if (latest_char_state == upsilon_1) {
8796 latest_char_state = upsilon_2;
8797 }
8798 else if (latest_char_state == iota_1) {
8799 latest_char_state = iota_2;
8800 }
8801 else {
9bed422d 8802 latest_char_state = generic_char;
d669c36c
KW
8803 }
8804 break;
8805 case 0x301:
8806 if (latest_char_state == upsilon_2) {
8807 ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
8808 goto do_tricky;
8809 }
8810 else if (latest_char_state == iota_2) {
8811 ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
8812 goto do_tricky;
8813 }
9bed422d 8814 latest_char_state = generic_char;
d669c36c 8815 break;
6e326e84 8816
d669c36c
KW
8817 /* These are the tricky fold characters. Flush any
8818 * buffer first. */
8819 case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
8820 case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
8821 case LATIN_SMALL_LETTER_SHARP_S:
8822 case LATIN_CAPITAL_LETTER_SHARP_S:
8823 case 0x1FD3:
8824 case 0x1FE3:
8825 if (len != 0) {
8826 p = oldp;
8827 goto loopdone;
8828 }
8829 /* FALL THROUGH */
8830 do_tricky: {
8831 char* const oldregxend = RExC_end;
8832 U8 tmpbuf[UTF8_MAXBYTES+1];
8833
8834 /* Here, we know we need to generate a special
8835 * regnode, and 'ender' contains the tricky
8836 * character. What's done is to pretend it's in a
8837 * [bracketed] class, and let the code that deals
8838 * with those handle it, as that code has all the
8839 * intelligence necessary. First save the current
8840 * parse state, get rid of the already allocated
8841 * but empty EXACT node that the ANYOFV node will
8842 * replace, and point the parse to a buffer which
8843 * we fill with the character we want the regclass
8844 * code to think is being parsed */
8845 RExC_emit = orig_emit;
8846 RExC_parse = (char *) tmpbuf;
8847 if (UTF) {
8848 U8 *d = uvchr_to_utf8(tmpbuf, ender);
8849 *d = '\0';
8850 RExC_end = (char *) d;
8851 }
8852 else {
8853 tmpbuf[0] = ender;
8854 tmpbuf[1] = '\0';
8855 RExC_end = RExC_parse + 1;
8856 }
6e326e84 8857
d669c36c
KW
8858 ret = regclass(pRExC_state,depth+1);
8859
8860 /* Here, have parsed the buffer. Reset the parse to
8861 * the actual input, and return */
8862 RExC_end = oldregxend;
8863 RExC_parse = p - 1;
6e326e84 8864
d669c36c
KW
8865 Set_Node_Offset(ret, RExC_parse);
8866 Set_Node_Cur_Length(ret);
8867 nextchar(pRExC_state);
8868 *flagp |= HASWIDTH|SIMPLE;
8869 return ret;
8870 }
6e326e84
KW
8871 }
8872 }
8873
bcdf7404
YO
8874 if ( RExC_flags & RXf_PMf_EXTENDED)
8875 p = regwhite( pRExC_state, p );
60a8b682 8876 if (UTF && FOLD) {
17580e7a
KW
8877 /* Prime the casefolded buffer. Locale rules, which apply
8878 * only to code points < 256, aren't known until execution,
8879 * so for them, just output the original character using
8880 * utf8 */
8881 if (LOC && ender < 256) {
8882 if (UNI_IS_INVARIANT(ender)) {
8883 *tmpbuf = (U8) ender;
8884 foldlen = 1;
8885 } else {
8886 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8887 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8888 foldlen = 2;
8889 }
8890 }
8891 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8892 */
2f7f8cb1 8893 ender = toLOWER(ender);
cd64649c 8894 *tmpbuf = (U8) ender;
2f7f8cb1
KW
8895 foldlen = 1;
8896 }
17580e7a
KW
8897 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8898
8899 /* Locale and /aa require more selectivity about the
8900 * fold, so are handled below. Otherwise, here, just
8901 * use the fold */
2f7f8cb1
KW
8902 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8903 }
8904 else {
17580e7a
KW
8905 /* Under locale rules or /aa we are not to mix,
8906 * respectively, ords < 256 or ASCII with non-. So
8907 * reject folds that mix them, using only the
8908 * non-folded code point. So do the fold to a
8909 * temporary, and inspect each character in it. */
2f7f8cb1
KW
8910 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8911 U8* s = trialbuf;
8912 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8913 U8* e = s + foldlen;
8914 bool fold_ok = TRUE;
8915
8916 while (s < e) {
17580e7a
KW
8917 if (isASCII(*s)
8918 || (LOC && (UTF8_IS_INVARIANT(*s)
8919 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8920 {
2f7f8cb1
KW
8921 fold_ok = FALSE;
8922 break;
8923 }
8924 s += UTF8SKIP(s);
8925 }
8926 if (fold_ok) {
8927 Copy(trialbuf, tmpbuf, foldlen, U8);
8928 ender = tmpender;
8929 }
8930 else {
8931 uvuni_to_utf8(tmpbuf, ender);
8932 foldlen = UNISKIP(ender);
8933 }
8934 }
60a8b682 8935 }
bcdf7404 8936 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
8937 if (len)
8938 p = oldp;
16ea2a2e 8939 else if (UTF) {
80aecb99 8940 if (FOLD) {
60a8b682 8941 /* Emit all the Unicode characters. */
1df70142 8942 STRLEN numlen;
80aecb99
JH
8943 for (foldbuf = tmpbuf;
8944 foldlen;
8945 foldlen -= numlen) {
8946 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8947 if (numlen > 0) {
71207a34 8948 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8949 s += unilen;
8950 len += unilen;
8951 /* In EBCDIC the numlen
8952 * and unilen can differ. */
9dc45d57 8953 foldbuf += numlen;
47654450
JH
8954 if (numlen >= foldlen)
8955 break;
9dc45d57
JH
8956 }
8957 else
8958 break; /* "Can't happen." */
80aecb99
JH
8959 }
8960 }
8961 else {
71207a34 8962 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8963 if (unilen > 0) {
0ebc6274
JH
8964 s += unilen;
8965 len += unilen;
9dc45d57 8966 }
80aecb99 8967 }
a0ed51b3 8968 }
a0d0e21e
LW
8969 else {
8970 len++;
eb160463 8971 REGC((char)ender, s++);
a0d0e21e
LW
8972 }
8973 break;
a687059c 8974 }
16ea2a2e 8975 if (UTF) {
80aecb99 8976 if (FOLD) {
60a8b682 8977 /* Emit all the Unicode characters. */
1df70142 8978 STRLEN numlen;
80aecb99
JH
8979 for (foldbuf = tmpbuf;
8980 foldlen;
8981 foldlen -= numlen) {
8982 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8983 if (numlen > 0) {
71207a34 8984 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8985 len += unilen;
8986 s += unilen;
8987 /* In EBCDIC the numlen
8988 * and unilen can differ. */
9dc45d57 8989 foldbuf += numlen;
47654450
JH
8990 if (numlen >= foldlen)
8991 break;
9dc45d57
JH
8992 }
8993 else
8994 break;
80aecb99
JH
8995 }
8996 }
8997 else {
71207a34 8998 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8999 if (unilen > 0) {
0ebc6274
JH
9000 s += unilen;
9001 len += unilen;
9dc45d57 9002 }
80aecb99
JH
9003 }
9004 len--;
a0ed51b3 9005 }
d669c36c 9006 else {
eb160463 9007 REGC((char)ender, s++);
d669c36c 9008 }
a0d0e21e 9009 }
7e2509c1
KW
9010 loopdone: /* Jumped to when encounters something that shouldn't be in
9011 the node */
830247a4 9012 RExC_parse = p - 1;
fac92740 9013 Set_Node_Cur_Length(ret); /* MJD */
830247a4 9014 nextchar(pRExC_state);
793db0cb
JH
9015 {
9016 /* len is STRLEN which is unsigned, need to copy to signed */
9017 IV iv = len;
9018 if (iv < 0)
9019 vFAIL("Internal disaster");
9020 }
a0d0e21e
LW
9021 if (len > 0)
9022 *flagp |= HASWIDTH;
090f7165 9023 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 9024 *flagp |= SIMPLE;
3dab1dad 9025
cd439c50 9026 if (SIZE_ONLY)
830247a4 9027 RExC_size += STR_SZ(len);
3dab1dad
YO
9028 else {
9029 STR_LEN(ret) = len;
830247a4 9030 RExC_emit += STR_SZ(len);
07be1b83 9031 }
3dab1dad 9032 }
a0d0e21e
LW
9033 break;
9034 }
a687059c 9035
a0d0e21e 9036 return(ret);
980866de
KW
9037
9038/* Jumped to when an unrecognized character set is encountered */
9039bad_charset:
9040 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9041 return(NULL);
a687059c
LW
9042}
9043
873ef191 9044STATIC char *
bcdf7404 9045S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 9046{
bcdf7404 9047 const char *e = RExC_end;
7918f24d
NC
9048
9049 PERL_ARGS_ASSERT_REGWHITE;
9050
5b5a24f7
CS
9051 while (p < e) {
9052 if (isSPACE(*p))
9053 ++p;
9054 else if (*p == '#') {
bcdf7404 9055 bool ended = 0;
5b5a24f7 9056 do {
bcdf7404
YO
9057 if (*p++ == '\n') {
9058 ended = 1;
9059 break;
9060 }
9061 } while (p < e);
9062 if (!ended)
9063 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
9064 }
9065 else
9066 break;
9067 }
9068 return p;
9069}
9070
b8c5462f
JH
9071/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9072 Character classes ([:foo:]) can also be negated ([:^foo:]).
9073 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9074 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 9075 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
9076
9077#define POSIXCC_DONE(c) ((c) == ':')
9078#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9079#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9080
b8c5462f 9081STATIC I32
830247a4 9082S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 9083{
97aff369 9084 dVAR;
936ed897 9085 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 9086
7918f24d
NC
9087 PERL_ARGS_ASSERT_REGPPOSIXCC;
9088
830247a4 9089 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 9090 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 9091 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 9092 const char c = UCHARAT(RExC_parse);
097eb12c 9093 char* const s = RExC_parse++;
b81d288d 9094
9a86a77b 9095 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
9096 RExC_parse++;
9097 if (RExC_parse == RExC_end)
620e46c5 9098 /* Grandfather lone [:, [=, [. */
830247a4 9099 RExC_parse = s;
620e46c5 9100 else {
3dab1dad 9101 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
9102 assert(*t == c);
9103
9a86a77b 9104 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 9105 const char *posixcc = s + 1;
830247a4 9106 RExC_parse++; /* skip over the ending ] */
3dab1dad 9107
b8c5462f 9108 if (*s == ':') {
1df70142
AL
9109 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9110 const I32 skip = t - posixcc;
80916619
NC
9111
9112 /* Initially switch on the length of the name. */
9113 switch (skip) {
9114 case 4:
3dab1dad
YO
9115 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9116 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 9117 break;
80916619
NC
9118 case 5:
9119 /* Names all of length 5. */
9120 /* alnum alpha ascii blank cntrl digit graph lower
9121 print punct space upper */
9122 /* Offset 4 gives the best switch position. */
9123 switch (posixcc[4]) {
9124 case 'a':
3dab1dad
YO
9125 if (memEQ(posixcc, "alph", 4)) /* alpha */
9126 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
9127 break;
9128 case 'e':
3dab1dad
YO
9129 if (memEQ(posixcc, "spac", 4)) /* space */
9130 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
9131 break;
9132 case 'h':
3dab1dad
YO
9133 if (memEQ(posixcc, "grap", 4)) /* graph */
9134 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
9135 break;
9136 case 'i':
3dab1dad
YO
9137 if (memEQ(posixcc, "asci", 4)) /* ascii */
9138 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
9139 break;
9140 case 'k':
3dab1dad
YO
9141 if (memEQ(posixcc, "blan", 4)) /* blank */
9142 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
9143 break;
9144 case 'l':
3dab1dad
YO
9145 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9146 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
9147 break;
9148 case 'm':
3dab1dad
YO
9149 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9150 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
9151 break;
9152 case 'r':
3dab1dad
YO
9153 if (memEQ(posixcc, "lowe", 4)) /* lower */
9154 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9155 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9156 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
9157 break;
9158 case 't':
3dab1dad
YO
9159 if (memEQ(posixcc, "digi", 4)) /* digit */
9160 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9161 else if (memEQ(posixcc, "prin", 4)) /* print */
9162 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9163 else if (memEQ(posixcc, "punc", 4)) /* punct */
9164 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 9165 break;
b8c5462f
JH
9166 }
9167 break;
80916619 9168 case 6:
3dab1dad
YO
9169 if (memEQ(posixcc, "xdigit", 6))
9170 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
9171 break;
9172 }
80916619
NC
9173
9174 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
9175 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9176 t - s - 1, s + 1);
80916619
NC
9177 assert (posixcc[skip] == ':');
9178 assert (posixcc[skip+1] == ']');
b45f050a 9179 } else if (!SIZE_ONLY) {
b8c5462f 9180 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 9181
830247a4 9182 /* adjust RExC_parse so the warning shows after
b45f050a 9183 the class closes */
9a86a77b 9184 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 9185 RExC_parse++;
b45f050a
JF
9186 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9187 }
b8c5462f
JH
9188 } else {
9189 /* Maternal grandfather:
9190 * "[:" ending in ":" but not in ":]" */
830247a4 9191 RExC_parse = s;
767d463e 9192 }
620e46c5
JH
9193 }
9194 }
9195
b8c5462f
JH
9196 return namedclass;
9197}
9198
9199STATIC void
830247a4 9200S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 9201{
97aff369 9202 dVAR;
7918f24d
NC
9203
9204 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9205
3dab1dad 9206 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
9207 const char *s = RExC_parse;
9208 const char c = *s++;
b8c5462f 9209
3dab1dad 9210 while (isALNUM(*s))
b8c5462f
JH
9211 s++;
9212 if (*s && c == *s && s[1] == ']') {
668c081a
NC
9213 ckWARN3reg(s+2,
9214 "POSIX syntax [%c %c] belongs inside character classes",
9215 c, c);
b45f050a
JF
9216
9217 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 9218 if (POSIXCC_NOTYET(c)) {
830247a4 9219 /* adjust RExC_parse so the error shows after
b45f050a 9220 the class closes */
9a86a77b 9221 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 9222 NOOP;
b45f050a
JF
9223 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9224 }
b8c5462f
JH
9225 }
9226 }
620e46c5
JH
9227}
9228
003331de
KW
9229/* No locale test, and always Unicode semantics */
9230#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9231ANYOF_##NAME: \
9232 for (value = 0; value < 256; value++) \
9233 if (TEST) \
5bfec14d 9234 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9235 yesno = '+'; \
9236 what = WORD; \
9237 break; \
9238case ANYOF_N##NAME: \
9239 for (value = 0; value < 256; value++) \
9240 if (!TEST) \
5bfec14d 9241 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9242 yesno = '!'; \
9243 what = WORD; \
e1d1eefb 9244 break
89836f1f 9245
a12cf05f
KW
9246/* Like the above, but there are differences if we are in uni-8-bit or not, so
9247 * there are two tests passed in, to use depending on that. There aren't any
9248 * cases where the label is different from the name, so no need for that
9249 * parameter */
f952827c 9250#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
003331de
KW
9251ANYOF_##NAME: \
9252 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9253 else if (UNI_SEMANTICS) { \
9254 for (value = 0; value < 256; value++) { \
f952827c 9255 if (TEST_8(value)) stored += \
5bfec14d 9256 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9257 } \
9258 } \
9259 else { \
9260 for (value = 0; value < 128; value++) { \
f952827c 9261 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
4c9daa0a 9262 set_regclass_bit(pRExC_state, ret, \
5bfec14d 9263 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9264 } \
9265 } \
9266 yesno = '+'; \
9267 what = WORD; \
9268 break; \
9269case ANYOF_N##NAME: \
9270 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9271 else if (UNI_SEMANTICS) { \
9272 for (value = 0; value < 256; value++) { \
f952827c 9273 if (! TEST_8(value)) stored += \
5bfec14d 9274 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9275 } \
9276 } \
9277 else { \
9278 for (value = 0; value < 128; value++) { \
4c9daa0a 9279 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
5bfec14d 9280 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
003331de 9281 } \
2f7f8cb1 9282 if (AT_LEAST_ASCII_RESTRICTED) { \
cfaf538b 9283 for (value = 128; value < 256; value++) { \
4c9daa0a 9284 stored += set_regclass_bit( \
5bfec14d 9285 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
cfaf538b 9286 } \
137165a6 9287 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
cfaf538b
KW
9288 } \
9289 else { \
9290 /* For a non-ut8 target string with DEPENDS semantics, all above \
9291 * ASCII Latin1 code points match the complement of any of the \
9292 * classes. But in utf8, they have their Unicode semantics, so \
9293 * can't just set them in the bitmap, or else regexec.c will think \
9294 * they matched when they shouldn't. */ \
137165a6 9295 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
cfaf538b 9296 } \
003331de
KW
9297 } \
9298 yesno = '!'; \
9299 what = WORD; \
a12cf05f
KW
9300 break
9301
2283d326 9302STATIC U8
2c6aa593 9303S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
9304{
9305
9306 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9307 * Locale folding is done at run-time, so this function should not be
9308 * called for nodes that are for locales.
9309 *
d50a4f90 9310 * This function sets the bit corresponding to the fold of the input
2283d326
KW
9311 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9312 * 'F' is 'f'.
9313 *
d50a4f90
KW
9314 * It also knows about the characters that are in the bitmap that have
9315 * folds that are matchable only outside it, and sets the appropriate lists
9316 * and flags.
9317 *
9318 * It returns the number of bits that actually changed from 0 to 1 */
2283d326
KW
9319
9320 U8 stored = 0;
2283d326
KW
9321 U8 fold;
9322
4c9daa0a
KW
9323 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9324
cfaf538b 9325 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2f7f8cb1 9326 : PL_fold[value];
2283d326
KW
9327
9328 /* It assumes the bit for 'value' has already been set */
9329 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9330 ANYOF_BITMAP_SET(node, fold);
9331 stored++;
9332 }
d50a4f90
KW
9333 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9334 /* Certain Latin1 characters have matches outside the bitmap. To get
9335 * here, 'value' is one of those characters. None of these matches is
9336 * valid for ASCII characters under /aa, which have been excluded by
9337 * the 'if' above. The matches fall into three categories:
9338 * 1) They are singly folded-to or -from an above 255 character, as
9339 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9340 * WITH DIAERESIS;
9341 * 2) They are part of a multi-char fold with another character in the
9342 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9343 * 3) They are part of a multi-char fold with a character not in the
9344 * bitmap, such as various ligatures.
9345 * We aren't dealing fully with multi-char folds, except we do deal
9346 * with the pattern containing a character that has a multi-char fold
9347 * (not so much the inverse).
9348 * For types 1) and 3), the matches only happen when the target string
9349 * is utf8; that's not true for 2), and we set a flag for it.
9350 *
9351 * The code below adds to the passed in inversion list the single fold
9352 * closures for 'value'. The values are hard-coded here so that an
9353 * innocent-looking character class, like /[ks]/i won't have to go out
9354 * to disk to find the possible matches. XXX It would be better to
9355 * generate these via regen, in case a new version of the Unicode
9356 * standard adds new mappings, though that is not really likely. */
9357 switch (value) {
9358 case 'k':
9359 case 'K':
9360 /* KELVIN SIGN */
9361 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9362 break;
9363 case 's':
9364 case 'S':
9365 /* LATIN SMALL LETTER LONG S */
9366 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9367 break;
9368 case MICRO_SIGN:
9369 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9370 GREEK_SMALL_LETTER_MU);
9371 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9372 GREEK_CAPITAL_LETTER_MU);
9373 break;
9374 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9375 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9376 /* ANGSTROM SIGN */
9377 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9378 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9379 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9380 PL_fold_latin1[value]);
9381 }
9382 break;
9383 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9384 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9385 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9386 break;
9387 case LATIN_SMALL_LETTER_SHARP_S:
1d4120df
KW
9388 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9389 LATIN_CAPITAL_LETTER_SHARP_S);
d50a4f90 9390
419d8974 9391 /* Under /a, /d, and /u, this can match the two chars "ss" */
d50a4f90
KW
9392 if (! MORE_ASCII_RESTRICTED) {
9393 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9394
419d8974
KW
9395 /* And under /u or /a, it can match even if the target is
9396 * not utf8 */
9397 if (AT_LEAST_UNI_SEMANTICS) {
d50a4f90
KW
9398 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9399 }
9400 }
9401 break;
9402 case 'F': case 'f':
9403 case 'I': case 'i':
9404 case 'L': case 'l':
9405 case 'T': case 't':
9406 /* These all are targets of multi-character folds, which can
9407 * occur with only non-Latin1 characters in the fold, so they
9408 * can match if the target string isn't UTF-8 */
9409 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9410 break;
9411 case 'A': case 'a':
9412 case 'H': case 'h':
9413 case 'J': case 'j':
9414 case 'N': case 'n':
9415 case 'W': case 'w':
9416 case 'Y': case 'y':
9417 /* These all are targets of multi-character folds, which occur
9418 * only with a non-Latin1 character as part of the fold, so
9419 * they can't match unless the target string is in UTF-8, so no
9420 * action here is necessary */
9421 break;
9422 default:
9423 /* Use deprecated warning to increase the chances of this
9424 * being output */
9425 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9426 break;
9427 }
9428 }
9429 else if (DEPENDS_SEMANTICS
f56b6394 9430 && ! isASCII(value)
d50a4f90
KW
9431 && PL_fold_latin1[value] != value)
9432 {
9433 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9434 * folds only when the target string is in UTF-8. We add the fold
9435 * here to the list of things to match outside the bitmap, which
9436 * won't be looked at unless it is UTF8 (or else if something else
9437 * says to look even if not utf8, but those things better not happen
9438 * under DEPENDS semantics. */
9439 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
2283d326
KW
9440 }
9441
9442 return stored;
9443}
9444
9445
9446PERL_STATIC_INLINE U8
2c6aa593 9447S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
9448{
9449 /* This inline function sets a bit in the bitmap if not already set, and if
9450 * appropriate, its fold, returning the number of bits that actually
9451 * changed from 0 to 1 */
9452
9453 U8 stored;
9454
4c9daa0a
KW
9455 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9456
2283d326
KW
9457 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9458 return 0;
9459 }
9460
9461 ANYOF_BITMAP_SET(node, value);
9462 stored = 1;
9463
9464 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
2c6aa593 9465 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
2283d326
KW
9466 }
9467
9468 return stored;
9469}
9470
c8453963
KW
9471STATIC void
9472S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9473{
9474 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9475 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9476 * the multi-character folds of characters in the node */
9477 SV *sv;
9478
9479 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9480
9481 if (! *alternate_ptr) {
9482 *alternate_ptr = newAV();
9483 }
9484 sv = newSVpvn_utf8((char*)string, len, TRUE);
9485 av_push(*alternate_ptr, sv);
9486 return;
9487}
9488
7f6f358c
YO
9489/*
9490 parse a class specification and produce either an ANYOF node that
ddad5e0b 9491 matches the pattern or perhaps will be optimized into an EXACTish node
679d1424
KW
9492 instead. The node contains a bit map for the first 256 characters, with the
9493 corresponding bit set if that character is in the list. For characters
9494 above 255, a range list is used */
89836f1f 9495
76e3520e 9496STATIC regnode *
3dab1dad 9497S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 9498{
97aff369 9499 dVAR;
9a86a77b 9500 register UV nextvalue;
3568d838 9501 register IV prevvalue = OOB_UNICODE;
ffc61ed2 9502 register IV range = 0;
e1d1eefb 9503 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 9504 register regnode *ret;
ba210ebe 9505 STRLEN numlen;
ffc61ed2 9506 IV namedclass;
cbbf8932 9507 char *rangebegin = NULL;
936ed897 9508 bool need_class = 0;
c445ea15 9509 SV *listsv = NULL;
137165a6
KW
9510 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9511 than just initialized. */
ffc61ed2 9512 UV n;
53742956
KW
9513
9514 /* code points this node matches that can't be stored in the bitmap */
56ca34ca 9515 HV* nonbitmap = NULL;
53742956
KW
9516
9517 /* The items that are to match that aren't stored in the bitmap, but are a
9518 * result of things that are stored there. This is the fold closure of
9519 * such a character, either because it has DEPENDS semantics and shouldn't
9520 * be matched unless the target string is utf8, or is a code point that is
9521 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9522 * above 255. This all is solely for performance reasons. By having this
9523 * code know the outside-the-bitmap folds that the bitmapped characters are
9524 * involved with, we don't have to go out to disk to find the list of
9525 * matches, unless the character class includes code points that aren't
9526 * storable in the bit map. That means that a character class with an 's'
9527 * in it, for example, doesn't need to go out to disk to find everything
9528 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9529 * empty unless there is something whose fold we don't know about, and will
9530 * have to go out to the disk to find. */
5bfec14d 9531 HV* l1_fold_invlist = NULL;
53742956
KW
9532
9533 /* List of multi-character folds that are matched by this node */
cbbf8932 9534 AV* unicode_alternate = NULL;
1b2d223b
JH
9535#ifdef EBCDIC
9536 UV literal_endpoint = 0;
9537#endif
ffc130aa 9538 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 9539
3dab1dad 9540 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 9541 case we need to change the emitted regop to an EXACT. */
07be1b83 9542 const char * orig_parse = RExC_parse;
72f13be8 9543 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9544
9545 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
9546#ifndef DEBUGGING
9547 PERL_UNUSED_ARG(depth);
9548#endif
72f13be8 9549
3dab1dad 9550 DEBUG_PARSE("clas");
7f6f358c
YO
9551
9552 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
9553 ret = reganode(pRExC_state, ANYOF, 0);
9554
56ca34ca
KW
9555
9556 if (!SIZE_ONLY) {
ffc61ed2 9557 ANYOF_FLAGS(ret) = 0;
56ca34ca 9558 }
ffc61ed2 9559
9a86a77b 9560 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
9561 RExC_naughty++;
9562 RExC_parse++;
9563 if (!SIZE_ONLY)
9564 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9565 }
a0d0e21e 9566
73060fc4 9567 if (SIZE_ONLY) {
830247a4 9568 RExC_size += ANYOF_SKIP;
73060fc4
JH
9569 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9570 }
936ed897 9571 else {
830247a4 9572 RExC_emit += ANYOF_SKIP;
3a15e693 9573 if (LOC) {
936ed897 9574 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693 9575 }
ffc61ed2 9576 ANYOF_BITMAP_ZERO(ret);
396482e1 9577 listsv = newSVpvs("# comment\n");
137165a6 9578 initial_listsv_len = SvCUR(listsv);
a0d0e21e 9579 }
b8c5462f 9580
9a86a77b
JH
9581 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9582
b938889d 9583 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 9584 checkposixcc(pRExC_state);
b8c5462f 9585
f064b6ad
HS
9586 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9587 if (UCHARAT(RExC_parse) == ']')
9588 goto charclassloop;
ffc61ed2 9589
fc8cd66c 9590parseit:
9a86a77b 9591 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
9592
9593 charclassloop:
9594
9595 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9596
73b437c8 9597 if (!range)
830247a4 9598 rangebegin = RExC_parse;
ffc61ed2 9599 if (UTF) {
5e12f4fb 9600 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 9601 RExC_end - RExC_parse,
9f7f3913 9602 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9603 RExC_parse += numlen;
9604 }
9605 else
9606 value = UCHARAT(RExC_parse++);
7f6f358c 9607
9a86a77b
JH
9608 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9609 if (value == '[' && POSIXCC(nextvalue))
830247a4 9610 namedclass = regpposixcc(pRExC_state, value);
620e46c5 9611 else if (value == '\\') {
ffc61ed2 9612 if (UTF) {
5e12f4fb 9613 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 9614 RExC_end - RExC_parse,
9f7f3913 9615 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9616 RExC_parse += numlen;
9617 }
9618 else
9619 value = UCHARAT(RExC_parse++);
470c3474 9620 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 9621 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
9622 * be a problem later if we want switch on Unicode.
9623 * A similar issue a little bit later when switching on
9624 * namedclass. --jhi */
ffc61ed2 9625 switch ((I32)value) {
b8c5462f
JH
9626 case 'w': namedclass = ANYOF_ALNUM; break;
9627 case 'W': namedclass = ANYOF_NALNUM; break;
9628 case 's': namedclass = ANYOF_SPACE; break;
9629 case 'S': namedclass = ANYOF_NSPACE; break;
9630 case 'd': namedclass = ANYOF_DIGIT; break;
9631 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
9632 case 'v': namedclass = ANYOF_VERTWS; break;
9633 case 'V': namedclass = ANYOF_NVERTWS; break;
9634 case 'h': namedclass = ANYOF_HORIZWS; break;
9635 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
9636 case 'N': /* Handle \N{NAME} in class */
9637 {
9638 /* We only pay attention to the first char of
9639 multichar strings being returned. I kinda wonder
9640 if this makes sense as it does change the behaviour
9641 from earlier versions, OTOH that behaviour was broken
9642 as well. */
9643 UV v; /* value is register so we cant & it /grrr */
9d64099b 9644 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
fc8cd66c
YO
9645 goto parseit;
9646 }
9647 value= v;
9648 }
9649 break;
ffc61ed2
JH
9650 case 'p':
9651 case 'P':
3dab1dad
YO
9652 {
9653 char *e;
af6f566e 9654 if (RExC_parse >= RExC_end)
2a4859cd 9655 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 9656 if (*RExC_parse == '{') {
1df70142 9657 const U8 c = (U8)value;
ffc61ed2
JH
9658 e = strchr(RExC_parse++, '}');
9659 if (!e)
0da60cf5 9660 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
9661 while (isSPACE(UCHARAT(RExC_parse)))
9662 RExC_parse++;
9663 if (e == RExC_parse)
0da60cf5 9664 vFAIL2("Empty \\%c{}", c);
ffc61ed2 9665 n = e - RExC_parse;
ab13f0c7
JH
9666 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9667 n--;
ffc61ed2
JH
9668 }
9669 else {
9670 e = RExC_parse;
9671 n = 1;
9672 }
ee410026 9673 if (!SIZE_ONLY) {
ab13f0c7
JH
9674 if (UCHARAT(RExC_parse) == '^') {
9675 RExC_parse++;
9676 n--;
9677 value = value == 'p' ? 'P' : 'p'; /* toggle */
9678 while (isSPACE(UCHARAT(RExC_parse))) {
9679 RExC_parse++;
9680 n--;
9681 }
9682 }
2f833f52
KW
9683
9684 /* Add the property name to the list. If /i matching, give
9685 * a different name which consists of the normal name
9686 * sandwiched between two underscores and '_i'. The design
9687 * is discussed in the commit message for this. */
9688 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9689 (value=='p' ? '+' : '!'),
9690 (FOLD) ? "__" : "",
9691 (int)n,
9692 RExC_parse,
9693 (FOLD) ? "_i" : ""
9694 );
ffc61ed2
JH
9695 }
9696 RExC_parse = e + 1;
08fc12dd
KW
9697
9698 /* The \p could match something in the Latin1 range, hence
9699 * something that isn't utf8 */
db8c82dd 9700 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
f81125e2 9701 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
9702
9703 /* \p means they want Unicode semantics */
9704 RExC_uni_semantics = 1;
3dab1dad 9705 }
f81125e2 9706 break;
b8c5462f
JH
9707 case 'n': value = '\n'; break;
9708 case 'r': value = '\r'; break;
9709 case 't': value = '\t'; break;
9710 case 'f': value = '\f'; break;
9711 case 'b': value = '\b'; break;
c7f1f016
NIS
9712 case 'e': value = ASCII_TO_NATIVE('\033');break;
9713 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
9714 case 'o':
9715 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
9716 {
9717 const char* error_msg;
9718 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
9719 &value,
9720 &numlen,
454155d9
KW
9721 &error_msg,
9722 SIZE_ONLY);
9723 RExC_parse += numlen;
9724 if (! valid) {
9725 vFAIL(error_msg);
9726 }
f0a2b745 9727 }
f0a2b745
KW
9728 if (PL_encoding && value < 0x100) {
9729 goto recode_encoding;
9730 }
9731 break;
b8c5462f 9732 case 'x':
ffc61ed2 9733 if (*RExC_parse == '{') {
a4c04bdc
NC
9734 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9735 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 9736 char * const e = strchr(RExC_parse++, '}');
b81d288d 9737 if (!e)
ffc61ed2 9738 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
9739
9740 numlen = e - RExC_parse;
9741 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9742 RExC_parse = e + 1;
9743 }
9744 else {
a4c04bdc 9745 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
9746 numlen = 2;
9747 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9748 RExC_parse += numlen;
9749 }
9e08bc66
TS
9750 if (PL_encoding && value < 0x100)
9751 goto recode_encoding;
b8c5462f
JH
9752 break;
9753 case 'c':
17a3df4c 9754 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
b8c5462f
JH
9755 break;
9756 case '0': case '1': case '2': case '3': case '4':
c99e91e9 9757 case '5': case '6': case '7':
9e08bc66 9758 {
c99e91e9
KW
9759 /* Take 1-3 octal digits */
9760 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
9761 numlen = 3;
9762 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9763 RExC_parse += numlen;
9764 if (PL_encoding && value < 0x100)
9765 goto recode_encoding;
9766 break;
9767 }
9768 recode_encoding:
e2a7e165 9769 if (! RExC_override_recoding) {
9e08bc66
TS
9770 SV* enc = PL_encoding;
9771 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
9772 if (!enc && SIZE_ONLY)
9773 ckWARNreg(RExC_parse,
9774 "Invalid escape in the specified encoding");
9e08bc66
TS
9775 break;
9776 }
1028017a 9777 default:
c99e91e9
KW
9778 /* Allow \_ to not give an error */
9779 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
9780 ckWARN2reg(RExC_parse,
9781 "Unrecognized escape \\%c in character class passed through",
9782 (int)value);
c99e91e9 9783 }
1028017a 9784 break;
b8c5462f 9785 }
ffc61ed2 9786 } /* end of \blah */
1b2d223b
JH
9787#ifdef EBCDIC
9788 else
9789 literal_endpoint++;
9790#endif
ffc61ed2
JH
9791
9792 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9793
2c63ecad
KW
9794 /* What matches in a locale is not known until runtime, so need to
9795 * (one time per class) allocate extra space to pass to regexec.
9796 * The space will contain a bit for each named class that is to be
9797 * matched against. This isn't needed for \p{} and pseudo-classes,
9798 * as they are not affected by locale, and hence are dealt with
9799 * separately */
9800 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9801 need_class = 1;
9802 if (SIZE_ONLY) {
dd58aee1 9803 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
9804 }
9805 else {
dd58aee1 9806 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
9807 ANYOF_CLASS_ZERO(ret);
9808 }
9051cfd9 9809 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 9810 }
ffc61ed2 9811
d5788240 9812 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
1d791ab2
KW
9813 * literal, as is the character that began the false range, i.e.
9814 * the 'a' in the examples */
ffc61ed2 9815 if (range) {
73b437c8 9816 if (!SIZE_ONLY) {
668c081a
NC
9817 const int w =
9818 RExC_parse >= rangebegin ?
9819 RExC_parse - rangebegin : 0;
9820 ckWARN4reg(RExC_parse,
b45f050a 9821 "False [] range \"%*.*s\"",
097eb12c 9822 w, w, rangebegin);
668c081a 9823
1d791ab2 9824 stored +=
5bfec14d 9825 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
3568d838 9826 if (prevvalue < 256) {
2283d326 9827 stored +=
5bfec14d 9828 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
ffc61ed2
JH
9829 }
9830 else {
1d791ab2 9831 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
ffc61ed2 9832 }
b8c5462f 9833 }
ffc61ed2
JH
9834
9835 range = 0; /* this was not a true range */
73b437c8 9836 }
ffc61ed2 9837
89836f1f
YO
9838
9839
73b437c8 9840 if (!SIZE_ONLY) {
c49a72a9
NC
9841 const char *what = NULL;
9842 char yesno = 0;
9843
e2962f66
JH
9844 /* Possible truncation here but in some 64-bit environments
9845 * the compiler gets heartburn about switch on 64-bit values.
9846 * A similar issue a little earlier when switching on value.
98f323fa 9847 * --jhi */
e2962f66 9848 switch ((I32)namedclass) {
da7fcca4 9849
f952827c
KW
9850 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9851 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9852 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9853 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9854 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9855 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9856 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9857 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9858 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9859 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
a12cf05f 9860 /* \s, \w match all unicode if utf8. */
f952827c
KW
9861 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9862 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
f952827c 9863 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
e1d1eefb
YO
9864 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9865 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
9866 case ANYOF_ASCII:
9867 if (LOC)
936ed897 9868 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 9869 else {
1ba5c669 9870 for (value = 0; value < 128; value++)
2283d326 9871 stored +=
5bfec14d 9872 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
73b437c8 9873 }
c49a72a9 9874 yesno = '+';
ce1c68b2
KW
9875 what = NULL; /* Doesn't match outside ascii, so
9876 don't want to add +utf8:: */
73b437c8
JH
9877 break;
9878 case ANYOF_NASCII:
9879 if (LOC)
936ed897 9880 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 9881 else {
1ba5c669 9882 for (value = 128; value < 256; value++)
2283d326 9883 stored +=
5bfec14d 9884 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
73b437c8 9885 }
cfaf538b 9886 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
c49a72a9
NC
9887 yesno = '!';
9888 what = "ASCII";
89836f1f 9889 break;
ffc61ed2
JH
9890 case ANYOF_DIGIT:
9891 if (LOC)
9892 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9893 else {
9894 /* consecutive digits assumed */
9895 for (value = '0'; value <= '9'; value++)
2283d326 9896 stored +=
5bfec14d 9897 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9898 }
c49a72a9 9899 yesno = '+';
779d7b58 9900 what = "Digit";
ffc61ed2
JH
9901 break;
9902 case ANYOF_NDIGIT:
9903 if (LOC)
9904 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9905 else {
9906 /* consecutive digits assumed */
9907 for (value = 0; value < '0'; value++)
2283d326 9908 stored +=
5bfec14d 9909 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9910 for (value = '9' + 1; value < 256; value++)
2283d326 9911 stored +=
5bfec14d 9912 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9913 }
c49a72a9 9914 yesno = '!';
779d7b58 9915 what = "Digit";
2f7f8cb1 9916 if (AT_LEAST_ASCII_RESTRICTED ) {
cfaf538b
KW
9917 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9918 }
89836f1f 9919 break;
f81125e2
JP
9920 case ANYOF_MAX:
9921 /* this is to handle \p and \P */
9922 break;
73b437c8 9923 default:
b45f050a 9924 vFAIL("Invalid [::] class");
73b437c8 9925 break;
b8c5462f 9926 }
2f7f8cb1 9927 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
c49a72a9
NC
9928 /* Strings such as "+utf8::isWord\n" */
9929 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810 9930 }
ce1c68b2 9931
73b437c8 9932 continue;
a0d0e21e 9933 }
ffc61ed2
JH
9934 } /* end of namedclass \blah */
9935
a0d0e21e 9936 if (range) {
eb160463 9937 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
9938 const int w = RExC_parse - rangebegin;
9939 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 9940 range = 0; /* not a valid range */
73b437c8 9941 }
a0d0e21e
LW
9942 }
9943 else {
3568d838 9944 prevvalue = value; /* save the beginning of the range */
646253b5
KW
9945 if (RExC_parse+1 < RExC_end
9946 && *RExC_parse == '-'
9947 && RExC_parse[1] != ']')
9948 {
830247a4 9949 RExC_parse++;
ffc61ed2
JH
9950
9951 /* a bad range like \w-, [:word:]- ? */
9952 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 9953 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 9954 const int w =
afd78fd5
JH
9955 RExC_parse >= rangebegin ?
9956 RExC_parse - rangebegin : 0;
830247a4 9957 vWARN4(RExC_parse,
b45f050a 9958 "False [] range \"%*.*s\"",
097eb12c 9959 w, w, rangebegin);
afd78fd5 9960 }
73b437c8 9961 if (!SIZE_ONLY)
2283d326 9962 stored +=
5bfec14d 9963 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
73b437c8 9964 } else
ffc61ed2
JH
9965 range = 1; /* yeah, it's a range! */
9966 continue; /* but do it the next time */
a0d0e21e 9967 }
a687059c 9968 }
ffc61ed2 9969
046c4055
KW
9970 /* non-Latin1 code point implies unicode semantics. Must be set in
9971 * pass1 so is there for the whole of pass 2 */
56ca34ca
KW
9972 if (value > 255) {
9973 RExC_uni_semantics = 1;
9974 }
9975
93733859 9976 /* now is the next time */
ae5c130c 9977 if (!SIZE_ONLY) {
3568d838 9978 if (prevvalue < 256) {
1df70142 9979 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 9980 IV i;
3568d838 9981#ifdef EBCDIC
1b2d223b
JH
9982 /* In EBCDIC [\x89-\x91] should include
9983 * the \x8e but [i-j] should not. */
9984 if (literal_endpoint == 2 &&
9985 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9986 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 9987 {
3568d838
JH
9988 if (isLOWER(prevvalue)) {
9989 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9990 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9991 stored +=
5bfec14d 9992 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 9993 }
ffc61ed2 9994 } else {
3568d838 9995 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9996 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9997 stored +=
5bfec14d 9998 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 9999 }
ffc61ed2 10000 }
8ada0baa 10001 }
ffc61ed2 10002 else
8ada0baa 10003#endif
07be1b83 10004 for (i = prevvalue; i <= ceilvalue; i++) {
5bfec14d 10005 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
07be1b83 10006 }
3568d838 10007 }
56ca34ca
KW
10008 if (value > 255) {
10009 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
10010 const UV natvalue = NATIVE_TO_UNI(value);
56ca34ca 10011 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
56ca34ca 10012 }
1b2d223b
JH
10013#ifdef EBCDIC
10014 literal_endpoint = 0;
10015#endif
8ada0baa 10016 }
ffc61ed2
JH
10017
10018 range = 0; /* this range (if it was one) is done now */
a0d0e21e 10019 }
ffc61ed2 10020
ffc61ed2 10021
7f6f358c
YO
10022
10023 if (SIZE_ONLY)
10024 return ret;
10025 /****** !SIZE_ONLY AFTER HERE *********/
10026
0c6e4288
KW
10027 /* If folding and there are code points above 255, we calculate all
10028 * characters that could fold to or from the ones already on the list */
10029 if (FOLD && nonbitmap) {
56ca34ca
KW
10030 UV i;
10031
93e5bb1c
KW
10032 HV* fold_intersection;
10033 UV* fold_list;
10034
10035 /* This is a list of all the characters that participate in folds
10036 * (except marks, etc in multi-char folds */
10037 if (! PL_utf8_foldable) {
10038 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10039 PL_utf8_foldable = _swash_to_invlist(swash);
10040 }
56ca34ca 10041
93e5bb1c
KW
10042 /* This is a hash that for a particular fold gives all characters
10043 * that are involved in it */
10044 if (! PL_utf8_foldclosures) {
10045
10046 /* If we were unable to find any folds, then we likely won't be
10047 * able to find the closures. So just create an empty list.
10048 * Folding will effectively be restricted to the non-Unicode rules
10049 * hard-coded into Perl. (This case happens legitimately during
10050 * compilation of Perl itself before the Unicode tables are
10051 * generated) */
10052 if (invlist_len(PL_utf8_foldable) == 0) {
10053 PL_utf8_foldclosures = _new_invlist(0);
10054 } else {
10055 /* If the folds haven't been read in, call a fold function
10056 * to force that */
10057 if (! PL_utf8_tofold) {
10058 U8 dummy[UTF8_MAXBYTES+1];
10059 STRLEN dummy_len;
10060 to_utf8_fold((U8*) "A", dummy, &dummy_len);
56ca34ca 10061 }
93e5bb1c 10062 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
56ca34ca 10063 }
93e5bb1c
KW
10064 }
10065
10066 /* Only the characters in this class that participate in folds need
10067 * be checked. Get the intersection of this class and all the
10068 * possible characters that are foldable. This can quickly narrow
10069 * down a large class */
10070 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10071
10072 /* Now look at the foldable characters in this class individually */
10073 fold_list = invlist_array(fold_intersection);
10074 for (i = 0; i < invlist_len(fold_intersection); i++) {
10075 UV j;
10076
10077 /* The next entry is the beginning of the range that is in the
10078 * class */
10079 UV start = fold_list[i++];
10080
56ca34ca 10081
93e5bb1c
KW
10082 /* The next entry is the beginning of the next range, which
10083 * isn't in the class, so the end of the current range is one
10084 * less than that */
10085 UV end = fold_list[i] - 1;
10086
10087 /* Look at every character in the range */
10088 for (j = start; j <= end; j++) {
10089
10090 /* Get its fold */
10091 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10092 STRLEN foldlen;
10093 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10094
10095 if (foldlen > (STRLEN)UNISKIP(f)) {
10096
10097 /* Any multicharacter foldings (disallowed in
10098 * lookbehind patterns) require the following
10099 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10100 * E folds into "pq" and F folds into "rst", all other
10101 * characters fold to single characters. We save away
10102 * these multicharacter foldings, to be later saved as
10103 * part of the additional "s" data. */
10104 if (! RExC_in_lookbehind) {
10105 U8* loc = foldbuf;
10106 U8* e = foldbuf + foldlen;
10107
10108 /* If any of the folded characters of this are in
10109 * the Latin1 range, tell the regex engine that
10110 * this can match a non-utf8 target string. The
10111 * only multi-byte fold whose source is in the
10112 * Latin1 range (U+00DF) applies only when the
10113 * target string is utf8, or under unicode rules */
10114 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10115 while (loc < e) {
10116
10117 /* Can't mix ascii with non- under /aa */
10118 if (MORE_ASCII_RESTRICTED
10119 && (isASCII(*loc) != isASCII(j)))
10120 {
10121 goto end_multi_fold;
10122 }
10123 if (UTF8_IS_INVARIANT(*loc)
10124 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10125 {
10126 /* Can't mix above and below 256 under
10127 * LOC */
10128 if (LOC) {
2f7f8cb1
KW
10129 goto end_multi_fold;
10130 }
93e5bb1c
KW
10131 ANYOF_FLAGS(ret)
10132 |= ANYOF_NONBITMAP_NON_UTF8;
10133 break;
8e3094e5 10134 }
93e5bb1c 10135 loc += UTF8SKIP(loc);
8e3094e5 10136 }
56ca34ca 10137 }
17580e7a 10138
93e5bb1c
KW
10139 add_alternate(&unicode_alternate, foldbuf, foldlen);
10140 end_multi_fold: ;
10141 }
14e30abc
KW
10142
10143 /* This is special-cased, as it is the only letter which
10144 * has both a multi-fold and single-fold in Latin1. All
10145 * the other chars that have single and multi-folds are
10146 * always in utf8, and the utf8 folding algorithm catches
10147 * them */
10148 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10149 stored += set_regclass_bit(pRExC_state,
10150 ret,
10151 LATIN_SMALL_LETTER_SHARP_S,
10152 &l1_fold_invlist, &unicode_alternate);
10153 }
93e5bb1c
KW
10154 }
10155 else {
10156 /* Single character fold. Add everything in its fold
10157 * closure to the list that this node should match */
10158 SV** listp;
10159
10160 /* The fold closures data structure is a hash with the
10161 * keys being every character that is folded to, like
10162 * 'k', and the values each an array of everything that
10163 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10164 if ((listp = hv_fetch(PL_utf8_foldclosures,
10165 (char *) foldbuf, foldlen, FALSE)))
10166 {
10167 AV* list = (AV*) *listp;
10168 IV k;
10169 for (k = 0; k <= av_len(list); k++) {
10170 SV** c_p = av_fetch(list, k, FALSE);
10171 UV c;
10172 if (c_p == NULL) {
10173 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10174 }
10175 c = SvUV(*c_p);
10176
10177 /* /aa doesn't allow folds between ASCII and
10178 * non-; /l doesn't allow them between above
10179 * and below 256 */
10180 if ((MORE_ASCII_RESTRICTED
10181 && (isASCII(c) != isASCII(j)))
10182 || (LOC && ((c < 256) != (j < 256))))
10183 {
10184 continue;
10185 }
56ca34ca 10186
93e5bb1c
KW
10187 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10188 stored += set_regclass_bit(pRExC_state,
10189 ret,
10190 (U8) c,
10191 &l1_fold_invlist, &unicode_alternate);
10192 }
10193 /* It may be that the code point is already
10194 * in this range or already in the bitmap,
10195 * in which case we need do nothing */
10196 else if ((c < start || c > end)
10197 && (c > 255
10198 || ! ANYOF_BITMAP_TEST(ret, c)))
10199 {
10200 nonbitmap = add_cp_to_invlist(nonbitmap, c);
56ca34ca
KW
10201 }
10202 }
10203 }
10204 }
10205 }
93e5bb1c
KW
10206 }
10207 invlist_destroy(fold_intersection);
56ca34ca
KW
10208 }
10209
53742956
KW
10210 /* Combine the two lists into one. */
10211 if (l1_fold_invlist) {
10212 if (nonbitmap) {
10213 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10214 }
10215 else {
10216 nonbitmap = l1_fold_invlist;
10217 }
10218 }
10219
fb9bfbf7
KW
10220 /* Here, we have calculated what code points should be in the character
10221 * class. Now we can see about various optimizations. Fold calculation
10222 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10223 * include K, which under /i would match k. */
10224
f56b6394
KW
10225 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10226 * set the FOLD flag yet, so this this does optimize those. It doesn't
40c78556
KW
10227 * optimize locale. Doing so perhaps could be done as long as there is
10228 * nothing like \w in it; some thought also would have to be given to the
10229 * interaction with above 0x100 chars */
137165a6
KW
10230 if (! LOC
10231 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10232 && ! unicode_alternate
10233 && ! nonbitmap
10234 && SvCUR(listsv) == initial_listsv_len)
10235 {
40c78556
KW
10236 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10237 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10238 stored = 256 - stored;
10239
d5788240
KW
10240 /* The inversion means that everything above 255 is matched; and at the
10241 * same time we clear the invert flag */
137165a6 10242 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
40c78556
KW
10243 }
10244
0222889f
KW
10245 /* Folding in the bitmap is taken care of above, but not for locale (for
10246 * which we have to wait to see what folding is in effect at runtime), and
10247 * for things not in the bitmap. Set run-time fold flag for these */
53742956 10248 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
0222889f 10249 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
f56b6394
KW
10250 }
10251
2786be71
KW
10252 /* A single character class can be "optimized" into an EXACTish node.
10253 * Note that since we don't currently count how many characters there are
10254 * outside the bitmap, we are XXX missing optimization possibilities for
10255 * them. This optimization can't happen unless this is a truly single
10256 * character class, which means that it can't be an inversion into a
10257 * many-character class, and there must be no possibility of there being
10258 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
10259 * \w, etc, so have to make a special test that they aren't present
10260 *
10261 * Similarly A 2-character class of the very special form like [bB] can be
10262 * optimized into an EXACTFish node, but only for non-locales, and for
10263 * characters which only have the two folds; so things like 'fF' and 'Ii'
10264 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10265 * FI'. */
137165a6 10266 if (! nonbitmap
53742956 10267 && ! unicode_alternate
137165a6
KW
10268 && SvCUR(listsv) == initial_listsv_len
10269 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
10270 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10271 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10272 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10273 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10274 /* If the latest code point has a fold whose
10275 * bit is set, it must be the only other one */
2dcac756 10276 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 10277 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
10278 {
10279 /* Note that the information needed to decide to do this optimization
10280 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
10281 * used EXACTish node takes less space than the calculated ANYOF node,
10282 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
10283 * than actually used, so this optimization doesn't gain us any space.
10284 * But an EXACT node is faster than an ANYOF node, and can be combined
10285 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
10286 * gains. The speed of executing an EXACTF is similar to an ANYOF
10287 * node, so the optimization advantage comes from the ability to join
10288 * it to adjacent EXACT nodes */
2786be71 10289
07be1b83 10290 const char * cur_parse= RExC_parse;
6da63e10 10291 U8 op;
07be1b83
YO
10292 RExC_emit = (regnode *)orig_emit;
10293 RExC_parse = (char *)orig_parse;
2786be71 10294
6da63e10
KW
10295 if (stored == 1) {
10296
10297 /* A locale node with one point can be folded; all the other cases
10298 * with folding will have two points, since we calculate them above
10299 */
39065660 10300 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
10301 op = EXACTFL;
10302 }
10303 else {
10304 op = EXACT;
10305 }
10306 } /* else 2 chars in the bit map: the folds of each other */
cfaf538b 10307 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
6da63e10
KW
10308
10309 /* To join adjacent nodes, they must be the exact EXACTish type.
10310 * Try to use the most likely type, by using EXACTFU if the regex
10311 * calls for them, or is required because the character is
10312 * non-ASCII */
10313 op = EXACTFU;
10314 }
10315 else { /* Otherwise, more likely to be EXACTF type */
10316 op = EXACTF;
10317 }
10318
10319 ret = reg_node(pRExC_state, op);
07be1b83 10320 RExC_parse = (char *)cur_parse;
2786be71
KW
10321 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10322 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10323 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10324 STR_LEN(ret)= 2;
10325 RExC_emit += STR_SZ(2);
10326 }
10327 else {
10328 *STRING(ret)= (char)value;
10329 STR_LEN(ret)= 1;
10330 RExC_emit += STR_SZ(1);
10331 }
ef8d46e8 10332 SvREFCNT_dec(listsv);
7f6f358c
YO
10333 return ret;
10334 }
ffc61ed2 10335
9e791dfe
KW
10336 if (nonbitmap) {
10337 UV* nonbitmap_array = invlist_array(nonbitmap);
10338 UV nonbitmap_len = invlist_len(nonbitmap);
10339 UV i;
10340
10341 /* Here have the full list of items to match that aren't in the
10342 * bitmap. Convert to the structure that the rest of the code is
10343 * expecting. XXX That rest of the code should convert to this
10344 * structure */
10345 for (i = 0; i < nonbitmap_len; i++) {
10346
10347 /* The next entry is the beginning of the range that is in the
10348 * class */
10349 UV start = nonbitmap_array[i++];
27340457 10350 UV end;
9e791dfe
KW
10351
10352 /* The next entry is the beginning of the next range, which isn't
10353 * in the class, so the end of the current range is one less than
27340457
KW
10354 * that. But if there is no next range, it means that the range
10355 * begun by 'start' extends to infinity, which for this platform
10356 * ends at UV_MAX */
10357 if (i == nonbitmap_len) {
10358 end = UV_MAX;
10359 }
10360 else {
10361 end = nonbitmap_array[i] - 1;
10362 }
10363
9e791dfe
KW
10364 if (start == end) {
10365 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10366 }
10367 else {
10368 /* The \t sets the whole range */
10369 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10370 /* XXX EBCDIC */
10371 start, end);
10372 }
10373 }
10374 invlist_destroy(nonbitmap);
10375 }
10376
137165a6
KW
10377 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10378 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10379 SvREFCNT_dec(listsv);
10380 SvREFCNT_dec(unicode_alternate);
10381 }
10382 else {
10383
097eb12c 10384 AV * const av = newAV();
ffc61ed2 10385 SV *rv;
9e55ce06 10386 /* The 0th element stores the character class description
6a0407ee 10387 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
10388 * to initialize the appropriate swash (which gets stored in
10389 * the 1st element), and also useful for dumping the regnode.
10390 * The 2nd element stores the multicharacter foldings,
6a0407ee 10391 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
10392 av_store(av, 0, listsv);
10393 av_store(av, 1, NULL);
ad64d0ec 10394 av_store(av, 2, MUTABLE_SV(unicode_alternate));
c93d5d8b
KW
10395 if (unicode_alternate) { /* This node is variable length */
10396 OP(ret) = ANYOFV;
10397 }
ad64d0ec 10398 rv = newRV_noinc(MUTABLE_SV(av));
19860706 10399 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 10400 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 10401 ARG_SET(ret, n);
a0ed51b3 10402 }
a0ed51b3
LW
10403 return ret;
10404}
89836f1f
YO
10405#undef _C_C_T_
10406
a0ed51b3 10407
bcdf7404
YO
10408/* reg_skipcomment()
10409
10410 Absorbs an /x style # comments from the input stream.
10411 Returns true if there is more text remaining in the stream.
10412 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10413 terminates the pattern without including a newline.
10414
10415 Note its the callers responsibility to ensure that we are
10416 actually in /x mode
10417
10418*/
10419
10420STATIC bool
10421S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10422{
10423 bool ended = 0;
7918f24d
NC
10424
10425 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10426
bcdf7404
YO
10427 while (RExC_parse < RExC_end)
10428 if (*RExC_parse++ == '\n') {
10429 ended = 1;
10430 break;
10431 }
10432 if (!ended) {
10433 /* we ran off the end of the pattern without ending
10434 the comment, so we have to add an \n when wrapping */
10435 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10436 return 0;
10437 } else
10438 return 1;
10439}
10440
10441/* nextchar()
10442
3b753521 10443 Advances the parse position, and optionally absorbs
bcdf7404
YO
10444 "whitespace" from the inputstream.
10445
10446 Without /x "whitespace" means (?#...) style comments only,
10447 with /x this means (?#...) and # comments and whitespace proper.
10448
10449 Returns the RExC_parse point from BEFORE the scan occurs.
10450
10451 This is the /x friendly way of saying RExC_parse++.
10452*/
10453
76e3520e 10454STATIC char*
830247a4 10455S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 10456{
097eb12c 10457 char* const retval = RExC_parse++;
a0d0e21e 10458
7918f24d
NC
10459 PERL_ARGS_ASSERT_NEXTCHAR;
10460
4633a7c4 10461 for (;;) {
830247a4
IZ
10462 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10463 RExC_parse[2] == '#') {
e994fd66
AE
10464 while (*RExC_parse != ')') {
10465 if (RExC_parse == RExC_end)
10466 FAIL("Sequence (?#... not terminated");
830247a4 10467 RExC_parse++;
e994fd66 10468 }
830247a4 10469 RExC_parse++;
4633a7c4
LW
10470 continue;
10471 }
bbe252da 10472 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
10473 if (isSPACE(*RExC_parse)) {
10474 RExC_parse++;
748a9306
LW
10475 continue;
10476 }
830247a4 10477 else if (*RExC_parse == '#') {
bcdf7404
YO
10478 if ( reg_skipcomment( pRExC_state ) )
10479 continue;
748a9306 10480 }
748a9306 10481 }
4633a7c4 10482 return retval;
a0d0e21e 10483 }
a687059c
LW
10484}
10485
10486/*
c277df42 10487- reg_node - emit a node
a0d0e21e 10488*/
76e3520e 10489STATIC regnode * /* Location. */
830247a4 10490S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 10491{
97aff369 10492 dVAR;
c277df42 10493 register regnode *ptr;
504618e9 10494 regnode * const ret = RExC_emit;
07be1b83 10495 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10496
7918f24d
NC
10497 PERL_ARGS_ASSERT_REG_NODE;
10498
c277df42 10499 if (SIZE_ONLY) {
830247a4
IZ
10500 SIZE_ALIGN(RExC_size);
10501 RExC_size += 1;
a0d0e21e
LW
10502 return(ret);
10503 }
3b57cd43
YO
10504 if (RExC_emit >= RExC_emit_bound)
10505 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10506
c277df42 10507 NODE_ALIGN_FILL(ret);
a0d0e21e 10508 ptr = ret;
c277df42 10509 FILL_ADVANCE_NODE(ptr, op);
7122b237 10510#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10511 if (RExC_offsets) { /* MJD */
07be1b83 10512 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 10513 "reg_node", __LINE__,
13d6edb4 10514 PL_reg_name[op],
07be1b83
YO
10515 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10516 ? "Overwriting end of array!\n" : "OK",
10517 (UV)(RExC_emit - RExC_emit_start),
10518 (UV)(RExC_parse - RExC_start),
10519 (UV)RExC_offsets[0]));
ccb2c380 10520 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 10521 }
7122b237 10522#endif
830247a4 10523 RExC_emit = ptr;
a0d0e21e 10524 return(ret);
a687059c
LW
10525}
10526
10527/*
a0d0e21e
LW
10528- reganode - emit a node with an argument
10529*/
76e3520e 10530STATIC regnode * /* Location. */
830247a4 10531S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 10532{
97aff369 10533 dVAR;
c277df42 10534 register regnode *ptr;
504618e9 10535 regnode * const ret = RExC_emit;
07be1b83 10536 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 10537
7918f24d
NC
10538 PERL_ARGS_ASSERT_REGANODE;
10539
c277df42 10540 if (SIZE_ONLY) {
830247a4
IZ
10541 SIZE_ALIGN(RExC_size);
10542 RExC_size += 2;
6bda09f9
YO
10543 /*
10544 We can't do this:
10545
10546 assert(2==regarglen[op]+1);
10547
10548 Anything larger than this has to allocate the extra amount.
10549 If we changed this to be:
10550
10551 RExC_size += (1 + regarglen[op]);
10552
10553 then it wouldn't matter. Its not clear what side effect
10554 might come from that so its not done so far.
10555 -- dmq
10556 */
a0d0e21e
LW
10557 return(ret);
10558 }
3b57cd43
YO
10559 if (RExC_emit >= RExC_emit_bound)
10560 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10561
c277df42 10562 NODE_ALIGN_FILL(ret);
a0d0e21e 10563 ptr = ret;
c277df42 10564 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 10565#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10566 if (RExC_offsets) { /* MJD */
07be1b83 10567 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10568 "reganode",
ccb2c380 10569 __LINE__,
13d6edb4 10570 PL_reg_name[op],
07be1b83 10571 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 10572 "Overwriting end of array!\n" : "OK",
07be1b83
YO
10573 (UV)(RExC_emit - RExC_emit_start),
10574 (UV)(RExC_parse - RExC_start),
10575 (UV)RExC_offsets[0]));
ccb2c380 10576 Set_Cur_Node_Offset;
fac92740 10577 }
7122b237 10578#endif
830247a4 10579 RExC_emit = ptr;
a0d0e21e 10580 return(ret);
fe14fcc3
LW
10581}
10582
10583/*
cd439c50 10584- reguni - emit (if appropriate) a Unicode character
a0ed51b3 10585*/
71207a34
AL
10586STATIC STRLEN
10587S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 10588{
97aff369 10589 dVAR;
7918f24d
NC
10590
10591 PERL_ARGS_ASSERT_REGUNI;
10592
71207a34 10593 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
10594}
10595
10596/*
a0d0e21e
LW
10597- reginsert - insert an operator in front of already-emitted operand
10598*
10599* Means relocating the operand.
10600*/
76e3520e 10601STATIC void
6bda09f9 10602S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 10603{
97aff369 10604 dVAR;
c277df42
IZ
10605 register regnode *src;
10606 register regnode *dst;
10607 register regnode *place;
504618e9 10608 const int offset = regarglen[(U8)op];
6bda09f9 10609 const int size = NODE_STEP_REGNODE + offset;
07be1b83 10610 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10611
10612 PERL_ARGS_ASSERT_REGINSERT;
def51078 10613 PERL_UNUSED_ARG(depth);
22c35a8c 10614/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 10615 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 10616 if (SIZE_ONLY) {
6bda09f9 10617 RExC_size += size;
a0d0e21e
LW
10618 return;
10619 }
a687059c 10620
830247a4 10621 src = RExC_emit;
6bda09f9 10622 RExC_emit += size;
830247a4 10623 dst = RExC_emit;
40d049e4 10624 if (RExC_open_parens) {
6bda09f9 10625 int paren;
3b57cd43 10626 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 10627 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 10628 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 10629 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
10630 RExC_open_parens[paren] += size;
10631 } else {
3b57cd43 10632 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
10633 }
10634 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 10635 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
10636 RExC_close_parens[paren] += size;
10637 } else {
3b57cd43 10638 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
10639 }
10640 }
6bda09f9 10641 }
40d049e4 10642
fac92740 10643 while (src > opnd) {
c277df42 10644 StructCopy(--src, --dst, regnode);
7122b237 10645#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10646 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 10647 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 10648 "reg_insert",
ccb2c380 10649 __LINE__,
13d6edb4 10650 PL_reg_name[op],
07be1b83
YO
10651 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10652 ? "Overwriting end of array!\n" : "OK",
10653 (UV)(src - RExC_emit_start),
10654 (UV)(dst - RExC_emit_start),
10655 (UV)RExC_offsets[0]));
ccb2c380
MP
10656 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10657 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 10658 }
7122b237 10659#endif
fac92740
MJD
10660 }
10661
a0d0e21e
LW
10662
10663 place = opnd; /* Op node, where operand used to be. */
7122b237 10664#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10665 if (RExC_offsets) { /* MJD */
07be1b83 10666 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10667 "reginsert",
ccb2c380 10668 __LINE__,
13d6edb4 10669 PL_reg_name[op],
07be1b83 10670 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 10671 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
10672 (UV)(place - RExC_emit_start),
10673 (UV)(RExC_parse - RExC_start),
786e8c11 10674 (UV)RExC_offsets[0]));
ccb2c380 10675 Set_Node_Offset(place, RExC_parse);
45948336 10676 Set_Node_Length(place, 1);
fac92740 10677 }
7122b237 10678#endif
c277df42
IZ
10679 src = NEXTOPER(place);
10680 FILL_ADVANCE_NODE(place, op);
10681 Zero(src, offset, regnode);
a687059c
LW
10682}
10683
10684/*
c277df42 10685- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 10686- SEE ALSO: regtail_study
a0d0e21e 10687*/
097eb12c 10688/* TODO: All three parms should be const */
76e3520e 10689STATIC void
3dab1dad 10690S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 10691{
97aff369 10692 dVAR;
c277df42 10693 register regnode *scan;
72f13be8 10694 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10695
10696 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
10697#ifndef DEBUGGING
10698 PERL_UNUSED_ARG(depth);
10699#endif
a0d0e21e 10700
c277df42 10701 if (SIZE_ONLY)
a0d0e21e
LW
10702 return;
10703
10704 /* Find last node. */
10705 scan = p;
10706 for (;;) {
504618e9 10707 regnode * const temp = regnext(scan);
3dab1dad
YO
10708 DEBUG_PARSE_r({
10709 SV * const mysv=sv_newmortal();
10710 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10711 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
10712 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10713 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10714 (temp == NULL ? "->" : ""),
13d6edb4 10715 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 10716 );
3dab1dad
YO
10717 });
10718 if (temp == NULL)
10719 break;
10720 scan = temp;
10721 }
10722
10723 if (reg_off_by_arg[OP(scan)]) {
10724 ARG_SET(scan, val - scan);
10725 }
10726 else {
10727 NEXT_OFF(scan) = val - scan;
10728 }
10729}
10730
07be1b83 10731#ifdef DEBUGGING
3dab1dad
YO
10732/*
10733- regtail_study - set the next-pointer at the end of a node chain of p to val.
10734- Look for optimizable sequences at the same time.
10735- currently only looks for EXACT chains.
07be1b83 10736
486ec47a 10737This is experimental code. The idea is to use this routine to perform
07be1b83
YO
10738in place optimizations on branches and groups as they are constructed,
10739with the long term intention of removing optimization from study_chunk so
10740that it is purely analytical.
10741
10742Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10743to control which is which.
10744
3dab1dad
YO
10745*/
10746/* TODO: All four parms should be const */
07be1b83 10747
3dab1dad
YO
10748STATIC U8
10749S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10750{
10751 dVAR;
10752 register regnode *scan;
07be1b83
YO
10753 U8 exact = PSEUDO;
10754#ifdef EXPERIMENTAL_INPLACESCAN
10755 I32 min = 0;
10756#endif
3dab1dad
YO
10757 GET_RE_DEBUG_FLAGS_DECL;
10758
7918f24d
NC
10759 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10760
07be1b83 10761
3dab1dad
YO
10762 if (SIZE_ONLY)
10763 return exact;
10764
10765 /* Find last node. */
10766
10767 scan = p;
10768 for (;;) {
10769 regnode * const temp = regnext(scan);
07be1b83
YO
10770#ifdef EXPERIMENTAL_INPLACESCAN
10771 if (PL_regkind[OP(scan)] == EXACT)
10772 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10773 return EXACT;
10774#endif
3dab1dad
YO
10775 if ( exact ) {
10776 switch (OP(scan)) {
10777 case EXACT:
10778 case EXACTF:
2f7f8cb1 10779 case EXACTFA:
2c2b7f86 10780 case EXACTFU:
3dab1dad
YO
10781 case EXACTFL:
10782 if( exact == PSEUDO )
10783 exact= OP(scan);
07be1b83
YO
10784 else if ( exact != OP(scan) )
10785 exact= 0;
3dab1dad
YO
10786 case NOTHING:
10787 break;
10788 default:
10789 exact= 0;
10790 }
10791 }
10792 DEBUG_PARSE_r({
10793 SV * const mysv=sv_newmortal();
10794 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10795 regprop(RExC_rx, mysv, scan);
eaf3ca90 10796 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 10797 SvPV_nolen_const(mysv),
eaf3ca90 10798 REG_NODE_NUM(scan),
13d6edb4 10799 PL_reg_name[exact]);
3dab1dad 10800 });
a0d0e21e
LW
10801 if (temp == NULL)
10802 break;
10803 scan = temp;
10804 }
07be1b83
YO
10805 DEBUG_PARSE_r({
10806 SV * const mysv_val=sv_newmortal();
10807 DEBUG_PARSE_MSG("");
10808 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
10809 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10810 SvPV_nolen_const(mysv_val),
10811 (IV)REG_NODE_NUM(val),
10812 (IV)(val - scan)
07be1b83
YO
10813 );
10814 });
c277df42
IZ
10815 if (reg_off_by_arg[OP(scan)]) {
10816 ARG_SET(scan, val - scan);
a0ed51b3
LW
10817 }
10818 else {
c277df42
IZ
10819 NEXT_OFF(scan) = val - scan;
10820 }
3dab1dad
YO
10821
10822 return exact;
a687059c 10823}
07be1b83 10824#endif
a687059c
LW
10825
10826/*
fd181c75 10827 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 10828 */
f7819f85 10829#ifdef DEBUGGING
c33269f7 10830static void
7918f24d
NC
10831S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10832{
f7819f85
A
10833 int bit;
10834 int set=0;
a62b1201 10835 regex_charset cs;
7918f24d 10836
f7819f85
A
10837 for (bit=0; bit<32; bit++) {
10838 if (flags & (1<<bit)) {
a62b1201
KW
10839 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10840 continue;
10841 }
f7819f85
A
10842 if (!set++ && lead)
10843 PerlIO_printf(Perl_debug_log, "%s",lead);
10844 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10845 }
10846 }
a62b1201
KW
10847 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10848 if (!set++ && lead) {
10849 PerlIO_printf(Perl_debug_log, "%s",lead);
10850 }
10851 switch (cs) {
10852 case REGEX_UNICODE_CHARSET:
10853 PerlIO_printf(Perl_debug_log, "UNICODE");
10854 break;
10855 case REGEX_LOCALE_CHARSET:
10856 PerlIO_printf(Perl_debug_log, "LOCALE");
10857 break;
cfaf538b
KW
10858 case REGEX_ASCII_RESTRICTED_CHARSET:
10859 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10860 break;
2f7f8cb1
KW
10861 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10862 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10863 break;
a62b1201
KW
10864 default:
10865 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10866 break;
10867 }
10868 }
f7819f85
A
10869 if (lead) {
10870 if (set)
10871 PerlIO_printf(Perl_debug_log, "\n");
10872 else
10873 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10874 }
10875}
10876#endif
10877
a687059c 10878void
097eb12c 10879Perl_regdump(pTHX_ const regexp *r)
a687059c 10880{
35ff7856 10881#ifdef DEBUGGING
97aff369 10882 dVAR;
c445ea15 10883 SV * const sv = sv_newmortal();
ab3bbdeb 10884 SV *dsv= sv_newmortal();
f8fc2ecf 10885 RXi_GET_DECL(r,ri);
f7819f85 10886 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10887
7918f24d
NC
10888 PERL_ARGS_ASSERT_REGDUMP;
10889
f8fc2ecf 10890 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
10891
10892 /* Header fields of interest. */
ab3bbdeb
YO
10893 if (r->anchored_substr) {
10894 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10895 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 10896 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10897 "anchored %s%s at %"IVdf" ",
10898 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 10899 (IV)r->anchored_offset);
ab3bbdeb
YO
10900 } else if (r->anchored_utf8) {
10901 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10902 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 10903 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10904 "anchored utf8 %s%s at %"IVdf" ",
10905 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 10906 (IV)r->anchored_offset);
ab3bbdeb
YO
10907 }
10908 if (r->float_substr) {
10909 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10910 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 10911 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10912 "floating %s%s at %"IVdf"..%"UVuf" ",
10913 s, RE_SV_TAIL(r->float_substr),
7b0972df 10914 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
10915 } else if (r->float_utf8) {
10916 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10917 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 10918 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10919 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10920 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 10921 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 10922 }
33b8afdf 10923 if (r->check_substr || r->check_utf8)
b81d288d 10924 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
10925 (const char *)
10926 (r->check_substr == r->float_substr
10927 && r->check_utf8 == r->float_utf8
10928 ? "(checking floating" : "(checking anchored"));
bbe252da 10929 if (r->extflags & RXf_NOSCAN)
c277df42 10930 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 10931 if (r->extflags & RXf_CHECK_ALL)
c277df42 10932 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 10933 if (r->check_substr || r->check_utf8)
c277df42
IZ
10934 PerlIO_printf(Perl_debug_log, ") ");
10935
f8fc2ecf
YO
10936 if (ri->regstclass) {
10937 regprop(r, sv, ri->regstclass);
1de06328 10938 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 10939 }
bbe252da 10940 if (r->extflags & RXf_ANCH) {
774d564b 10941 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 10942 if (r->extflags & RXf_ANCH_BOL)
774d564b 10943 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 10944 if (r->extflags & RXf_ANCH_MBOL)
c277df42 10945 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 10946 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 10947 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 10948 if (r->extflags & RXf_ANCH_GPOS)
774d564b 10949 PerlIO_printf(Perl_debug_log, "(GPOS)");
10950 PerlIO_putc(Perl_debug_log, ' ');
10951 }
bbe252da 10952 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 10953 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 10954 if (r->intflags & PREGf_SKIP)
760ac839 10955 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 10956 if (r->intflags & PREGf_IMPLICIT)
760ac839 10957 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 10958 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 10959 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 10960 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 10961 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 10962 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 10963#else
7918f24d 10964 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 10965 PERL_UNUSED_CONTEXT;
65e66c80 10966 PERL_UNUSED_ARG(r);
17c3b450 10967#endif /* DEBUGGING */
a687059c
LW
10968}
10969
10970/*
a0d0e21e
LW
10971- regprop - printable representation of opcode
10972*/
3339dfd8
YO
10973#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10974STMT_START { \
10975 if (do_sep) { \
10976 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10977 if (flags & ANYOF_INVERT) \
10978 /*make sure the invert info is in each */ \
10979 sv_catpvs(sv, "^"); \
10980 do_sep = 0; \
10981 } \
10982} STMT_END
10983
46fc3d4c 10984void
32fc9b6a 10985Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 10986{
35ff7856 10987#ifdef DEBUGGING
97aff369 10988 dVAR;
9b155405 10989 register int k;
f8fc2ecf 10990 RXi_GET_DECL(prog,progi);
1de06328 10991 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 10992
7918f24d 10993 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 10994
76f68e9b 10995 sv_setpvs(sv, "");
8aa23a47 10996
03363afd 10997 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
10998 /* It would be nice to FAIL() here, but this may be called from
10999 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 11000 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 11001 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 11002
3dab1dad 11003 k = PL_regkind[OP(o)];
9b155405 11004
2a782b5b 11005 if (k == EXACT) {
f92a2122 11006 sv_catpvs(sv, " ");
ab3bbdeb
YO
11007 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11008 * is a crude hack but it may be the best for now since
11009 * we have no flag "this EXACTish node was UTF-8"
11010 * --jhi */
f92a2122
NC
11011 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11012 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 11013 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
11014 PERL_PV_PRETTY_ELLIPSES |
11015 PERL_PV_PRETTY_LTGT |
11016 PERL_PV_PRETTY_NOCLEAR
11017 );
bb263b4e 11018 } else if (k == TRIE) {
3dab1dad 11019 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 11020 * progi->data isn't available here */
1de06328 11021 const char op = OP(o);
647f639f 11022 const U32 n = ARG(o);
1de06328 11023 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 11024 (reg_ac_data *)progi->data->data[n] :
1de06328 11025 NULL;
3251b653
NC
11026 const reg_trie_data * const trie
11027 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 11028
13d6edb4 11029 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
11030 DEBUG_TRIE_COMPILE_r(
11031 Perl_sv_catpvf(aTHX_ sv,
11032 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11033 (UV)trie->startstate,
1e2e3d02 11034 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
11035 (UV)trie->wordcount,
11036 (UV)trie->minlen,
11037 (UV)trie->maxlen,
11038 (UV)TRIE_CHARCOUNT(trie),
11039 (UV)trie->uniquecharcount
11040 )
11041 );
11042 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11043 int i;
11044 int rangestart = -1;
f46cb337 11045 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 11046 sv_catpvs(sv, "[");
1de06328
YO
11047 for (i = 0; i <= 256; i++) {
11048 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11049 if (rangestart == -1)
11050 rangestart = i;
11051 } else if (rangestart != -1) {
11052 if (i <= rangestart + 3)
11053 for (; rangestart < i; rangestart++)
11054 put_byte(sv, rangestart);
11055 else {
11056 put_byte(sv, rangestart);
11057 sv_catpvs(sv, "-");
11058 put_byte(sv, i - 1);
11059 }
11060 rangestart = -1;
11061 }
11062 }
f3a2811a 11063 sv_catpvs(sv, "]");
1de06328
YO
11064 }
11065
a3621e74 11066 } else if (k == CURLY) {
cb434fcc 11067 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
11068 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11069 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 11070 }
2c2d71f5
JH
11071 else if (k == WHILEM && o->flags) /* Ordinal/of */
11072 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 11073 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 11074 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 11075 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 11076 if ( k != REF || (OP(o) < NREF)) {
502c6561 11077 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
11078 SV **name= av_fetch(list, ARG(o), 0 );
11079 if (name)
11080 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11081 }
11082 else {
502c6561 11083 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 11084 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
11085 I32 *nums=(I32*)SvPVX(sv_dat);
11086 SV **name= av_fetch(list, nums[0], 0 );
11087 I32 n;
11088 if (name) {
11089 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11090 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11091 (n ? "," : ""), (IV)nums[n]);
11092 }
11093 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 11094 }
1f1031fe 11095 }
ee9b8eae 11096 }
1f1031fe 11097 } else if (k == GOSUB)
6bda09f9 11098 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
11099 else if (k == VERB) {
11100 if (!o->flags)
11101 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 11102 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 11103 } else if (k == LOGICAL)
04ebc1ab 11104 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 11105 else if (k == FOLDCHAR)
df44d732 11106 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
11107 else if (k == ANYOF) {
11108 int i, rangestart = -1;
2d03de9c 11109 const U8 flags = ANYOF_FLAGS(o);
24d786f4 11110 int do_sep = 0;
0bd48802
AL
11111
11112 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11113 static const char * const anyofs[] = {
653099ff
GS
11114 "\\w",
11115 "\\W",
11116 "\\s",
11117 "\\S",
11118 "\\d",
11119 "\\D",
11120 "[:alnum:]",
11121 "[:^alnum:]",
11122 "[:alpha:]",
11123 "[:^alpha:]",
11124 "[:ascii:]",
11125 "[:^ascii:]",
24d786f4
YO
11126 "[:cntrl:]",
11127 "[:^cntrl:]",
653099ff
GS
11128 "[:graph:]",
11129 "[:^graph:]",
11130 "[:lower:]",
11131 "[:^lower:]",
11132 "[:print:]",
11133 "[:^print:]",
11134 "[:punct:]",
11135 "[:^punct:]",
11136 "[:upper:]",
aaa51d5e 11137 "[:^upper:]",
653099ff 11138 "[:xdigit:]",
aaa51d5e
JF
11139 "[:^xdigit:]",
11140 "[:space:]",
11141 "[:^space:]",
11142 "[:blank:]",
11143 "[:^blank:]"
653099ff
GS
11144 };
11145
19860706 11146 if (flags & ANYOF_LOCALE)
396482e1 11147 sv_catpvs(sv, "{loc}");
39065660 11148 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 11149 sv_catpvs(sv, "{i}");
653099ff 11150 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 11151 if (flags & ANYOF_INVERT)
396482e1 11152 sv_catpvs(sv, "^");
3339dfd8
YO
11153
11154 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
11155 for (i = 0; i <= 256; i++) {
11156 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11157 if (rangestart == -1)
11158 rangestart = i;
11159 } else if (rangestart != -1) {
11160 if (i <= rangestart + 3)
11161 for (; rangestart < i; rangestart++)
653099ff 11162 put_byte(sv, rangestart);
ffc61ed2
JH
11163 else {
11164 put_byte(sv, rangestart);
396482e1 11165 sv_catpvs(sv, "-");
ffc61ed2 11166 put_byte(sv, i - 1);
653099ff 11167 }
24d786f4 11168 do_sep = 1;
ffc61ed2 11169 rangestart = -1;
653099ff 11170 }
847a199f 11171 }
3339dfd8
YO
11172
11173 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
11174 /* output any special charclass tests (used entirely under use locale) */
11175 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 11176 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 11177 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 11178 sv_catpv(sv, anyofs[i]);
24d786f4
YO
11179 do_sep = 1;
11180 }
11181
3339dfd8
YO
11182 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11183
11454c59
KW
11184 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11185 sv_catpvs(sv, "{non-utf8-latin1-all}");
11186 }
11187
3339dfd8 11188 /* output information about the unicode matching */
ef87b810 11189 if (flags & ANYOF_UNICODE_ALL)
396482e1 11190 sv_catpvs(sv, "{unicode_all}");
137165a6 11191 else if (ANYOF_NONBITMAP(o))
ef87b810 11192 sv_catpvs(sv, "{unicode}");
f5ecd18d 11193 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 11194 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2 11195
1aa9930e 11196 if (ANYOF_NONBITMAP(o)) {
ffc61ed2 11197 SV *lv;
32fc9b6a 11198 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 11199
ffc61ed2
JH
11200 if (lv) {
11201 if (sw) {
89ebb4a3 11202 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 11203
ffc61ed2 11204 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 11205 uvchr_to_utf8(s, i);
ffc61ed2 11206
3568d838 11207 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
11208 if (rangestart == -1)
11209 rangestart = i;
11210 } else if (rangestart != -1) {
ffc61ed2
JH
11211 if (i <= rangestart + 3)
11212 for (; rangestart < i; rangestart++) {
2d03de9c
AL
11213 const U8 * const e = uvchr_to_utf8(s,rangestart);
11214 U8 *p;
11215 for(p = s; p < e; p++)
ffc61ed2
JH
11216 put_byte(sv, *p);
11217 }
11218 else {
2d03de9c
AL
11219 const U8 *e = uvchr_to_utf8(s,rangestart);
11220 U8 *p;
11221 for (p = s; p < e; p++)
ffc61ed2 11222 put_byte(sv, *p);
396482e1 11223 sv_catpvs(sv, "-");
2d03de9c
AL
11224 e = uvchr_to_utf8(s, i-1);
11225 for (p = s; p < e; p++)
1df70142 11226 put_byte(sv, *p);
ffc61ed2
JH
11227 }
11228 rangestart = -1;
11229 }
19860706 11230 }
ffc61ed2 11231
396482e1 11232 sv_catpvs(sv, "..."); /* et cetera */
19860706 11233 }
fde631ed 11234
ffc61ed2 11235 {
2e0de35c 11236 char *s = savesvpv(lv);
c445ea15 11237 char * const origs = s;
b81d288d 11238
3dab1dad
YO
11239 while (*s && *s != '\n')
11240 s++;
b81d288d 11241
ffc61ed2 11242 if (*s == '\n') {
2d03de9c 11243 const char * const t = ++s;
ffc61ed2
JH
11244
11245 while (*s) {
11246 if (*s == '\n')
11247 *s = ' ';
11248 s++;
11249 }
11250 if (s[-1] == ' ')
11251 s[-1] = 0;
11252
11253 sv_catpv(sv, t);
fde631ed 11254 }
b81d288d 11255
ffc61ed2 11256 Safefree(origs);
fde631ed
JH
11257 }
11258 }
653099ff 11259 }
ffc61ed2 11260
653099ff
GS
11261 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11262 }
9b155405 11263 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 11264 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 11265#else
96a5add6 11266 PERL_UNUSED_CONTEXT;
65e66c80
SP
11267 PERL_UNUSED_ARG(sv);
11268 PERL_UNUSED_ARG(o);
f9049ba1 11269 PERL_UNUSED_ARG(prog);
17c3b450 11270#endif /* DEBUGGING */
35ff7856 11271}
a687059c 11272
cad2e5aa 11273SV *
288b8c02 11274Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 11275{ /* Assume that RE_INTUIT is set */
97aff369 11276 dVAR;
288b8c02 11277 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 11278 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11279
11280 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
11281 PERL_UNUSED_CONTEXT;
11282
a3621e74 11283 DEBUG_COMPILE_r(
cfd0369c 11284 {
2d03de9c 11285 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 11286 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
11287
11288 if (!PL_colorset) reginitcolors();
11289 PerlIO_printf(Perl_debug_log,
a0288114 11290 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
11291 PL_colors[4],
11292 prog->check_substr ? "" : "utf8 ",
11293 PL_colors[5],PL_colors[0],
cad2e5aa
JH
11294 s,
11295 PL_colors[1],
11296 (strlen(s) > 60 ? "..." : ""));
11297 } );
11298
33b8afdf 11299 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
11300}
11301
84da74a7 11302/*
f8149455 11303 pregfree()
84da74a7 11304
f8149455
YO
11305 handles refcounting and freeing the perl core regexp structure. When
11306 it is necessary to actually free the structure the first thing it
3b753521 11307 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
11308 the regexp, allowing the handling of the void *pprivate; member
11309 first. (This routine is not overridable by extensions, which is why
11310 the extensions free is called first.)
11311
11312 See regdupe and regdupe_internal if you change anything here.
84da74a7 11313*/
f8149455 11314#ifndef PERL_IN_XSUB_RE
2b69d0c2 11315void
84679df5 11316Perl_pregfree(pTHX_ REGEXP *r)
a687059c 11317{
288b8c02
NC
11318 SvREFCNT_dec(r);
11319}
11320
11321void
11322Perl_pregfree2(pTHX_ REGEXP *rx)
11323{
27da23d5 11324 dVAR;
288b8c02 11325 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 11326 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 11327
7918f24d
NC
11328 PERL_ARGS_ASSERT_PREGFREE2;
11329
28d8d7f4
YO
11330 if (r->mother_re) {
11331 ReREFCNT_dec(r->mother_re);
11332 } else {
288b8c02 11333 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 11334 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
11335 }
11336 if (r->substrs) {
ef8d46e8
VP
11337 SvREFCNT_dec(r->anchored_substr);
11338 SvREFCNT_dec(r->anchored_utf8);
11339 SvREFCNT_dec(r->float_substr);
11340 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
11341 Safefree(r->substrs);
11342 }
288b8c02 11343 RX_MATCH_COPY_FREE(rx);
f8c7b90f 11344#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 11345 SvREFCNT_dec(r->saved_copy);
ed252734 11346#endif
f0ab9afb 11347 Safefree(r->offs);
f8149455 11348}
28d8d7f4
YO
11349
11350/* reg_temp_copy()
11351
11352 This is a hacky workaround to the structural issue of match results
11353 being stored in the regexp structure which is in turn stored in
11354 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11355 could be PL_curpm in multiple contexts, and could require multiple
11356 result sets being associated with the pattern simultaneously, such
11357 as when doing a recursive match with (??{$qr})
11358
11359 The solution is to make a lightweight copy of the regexp structure
11360 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 11361 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
11362 the starp/end and the actual regexp structure itself.
11363
11364*/
11365
11366
84679df5 11367REGEXP *
f0826785 11368Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 11369{
f0826785 11370 struct regexp *ret;
288b8c02 11371 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 11372 register const I32 npar = r->nparens+1;
7918f24d
NC
11373
11374 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11375
f0826785
BM
11376 if (!ret_x)
11377 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11378 ret = (struct regexp *)SvANY(ret_x);
11379
288b8c02 11380 (void)ReREFCNT_inc(rx);
f7c278bf
NC
11381 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11382 by pointing directly at the buffer, but flagging that the allocated
11383 space in the copy is zero. As we've just done a struct copy, it's now
11384 a case of zero-ing that, rather than copying the current length. */
11385 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 11386 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
11387 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11388 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 11389 SvLEN_set(ret_x, 0);
b9ad13ac 11390 SvSTASH_set(ret_x, NULL);
703c388d 11391 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
11392 Newx(ret->offs, npar, regexp_paren_pair);
11393 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 11394 if (r->substrs) {
28d8d7f4 11395 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
11396 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11397
11398 SvREFCNT_inc_void(ret->anchored_substr);
11399 SvREFCNT_inc_void(ret->anchored_utf8);
11400 SvREFCNT_inc_void(ret->float_substr);
11401 SvREFCNT_inc_void(ret->float_utf8);
11402
11403 /* check_substr and check_utf8, if non-NULL, point to either their
11404 anchored or float namesakes, and don't hold a second reference. */
486913e4 11405 }
288b8c02 11406 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 11407#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 11408 ret->saved_copy = NULL;
28d8d7f4 11409#endif
288b8c02 11410 ret->mother_re = rx;
28d8d7f4 11411
288b8c02 11412 return ret_x;
28d8d7f4 11413}
f8149455
YO
11414#endif
11415
11416/* regfree_internal()
11417
11418 Free the private data in a regexp. This is overloadable by
11419 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 11420 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
11421 know about, however of course we have to handle the
11422 regexp_internal structure when no extension is in use.
11423
11424 Note this is called before freeing anything in the regexp
11425 structure.
11426 */
11427
11428void
288b8c02 11429Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
11430{
11431 dVAR;
288b8c02 11432 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11433 RXi_GET_DECL(r,ri);
11434 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11435
11436 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11437
f8149455
YO
11438 DEBUG_COMPILE_r({
11439 if (!PL_colorset)
11440 reginitcolors();
11441 {
11442 SV *dsv= sv_newmortal();
3c8556c3 11443 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 11444 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
11445 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11446 PL_colors[4],PL_colors[5],s);
11447 }
11448 });
7122b237
YO
11449#ifdef RE_TRACK_PATTERN_OFFSETS
11450 if (ri->u.offsets)
11451 Safefree(ri->u.offsets); /* 20010421 MJD */
11452#endif
f8fc2ecf
YO
11453 if (ri->data) {
11454 int n = ri->data->count;
f3548bdc
DM
11455 PAD* new_comppad = NULL;
11456 PAD* old_comppad;
4026c95a 11457 PADOFFSET refcnt;
dfad63ad 11458
c277df42 11459 while (--n >= 0) {
261faec3 11460 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 11461 switch (ri->data->what[n]) {
af534a04 11462 case 'a':
c277df42 11463 case 's':
81714fb9 11464 case 'S':
55eed653 11465 case 'u':
ad64d0ec 11466 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 11467 break;
653099ff 11468 case 'f':
f8fc2ecf 11469 Safefree(ri->data->data[n]);
653099ff 11470 break;
dfad63ad 11471 case 'p':
502c6561 11472 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 11473 break;
c277df42 11474 case 'o':
dfad63ad 11475 if (new_comppad == NULL)
cea2e8a9 11476 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
11477 PAD_SAVE_LOCAL(old_comppad,
11478 /* Watch out for global destruction's random ordering. */
c445ea15 11479 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 11480 );
b34c0dd4 11481 OP_REFCNT_LOCK;
f8fc2ecf 11482 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
11483 OP_REFCNT_UNLOCK;
11484 if (!refcnt)
f8fc2ecf 11485 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 11486
f3548bdc 11487 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 11488 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 11489 new_comppad = NULL;
c277df42
IZ
11490 break;
11491 case 'n':
9e55ce06 11492 break;
07be1b83 11493 case 'T':
be8e71aa
YO
11494 { /* Aho Corasick add-on structure for a trie node.
11495 Used in stclass optimization only */
07be1b83 11496 U32 refcount;
f8fc2ecf 11497 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
11498 OP_REFCNT_LOCK;
11499 refcount = --aho->refcount;
11500 OP_REFCNT_UNLOCK;
11501 if ( !refcount ) {
446bd890
NC
11502 PerlMemShared_free(aho->states);
11503 PerlMemShared_free(aho->fail);
446bd890
NC
11504 /* do this last!!!! */
11505 PerlMemShared_free(ri->data->data[n]);
11506 PerlMemShared_free(ri->regstclass);
07be1b83
YO
11507 }
11508 }
11509 break;
a3621e74 11510 case 't':
07be1b83 11511 {
be8e71aa 11512 /* trie structure. */
07be1b83 11513 U32 refcount;
f8fc2ecf 11514 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
11515 OP_REFCNT_LOCK;
11516 refcount = --trie->refcount;
11517 OP_REFCNT_UNLOCK;
11518 if ( !refcount ) {
446bd890 11519 PerlMemShared_free(trie->charmap);
446bd890
NC
11520 PerlMemShared_free(trie->states);
11521 PerlMemShared_free(trie->trans);
07be1b83 11522 if (trie->bitmap)
446bd890 11523 PerlMemShared_free(trie->bitmap);
786e8c11 11524 if (trie->jump)
446bd890 11525 PerlMemShared_free(trie->jump);
2e64971a 11526 PerlMemShared_free(trie->wordinfo);
446bd890
NC
11527 /* do this last!!!! */
11528 PerlMemShared_free(ri->data->data[n]);
a3621e74 11529 }
07be1b83
YO
11530 }
11531 break;
c277df42 11532 default:
f8fc2ecf 11533 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
11534 }
11535 }
f8fc2ecf
YO
11536 Safefree(ri->data->what);
11537 Safefree(ri->data);
a0d0e21e 11538 }
28d8d7f4 11539
f8fc2ecf 11540 Safefree(ri);
a687059c 11541}
c277df42 11542
a09252eb
NC
11543#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11544#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
11545#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11546
11547/*
32cd70f6 11548 re_dup - duplicate a regexp.
84da74a7 11549
8233f606
DM
11550 This routine is expected to clone a given regexp structure. It is only
11551 compiled under USE_ITHREADS.
32cd70f6 11552
f8149455
YO
11553 After all of the core data stored in struct regexp is duplicated
11554 the regexp_engine.dupe method is used to copy any private data
11555 stored in the *pprivate pointer. This allows extensions to handle
11556 any duplication it needs to do.
11557
11558 See pregfree() and regfree_internal() if you change anything here.
84da74a7 11559*/
a3c0e9ca 11560#if defined(USE_ITHREADS)
f8149455 11561#ifndef PERL_IN_XSUB_RE
288b8c02
NC
11562void
11563Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 11564{
84da74a7 11565 dVAR;
a86a1ca7 11566 I32 npar;
288b8c02
NC
11567 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11568 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 11569
7918f24d
NC
11570 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11571
84da74a7 11572 npar = r->nparens+1;
f0ab9afb
NC
11573 Newx(ret->offs, npar, regexp_paren_pair);
11574 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 11575 if(ret->swap) {
28d8d7f4 11576 /* no need to copy these */
f0ab9afb 11577 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 11578 }
84da74a7 11579
6057429f 11580 if (ret->substrs) {
32cd70f6
NC
11581 /* Do it this way to avoid reading from *r after the StructCopy().
11582 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11583 cache, it doesn't matter. */
66b1de87
NC
11584 const bool anchored = r->check_substr
11585 ? r->check_substr == r->anchored_substr
11586 : r->check_utf8 == r->anchored_utf8;
785a26d5 11587 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
11588 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11589
32cd70f6
NC
11590 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11591 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11592 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11593 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 11594
32cd70f6
NC
11595 /* check_substr and check_utf8, if non-NULL, point to either their
11596 anchored or float namesakes, and don't hold a second reference. */
11597
11598 if (ret->check_substr) {
11599 if (anchored) {
11600 assert(r->check_utf8 == r->anchored_utf8);
11601 ret->check_substr = ret->anchored_substr;
11602 ret->check_utf8 = ret->anchored_utf8;
11603 } else {
11604 assert(r->check_substr == r->float_substr);
11605 assert(r->check_utf8 == r->float_utf8);
11606 ret->check_substr = ret->float_substr;
11607 ret->check_utf8 = ret->float_utf8;
11608 }
66b1de87
NC
11609 } else if (ret->check_utf8) {
11610 if (anchored) {
11611 ret->check_utf8 = ret->anchored_utf8;
11612 } else {
11613 ret->check_utf8 = ret->float_utf8;
11614 }
32cd70f6 11615 }
6057429f 11616 }
f8149455 11617
5daac39c 11618 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 11619
6057429f 11620 if (ret->pprivate)
288b8c02 11621 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 11622
288b8c02 11623 if (RX_MATCH_COPIED(dstr))
6057429f 11624 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
11625 else
11626 ret->subbeg = NULL;
11627#ifdef PERL_OLD_COPY_ON_WRITE
11628 ret->saved_copy = NULL;
11629#endif
6057429f 11630
c2123ae3
NC
11631 if (ret->mother_re) {
11632 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11633 /* Our storage points directly to our mother regexp, but that's
11634 1: a buffer in a different thread
11635 2: something we no longer hold a reference on
11636 so we need to copy it locally. */
11637 /* Note we need to sue SvCUR() on our mother_re, because it, in
11638 turn, may well be pointing to its own mother_re. */
11639 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11640 SvCUR(ret->mother_re)+1));
11641 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11642 }
11643 ret->mother_re = NULL;
11644 }
6057429f 11645 ret->gofs = 0;
f8149455
YO
11646}
11647#endif /* PERL_IN_XSUB_RE */
11648
11649/*
11650 regdupe_internal()
11651
11652 This is the internal complement to regdupe() which is used to copy
11653 the structure pointed to by the *pprivate pointer in the regexp.
11654 This is the core version of the extension overridable cloning hook.
11655 The regexp structure being duplicated will be copied by perl prior
11656 to this and will be provided as the regexp *r argument, however
11657 with the /old/ structures pprivate pointer value. Thus this routine
11658 may override any copying normally done by perl.
11659
11660 It returns a pointer to the new regexp_internal structure.
11661*/
11662
11663void *
288b8c02 11664Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
11665{
11666 dVAR;
288b8c02 11667 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11668 regexp_internal *reti;
11669 int len, npar;
11670 RXi_GET_DECL(r,ri);
7918f24d
NC
11671
11672 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
11673
11674 npar = r->nparens+1;
7122b237 11675 len = ProgLen(ri);
f8149455 11676
45cf4570 11677 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
11678 Copy(ri->program, reti->program, len+1, regnode);
11679
f8149455 11680
f8fc2ecf 11681 reti->regstclass = NULL;
bcdf7404 11682
f8fc2ecf 11683 if (ri->data) {
84da74a7 11684 struct reg_data *d;
f8fc2ecf 11685 const int count = ri->data->count;
84da74a7
YO
11686 int i;
11687
11688 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11689 char, struct reg_data);
11690 Newx(d->what, count, U8);
11691
11692 d->count = count;
11693 for (i = 0; i < count; i++) {
f8fc2ecf 11694 d->what[i] = ri->data->what[i];
84da74a7 11695 switch (d->what[i]) {
af534a04 11696 /* legal options are one of: sSfpontTua
84da74a7 11697 see also regcomp.h and pregfree() */
af534a04 11698 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 11699 case 's':
81714fb9 11700 case 'S':
0536c0a7 11701 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 11702 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 11703 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 11704 break;
84da74a7
YO
11705 case 'f':
11706 /* This is cheating. */
11707 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 11708 StructCopy(ri->data->data[i], d->data[i],
84da74a7 11709 struct regnode_charclass_class);
f8fc2ecf 11710 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
11711 break;
11712 case 'o':
bbe252da
YO
11713 /* Compiled op trees are readonly and in shared memory,
11714 and can thus be shared without duplication. */
84da74a7 11715 OP_REFCNT_LOCK;
f8fc2ecf 11716 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
11717 OP_REFCNT_UNLOCK;
11718 break;
23eab42c
NC
11719 case 'T':
11720 /* Trie stclasses are readonly and can thus be shared
11721 * without duplication. We free the stclass in pregfree
11722 * when the corresponding reg_ac_data struct is freed.
11723 */
11724 reti->regstclass= ri->regstclass;
11725 /* Fall through */
84da74a7 11726 case 't':
84da74a7 11727 OP_REFCNT_LOCK;
0536c0a7 11728 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 11729 OP_REFCNT_UNLOCK;
0536c0a7
NC
11730 /* Fall through */
11731 case 'n':
11732 d->data[i] = ri->data->data[i];
84da74a7 11733 break;
84da74a7 11734 default:
f8fc2ecf 11735 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
11736 }
11737 }
11738
f8fc2ecf 11739 reti->data = d;
84da74a7
YO
11740 }
11741 else
f8fc2ecf 11742 reti->data = NULL;
84da74a7 11743
cde0cee5
YO
11744 reti->name_list_idx = ri->name_list_idx;
11745
7122b237
YO
11746#ifdef RE_TRACK_PATTERN_OFFSETS
11747 if (ri->u.offsets) {
11748 Newx(reti->u.offsets, 2*len+1, U32);
11749 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11750 }
11751#else
11752 SetProgLen(reti,len);
11753#endif
11754
f8149455 11755 return (void*)reti;
84da74a7 11756}
f8149455
YO
11757
11758#endif /* USE_ITHREADS */
84da74a7 11759
f8149455 11760#ifndef PERL_IN_XSUB_RE
bcdf7404 11761
c277df42
IZ
11762/*
11763 - regnext - dig the "next" pointer out of a node
c277df42
IZ
11764 */
11765regnode *
864dbfa3 11766Perl_regnext(pTHX_ register regnode *p)
c277df42 11767{
97aff369 11768 dVAR;
c277df42
IZ
11769 register I32 offset;
11770
f8fc2ecf 11771 if (!p)
c277df42
IZ
11772 return(NULL);
11773
35db910f
KW
11774 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11775 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11776 }
11777
c277df42
IZ
11778 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11779 if (offset == 0)
11780 return(NULL);
11781
c277df42 11782 return(p+offset);
c277df42 11783}
76234dfb 11784#endif
c277df42 11785
01f988be 11786STATIC void
cea2e8a9 11787S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
11788{
11789 va_list args;
11790 STRLEN l1 = strlen(pat1);
11791 STRLEN l2 = strlen(pat2);
11792 char buf[512];
06bf62c7 11793 SV *msv;
73d840c0 11794 const char *message;
c277df42 11795
7918f24d
NC
11796 PERL_ARGS_ASSERT_RE_CROAK2;
11797
c277df42
IZ
11798 if (l1 > 510)
11799 l1 = 510;
11800 if (l1 + l2 > 510)
11801 l2 = 510 - l1;
11802 Copy(pat1, buf, l1 , char);
11803 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
11804 buf[l1 + l2] = '\n';
11805 buf[l1 + l2 + 1] = '\0';
8736538c
AS
11806#ifdef I_STDARG
11807 /* ANSI variant takes additional second argument */
c277df42 11808 va_start(args, pat2);
8736538c
AS
11809#else
11810 va_start(args);
11811#endif
5a844595 11812 msv = vmess(buf, &args);
c277df42 11813 va_end(args);
cfd0369c 11814 message = SvPV_const(msv,l1);
c277df42
IZ
11815 if (l1 > 512)
11816 l1 = 512;
11817 Copy(message, buf, l1 , char);
197cf9b9 11818 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 11819 Perl_croak(aTHX_ "%s", buf);
c277df42 11820}
a0ed51b3
LW
11821
11822/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11823
76234dfb 11824#ifndef PERL_IN_XSUB_RE
a0ed51b3 11825void
864dbfa3 11826Perl_save_re_context(pTHX)
b81d288d 11827{
97aff369 11828 dVAR;
1ade1aa1
NC
11829
11830 struct re_save_state *state;
11831
11832 SAVEVPTR(PL_curcop);
11833 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11834
11835 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11836 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 11837 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 11838
46ab3289 11839 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 11840
a0ed51b3 11841 PL_reg_start_tmp = 0;
a0ed51b3 11842 PL_reg_start_tmpl = 0;
c445ea15 11843 PL_reg_oldsaved = NULL;
a5db57d6 11844 PL_reg_oldsavedlen = 0;
a5db57d6 11845 PL_reg_maxiter = 0;
a5db57d6 11846 PL_reg_leftiter = 0;
c445ea15 11847 PL_reg_poscache = NULL;
a5db57d6 11848 PL_reg_poscache_size = 0;
1ade1aa1
NC
11849#ifdef PERL_OLD_COPY_ON_WRITE
11850 PL_nrs = NULL;
11851#endif
ada6e8a9 11852
c445ea15
AL
11853 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11854 if (PL_curpm) {
11855 const REGEXP * const rx = PM_GETRE(PL_curpm);
11856 if (rx) {
1df70142 11857 U32 i;
07bc277f 11858 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 11859 char digits[TYPE_CHARS(long)];
d9fad198 11860 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
11861 GV *const *const gvp
11862 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11863
b37c2d43
AL
11864 if (gvp) {
11865 GV * const gv = *gvp;
11866 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11867 save_scalar(gv);
49f27e4b 11868 }
ada6e8a9
AMS
11869 }
11870 }
11871 }
a0ed51b3 11872}
76234dfb 11873#endif
51371543 11874
51371543 11875static void
acfe0abc 11876clear_re(pTHX_ void *r)
51371543 11877{
97aff369 11878 dVAR;
84679df5 11879 ReREFCNT_dec((REGEXP *)r);
51371543 11880}
ffbc6a93 11881
a28509cc
AL
11882#ifdef DEBUGGING
11883
11884STATIC void
11885S_put_byte(pTHX_ SV *sv, int c)
11886{
7918f24d
NC
11887 PERL_ARGS_ASSERT_PUT_BYTE;
11888
7fddd944
NC
11889 /* Our definition of isPRINT() ignores locales, so only bytes that are
11890 not part of UTF-8 are considered printable. I assume that the same
11891 holds for UTF-EBCDIC.
11892 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11893 which Wikipedia says:
11894
11895 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11896 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11897 identical, to the ASCII delete (DEL) or rubout control character.
11898 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
11899 if (!isPRINT(c)) {
11900 if (c < 256) {
11901 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11902 }
11903 else {
11904 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11905 }
11906 }
5e7aa789 11907 else {
88c9ea1e 11908 const char string = c;
5e7aa789
NC
11909 if (c == '-' || c == ']' || c == '\\' || c == '^')
11910 sv_catpvs(sv, "\\");
11911 sv_catpvn(sv, &string, 1);
11912 }
a28509cc
AL
11913}
11914
786e8c11 11915
3dab1dad
YO
11916#define CLEAR_OPTSTART \
11917 if (optstart) STMT_START { \
70685ca0 11918 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
11919 optstart=NULL; \
11920 } STMT_END
11921
786e8c11 11922#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 11923
b5a2f8d8
NC
11924STATIC const regnode *
11925S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
11926 const regnode *last, const regnode *plast,
11927 SV* sv, I32 indent, U32 depth)
a28509cc 11928{
97aff369 11929 dVAR;
786e8c11 11930 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 11931 register const regnode *next;
3dab1dad 11932 const regnode *optstart= NULL;
1f1031fe 11933
f8fc2ecf 11934 RXi_GET_DECL(r,ri);
3dab1dad 11935 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11936
11937 PERL_ARGS_ASSERT_DUMPUNTIL;
11938
786e8c11
YO
11939#ifdef DEBUG_DUMPUNTIL
11940 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11941 last ? last-start : 0,plast ? plast-start : 0);
11942#endif
11943
11944 if (plast && plast < last)
11945 last= plast;
11946
11947 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 11948 /* While that wasn't END last time... */
a28509cc
AL
11949 NODE_ALIGN(node);
11950 op = OP(node);
de734bd5 11951 if (op == CLOSE || op == WHILEM)
786e8c11 11952 indent--;
b5a2f8d8 11953 next = regnext((regnode *)node);
1f1031fe 11954
a28509cc 11955 /* Where, what. */
8e11feef 11956 if (OP(node) == OPTIMIZED) {
e68ec53f 11957 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 11958 optstart = node;
3dab1dad 11959 else
8e11feef 11960 goto after_print;
3dab1dad
YO
11961 } else
11962 CLEAR_OPTSTART;
1f1031fe 11963
32fc9b6a 11964 regprop(r, sv, node);
a28509cc 11965 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 11966 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
11967
11968 if (OP(node) != OPTIMIZED) {
11969 if (next == NULL) /* Next ptr. */
11970 PerlIO_printf(Perl_debug_log, " (0)");
11971 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11972 PerlIO_printf(Perl_debug_log, " (FAIL)");
11973 else
11974 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11975 (void)PerlIO_putc(Perl_debug_log, '\n');
11976 }
11977
a28509cc
AL
11978 after_print:
11979 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
11980 assert(next);
11981 {
11982 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
11983 ? regnext((regnode *)next)
11984 : next);
be8e71aa
YO
11985 if (last && nnode > last)
11986 nnode = last;
786e8c11 11987 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 11988 }
a28509cc
AL
11989 }
11990 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 11991 assert(next);
786e8c11 11992 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
11993 }
11994 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 11995 const regnode *this_trie = node;
1de06328 11996 const char op = OP(node);
647f639f 11997 const U32 n = ARG(node);
1de06328 11998 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 11999 (reg_ac_data *)ri->data->data[n] :
1de06328 12000 NULL;
3251b653
NC
12001 const reg_trie_data * const trie =
12002 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 12003#ifdef DEBUGGING
502c6561 12004 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 12005#endif
786e8c11 12006 const regnode *nextbranch= NULL;
a28509cc 12007 I32 word_idx;
76f68e9b 12008 sv_setpvs(sv, "");
786e8c11 12009 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 12010 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
12011
12012 PerlIO_printf(Perl_debug_log, "%*s%s ",
12013 (int)(2*(indent+3)), "",
12014 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
12015 PL_colors[0], PL_colors[1],
12016 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 12017 PERL_PV_PRETTY_ELLIPSES |
7f69552c 12018 PERL_PV_PRETTY_LTGT
786e8c11
YO
12019 )
12020 : "???"
12021 );
12022 if (trie->jump) {
40d049e4 12023 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
12024 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12025 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
12026 if (dist) {
12027 if (!nextbranch)
24b23f37 12028 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
12029 DUMPUNTIL(this_trie + dist, nextbranch);
12030 }
786e8c11
YO
12031 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12032 nextbranch= regnext((regnode *)nextbranch);
12033 } else {
12034 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 12035 }
786e8c11
YO
12036 }
12037 if (last && next > last)
12038 node= last;
12039 else
12040 node= next;
a28509cc 12041 }
786e8c11
YO
12042 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
12043 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12044 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
12045 }
12046 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 12047 assert(next);
786e8c11 12048 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
12049 }
12050 else if ( op == PLUS || op == STAR) {
786e8c11 12051 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 12052 }
f56b6394 12053 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 12054 /* arglen 1 + class block */
4a3ee7a8 12055 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
12056 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12057 node = NEXTOPER(node);
12058 }
12059 else if (PL_regkind[(U8)op] == EXACT) {
12060 /* Literal string, where present. */
12061 node += NODE_SZ_STR(node) - 1;
12062 node = NEXTOPER(node);
12063 }
12064 else {
12065 node = NEXTOPER(node);
12066 node += regarglen[(U8)op];
12067 }
12068 if (op == CURLYX || op == OPEN)
786e8c11 12069 indent++;
a28509cc 12070 }
3dab1dad 12071 CLEAR_OPTSTART;
786e8c11 12072#ifdef DEBUG_DUMPUNTIL
70685ca0 12073 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 12074#endif
1de06328 12075 return node;
a28509cc
AL
12076}
12077
12078#endif /* DEBUGGING */
12079
241d1a3b
NC
12080/*
12081 * Local variables:
12082 * c-indentation-style: bsd
12083 * c-basic-offset: 4
12084 * indent-tabs-mode: t
12085 * End:
12086 *
37442d52
RGS
12087 * ex: set ts=8 sts=4 sw=4 noet:
12088 */