This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
In make_ext.pl, only fudge the Makefile.PL timestamp when generating it.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d 88#include "dquote_static.c"
370b8f2f
TC
89#ifndef PERL_IN_XSUB_RE
90# include "charclass_invlists.h"
91#endif
04e98a4d 92
94dc5c2d
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
94
d4cce5f1 95#ifdef op
11343788 96#undef op
d4cce5f1 97#endif /* op */
11343788 98
fe14fcc3 99#ifdef MSDOS
7e4e8c89 100# if defined(BUGGY_MSC6)
fe14fcc3 101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 102# pragma optimize("a",off)
fe14fcc3 103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
104# pragma optimize("w",on )
105# endif /* BUGGY_MSC6 */
fe14fcc3
LW
106#endif /* MSDOS */
107
a687059c
LW
108#ifndef STATIC
109#define STATIC static
110#endif
111
830247a4 112typedef struct RExC_state_t {
e2509266 113 U32 flags; /* are we folding, multilining? */
830247a4 114 char *precomp; /* uncompiled string. */
288b8c02 115 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
116 regexp *rx; /* perl core regexp structure */
117 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 123 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 124 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
127 U32 seen;
128 I32 size; /* Code size. */
c74340f9
YO
129 I32 npar; /* Capture buffer count, (OPEN). */
130 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 131 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
132 I32 extralen;
133 I32 seen_zerolen;
134 I32 seen_evals;
40d049e4
YO
135 regnode **open_parens; /* pointers to open parens */
136 regnode **close_parens; /* pointers to close parens */
137 regnode *opend; /* END node in program */
02daf0ab
YO
138 I32 utf8; /* whether the pattern is utf8 or not */
139 I32 orig_utf8; /* whether the pattern was originally in utf8 */
140 /* XXX use this for future optimisation of case
141 * where pattern must be upgraded to utf8. */
e40e74fe
KW
142 I32 uni_semantics; /* If a d charset modifier should use unicode
143 rules, even if the pattern is not in
144 utf8 */
81714fb9 145 HV *paren_names; /* Paren names */
1f1031fe 146
40d049e4
YO
147 regnode **recurse; /* Recurse regops */
148 I32 recurse_count; /* Number of recurse regops */
b57e4118 149 I32 in_lookbehind;
4624b182 150 I32 contains_locale;
bb3f3ed2 151 I32 override_recoding;
830247a4
IZ
152#if ADD_TO_REGEXEC
153 char *starttry; /* -Dr: where regtry was called. */
154#define RExC_starttry (pRExC_state->starttry)
155#endif
3dab1dad 156#ifdef DEBUGGING
be8e71aa 157 const char *lastparse;
3dab1dad 158 I32 lastnum;
1f1031fe 159 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
160#define RExC_lastparse (pRExC_state->lastparse)
161#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 162#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 163#endif
830247a4
IZ
164} RExC_state_t;
165
e2509266 166#define RExC_flags (pRExC_state->flags)
830247a4 167#define RExC_precomp (pRExC_state->precomp)
288b8c02 168#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 169#define RExC_rx (pRExC_state->rx)
f8fc2ecf 170#define RExC_rxi (pRExC_state->rxi)
fac92740 171#define RExC_start (pRExC_state->start)
830247a4
IZ
172#define RExC_end (pRExC_state->end)
173#define RExC_parse (pRExC_state->parse)
174#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
175#ifdef RE_TRACK_PATTERN_OFFSETS
176#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
177#endif
830247a4 178#define RExC_emit (pRExC_state->emit)
fac92740 179#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 180#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
181#define RExC_naughty (pRExC_state->naughty)
182#define RExC_sawback (pRExC_state->sawback)
183#define RExC_seen (pRExC_state->seen)
184#define RExC_size (pRExC_state->size)
185#define RExC_npar (pRExC_state->npar)
e2e6a0f1 186#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
187#define RExC_extralen (pRExC_state->extralen)
188#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
189#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 190#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 191#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 192#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
193#define RExC_open_parens (pRExC_state->open_parens)
194#define RExC_close_parens (pRExC_state->close_parens)
195#define RExC_opend (pRExC_state->opend)
81714fb9 196#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
197#define RExC_recurse (pRExC_state->recurse)
198#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 199#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 200#define RExC_contains_locale (pRExC_state->contains_locale)
bb3f3ed2 201#define RExC_override_recoding (pRExC_state->override_recoding)
830247a4 202
cde0cee5 203
a687059c
LW
204#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
205#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
206 ((*s) == '{' && regcurly(s)))
a687059c 207
35c8bce7
LW
208#ifdef SPSTART
209#undef SPSTART /* dratted cpp namespace... */
210#endif
a687059c
LW
211/*
212 * Flags to be passed up and down.
213 */
a687059c 214#define WORST 0 /* Worst case. */
a3b492c3 215#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
216
217/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
cf8c372d
KW
218 * character, and if utf8, must be invariant. Note that this is not the same
219 * thing as REGNODE_SIMPLE */
fda99bee 220#define SIMPLE 0x02
a3b492c3
YO
221#define SPSTART 0x04 /* Starts with * or +. */
222#define TRYAGAIN 0x08 /* Weeded out a declaration. */
223#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 224
3dab1dad
YO
225#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
226
07be1b83
YO
227/* whether trie related optimizations are enabled */
228#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
229#define TRIE_STUDY_OPT
786e8c11 230#define FULL_TRIE_STUDY
07be1b83
YO
231#define TRIE_STCLASS
232#endif
1de06328
YO
233
234
40d049e4
YO
235
236#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
237#define PBITVAL(paren) (1 << ((paren) & 7))
238#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
239#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
240#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
241
bbd61b5f
KW
242/* If not already in utf8, do a longjmp back to the beginning */
243#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
244#define REQUIRE_UTF8 STMT_START { \
245 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
246 } STMT_END
40d049e4 247
1de06328
YO
248/* About scan_data_t.
249
250 During optimisation we recurse through the regexp program performing
251 various inplace (keyhole style) optimisations. In addition study_chunk
252 and scan_commit populate this data structure with information about
253 what strings MUST appear in the pattern. We look for the longest
3b753521 254 string that must appear at a fixed location, and we look for the
1de06328
YO
255 longest string that may appear at a floating location. So for instance
256 in the pattern:
257
258 /FOO[xX]A.*B[xX]BAR/
259
260 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
261 strings (because they follow a .* construct). study_chunk will identify
262 both FOO and BAR as being the longest fixed and floating strings respectively.
263
264 The strings can be composites, for instance
265
266 /(f)(o)(o)/
267
268 will result in a composite fixed substring 'foo'.
269
270 For each string some basic information is maintained:
271
272 - offset or min_offset
273 This is the position the string must appear at, or not before.
274 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
275 characters must match before the string we are searching for.
276 Likewise when combined with minlenp and the length of the string it
1de06328
YO
277 tells us how many characters must appear after the string we have
278 found.
279
280 - max_offset
281 Only used for floating strings. This is the rightmost point that
3b753521 282 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
283 string can occur infinitely far to the right.
284
285 - minlenp
286 A pointer to the minimum length of the pattern that the string
287 was found inside. This is important as in the case of positive
288 lookahead or positive lookbehind we can have multiple patterns
289 involved. Consider
290
291 /(?=FOO).*F/
292
293 The minimum length of the pattern overall is 3, the minimum length
294 of the lookahead part is 3, but the minimum length of the part that
295 will actually match is 1. So 'FOO's minimum length is 3, but the
296 minimum length for the F is 1. This is important as the minimum length
297 is used to determine offsets in front of and behind the string being
298 looked for. Since strings can be composites this is the length of the
486ec47a 299 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
300 the length is calculated by study_chunk, so that the minimum lengths
301 are not known until the full pattern has been compiled, thus the
302 pointer to the value.
303
304 - lookbehind
305
306 In the case of lookbehind the string being searched for can be
307 offset past the start point of the final matching string.
308 If this value was just blithely removed from the min_offset it would
309 invalidate some of the calculations for how many chars must match
310 before or after (as they are derived from min_offset and minlen and
311 the length of the string being searched for).
312 When the final pattern is compiled and the data is moved from the
313 scan_data_t structure into the regexp structure the information
314 about lookbehind is factored in, with the information that would
315 have been lost precalculated in the end_shift field for the
316 associated string.
317
318 The fields pos_min and pos_delta are used to store the minimum offset
319 and the delta to the maximum offset at the current point in the pattern.
320
321*/
2c2d71f5
JH
322
323typedef struct scan_data_t {
1de06328
YO
324 /*I32 len_min; unused */
325 /*I32 len_delta; unused */
2c2d71f5
JH
326 I32 pos_min;
327 I32 pos_delta;
328 SV *last_found;
1de06328 329 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
330 I32 last_start_min;
331 I32 last_start_max;
1de06328
YO
332 SV **longest; /* Either &l_fixed, or &l_float. */
333 SV *longest_fixed; /* longest fixed string found in pattern */
334 I32 offset_fixed; /* offset where it starts */
486ec47a 335 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
336 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
337 SV *longest_float; /* longest floating string found in pattern */
338 I32 offset_float_min; /* earliest point in string it can appear */
339 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 340 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 341 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
342 I32 flags;
343 I32 whilem_c;
cb434fcc 344 I32 *last_closep;
653099ff 345 struct regnode_charclass_class *start_class;
2c2d71f5
JH
346} scan_data_t;
347
a687059c 348/*
e50aee73 349 * Forward declarations for pregcomp()'s friends.
a687059c 350 */
a0d0e21e 351
27da23d5 352static const scan_data_t zero_scan_data =
1de06328 353 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
354
355#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
356#define SF_BEFORE_SEOL 0x0001
357#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
358#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
359#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
360
09b7f37c
CB
361#ifdef NO_UNARY_PLUS
362# define SF_FIX_SHIFT_EOL (0+2)
363# define SF_FL_SHIFT_EOL (0+4)
364#else
365# define SF_FIX_SHIFT_EOL (+2)
366# define SF_FL_SHIFT_EOL (+4)
367#endif
c277df42
IZ
368
369#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
370#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
371
372#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
373#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
374#define SF_IS_INF 0x0040
375#define SF_HAS_PAR 0x0080
376#define SF_IN_PAR 0x0100
377#define SF_HAS_EVAL 0x0200
378#define SCF_DO_SUBSTR 0x0400
653099ff
GS
379#define SCF_DO_STCLASS_AND 0x0800
380#define SCF_DO_STCLASS_OR 0x1000
381#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 382#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 383
786e8c11 384#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 385#define SCF_SEEN_ACCEPT 0x8000
07be1b83 386
43fead97 387#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
388
389/* The enums for all these are ordered so things work out correctly */
a62b1201 390#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 391#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 392#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
393#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
394#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
395#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
396#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 397
43fead97 398#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 399
ffc61ed2 400#define OOB_UNICODE 12345678
93733859 401#define OOB_NAMEDCLASS -1
b8c5462f 402
a0ed51b3
LW
403#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
404#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
405
8615cb43 406
b45f050a
JF
407/* length of regex to show in messages that don't mark a position within */
408#define RegexLengthToShowInErrorMessages 127
409
410/*
411 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
412 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
413 * op/pragma/warn/regcomp.
414 */
7253e4e3
RK
415#define MARKER1 "<-- HERE" /* marker as it appears in the description */
416#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 417
7253e4e3 418#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
419
420/*
421 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
422 * arg. Show regex, up to a maximum length. If it's too long, chop and add
423 * "...".
424 */
58e23c8d 425#define _FAIL(code) STMT_START { \
bfed75c6 426 const char *ellipses = ""; \
ccb2c380
MP
427 IV len = RExC_end - RExC_precomp; \
428 \
429 if (!SIZE_ONLY) \
288b8c02 430 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
431 if (len > RegexLengthToShowInErrorMessages) { \
432 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
433 len = RegexLengthToShowInErrorMessages - 10; \
434 ellipses = "..."; \
435 } \
58e23c8d 436 code; \
ccb2c380 437} STMT_END
8615cb43 438
58e23c8d
YO
439#define FAIL(msg) _FAIL( \
440 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
441 msg, (int)len, RExC_precomp, ellipses))
442
443#define FAIL2(msg,arg) _FAIL( \
444 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
445 arg, (int)len, RExC_precomp, ellipses))
446
b45f050a 447/*
b45f050a
JF
448 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
449 */
ccb2c380 450#define Simple_vFAIL(m) STMT_START { \
a28509cc 451 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
452 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
b45f050a
JF
455
456/*
457 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
458 */
ccb2c380
MP
459#define vFAIL(m) STMT_START { \
460 if (!SIZE_ONLY) \
288b8c02 461 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
462 Simple_vFAIL(m); \
463} STMT_END
b45f050a
JF
464
465/*
466 * Like Simple_vFAIL(), but accepts two arguments.
467 */
ccb2c380 468#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 469 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
470 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
471 (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
b45f050a
JF
473
474/*
475 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
476 */
ccb2c380
MP
477#define vFAIL2(m,a1) STMT_START { \
478 if (!SIZE_ONLY) \
288b8c02 479 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
480 Simple_vFAIL2(m, a1); \
481} STMT_END
b45f050a
JF
482
483
484/*
485 * Like Simple_vFAIL(), but accepts three arguments.
486 */
ccb2c380 487#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 488 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
489 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
b45f050a
JF
492
493/*
494 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
495 */
ccb2c380
MP
496#define vFAIL3(m,a1,a2) STMT_START { \
497 if (!SIZE_ONLY) \
288b8c02 498 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
499 Simple_vFAIL3(m, a1, a2); \
500} STMT_END
b45f050a
JF
501
502/*
503 * Like Simple_vFAIL(), but accepts four arguments.
504 */
ccb2c380 505#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 506 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
507 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
508 (int)offset, RExC_precomp, RExC_precomp + offset); \
509} STMT_END
b45f050a 510
668c081a 511#define ckWARNreg(loc,m) STMT_START { \
a28509cc 512 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
513 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
515} STMT_END
516
668c081a 517#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 518 const IV offset = loc - RExC_precomp; \
d1d15184 519 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
520 m REPORT_LOCATION, \
521 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
522} STMT_END
523
2335b3d3
KW
524#define ckWARN2regdep(loc,m, a1) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
527 m REPORT_LOCATION, \
528 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
529} STMT_END
530
668c081a 531#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 532 const IV offset = loc - RExC_precomp; \
668c081a 533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
535} STMT_END
536
537#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 538 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
539 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
541} STMT_END
542
668c081a
NC
543#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
547} STMT_END
548
ccb2c380 549#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 550 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
551 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
553} STMT_END
554
668c081a
NC
555#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
559} STMT_END
560
ccb2c380 561#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 562 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
563 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
565} STMT_END
9d1d55b5 566
8615cb43 567
cd439c50 568/* Allow for side effects in s */
ccb2c380
MP
569#define REGC(c,s) STMT_START { \
570 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
571} STMT_END
cd439c50 572
fac92740
MJD
573/* Macros for recording node offsets. 20001227 mjd@plover.com
574 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
575 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
576 * Element 0 holds the number n.
07be1b83 577 * Position is 1 indexed.
fac92740 578 */
7122b237
YO
579#ifndef RE_TRACK_PATTERN_OFFSETS
580#define Set_Node_Offset_To_R(node,byte)
581#define Set_Node_Offset(node,byte)
582#define Set_Cur_Node_Offset
583#define Set_Node_Length_To_R(node,len)
584#define Set_Node_Length(node,len)
585#define Set_Node_Cur_Length(node)
586#define Node_Offset(n)
587#define Node_Length(n)
588#define Set_Node_Offset_Length(node,offset,len)
589#define ProgLen(ri) ri->u.proglen
590#define SetProgLen(ri,x) ri->u.proglen = x
591#else
592#define ProgLen(ri) ri->u.offsets[0]
593#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
594#define Set_Node_Offset_To_R(node,byte) STMT_START { \
595 if (! SIZE_ONLY) { \
596 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 597 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 598 if((node) < 0) { \
551405c4 599 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
600 } else { \
601 RExC_offsets[2*(node)-1] = (byte); \
602 } \
603 } \
604} STMT_END
605
606#define Set_Node_Offset(node,byte) \
607 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
608#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
609
610#define Set_Node_Length_To_R(node,len) STMT_START { \
611 if (! SIZE_ONLY) { \
612 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 613 __LINE__, (int)(node), (int)(len))); \
ccb2c380 614 if((node) < 0) { \
551405c4 615 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
616 } else { \
617 RExC_offsets[2*(node)] = (len); \
618 } \
619 } \
620} STMT_END
621
622#define Set_Node_Length(node,len) \
623 Set_Node_Length_To_R((node)-RExC_emit_start, len)
624#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
625#define Set_Node_Cur_Length(node) \
626 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
627
628/* Get offsets and lengths */
629#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
630#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
631
07be1b83
YO
632#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
633 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
634 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
635} STMT_END
7122b237 636#endif
07be1b83
YO
637
638#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
639#define EXPERIMENTAL_INPLACESCAN
f427392e 640#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 641
304ee84b
YO
642#define DEBUG_STUDYDATA(str,data,depth) \
643DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 644 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
645 "%*s" str "Pos:%"IVdf"/%"IVdf \
646 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
647 (int)(depth)*2, "", \
648 (IV)((data)->pos_min), \
649 (IV)((data)->pos_delta), \
304ee84b 650 (UV)((data)->flags), \
1de06328 651 (IV)((data)->whilem_c), \
304ee84b
YO
652 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
653 is_inf ? "INF " : "" \
1de06328
YO
654 ); \
655 if ((data)->last_found) \
656 PerlIO_printf(Perl_debug_log, \
657 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
658 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
659 SvPVX_const((data)->last_found), \
660 (IV)((data)->last_end), \
661 (IV)((data)->last_start_min), \
662 (IV)((data)->last_start_max), \
663 ((data)->longest && \
664 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
665 SvPVX_const((data)->longest_fixed), \
666 (IV)((data)->offset_fixed), \
667 ((data)->longest && \
668 (data)->longest==&((data)->longest_float)) ? "*" : "", \
669 SvPVX_const((data)->longest_float), \
670 (IV)((data)->offset_float_min), \
671 (IV)((data)->offset_float_max) \
672 ); \
673 PerlIO_printf(Perl_debug_log,"\n"); \
674});
675
acfe0abc 676static void clear_re(pTHX_ void *r);
4327152a 677
653099ff 678/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 679 Update the longest found anchored substring and the longest found
653099ff
GS
680 floating substrings if needed. */
681
4327152a 682STATIC void
304ee84b 683S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 684{
e1ec3a88
AL
685 const STRLEN l = CHR_SVLEN(data->last_found);
686 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 687 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 688
7918f24d
NC
689 PERL_ARGS_ASSERT_SCAN_COMMIT;
690
c277df42 691 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 692 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
693 if (*data->longest == data->longest_fixed) {
694 data->offset_fixed = l ? data->last_start_min : data->pos_min;
695 if (data->flags & SF_BEFORE_EOL)
b81d288d 696 data->flags
c277df42
IZ
697 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
698 else
699 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 700 data->minlen_fixed=minlenp;
1de06328 701 data->lookbehind_fixed=0;
a0ed51b3 702 }
304ee84b 703 else { /* *data->longest == data->longest_float */
c277df42 704 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
705 data->offset_float_max = (l
706 ? data->last_start_max
c277df42 707 : data->pos_min + data->pos_delta);
304ee84b 708 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 709 data->offset_float_max = I32_MAX;
c277df42 710 if (data->flags & SF_BEFORE_EOL)
b81d288d 711 data->flags
c277df42
IZ
712 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
713 else
714 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
715 data->minlen_float=minlenp;
716 data->lookbehind_float=0;
c277df42
IZ
717 }
718 }
719 SvCUR_set(data->last_found, 0);
0eda9292 720 {
a28509cc 721 SV * const sv = data->last_found;
097eb12c
AL
722 if (SvUTF8(sv) && SvMAGICAL(sv)) {
723 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
724 if (mg)
725 mg->mg_len = 0;
726 }
0eda9292 727 }
c277df42
IZ
728 data->last_end = -1;
729 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 730 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
731}
732
653099ff
GS
733/* Can match anything (initialization) */
734STATIC void
3fffb88a 735S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 736{
7918f24d
NC
737 PERL_ARGS_ASSERT_CL_ANYTHING;
738
f8bef550 739 ANYOF_BITMAP_SETALL(cl);
dd58aee1 740 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
3ad98780 741 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
3fffb88a
KW
742
743 /* If any portion of the regex is to operate under locale rules,
744 * initialization includes it. The reason this isn't done for all regexes
745 * is that the optimizer was written under the assumption that locale was
746 * all-or-nothing. Given the complexity and lack of documentation in the
747 * optimizer, and that there are inadequate test cases for locale, so many
748 * parts of it may not work properly, it is safest to avoid locale unless
749 * necessary. */
750 if (RExC_contains_locale) {
9d7a1e63 751 ANYOF_CLASS_SETALL(cl); /* /l uses class */
3fffb88a
KW
752 cl->flags |= ANYOF_LOCALE;
753 }
9d7a1e63
KW
754 else {
755 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
756 }
653099ff
GS
757}
758
759/* Can match anything (initialization) */
760STATIC int
5f66b61c 761S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
762{
763 int value;
764
7918f24d
NC
765 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
766
aaa51d5e 767 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
768 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
769 return 1;
1aa99e6b
IH
770 if (!(cl->flags & ANYOF_UNICODE_ALL))
771 return 0;
10edeb5d 772 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 773 return 0;
653099ff
GS
774 return 1;
775}
776
777/* Can match anything (initialization) */
778STATIC void
e755fd73 779S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 780{
7918f24d
NC
781 PERL_ARGS_ASSERT_CL_INIT;
782
8ecf7187 783 Zero(cl, 1, struct regnode_charclass_class);
653099ff 784 cl->type = ANYOF;
3fffb88a 785 cl_anything(pRExC_state, cl);
1411dba4 786 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
787}
788
1051e1c4
KW
789/* These two functions currently do the exact same thing */
790#define cl_init_zero S_cl_init
653099ff 791
dd58aee1
KW
792/* 'AND' a given class with another one. Can create false positives. 'cl'
793 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
794 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 795STATIC void
5f66b61c 796S_cl_and(struct regnode_charclass_class *cl,
a28509cc 797 const struct regnode_charclass_class *and_with)
653099ff 798{
7918f24d 799 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
800
801 assert(and_with->type == ANYOF);
1e6ade67 802
c6b76537 803 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
804 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
805 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 806 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
807 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
808 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
809 int i;
810
811 if (and_with->flags & ANYOF_INVERT)
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] &= ~and_with->bitmap[i];
814 else
815 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
816 cl->bitmap[i] &= and_with->bitmap[i];
817 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 818
c6b76537 819 if (and_with->flags & ANYOF_INVERT) {
8951c461 820
c6b76537
KW
821 /* Here, the and'ed node is inverted. Get the AND of the flags that
822 * aren't affected by the inversion. Those that are affected are
823 * handled individually below */
824 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
825 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
826 cl->flags |= affected_flags;
827
828 /* We currently don't know how to deal with things that aren't in the
829 * bitmap, but we know that the intersection is no greater than what
830 * is already in cl, so let there be false positives that get sorted
831 * out after the synthetic start class succeeds, and the node is
832 * matched for real. */
833
834 /* The inversion of these two flags indicate that the resulting
835 * intersection doesn't have them */
836 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
837 cl->flags &= ~ANYOF_UNICODE_ALL;
838 }
c6b76537
KW
839 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
840 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 841 }
1aa99e6b 842 }
c6b76537 843 else { /* and'd node is not inverted */
3ad98780
KW
844 U8 outside_bitmap_but_not_utf8; /* Temp variable */
845
137165a6 846 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
847
848 /* Here 'and_with' doesn't match anything outside the bitmap
849 * (except possibly ANYOF_UNICODE_ALL), which means the
850 * intersection can't either, except for ANYOF_UNICODE_ALL, in
851 * which case we don't know what the intersection is, but it's no
852 * greater than what cl already has, so can just leave it alone,
853 * with possible false positives */
854 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
855 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 856 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 857 }
137165a6 858 }
c6b76537
KW
859 else if (! ANYOF_NONBITMAP(cl)) {
860
861 /* Here, 'and_with' does match something outside the bitmap, and cl
862 * doesn't have a list of things to match outside the bitmap. If
863 * cl can match all code points above 255, the intersection will
3ad98780
KW
864 * be those above-255 code points that 'and_with' matches. If cl
865 * can't match all Unicode code points, it means that it can't
866 * match anything outside the bitmap (since the 'if' that got us
867 * into this block tested for that), so we leave the bitmap empty.
868 */
c6b76537
KW
869 if (cl->flags & ANYOF_UNICODE_ALL) {
870 ARG_SET(cl, ARG(and_with));
3ad98780
KW
871
872 /* and_with's ARG may match things that don't require UTF8.
873 * And now cl's will too, in spite of this being an 'and'. See
874 * the comments below about the kludge */
875 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
c6b76537
KW
876 }
877 }
878 else {
879 /* Here, both 'and_with' and cl match something outside the
880 * bitmap. Currently we do not do the intersection, so just match
881 * whatever cl had at the beginning. */
882 }
883
884
3ad98780
KW
885 /* Take the intersection of the two sets of flags. However, the
886 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
887 * kludge around the fact that this flag is not treated like the others
888 * which are initialized in cl_anything(). The way the optimizer works
889 * is that the synthetic start class (SSC) is initialized to match
890 * anything, and then the first time a real node is encountered, its
891 * values are AND'd with the SSC's with the result being the values of
892 * the real node. However, there are paths through the optimizer where
893 * the AND never gets called, so those initialized bits are set
894 * inappropriately, which is not usually a big deal, as they just cause
895 * false positives in the SSC, which will just mean a probably
896 * imperceptible slow down in execution. However this bit has a
897 * higher false positive consequence in that it can cause utf8.pm,
898 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
899 * bigger slowdown and also causes significant extra memory to be used.
900 * In order to prevent this, the code now takes a different tack. The
901 * bit isn't set unless some part of the regular expression needs it,
902 * but once set it won't get cleared. This means that these extra
903 * modules won't get loaded unless there was some path through the
904 * pattern that would have required them anyway, and so any false
905 * positives that occur by not ANDing them out when they could be
906 * aren't as severe as they would be if we treated this bit like all
907 * the others */
908 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
909 & ANYOF_NONBITMAP_NON_UTF8;
c6b76537 910 cl->flags &= and_with->flags;
3ad98780 911 cl->flags |= outside_bitmap_but_not_utf8;
137165a6 912 }
653099ff
GS
913}
914
dd58aee1
KW
915/* 'OR' a given class with another one. Can create false positives. 'cl'
916 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
917 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 918STATIC void
3fffb88a 919S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 920{
7918f24d
NC
921 PERL_ARGS_ASSERT_CL_OR;
922
653099ff 923 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
924
925 /* Here, the or'd node is to be inverted. This means we take the
926 * complement of everything not in the bitmap, but currently we don't
927 * know what that is, so give up and match anything */
928 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 929 cl_anything(pRExC_state, cl);
c6b76537 930 }
653099ff
GS
931 /* We do not use
932 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
933 * <= (B1 | !B2) | (CL1 | !CL2)
934 * which is wasteful if CL2 is small, but we ignore CL2:
935 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
936 * XXXX Can we handle case-fold? Unclear:
937 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
938 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
939 */
c6b76537 940 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
941 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
942 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
943 int i;
944
945 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
946 cl->bitmap[i] |= ~or_with->bitmap[i];
947 } /* XXXX: logic is complicated otherwise */
948 else {
3fffb88a 949 cl_anything(pRExC_state, cl);
653099ff 950 }
c6b76537
KW
951
952 /* And, we can just take the union of the flags that aren't affected
953 * by the inversion */
954 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
955
956 /* For the remaining flags:
957 ANYOF_UNICODE_ALL and inverted means to not match anything above
958 255, which means that the union with cl should just be
959 what cl has in it, so can ignore this flag
960 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
961 is 127-255 to match them, but then invert that, so the
962 union with cl should just be what cl has in it, so can
963 ignore this flag
964 */
965 } else { /* 'or_with' is not inverted */
653099ff
GS
966 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
967 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
968 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
969 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
970 int i;
971
972 /* OR char bitmap and class bitmap separately */
973 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
974 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 975 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
976 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
977 cl->classflags[i] |= or_with->classflags[i];
978 cl->flags |= ANYOF_CLASS;
979 }
980 }
981 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 982 cl_anything(pRExC_state, cl);
653099ff 983 }
9826f543 984
c6b76537
KW
985 if (ANYOF_NONBITMAP(or_with)) {
986
987 /* Use the added node's outside-the-bit-map match if there isn't a
988 * conflict. If there is a conflict (both nodes match something
989 * outside the bitmap, but what they match outside is not the same
990 * pointer, and hence not easily compared until XXX we extend
991 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
992 * match everything outside the bitmap. If that stuff is all above
993 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
c6b76537
KW
994 if (! ANYOF_NONBITMAP(cl)) {
995 ARG_SET(cl, ARG(or_with));
996 }
997 else if (ARG(cl) != ARG(or_with)) {
d94b1d13
KW
998
999 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1000 cl_anything(pRExC_state, cl);
1001 }
1002 else {
1003 cl->flags |= ANYOF_UNICODE_ALL;
1004 }
c6b76537 1005 }
4c34a693 1006 }
0b9668ee
KW
1007
1008 /* Take the union */
1009 cl->flags |= or_with->flags;
1aa99e6b 1010 }
653099ff
GS
1011}
1012
a3621e74
YO
1013#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1014#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1015#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1016#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1017
3dab1dad
YO
1018
1019#ifdef DEBUGGING
07be1b83 1020/*
2b8b4781
NC
1021 dump_trie(trie,widecharmap,revcharmap)
1022 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1023 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1024
1025 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1026 The _interim_ variants are used for debugging the interim
1027 tables that are used to generate the final compressed
1028 representation which is what dump_trie expects.
1029
486ec47a 1030 Part of the reason for their existence is to provide a form
3dab1dad 1031 of documentation as to how the different representations function.
07be1b83
YO
1032
1033*/
3dab1dad
YO
1034
1035/*
3dab1dad
YO
1036 Dumps the final compressed table form of the trie to Perl_debug_log.
1037 Used for debugging make_trie().
1038*/
b9a59e08 1039
3dab1dad 1040STATIC void
2b8b4781
NC
1041S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1042 AV *revcharmap, U32 depth)
3dab1dad
YO
1043{
1044 U32 state;
ab3bbdeb 1045 SV *sv=sv_newmortal();
55eed653 1046 int colwidth= widecharmap ? 6 : 4;
2e64971a 1047 U16 word;
3dab1dad
YO
1048 GET_RE_DEBUG_FLAGS_DECL;
1049
7918f24d 1050 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1051
3dab1dad
YO
1052 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1053 (int)depth * 2 + 2,"",
1054 "Match","Base","Ofs" );
1055
1056 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1057 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1058 if ( tmp ) {
ab3bbdeb
YO
1059 PerlIO_printf( Perl_debug_log, "%*s",
1060 colwidth,
ddc5bc0f 1061 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1062 PL_colors[0], PL_colors[1],
1063 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1064 PERL_PV_ESCAPE_FIRSTCHAR
1065 )
1066 );
3dab1dad
YO
1067 }
1068 }
1069 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1070 (int)depth * 2 + 2,"");
1071
1072 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1073 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1074 PerlIO_printf( Perl_debug_log, "\n");
1075
1e2e3d02 1076 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1077 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1078
1079 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1080
1081 if ( trie->states[ state ].wordnum ) {
1082 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1083 } else {
1084 PerlIO_printf( Perl_debug_log, "%6s", "" );
1085 }
1086
1087 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1088
1089 if ( base ) {
1090 U32 ofs = 0;
1091
1092 while( ( base + ofs < trie->uniquecharcount ) ||
1093 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1094 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1095 ofs++;
1096
1097 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1098
1099 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1100 if ( ( base + ofs >= trie->uniquecharcount ) &&
1101 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1102 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1103 {
ab3bbdeb
YO
1104 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1105 colwidth,
3dab1dad
YO
1106 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1107 } else {
ab3bbdeb 1108 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1109 }
1110 }
1111
1112 PerlIO_printf( Perl_debug_log, "]");
1113
1114 }
1115 PerlIO_printf( Perl_debug_log, "\n" );
1116 }
2e64971a
DM
1117 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1118 for (word=1; word <= trie->wordcount; word++) {
1119 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1120 (int)word, (int)(trie->wordinfo[word].prev),
1121 (int)(trie->wordinfo[word].len));
1122 }
1123 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1124}
1125/*
3dab1dad
YO
1126 Dumps a fully constructed but uncompressed trie in list form.
1127 List tries normally only are used for construction when the number of
1128 possible chars (trie->uniquecharcount) is very high.
1129 Used for debugging make_trie().
1130*/
1131STATIC void
55eed653 1132S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1133 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1134 U32 depth)
3dab1dad
YO
1135{
1136 U32 state;
ab3bbdeb 1137 SV *sv=sv_newmortal();
55eed653 1138 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1139 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1140
1141 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1142
3dab1dad 1143 /* print out the table precompression. */
ab3bbdeb
YO
1144 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1145 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1146 "------:-----+-----------------\n" );
3dab1dad
YO
1147
1148 for( state=1 ; state < next_alloc ; state ++ ) {
1149 U16 charid;
1150
ab3bbdeb 1151 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1152 (int)depth * 2 + 2,"", (UV)state );
1153 if ( ! trie->states[ state ].wordnum ) {
1154 PerlIO_printf( Perl_debug_log, "%5s| ","");
1155 } else {
1156 PerlIO_printf( Perl_debug_log, "W%4x| ",
1157 trie->states[ state ].wordnum
1158 );
1159 }
1160 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1161 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1162 if ( tmp ) {
1163 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1164 colwidth,
ddc5bc0f 1165 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1166 PL_colors[0], PL_colors[1],
1167 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1168 PERL_PV_ESCAPE_FIRSTCHAR
1169 ) ,
1e2e3d02
YO
1170 TRIE_LIST_ITEM(state,charid).forid,
1171 (UV)TRIE_LIST_ITEM(state,charid).newstate
1172 );
1173 if (!(charid % 10))
664e119d
RGS
1174 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1175 (int)((depth * 2) + 14), "");
1e2e3d02 1176 }
ab3bbdeb
YO
1177 }
1178 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1179 }
1180}
1181
1182/*
3dab1dad
YO
1183 Dumps a fully constructed but uncompressed trie in table form.
1184 This is the normal DFA style state transition table, with a few
1185 twists to facilitate compression later.
1186 Used for debugging make_trie().
1187*/
1188STATIC void
55eed653 1189S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1190 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1191 U32 depth)
3dab1dad
YO
1192{
1193 U32 state;
1194 U16 charid;
ab3bbdeb 1195 SV *sv=sv_newmortal();
55eed653 1196 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1197 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1198
1199 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1200
1201 /*
1202 print out the table precompression so that we can do a visual check
1203 that they are identical.
1204 */
1205
1206 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1207
1208 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1209 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1210 if ( tmp ) {
ab3bbdeb
YO
1211 PerlIO_printf( Perl_debug_log, "%*s",
1212 colwidth,
ddc5bc0f 1213 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1214 PL_colors[0], PL_colors[1],
1215 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1216 PERL_PV_ESCAPE_FIRSTCHAR
1217 )
1218 );
3dab1dad
YO
1219 }
1220 }
1221
1222 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1223
1224 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1225 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1226 }
1227
1228 PerlIO_printf( Perl_debug_log, "\n" );
1229
1230 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1231
1232 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1233 (int)depth * 2 + 2,"",
1234 (UV)TRIE_NODENUM( state ) );
1235
1236 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1237 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1238 if (v)
1239 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1240 else
1241 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1242 }
1243 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1244 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1245 } else {
1246 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1247 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1248 }
1249 }
07be1b83 1250}
3dab1dad
YO
1251
1252#endif
1253
2e64971a 1254
786e8c11
YO
1255/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1256 startbranch: the first branch in the whole branch sequence
1257 first : start branch of sequence of branch-exact nodes.
1258 May be the same as startbranch
1259 last : Thing following the last branch.
1260 May be the same as tail.
1261 tail : item following the branch sequence
1262 count : words in the sequence
1263 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1264 depth : indent depth
3dab1dad 1265
786e8c11 1266Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1267
786e8c11
YO
1268A trie is an N'ary tree where the branches are determined by digital
1269decomposition of the key. IE, at the root node you look up the 1st character and
1270follow that branch repeat until you find the end of the branches. Nodes can be
1271marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1272
786e8c11 1273 /he|she|his|hers/
72f13be8 1274
786e8c11
YO
1275would convert into the following structure. Numbers represent states, letters
1276following numbers represent valid transitions on the letter from that state, if
1277the number is in square brackets it represents an accepting state, otherwise it
1278will be in parenthesis.
07be1b83 1279
786e8c11
YO
1280 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1281 | |
1282 | (2)
1283 | |
1284 (1) +-i->(6)-+-s->[7]
1285 |
1286 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1287
786e8c11
YO
1288 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1289
1290This shows that when matching against the string 'hers' we will begin at state 1
1291read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1292then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1293is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1294single traverse. We store a mapping from accepting to state to which word was
1295matched, and then when we have multiple possibilities we try to complete the
1296rest of the regex in the order in which they occured in the alternation.
1297
1298The only prior NFA like behaviour that would be changed by the TRIE support is
1299the silent ignoring of duplicate alternations which are of the form:
1300
1301 / (DUPE|DUPE) X? (?{ ... }) Y /x
1302
4b714af6 1303Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1304and without the optimisation. With the optimisations dupes will be silently
486ec47a 1305ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1306the following demonstrates:
1307
1308 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1309
1310which prints out 'word' three times, but
1311
1312 'words'=~/(word|word|word)(?{ print $1 })S/
1313
1314which doesnt print it out at all. This is due to other optimisations kicking in.
1315
1316Example of what happens on a structural level:
1317
486ec47a 1318The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1319
1320 1: CURLYM[1] {1,32767}(18)
1321 5: BRANCH(8)
1322 6: EXACT <ac>(16)
1323 8: BRANCH(11)
1324 9: EXACT <ad>(16)
1325 11: BRANCH(14)
1326 12: EXACT <ab>(16)
1327 16: SUCCEED(0)
1328 17: NOTHING(18)
1329 18: END(0)
1330
1331This would be optimizable with startbranch=5, first=5, last=16, tail=16
1332and should turn into:
1333
1334 1: CURLYM[1] {1,32767}(18)
1335 5: TRIE(16)
1336 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1337 <ac>
1338 <ad>
1339 <ab>
1340 16: SUCCEED(0)
1341 17: NOTHING(18)
1342 18: END(0)
1343
1344Cases where tail != last would be like /(?foo|bar)baz/:
1345
1346 1: BRANCH(4)
1347 2: EXACT <foo>(8)
1348 4: BRANCH(7)
1349 5: EXACT <bar>(8)
1350 7: TAIL(8)
1351 8: EXACT <baz>(10)
1352 10: END(0)
1353
1354which would be optimizable with startbranch=1, first=1, last=7, tail=8
1355and would end up looking like:
1356
1357 1: TRIE(8)
1358 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1359 <foo>
1360 <bar>
1361 7: TAIL(8)
1362 8: EXACT <baz>(10)
1363 10: END(0)
1364
1365 d = uvuni_to_utf8_flags(d, uv, 0);
1366
1367is the recommended Unicode-aware way of saying
1368
1369 *(d++) = uv;
1370*/
1371
fab2782b 1372#define TRIE_STORE_REVCHAR(val) \
786e8c11 1373 STMT_START { \
73031816 1374 if (UTF) { \
fab2782b 1375 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1376 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
fab2782b 1377 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
73031816
NC
1378 SvCUR_set(zlopp, kapow - flrbbbbb); \
1379 SvPOK_on(zlopp); \
1380 SvUTF8_on(zlopp); \
1381 av_push(revcharmap, zlopp); \
1382 } else { \
fab2782b 1383 char ooooff = (char)val; \
73031816
NC
1384 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1385 } \
1386 } STMT_END
786e8c11 1387
fab2782b
YO
1388#define TRIE_READ_CHAR STMT_START { \
1389 wordlen++; \
1390 if ( UTF ) { \
1391 /* if it is UTF then it is either already folded, or does not need folding */ \
1392 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1393 } \
1394 else if (folder == PL_fold_latin1) { \
1395 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1396 if ( foldlen > 0 ) { \
1397 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1398 foldlen -= len; \
1399 scan += len; \
1400 len = 0; \
1401 } else { \
1402 len = 1; \
1403 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1404 skiplen = UNISKIP(uvc); \
1405 foldlen -= skiplen; \
1406 scan = foldbuf + skiplen; \
1407 } \
1408 } else { \
1409 /* raw data, will be folded later if needed */ \
1410 uvc = (U32)*uc; \
1411 len = 1; \
1412 } \
786e8c11
YO
1413} STMT_END
1414
1415
1416
1417#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1418 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1419 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1420 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1421 } \
1422 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1423 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1424 TRIE_LIST_CUR( state )++; \
1425} STMT_END
07be1b83 1426
786e8c11
YO
1427#define TRIE_LIST_NEW(state) STMT_START { \
1428 Newxz( trie->states[ state ].trans.list, \
1429 4, reg_trie_trans_le ); \
1430 TRIE_LIST_CUR( state ) = 1; \
1431 TRIE_LIST_LEN( state ) = 4; \
1432} STMT_END
07be1b83 1433
786e8c11
YO
1434#define TRIE_HANDLE_WORD(state) STMT_START { \
1435 U16 dupe= trie->states[ state ].wordnum; \
1436 regnode * const noper_next = regnext( noper ); \
1437 \
786e8c11
YO
1438 DEBUG_r({ \
1439 /* store the word for dumping */ \
1440 SV* tmp; \
1441 if (OP(noper) != NOTHING) \
740cce10 1442 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1443 else \
740cce10 1444 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1445 av_push( trie_words, tmp ); \
786e8c11
YO
1446 }); \
1447 \
1448 curword++; \
2e64971a
DM
1449 trie->wordinfo[curword].prev = 0; \
1450 trie->wordinfo[curword].len = wordlen; \
1451 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1452 \
1453 if ( noper_next < tail ) { \
1454 if (!trie->jump) \
c944940b 1455 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1456 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1457 if (!jumper) \
1458 jumper = noper_next; \
1459 if (!nextbranch) \
1460 nextbranch= regnext(cur); \
1461 } \
1462 \
1463 if ( dupe ) { \
2e64971a
DM
1464 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1465 /* chain, so that when the bits of chain are later */\
1466 /* linked together, the dups appear in the chain */\
1467 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1468 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1469 } else { \
1470 /* we haven't inserted this word yet. */ \
1471 trie->states[ state ].wordnum = curword; \
1472 } \
1473} STMT_END
07be1b83 1474
3dab1dad 1475
786e8c11
YO
1476#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1477 ( ( base + charid >= ucharcount \
1478 && base + charid < ubound \
1479 && state == trie->trans[ base - ucharcount + charid ].check \
1480 && trie->trans[ base - ucharcount + charid ].next ) \
1481 ? trie->trans[ base - ucharcount + charid ].next \
1482 : ( state==1 ? special : 0 ) \
1483 )
3dab1dad 1484
786e8c11
YO
1485#define MADE_TRIE 1
1486#define MADE_JUMP_TRIE 2
1487#define MADE_EXACT_TRIE 4
3dab1dad 1488
a3621e74 1489STATIC I32
786e8c11 1490S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1491{
27da23d5 1492 dVAR;
a3621e74
YO
1493 /* first pass, loop through and scan words */
1494 reg_trie_data *trie;
55eed653 1495 HV *widecharmap = NULL;
2b8b4781 1496 AV *revcharmap = newAV();
a3621e74 1497 regnode *cur;
9f7f3913 1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1499 STRLEN len = 0;
1500 UV uvc = 0;
1501 U16 curword = 0;
1502 U32 next_alloc = 0;
786e8c11
YO
1503 regnode *jumper = NULL;
1504 regnode *nextbranch = NULL;
7f69552c 1505 regnode *convert = NULL;
2e64971a 1506 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1507 /* we just use folder as a flag in utf8 */
1e696034 1508 const U8 * folder = NULL;
a3621e74 1509
2b8b4781
NC
1510#ifdef DEBUGGING
1511 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1512 AV *trie_words = NULL;
1513 /* along with revcharmap, this only used during construction but both are
1514 * useful during debugging so we store them in the struct when debugging.
8e11feef 1515 */
2b8b4781
NC
1516#else
1517 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1518 STRLEN trie_charcount=0;
3dab1dad 1519#endif
2b8b4781 1520 SV *re_trie_maxbuff;
a3621e74 1521 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1522
1523 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1524#ifndef DEBUGGING
1525 PERL_UNUSED_ARG(depth);
1526#endif
a3621e74 1527
1e696034 1528 switch (flags) {
c46d03cf 1529 case EXACT: break;
2f7f8cb1 1530 case EXACTFA:
fab2782b
YO
1531 case EXACTFU_SS:
1532 case EXACTFU_TRICKYFOLD:
1e696034
KW
1533 case EXACTFU: folder = PL_fold_latin1; break;
1534 case EXACTF: folder = PL_fold; break;
1535 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1536 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1537 }
1538
c944940b 1539 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1540 trie->refcount = 1;
3dab1dad 1541 trie->startstate = 1;
786e8c11 1542 trie->wordcount = word_count;
f8fc2ecf 1543 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1544 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1545 if (flags == EXACT)
c944940b 1546 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1547 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1548 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1549
a3621e74 1550 DEBUG_r({
2b8b4781 1551 trie_words = newAV();
a3621e74 1552 });
a3621e74 1553
0111c4fd 1554 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1555 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1556 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1557 }
df826430 1558 DEBUG_TRIE_COMPILE_r({
3dab1dad 1559 PerlIO_printf( Perl_debug_log,
786e8c11 1560 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1561 (int)depth * 2 + 2, "",
1562 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1563 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1564 (int)depth);
3dab1dad 1565 });
7f69552c
YO
1566
1567 /* Find the node we are going to overwrite */
1568 if ( first == startbranch && OP( last ) != BRANCH ) {
1569 /* whole branch chain */
1570 convert = first;
1571 } else {
1572 /* branch sub-chain */
1573 convert = NEXTOPER( first );
1574 }
1575
a3621e74
YO
1576 /* -- First loop and Setup --
1577
1578 We first traverse the branches and scan each word to determine if it
1579 contains widechars, and how many unique chars there are, this is
1580 important as we have to build a table with at least as many columns as we
1581 have unique chars.
1582
1583 We use an array of integers to represent the character codes 0..255
38a44b82 1584 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1585 native representation of the character value as the key and IV's for the
1586 coded index.
1587
1588 *TODO* If we keep track of how many times each character is used we can
1589 remap the columns so that the table compression later on is more
3b753521 1590 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1591 middle and the least common are on the outside. IMO this would be better
1592 than a most to least common mapping as theres a decent chance the most
1593 common letter will share a node with the least common, meaning the node
486ec47a 1594 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1595 case is when we have the least common nodes twice.
1596
1597 */
1598
a3621e74 1599 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1600 regnode *noper = NEXTOPER( cur );
e1ec3a88 1601 const U8 *uc = (U8*)STRING( noper );
df826430 1602 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
1603 STRLEN foldlen = 0;
1604 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
fab2782b 1605 STRLEN skiplen = 0;
2af232bd 1606 const U8 *scan = (U8*)NULL;
07be1b83 1607 U32 wordlen = 0; /* required init */
02daf0ab
YO
1608 STRLEN chars = 0;
1609 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1610
3dab1dad 1611 if (OP(noper) == NOTHING) {
df826430
YO
1612 regnode *noper_next= regnext(noper);
1613 if (noper_next != tail && OP(noper_next) == flags) {
1614 noper = noper_next;
1615 uc= (U8*)STRING(noper);
1616 e= uc + STR_LEN(noper);
1617 trie->minlen= STR_LEN(noper);
1618 } else {
1619 trie->minlen= 0;
1620 continue;
1621 }
3dab1dad 1622 }
df826430 1623
fab2782b 1624 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
1625 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1626 regardless of encoding */
fab2782b
YO
1627 if (OP( noper ) == EXACTFU_SS) {
1628 /* false positives are ok, so just set this */
1629 TRIE_BITMAP_SET(trie,0xDF);
1630 }
1631 }
a3621e74 1632 for ( ; uc < e ; uc += len ) {
3dab1dad 1633 TRIE_CHARCOUNT(trie)++;
a3621e74 1634 TRIE_READ_CHAR;
3dab1dad 1635 chars++;
a3621e74 1636 if ( uvc < 256 ) {
fab2782b
YO
1637 if ( folder ) {
1638 U8 folded= folder[ (U8) uvc ];
1639 if ( !trie->charmap[ folded ] ) {
1640 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1641 TRIE_STORE_REVCHAR( folded );
1642 }
1643 }
a3621e74
YO
1644 if ( !trie->charmap[ uvc ] ) {
1645 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 1646 TRIE_STORE_REVCHAR( uvc );
a3621e74 1647 }
02daf0ab 1648 if ( set_bit ) {
62012aee
KW
1649 /* store the codepoint in the bitmap, and its folded
1650 * equivalent. */
fab2782b 1651 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
1652
1653 /* store the folded codepoint */
fab2782b 1654 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
1655
1656 if ( !UTF ) {
1657 /* store first byte of utf8 representation of
acdf4139
KW
1658 variant codepoints */
1659 if (! UNI_IS_INVARIANT(uvc)) {
1660 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1661 }
1662 }
02daf0ab
YO
1663 set_bit = 0; /* We've done our bit :-) */
1664 }
a3621e74
YO
1665 } else {
1666 SV** svpp;
55eed653
NC
1667 if ( !widecharmap )
1668 widecharmap = newHV();
a3621e74 1669
55eed653 1670 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1671
1672 if ( !svpp )
e4584336 1673 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1674
1675 if ( !SvTRUE( *svpp ) ) {
1676 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 1677 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
1678 }
1679 }
1680 }
3dab1dad 1681 if( cur == first ) {
fab2782b
YO
1682 trie->minlen = chars;
1683 trie->maxlen = chars;
3dab1dad 1684 } else if (chars < trie->minlen) {
fab2782b 1685 trie->minlen = chars;
3dab1dad 1686 } else if (chars > trie->maxlen) {
fab2782b
YO
1687 trie->maxlen = chars;
1688 }
1689 if (OP( noper ) == EXACTFU_SS) {
1690 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1691 if (trie->minlen > 1)
1692 trie->minlen= 1;
1693 }
1694 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1695 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1696 * - We assume that any such sequence might match a 2 byte string */
1697 if (trie->minlen > 2 )
1698 trie->minlen= 2;
3dab1dad
YO
1699 }
1700
a3621e74
YO
1701 } /* end first pass */
1702 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1703 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1704 (int)depth * 2 + 2,"",
55eed653 1705 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1706 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1707 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1708 );
a3621e74
YO
1709
1710 /*
1711 We now know what we are dealing with in terms of unique chars and
1712 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1713 representation using a flat table will take. If it's over a reasonable
1714 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1715 conservative but potentially much slower representation using an array
1716 of lists.
1717
1718 At the end we convert both representations into the same compressed
1719 form that will be used in regexec.c for matching with. The latter
1720 is a form that cannot be used to construct with but has memory
1721 properties similar to the list form and access properties similar
1722 to the table form making it both suitable for fast searches and
1723 small enough that its feasable to store for the duration of a program.
1724
1725 See the comment in the code where the compressed table is produced
1726 inplace from the flat tabe representation for an explanation of how
1727 the compression works.
1728
1729 */
1730
1731
2e64971a
DM
1732 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1733 prev_states[1] = 0;
1734
3dab1dad 1735 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1736 /*
1737 Second Pass -- Array Of Lists Representation
1738
1739 Each state will be represented by a list of charid:state records
1740 (reg_trie_trans_le) the first such element holds the CUR and LEN
1741 points of the allocated array. (See defines above).
1742
1743 We build the initial structure using the lists, and then convert
1744 it into the compressed table form which allows faster lookups
1745 (but cant be modified once converted).
a3621e74
YO
1746 */
1747
a3621e74
YO
1748 STRLEN transcount = 1;
1749
1e2e3d02
YO
1750 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1751 "%*sCompiling trie using list compiler\n",
1752 (int)depth * 2 + 2, ""));
686b73d4 1753
c944940b
JH
1754 trie->states = (reg_trie_state *)
1755 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1756 sizeof(reg_trie_state) );
a3621e74
YO
1757 TRIE_LIST_NEW(1);
1758 next_alloc = 2;
1759
1760 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1761
df826430 1762 regnode *noper = NEXTOPER( cur );
c445ea15 1763 U8 *uc = (U8*)STRING( noper );
df826430 1764 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
1765 U32 state = 1; /* required init */
1766 U16 charid = 0; /* sanity init */
1767 U8 *scan = (U8*)NULL; /* sanity init */
1768 STRLEN foldlen = 0; /* required init */
07be1b83 1769 U32 wordlen = 0; /* required init */
c445ea15 1770 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
fab2782b 1771 STRLEN skiplen = 0;
c445ea15 1772
df826430
YO
1773 if (OP(noper) == NOTHING) {
1774 regnode *noper_next= regnext(noper);
1775 if (noper_next != tail && OP(noper_next) == flags) {
1776 noper = noper_next;
1777 uc= (U8*)STRING(noper);
1778 e= uc + STR_LEN(noper);
1779 }
1780 }
1781
3dab1dad 1782 if (OP(noper) != NOTHING) {
786e8c11 1783 for ( ; uc < e ; uc += len ) {
c445ea15 1784
786e8c11 1785 TRIE_READ_CHAR;
c445ea15 1786
786e8c11
YO
1787 if ( uvc < 256 ) {
1788 charid = trie->charmap[ uvc ];
c445ea15 1789 } else {
55eed653 1790 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1791 if ( !svpp ) {
1792 charid = 0;
1793 } else {
1794 charid=(U16)SvIV( *svpp );
1795 }
c445ea15 1796 }
786e8c11
YO
1797 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1798 if ( charid ) {
a3621e74 1799
786e8c11
YO
1800 U16 check;
1801 U32 newstate = 0;
a3621e74 1802
786e8c11
YO
1803 charid--;
1804 if ( !trie->states[ state ].trans.list ) {
1805 TRIE_LIST_NEW( state );
c445ea15 1806 }
786e8c11
YO
1807 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1808 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1809 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1810 break;
1811 }
1812 }
1813 if ( ! newstate ) {
1814 newstate = next_alloc++;
2e64971a 1815 prev_states[newstate] = state;
786e8c11
YO
1816 TRIE_LIST_PUSH( state, charid, newstate );
1817 transcount++;
1818 }
1819 state = newstate;
1820 } else {
1821 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1822 }
a28509cc 1823 }
c445ea15 1824 }
3dab1dad 1825 TRIE_HANDLE_WORD(state);
a3621e74
YO
1826
1827 } /* end second pass */
1828
1e2e3d02
YO
1829 /* next alloc is the NEXT state to be allocated */
1830 trie->statecount = next_alloc;
c944940b
JH
1831 trie->states = (reg_trie_state *)
1832 PerlMemShared_realloc( trie->states,
1833 next_alloc
1834 * sizeof(reg_trie_state) );
a3621e74 1835
3dab1dad 1836 /* and now dump it out before we compress it */
2b8b4781
NC
1837 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1838 revcharmap, next_alloc,
1839 depth+1)
1e2e3d02 1840 );
a3621e74 1841
c944940b
JH
1842 trie->trans = (reg_trie_trans *)
1843 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1844 {
1845 U32 state;
a3621e74
YO
1846 U32 tp = 0;
1847 U32 zp = 0;
1848
1849
1850 for( state=1 ; state < next_alloc ; state ++ ) {
1851 U32 base=0;
1852
1853 /*
1854 DEBUG_TRIE_COMPILE_MORE_r(
1855 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1856 );
1857 */
1858
1859 if (trie->states[state].trans.list) {
1860 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1861 U16 maxid=minid;
a28509cc 1862 U16 idx;
a3621e74
YO
1863
1864 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1865 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1866 if ( forid < minid ) {
1867 minid=forid;
1868 } else if ( forid > maxid ) {
1869 maxid=forid;
1870 }
a3621e74
YO
1871 }
1872 if ( transcount < tp + maxid - minid + 1) {
1873 transcount *= 2;
c944940b
JH
1874 trie->trans = (reg_trie_trans *)
1875 PerlMemShared_realloc( trie->trans,
446bd890
NC
1876 transcount
1877 * sizeof(reg_trie_trans) );
a3621e74
YO
1878 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1879 }
1880 base = trie->uniquecharcount + tp - minid;
1881 if ( maxid == minid ) {
1882 U32 set = 0;
1883 for ( ; zp < tp ; zp++ ) {
1884 if ( ! trie->trans[ zp ].next ) {
1885 base = trie->uniquecharcount + zp - minid;
1886 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1887 trie->trans[ zp ].check = state;
1888 set = 1;
1889 break;
1890 }
1891 }
1892 if ( !set ) {
1893 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1894 trie->trans[ tp ].check = state;
1895 tp++;
1896 zp = tp;
1897 }
1898 } else {
1899 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1900 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1901 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1902 trie->trans[ tid ].check = state;
1903 }
1904 tp += ( maxid - minid + 1 );
1905 }
1906 Safefree(trie->states[ state ].trans.list);
1907 }
1908 /*
1909 DEBUG_TRIE_COMPILE_MORE_r(
1910 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1911 );
1912 */
1913 trie->states[ state ].trans.base=base;
1914 }
cc601c31 1915 trie->lasttrans = tp + 1;
a3621e74
YO
1916 }
1917 } else {
1918 /*
1919 Second Pass -- Flat Table Representation.
1920
1921 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1922 We know that we will need Charcount+1 trans at most to store the data
1923 (one row per char at worst case) So we preallocate both structures
1924 assuming worst case.
1925
1926 We then construct the trie using only the .next slots of the entry
1927 structs.
1928
3b753521 1929 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1930 make compression both faster and easier by keeping track of how many non
1931 zero fields are in the node.
1932
1933 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1934 transition.
1935
1936 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1937 number representing the first entry of the node, and state as a
1938 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1939 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1940 are 2 entrys per node. eg:
1941
1942 A B A B
1943 1. 2 4 1. 3 7
1944 2. 0 3 3. 0 5
1945 3. 0 0 5. 0 0
1946 4. 0 0 7. 0 0
1947
1948 The table is internally in the right hand, idx form. However as we also
1949 have to deal with the states array which is indexed by nodenum we have to
1950 use TRIE_NODENUM() to convert.
1951
1952 */
1e2e3d02
YO
1953 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1954 "%*sCompiling trie using table compiler\n",
1955 (int)depth * 2 + 2, ""));
3dab1dad 1956
c944940b
JH
1957 trie->trans = (reg_trie_trans *)
1958 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1959 * trie->uniquecharcount + 1,
1960 sizeof(reg_trie_trans) );
1961 trie->states = (reg_trie_state *)
1962 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1963 sizeof(reg_trie_state) );
a3621e74
YO
1964 next_alloc = trie->uniquecharcount + 1;
1965
3dab1dad 1966
a3621e74
YO
1967 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1968
df826430 1969 regnode *noper = NEXTOPER( cur );
a28509cc 1970 const U8 *uc = (U8*)STRING( noper );
df826430 1971 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
1972
1973 U32 state = 1; /* required init */
1974
1975 U16 charid = 0; /* sanity init */
1976 U32 accept_state = 0; /* sanity init */
1977 U8 *scan = (U8*)NULL; /* sanity init */
1978
1979 STRLEN foldlen = 0; /* required init */
07be1b83 1980 U32 wordlen = 0; /* required init */
fab2782b 1981 STRLEN skiplen = 0;
a3621e74
YO
1982 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1983
df826430
YO
1984 if (OP(noper) == NOTHING) {
1985 regnode *noper_next= regnext(noper);
1986 if (noper_next != tail && OP(noper_next) == flags) {
1987 noper = noper_next;
1988 uc= (U8*)STRING(noper);
1989 e= uc + STR_LEN(noper);
1990 }
1991 }
fab2782b 1992
3dab1dad 1993 if ( OP(noper) != NOTHING ) {
786e8c11 1994 for ( ; uc < e ; uc += len ) {
a3621e74 1995
786e8c11 1996 TRIE_READ_CHAR;
a3621e74 1997
786e8c11
YO
1998 if ( uvc < 256 ) {
1999 charid = trie->charmap[ uvc ];
2000 } else {
55eed653 2001 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2002 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2003 }
786e8c11
YO
2004 if ( charid ) {
2005 charid--;
2006 if ( !trie->trans[ state + charid ].next ) {
2007 trie->trans[ state + charid ].next = next_alloc;
2008 trie->trans[ state ].check++;
2e64971a
DM
2009 prev_states[TRIE_NODENUM(next_alloc)]
2010 = TRIE_NODENUM(state);
786e8c11
YO
2011 next_alloc += trie->uniquecharcount;
2012 }
2013 state = trie->trans[ state + charid ].next;
2014 } else {
2015 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2016 }
2017 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2018 }
a3621e74 2019 }
3dab1dad
YO
2020 accept_state = TRIE_NODENUM( state );
2021 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2022
2023 } /* end second pass */
2024
3dab1dad 2025 /* and now dump it out before we compress it */
2b8b4781
NC
2026 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2027 revcharmap,
2028 next_alloc, depth+1));
a3621e74 2029
a3621e74
YO
2030 {
2031 /*
2032 * Inplace compress the table.*
2033
2034 For sparse data sets the table constructed by the trie algorithm will
2035 be mostly 0/FAIL transitions or to put it another way mostly empty.
2036 (Note that leaf nodes will not contain any transitions.)
2037
2038 This algorithm compresses the tables by eliminating most such
2039 transitions, at the cost of a modest bit of extra work during lookup:
2040
2041 - Each states[] entry contains a .base field which indicates the
2042 index in the state[] array wheres its transition data is stored.
2043
3b753521 2044 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2045
2046 - If .base is nonzero then charid is added to it to find an entry in
2047 the trans array.
2048
2049 -If trans[states[state].base+charid].check!=state then the
2050 transition is taken to be a 0/Fail transition. Thus if there are fail
2051 transitions at the front of the node then the .base offset will point
2052 somewhere inside the previous nodes data (or maybe even into a node
2053 even earlier), but the .check field determines if the transition is
2054 valid.
2055
786e8c11 2056 XXX - wrong maybe?
a3621e74 2057 The following process inplace converts the table to the compressed
3b753521 2058 table: We first do not compress the root node 1,and mark all its
a3621e74 2059 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2060 allows us to do a DFA construction from the compressed table later,
2061 and ensures that any .base pointers we calculate later are greater
2062 than 0.
a3621e74
YO
2063
2064 - We set 'pos' to indicate the first entry of the second node.
2065
2066 - We then iterate over the columns of the node, finding the first and
2067 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2068 and set the .check pointers accordingly, and advance pos
2069 appropriately and repreat for the next node. Note that when we copy
2070 the next pointers we have to convert them from the original
2071 NODEIDX form to NODENUM form as the former is not valid post
2072 compression.
2073
2074 - If a node has no transitions used we mark its base as 0 and do not
2075 advance the pos pointer.
2076
2077 - If a node only has one transition we use a second pointer into the
2078 structure to fill in allocated fail transitions from other states.
2079 This pointer is independent of the main pointer and scans forward
2080 looking for null transitions that are allocated to a state. When it
2081 finds one it writes the single transition into the "hole". If the
786e8c11 2082 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2083
2084 - Once compressed we can Renew/realloc the structures to release the
2085 excess space.
2086
2087 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2088 specifically Fig 3.47 and the associated pseudocode.
2089
2090 demq
2091 */
a3b680e6 2092 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2093 U32 state, charid;
a3621e74 2094 U32 pos = 0, zp=0;
1e2e3d02 2095 trie->statecount = laststate;
a3621e74
YO
2096
2097 for ( state = 1 ; state < laststate ; state++ ) {
2098 U8 flag = 0;
a28509cc
AL
2099 const U32 stateidx = TRIE_NODEIDX( state );
2100 const U32 o_used = trie->trans[ stateidx ].check;
2101 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2102 trie->trans[ stateidx ].check = 0;
2103
2104 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2105 if ( flag || trie->trans[ stateidx + charid ].next ) {
2106 if ( trie->trans[ stateidx + charid ].next ) {
2107 if (o_used == 1) {
2108 for ( ; zp < pos ; zp++ ) {
2109 if ( ! trie->trans[ zp ].next ) {
2110 break;
2111 }
2112 }
2113 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2114 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2115 trie->trans[ zp ].check = state;
2116 if ( ++zp > pos ) pos = zp;
2117 break;
2118 }
2119 used--;
2120 }
2121 if ( !flag ) {
2122 flag = 1;
2123 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2124 }
2125 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2126 trie->trans[ pos ].check = state;
2127 pos++;
2128 }
2129 }
2130 }
cc601c31 2131 trie->lasttrans = pos + 1;
c944940b
JH
2132 trie->states = (reg_trie_state *)
2133 PerlMemShared_realloc( trie->states, laststate
2134 * sizeof(reg_trie_state) );
a3621e74 2135 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2136 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2137 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2138 (int)depth * 2 + 2,"",
2139 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2140 (IV)next_alloc,
2141 (IV)pos,
a3621e74
YO
2142 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2143 );
2144
2145 } /* end table compress */
2146 }
1e2e3d02
YO
2147 DEBUG_TRIE_COMPILE_MORE_r(
2148 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2149 (int)depth * 2 + 2, "",
2150 (UV)trie->statecount,
2151 (UV)trie->lasttrans)
2152 );
cc601c31 2153 /* resize the trans array to remove unused space */
c944940b
JH
2154 trie->trans = (reg_trie_trans *)
2155 PerlMemShared_realloc( trie->trans, trie->lasttrans
2156 * sizeof(reg_trie_trans) );
a3621e74 2157
3b753521 2158 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2159 U8 nodetype =(U8)(flags & 0xFF);
2160 char *str=NULL;
786e8c11 2161
07be1b83 2162#ifdef DEBUGGING
e62cc96a 2163 regnode *optimize = NULL;
7122b237
YO
2164#ifdef RE_TRACK_PATTERN_OFFSETS
2165
b57a0404
JH
2166 U32 mjd_offset = 0;
2167 U32 mjd_nodelen = 0;
7122b237
YO
2168#endif /* RE_TRACK_PATTERN_OFFSETS */
2169#endif /* DEBUGGING */
a3621e74 2170 /*
3dab1dad
YO
2171 This means we convert either the first branch or the first Exact,
2172 depending on whether the thing following (in 'last') is a branch
2173 or not and whther first is the startbranch (ie is it a sub part of
2174 the alternation or is it the whole thing.)
3b753521 2175 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2176 the whole branch sequence, including the first.
a3621e74 2177 */
3dab1dad 2178 /* Find the node we are going to overwrite */
7f69552c 2179 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2180 /* branch sub-chain */
3dab1dad 2181 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2182#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2183 DEBUG_r({
2184 mjd_offset= Node_Offset((convert));
2185 mjd_nodelen= Node_Length((convert));
2186 });
7122b237 2187#endif
7f69552c 2188 /* whole branch chain */
7122b237
YO
2189 }
2190#ifdef RE_TRACK_PATTERN_OFFSETS
2191 else {
7f69552c
YO
2192 DEBUG_r({
2193 const regnode *nop = NEXTOPER( convert );
2194 mjd_offset= Node_Offset((nop));
2195 mjd_nodelen= Node_Length((nop));
2196 });
07be1b83
YO
2197 }
2198 DEBUG_OPTIMISE_r(
2199 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2200 (int)depth * 2 + 2, "",
786e8c11 2201 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2202 );
7122b237 2203#endif
3dab1dad
YO
2204 /* But first we check to see if there is a common prefix we can
2205 split out as an EXACT and put in front of the TRIE node. */
2206 trie->startstate= 1;
55eed653 2207 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2208 U32 state;
1e2e3d02 2209 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2210 U32 ofs = 0;
8e11feef
RGS
2211 I32 idx = -1;
2212 U32 count = 0;
2213 const U32 base = trie->states[ state ].trans.base;
a3621e74 2214
3dab1dad 2215 if ( trie->states[state].wordnum )
8e11feef 2216 count = 1;
a3621e74 2217
8e11feef 2218 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2219 if ( ( base + ofs >= trie->uniquecharcount ) &&
2220 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2221 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2222 {
3dab1dad 2223 if ( ++count > 1 ) {
2b8b4781 2224 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2225 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2226 if ( state == 1 ) break;
3dab1dad
YO
2227 if ( count == 2 ) {
2228 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2229 DEBUG_OPTIMISE_r(
8e11feef
RGS
2230 PerlIO_printf(Perl_debug_log,
2231 "%*sNew Start State=%"UVuf" Class: [",
2232 (int)depth * 2 + 2, "",
786e8c11 2233 (UV)state));
be8e71aa 2234 if (idx >= 0) {
2b8b4781 2235 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2236 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2237
3dab1dad 2238 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2239 if ( folder )
2240 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2241 DEBUG_OPTIMISE_r(
f1f66076 2242 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2243 );
8e11feef
RGS
2244 }
2245 }
2246 TRIE_BITMAP_SET(trie,*ch);
2247 if ( folder )
2248 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2249 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2250 }
2251 idx = ofs;
2252 }
3dab1dad
YO
2253 }
2254 if ( count == 1 ) {
2b8b4781 2255 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2256 STRLEN len;
2257 char *ch = SvPV( *tmp, len );
de734bd5
A
2258 DEBUG_OPTIMISE_r({
2259 SV *sv=sv_newmortal();
8e11feef
RGS
2260 PerlIO_printf( Perl_debug_log,
2261 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2262 (int)depth * 2 + 2, "",
de734bd5
A
2263 (UV)state, (UV)idx,
2264 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2265 PL_colors[0], PL_colors[1],
2266 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2267 PERL_PV_ESCAPE_FIRSTCHAR
2268 )
2269 );
2270 });
3dab1dad
YO
2271 if ( state==1 ) {
2272 OP( convert ) = nodetype;
2273 str=STRING(convert);
2274 STR_LEN(convert)=0;
2275 }
c490c714
YO
2276 STR_LEN(convert) += len;
2277 while (len--)
de734bd5 2278 *str++ = *ch++;
8e11feef 2279 } else {
f9049ba1 2280#ifdef DEBUGGING
8e11feef
RGS
2281 if (state>1)
2282 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2283#endif
8e11feef
RGS
2284 break;
2285 }
2286 }
2e64971a 2287 trie->prefixlen = (state-1);
3dab1dad 2288 if (str) {
8e11feef 2289 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2290 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2291 trie->startstate = state;
07be1b83
YO
2292 trie->minlen -= (state - 1);
2293 trie->maxlen -= (state - 1);
33809eae
JH
2294#ifdef DEBUGGING
2295 /* At least the UNICOS C compiler choked on this
2296 * being argument to DEBUG_r(), so let's just have
2297 * it right here. */
2298 if (
2299#ifdef PERL_EXT_RE_BUILD
2300 1
2301#else
2302 DEBUG_r_TEST
2303#endif
2304 ) {
2305 regnode *fix = convert;
2306 U32 word = trie->wordcount;
2307 mjd_nodelen++;
2308 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2309 while( ++fix < n ) {
2310 Set_Node_Offset_Length(fix, 0, 0);
2311 }
2312 while (word--) {
2313 SV ** const tmp = av_fetch( trie_words, word, 0 );
2314 if (tmp) {
2315 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2316 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2317 else
2318 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2319 }
2320 }
2321 }
2322#endif
8e11feef
RGS
2323 if (trie->maxlen) {
2324 convert = n;
2325 } else {
3dab1dad 2326 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2327 DEBUG_r(optimize= n);
3dab1dad
YO
2328 }
2329 }
2330 }
a5ca303d
YO
2331 if (!jumper)
2332 jumper = last;
3dab1dad 2333 if ( trie->maxlen ) {
8e11feef
RGS
2334 NEXT_OFF( convert ) = (U16)(tail - convert);
2335 ARG_SET( convert, data_slot );
786e8c11
YO
2336 /* Store the offset to the first unabsorbed branch in
2337 jump[0], which is otherwise unused by the jump logic.
2338 We use this when dumping a trie and during optimisation. */
2339 if (trie->jump)
7f69552c 2340 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2341
6c48061a
YO
2342 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2343 * and there is a bitmap
2344 * and the first "jump target" node we found leaves enough room
2345 * then convert the TRIE node into a TRIEC node, with the bitmap
2346 * embedded inline in the opcode - this is hypothetically faster.
2347 */
2348 if ( !trie->states[trie->startstate].wordnum
2349 && trie->bitmap
2350 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2351 {
2352 OP( convert ) = TRIEC;
2353 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2354 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2355 trie->bitmap= NULL;
2356 } else
2357 OP( convert ) = TRIE;
a3621e74 2358
3dab1dad
YO
2359 /* store the type in the flags */
2360 convert->flags = nodetype;
a5ca303d
YO
2361 DEBUG_r({
2362 optimize = convert
2363 + NODE_STEP_REGNODE
2364 + regarglen[ OP( convert ) ];
2365 });
2366 /* XXX We really should free up the resource in trie now,
2367 as we won't use them - (which resources?) dmq */
3dab1dad 2368 }
a3621e74 2369 /* needed for dumping*/
e62cc96a 2370 DEBUG_r(if (optimize) {
07be1b83 2371 regnode *opt = convert;
bcdf7404 2372
e62cc96a 2373 while ( ++opt < optimize) {
07be1b83
YO
2374 Set_Node_Offset_Length(opt,0,0);
2375 }
786e8c11
YO
2376 /*
2377 Try to clean up some of the debris left after the
2378 optimisation.
a3621e74 2379 */
786e8c11 2380 while( optimize < jumper ) {
07be1b83 2381 mjd_nodelen += Node_Length((optimize));
a3621e74 2382 OP( optimize ) = OPTIMIZED;
07be1b83 2383 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2384 optimize++;
2385 }
07be1b83 2386 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2387 });
2388 } /* end node insert */
2e64971a
DM
2389
2390 /* Finish populating the prev field of the wordinfo array. Walk back
2391 * from each accept state until we find another accept state, and if
2392 * so, point the first word's .prev field at the second word. If the
2393 * second already has a .prev field set, stop now. This will be the
2394 * case either if we've already processed that word's accept state,
3b753521
FN
2395 * or that state had multiple words, and the overspill words were
2396 * already linked up earlier.
2e64971a
DM
2397 */
2398 {
2399 U16 word;
2400 U32 state;
2401 U16 prev;
2402
2403 for (word=1; word <= trie->wordcount; word++) {
2404 prev = 0;
2405 if (trie->wordinfo[word].prev)
2406 continue;
2407 state = trie->wordinfo[word].accept;
2408 while (state) {
2409 state = prev_states[state];
2410 if (!state)
2411 break;
2412 prev = trie->states[state].wordnum;
2413 if (prev)
2414 break;
2415 }
2416 trie->wordinfo[word].prev = prev;
2417 }
2418 Safefree(prev_states);
2419 }
2420
2421
2422 /* and now dump out the compressed format */
2423 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2424
55eed653 2425 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2426#ifdef DEBUGGING
2427 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2428 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2429#else
2430 SvREFCNT_dec(revcharmap);
07be1b83 2431#endif
786e8c11
YO
2432 return trie->jump
2433 ? MADE_JUMP_TRIE
2434 : trie->startstate>1
2435 ? MADE_EXACT_TRIE
2436 : MADE_TRIE;
2437}
2438
2439STATIC void
2440S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2441{
3b753521 2442/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2443
2444 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2445 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2446 ISBN 0-201-10088-6
2447
2448 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2449 suffix of the current state's 'word' that is also a proper prefix of another word in our
2450 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2451 the DFA not to have to restart after its tried and failed a word at a given point, it
2452 simply continues as though it had been matching the other word in the first place.
2453 Consider
2454 'abcdgu'=~/abcdefg|cdgu/
2455 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2456 fail, which would bring us to the state representing 'd' in the second word where we would
2457 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2458 */
2459 /* add a fail transition */
3251b653
NC
2460 const U32 trie_offset = ARG(source);
2461 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2462 U32 *q;
2463 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2464 const U32 numstates = trie->statecount;
786e8c11
YO
2465 const U32 ubound = trie->lasttrans + ucharcount;
2466 U32 q_read = 0;
2467 U32 q_write = 0;
2468 U32 charid;
2469 U32 base = trie->states[ 1 ].trans.base;
2470 U32 *fail;
2471 reg_ac_data *aho;
2472 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2473 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2474
2475 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2476#ifndef DEBUGGING
2477 PERL_UNUSED_ARG(depth);
2478#endif
2479
2480
2481 ARG_SET( stclass, data_slot );
c944940b 2482 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2483 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2484 aho->trie=trie_offset;
446bd890
NC
2485 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2486 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2487 Newxz( q, numstates, U32);
c944940b 2488 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2489 aho->refcount = 1;
2490 fail = aho->fail;
2491 /* initialize fail[0..1] to be 1 so that we always have
2492 a valid final fail state */
2493 fail[ 0 ] = fail[ 1 ] = 1;
2494
2495 for ( charid = 0; charid < ucharcount ; charid++ ) {
2496 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2497 if ( newstate ) {
2498 q[ q_write ] = newstate;
2499 /* set to point at the root */
2500 fail[ q[ q_write++ ] ]=1;
2501 }
2502 }
2503 while ( q_read < q_write) {
2504 const U32 cur = q[ q_read++ % numstates ];
2505 base = trie->states[ cur ].trans.base;
2506
2507 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2508 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2509 if (ch_state) {
2510 U32 fail_state = cur;
2511 U32 fail_base;
2512 do {
2513 fail_state = fail[ fail_state ];
2514 fail_base = aho->states[ fail_state ].trans.base;
2515 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2516
2517 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2518 fail[ ch_state ] = fail_state;
2519 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2520 {
2521 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2522 }
2523 q[ q_write++ % numstates] = ch_state;
2524 }
2525 }
2526 }
2527 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2528 when we fail in state 1, this allows us to use the
2529 charclass scan to find a valid start char. This is based on the principle
2530 that theres a good chance the string being searched contains lots of stuff
2531 that cant be a start char.
2532 */
2533 fail[ 0 ] = fail[ 1 ] = 0;
2534 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2535 PerlIO_printf(Perl_debug_log,
2536 "%*sStclass Failtable (%"UVuf" states): 0",
2537 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2538 );
786e8c11
YO
2539 for( q_read=1; q_read<numstates; q_read++ ) {
2540 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2541 }
2542 PerlIO_printf(Perl_debug_log, "\n");
2543 });
2544 Safefree(q);
2545 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2546}
2547
786e8c11 2548
a3621e74 2549/*
5d1c421c
JH
2550 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2551 * These need to be revisited when a newer toolchain becomes available.
2552 */
2553#if defined(__sparc64__) && defined(__GNUC__)
2554# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2555# undef SPARC64_GCC_WORKAROUND
2556# define SPARC64_GCC_WORKAROUND 1
2557# endif
2558#endif
2559
07be1b83 2560#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2561 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2562 SV * const mysv=sv_newmortal(); \
2563 regnode *Next = regnext(scan); \
2564 regprop(RExC_rx, mysv, scan); \
7f69552c 2565 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2566 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2567 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2568 }});
07be1b83 2569
1de06328 2570
bb914485
KW
2571/* The below joins as many adjacent EXACTish nodes as possible into a single
2572 * one, and looks for problematic sequences of characters whose folds vs.
2573 * non-folds have sufficiently different lengths, that the optimizer would be
2574 * fooled into rejecting legitimate matches of them, and the trie construction
2575 * code can't cope with them. The joining is only done if:
2576 * 1) there is room in the current conglomerated node to entirely contain the
2577 * next one.
2578 * 2) they are the exact same node type
2579 *
2580 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2581 * these get optimized out
2582 *
9d071ca8
KW
2583 * If there are problematic code sequences, *min_subtract is set to the delta
2584 * that the minimum size of the node can be less than its actual size. And,
2585 * the node type of the result is changed to reflect that it contains these
bb914485
KW
2586 * sequences.
2587 *
a0c4c608
KW
2588 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2589 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2590 *
bb914485
KW
2591 * This is as good a place as any to discuss the design of handling these
2592 * problematic sequences. It's been wrong in Perl for a very long time. There
2593 * are three code points in Unicode whose folded lengths differ so much from
2594 * the un-folded lengths that it causes problems for the optimizer and trie
2595 * construction. Why only these are problematic, and not others where lengths
2596 * also differ is something I (khw) do not understand. New versions of Unicode
2597 * might add more such code points. Hopefully the logic in fold_grind.t that
287722f3 2598 * figures out what to test (in part by verifying that each size-combination
bb914485 2599 * gets tested) will catch any that do come along, so they can be added to the
287722f3
KW
2600 * special handling below. The chances of new ones are actually rather small,
2601 * as most, if not all, of the world's scripts that have casefolding have
2602 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2603 * made to allow compatibility with pre-existing standards, and almost all of
2604 * those have already been dealt with. These would otherwise be the most
2605 * likely candidates for generating further tricky sequences. In other words,
2606 * Unicode by itself is unlikely to add new ones unless it is for compatibility
a0c4c608 2607 * with pre-existing standards, and there aren't many of those left.
bb914485
KW
2608 *
2609 * The previous designs for dealing with these involved assigning a special
2610 * node for them. This approach doesn't work, as evidenced by this example:
a0c4c608 2611 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
bb914485
KW
2612 * Both these fold to "sss", but if the pattern is parsed to create a node of
2613 * that would match just the \xDF, it won't be able to handle the case where a
2614 * successful match would have to cross the node's boundary. The new approach
2615 * that hopefully generally solves the problem generates an EXACTFU_SS node
2616 * that is "sss".
2617 *
2618 * There are a number of components to the approach (a lot of work for just
2619 * three code points!):
2620 * 1) This routine examines each EXACTFish node that could contain the
9d071ca8
KW
2621 * problematic sequences. It returns in *min_subtract how much to
2622 * subtract from the the actual length of the string to get a real minimum
2623 * for one that could match it. This number is usually 0 except for the
2624 * problematic sequences. This delta is used by the caller to adjust the
2625 * min length of the match, and the delta between min and max, so that the
2626 * optimizer doesn't reject these possibilities based on size constraints.
bb914485
KW
2627 * 2) These sequences are not currently correctly handled by the trie code
2628 * either, so it changes the joined node type to ops that are not handled
fab2782b 2629 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
bb914485
KW
2630 * 3) This is sufficient for the two Greek sequences (described below), but
2631 * the one involving the Sharp s (\xDF) needs more. The node type
2632 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2633 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2634 * case where there is a possible fold length change. That means that a
2635 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2636 * itself with length changes, and so can be processed faster. regexec.c
2637 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2638 * is pre-folded by regcomp.c. This saves effort in regex matching.
2639 * However, probably mostly for historical reasons, the pre-folding isn't
a0c4c608
KW
2640 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2641 * nodes, as what they fold to isn't known until runtime.) The fold
2642 * possibilities for the non-UTF8 patterns are quite simple, except for
2643 * the sharp s. All the ones that don't involve a UTF-8 target string
2644 * are members of a fold-pair, and arrays are set up for all of them
2645 * that quickly find the other member of the pair. It might actually
2646 * be faster to pre-fold these, but it isn't currently done, except for
2647 * the sharp s. Code elsewhere in this file makes sure that it gets
2648 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2649 * issues described in the next item.
bb914485
KW
2650 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2651 * 'ss' or not is not knowable at compile time. It will match iff the
2652 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2653 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2654 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2655 * described in item 3). An assumption that the optimizer part of
2656 * regexec.c (probably unwittingly) makes is that a character in the
2657 * pattern corresponds to at most a single character in the target string.
2658 * (And I do mean character, and not byte here, unlike other parts of the
2659 * documentation that have never been updated to account for multibyte
2660 * Unicode.) This assumption is wrong only in this case, as all other
2661 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2662 * virtue of having this file pre-fold UTF-8 patterns. I'm
2663 * reluctant to try to change this assumption, so instead the code punts.
9d071ca8
KW
2664 * This routine examines EXACTF nodes for the sharp s, and returns a
2665 * boolean indicating whether or not the node is an EXACTF node that
2666 * contains a sharp s. When it is true, the caller sets a flag that later
2667 * causes the optimizer in this file to not set values for the floating
2668 * and fixed string lengths, and thus avoids the optimizer code in
2669 * regexec.c that makes the invalid assumption. Thus, there is no
2670 * optimization based on string lengths for EXACTF nodes that contain the
2671 * sharp s. This only happens for /id rules (which means the pattern
2672 * isn't in UTF-8).
bb914485 2673 */
1de06328 2674
9d071ca8 2675#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 2676 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 2677 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 2678
be8e71aa 2679STATIC U32
9d071ca8 2680S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
2681 /* Merge several consecutive EXACTish nodes into one. */
2682 regnode *n = regnext(scan);
2683 U32 stringok = 1;
2684 regnode *next = scan + NODE_SZ_STR(scan);
2685 U32 merged = 0;
2686 U32 stopnow = 0;
2687#ifdef DEBUGGING
2688 regnode *stop = scan;
72f13be8 2689 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2690#else
d47053eb
RGS
2691 PERL_UNUSED_ARG(depth);
2692#endif
7918f24d
NC
2693
2694 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2695#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2696 PERL_UNUSED_ARG(flags);
2697 PERL_UNUSED_ARG(val);
07be1b83 2698#endif
07be1b83 2699 DEBUG_PEEP("join",scan,depth);
bb914485 2700
3f410cf6
KW
2701 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2702 * EXACT ones that are mergeable to the current one. */
2703 while (n
2704 && (PL_regkind[OP(n)] == NOTHING
2705 || (stringok && OP(n) == OP(scan)))
07be1b83 2706 && NEXT_OFF(n)
3f410cf6
KW
2707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2708 {
07be1b83
YO
2709
2710 if (OP(n) == TAIL || n > next)
2711 stringok = 0;
2712 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2713 DEBUG_PEEP("skip:",n,depth);
2714 NEXT_OFF(scan) += NEXT_OFF(n);
2715 next = n + NODE_STEP_REGNODE;
2716#ifdef DEBUGGING
2717 if (stringok)
2718 stop = n;
2719#endif
2720 n = regnext(n);
2721 }
2722 else if (stringok) {
786e8c11 2723 const unsigned int oldl = STR_LEN(scan);
07be1b83 2724 regnode * const nnext = regnext(n);
b2230d39
KW
2725
2726 if (oldl + STR_LEN(n) > U8_MAX)
2727 break;
07be1b83
YO
2728
2729 DEBUG_PEEP("merg",n,depth);
07be1b83 2730 merged++;
b2230d39 2731
07be1b83
YO
2732 NEXT_OFF(scan) += NEXT_OFF(n);
2733 STR_LEN(scan) += STR_LEN(n);
2734 next = n + NODE_SZ_STR(n);
2735 /* Now we can overwrite *n : */
2736 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2737#ifdef DEBUGGING
2738 stop = next - 1;
2739#endif
2740 n = nnext;
2741 if (stopnow) break;
2742 }
2743
d47053eb
RGS
2744#ifdef EXPERIMENTAL_INPLACESCAN
2745 if (flags && !NEXT_OFF(n)) {
2746 DEBUG_PEEP("atch", val, depth);
2747 if (reg_off_by_arg[OP(n)]) {
2748 ARG_SET(n, val - n);
2749 }
2750 else {
2751 NEXT_OFF(n) = val - n;
2752 }
2753 stopnow = 1;
2754 }
07be1b83
YO
2755#endif
2756 }
2c2b7f86 2757
9d071ca8 2758 *min_subtract = 0;
f758bddf 2759 *has_exactf_sharp_s = FALSE;
f646642f 2760
3f410cf6
KW
2761 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2762 * can now analyze for sequences of problematic code points. (Prior to
2763 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
2764 * hence missed). The sequences only happen in folding, hence for any
2765 * non-EXACT EXACTish node */
86d6fcad 2766 if (OP(scan) != EXACT) {
f758bddf
KW
2767 U8 *s;
2768 U8 * s0 = (U8*) STRING(scan);
2769 U8 * const s_end = s0 + STR_LEN(scan);
2770
2771 /* The below is perhaps overboard, but this allows us to save a test
2772 * each time through the loop at the expense of a mask. This is
2773 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2774 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2775 * This uses an exclusive 'or' to find that bit and then inverts it to
2776 * form a mask, with just a single 0, in the bit position where 'S' and
2777 * 's' differ. */
dbeb8947 2778 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
f758bddf
KW
2779 const U8 s_masked = 's' & S_or_s_mask;
2780
2781 /* One pass is made over the node's string looking for all the
2782 * possibilities. to avoid some tests in the loop, there are two main
2783 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2784 * non-UTF-8 */
2785 if (UTF) {
86d6fcad 2786
f758bddf
KW
2787 /* There are two problematic Greek code points in Unicode
2788 * casefolding
86d6fcad
KW
2789 *
2790 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2791 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2792 *
2793 * which casefold to
2794 *
2795 * Unicode UTF-8
2796 *
2797 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2798 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2799 *
2800 * This means that in case-insensitive matching (or "loose
2801 * matching", as Unicode calls it), an EXACTF of length six (the
2802 * UTF-8 encoded byte length of the above casefolded versions) can
2803 * match a target string of length two (the byte length of UTF-8
2804 * encoded U+0390 or U+03B0). This would rather mess up the
2805 * minimum length computation. (there are other code points that
2806 * also fold to these two sequences, but the delta is smaller)
2807 *
f758bddf
KW
2808 * If these sequences are found, the minimum length is decreased by
2809 * four (six minus two).
86d6fcad 2810 *
f758bddf
KW
2811 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2812 * LETTER SHARP S. We decrease the min length by 1 for each
2813 * occurrence of 'ss' found */
3f410cf6 2814
e294cc5d 2815#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
f758bddf
KW
2816# define U390_first_byte 0xb4
2817 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2818# define U3B0_first_byte 0xb5
2819 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
e294cc5d 2820#else
f758bddf
KW
2821# define U390_first_byte 0xce
2822 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2823# define U3B0_first_byte 0xcf
2824 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
e294cc5d 2825#endif
f758bddf
KW
2826 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2827 yields a net of 0 */
2828 /* Examine the string for one of the problematic sequences */
2829 for (s = s0;
2830 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2831 * sequence we are looking for is 2 */
2832 s += UTF8SKIP(s))
86d6fcad 2833 {
bb914485 2834
f758bddf
KW
2835 /* Look for the first byte in each problematic sequence */
2836 switch (*s) {
2837 /* We don't have to worry about other things that fold to
2838 * 's' (such as the long s, U+017F), as all above-latin1
2839 * code points have been pre-folded */
2840 case 's':
2841 case 'S':
2842
a0c4c608
KW
2843 /* Current character is an 's' or 'S'. If next one is
2844 * as well, we have the dreaded sequence */
f758bddf
KW
2845 if (((*(s+1) & S_or_s_mask) == s_masked)
2846 /* These two node types don't have special handling
2847 * for 'ss' */
2848 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2849 {
9d071ca8 2850 *min_subtract += 1;
f758bddf
KW
2851 OP(scan) = EXACTFU_SS;
2852 s++; /* No need to look at this character again */
2853 }
2854 break;
2855
2856 case U390_first_byte:
2857 if (s_end - s >= len
2858
2859 /* The 1's are because are skipping comparing the
2860 * first byte */
2861 && memEQ(s + 1, U390_tail, len - 1))
2862 {
2863 goto greek_sequence;
2864 }
2865 break;
2866
2867 case U3B0_first_byte:
2868 if (! (s_end - s >= len
2869 && memEQ(s + 1, U3B0_tail, len - 1)))
2870 {
2871 break;
2872 }
2873 greek_sequence:
9d071ca8 2874 *min_subtract += 4;
f758bddf
KW
2875
2876 /* This can't currently be handled by trie's, so change
2877 * the node type to indicate this. If EXACTFA and
2878 * EXACTFL were ever to be handled by trie's, this
2879 * would have to be changed. If this node has already
2880 * been changed to EXACTFU_SS in this loop, leave it as
2881 * is. (I (khw) think it doesn't matter in regexec.c
2882 * for UTF patterns, but no need to change it */
2883 if (OP(scan) == EXACTFU) {
fab2782b 2884 OP(scan) = EXACTFU_TRICKYFOLD;
f758bddf
KW
2885 }
2886 s += 6; /* We already know what this sequence is. Skip
2887 the rest of it */
2888 break;
bb914485
KW
2889 }
2890 }
2891 }
f758bddf 2892 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
bb914485 2893
f758bddf
KW
2894 /* Here, the pattern is not UTF-8. We need to look only for the
2895 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2896 * in the final position. Otherwise we can stop looking 1 byte
2897 * earlier because have to find both the first and second 's' */
2898 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2899
2900 for (s = s0; s < upper; s++) {
2901 switch (*s) {
2902 case 'S':
2903 case 's':
2904 if (s_end - s > 1
2905 && ((*(s+1) & S_or_s_mask) == s_masked))
2906 {
9d071ca8 2907 *min_subtract += 1;
f758bddf
KW
2908
2909 /* EXACTF nodes need to know that the minimum
2910 * length changed so that a sharp s in the string
2911 * can match this ss in the pattern, but they
2912 * remain EXACTF nodes, as they are not trie'able,
2913 * so don't have to invent a new node type to
2914 * exclude them from the trie code */
2915 if (OP(scan) != EXACTF) {
2916 OP(scan) = EXACTFU_SS;
2917 }
2918 s++;
2919 }
2920 break;
2921 case LATIN_SMALL_LETTER_SHARP_S:
2922 if (OP(scan) == EXACTF) {
2923 *has_exactf_sharp_s = TRUE;
2924 }
2925 break;
86d6fcad
KW
2926 }
2927 }
2928 }
07be1b83 2929 }
3f410cf6 2930
07be1b83 2931#ifdef DEBUGGING
bb789b09
DM
2932 /* Allow dumping but overwriting the collection of skipped
2933 * ops and/or strings with fake optimized ops */
07be1b83
YO
2934 n = scan + NODE_SZ_STR(scan);
2935 while (n <= stop) {
bb789b09
DM
2936 OP(n) = OPTIMIZED;
2937 FLAGS(n) = 0;
2938 NEXT_OFF(n) = 0;
07be1b83
YO
2939 n++;
2940 }
2941#endif
2942 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2943 return stopnow;
2944}
2945
486ec47a 2946/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2947 Finds fixed substrings. */
2948
a0288114 2949/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2950 to the position after last scanned or to NULL. */
2951
40d049e4
YO
2952#define INIT_AND_WITHP \
2953 assert(!and_withp); \
2954 Newx(and_withp,1,struct regnode_charclass_class); \
2955 SAVEFREEPV(and_withp)
07be1b83 2956
b515a41d 2957/* this is a chain of data about sub patterns we are processing that
486ec47a 2958 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2959 we can simulate recursion without losing state. */
2960struct scan_frame;
2961typedef struct scan_frame {
2962 regnode *last; /* last node to process in this frame */
2963 regnode *next; /* next node to process when last is reached */
2964 struct scan_frame *prev; /*previous frame*/
2965 I32 stop; /* what stopparen do we use */
2966} scan_frame;
2967
304ee84b
YO
2968
2969#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2970
e1d1eefb
YO
2971#define CASE_SYNST_FNC(nAmE) \
2972case nAmE: \
2973 if (flags & SCF_DO_STCLASS_AND) { \
2974 for (value = 0; value < 256; value++) \
2975 if (!is_ ## nAmE ## _cp(value)) \
2976 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2977 } \
2978 else { \
2979 for (value = 0; value < 256; value++) \
2980 if (is_ ## nAmE ## _cp(value)) \
2981 ANYOF_BITMAP_SET(data->start_class, value); \
2982 } \
2983 break; \
2984case N ## nAmE: \
2985 if (flags & SCF_DO_STCLASS_AND) { \
2986 for (value = 0; value < 256; value++) \
2987 if (is_ ## nAmE ## _cp(value)) \
2988 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2989 } \
2990 else { \
2991 for (value = 0; value < 256; value++) \
2992 if (!is_ ## nAmE ## _cp(value)) \
2993 ANYOF_BITMAP_SET(data->start_class, value); \
2994 } \
2995 break
2996
2997
2998
76e3520e 2999STATIC I32
40d049e4 3000S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 3001 I32 *minlenp, I32 *deltap,
40d049e4
YO
3002 regnode *last,
3003 scan_data_t *data,
3004 I32 stopparen,
3005 U8* recursed,
3006 struct regnode_charclass_class *and_withp,
3007 U32 flags, U32 depth)
c277df42
IZ
3008 /* scanp: Start here (read-write). */
3009 /* deltap: Write maxlen-minlen here. */
3010 /* last: Stop before this one. */
40d049e4
YO
3011 /* data: string data about the pattern */
3012 /* stopparen: treat close N as END */
3013 /* recursed: which subroutines have we recursed into */
3014 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3015{
97aff369 3016 dVAR;
c277df42
IZ
3017 I32 min = 0, pars = 0, code;
3018 regnode *scan = *scanp, *next;
3019 I32 delta = 0;
3020 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3021 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3022 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3023 scan_data_t data_fake;
a3621e74 3024 SV *re_trie_maxbuff = NULL;
786e8c11 3025 regnode *first_non_open = scan;
e2e6a0f1 3026 I32 stopmin = I32_MAX;
8aa23a47 3027 scan_frame *frame = NULL;
a3621e74 3028 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3029
7918f24d
NC
3030 PERL_ARGS_ASSERT_STUDY_CHUNK;
3031
13a24bad 3032#ifdef DEBUGGING
40d049e4 3033 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3034#endif
40d049e4 3035
786e8c11 3036 if ( depth == 0 ) {
40d049e4 3037 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3038 first_non_open=regnext(first_non_open);
3039 }
3040
b81d288d 3041
8aa23a47
YO
3042 fake_study_recurse:
3043 while ( scan && OP(scan) != END && scan < last ){
9d071ca8
KW
3044 UV min_subtract = 0; /* How much to subtract from the minimum node
3045 length to get a real minimum (because the
3046 folded version may be shorter) */
f758bddf 3047 bool has_exactf_sharp_s = FALSE;
8aa23a47 3048 /* Peephole optimizer: */
304ee84b 3049 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 3050 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
3051
3052 /* Its not clear to khw or hv why this is done here, and not in the
3053 * clauses that deal with EXACT nodes. khw's guess is that it's
3054 * because of a previous design */
9d071ca8 3055 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3056
3057 /* Follow the next-chain of the current node and optimize
3058 away all the NOTHINGs from it. */
3059 if (OP(scan) != CURLYX) {
3060 const int max = (reg_off_by_arg[OP(scan)]
3061 ? I32_MAX
3062 /* I32 may be smaller than U16 on CRAYs! */
3063 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3064 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3065 int noff;
3066 regnode *n = scan;
686b73d4 3067
8aa23a47
YO
3068 /* Skip NOTHING and LONGJMP. */
3069 while ((n = regnext(n))
3070 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3071 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3072 && off + noff < max)
3073 off += noff;
3074 if (reg_off_by_arg[OP(scan)])
3075 ARG(scan) = off;
3076 else
3077 NEXT_OFF(scan) = off;
3078 }
a3621e74 3079
c277df42 3080
8aa23a47
YO
3081
3082 /* The principal pseudo-switch. Cannot be a switch, since we
3083 look into several different things. */
3084 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3085 || OP(scan) == IFTHEN) {
3086 next = regnext(scan);
3087 code = OP(scan);
3088 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3089
8aa23a47
YO
3090 if (OP(next) == code || code == IFTHEN) {
3091 /* NOTE - There is similar code to this block below for handling
3092 TRIE nodes on a re-study. If you change stuff here check there
3093 too. */
3094 I32 max1 = 0, min1 = I32_MAX, num = 0;
3095 struct regnode_charclass_class accum;
3096 regnode * const startbranch=scan;
686b73d4 3097
8aa23a47 3098 if (flags & SCF_DO_SUBSTR)
304ee84b 3099 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3100 if (flags & SCF_DO_STCLASS)
e755fd73 3101 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
3102
3103 while (OP(scan) == code) {
3104 I32 deltanext, minnext, f = 0, fake;
3105 struct regnode_charclass_class this_class;
3106
3107 num++;
3108 data_fake.flags = 0;
3109 if (data) {
3110 data_fake.whilem_c = data->whilem_c;
3111 data_fake.last_closep = data->last_closep;
3112 }
3113 else
3114 data_fake.last_closep = &fake;
58e23c8d
YO
3115
3116 data_fake.pos_delta = delta;
8aa23a47
YO
3117 next = regnext(scan);
3118 scan = NEXTOPER(scan);
3119 if (code != BRANCH)
c277df42 3120 scan = NEXTOPER(scan);
8aa23a47 3121 if (flags & SCF_DO_STCLASS) {
e755fd73 3122 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3123 data_fake.start_class = &this_class;
3124 f = SCF_DO_STCLASS_AND;
58e23c8d 3125 }
8aa23a47
YO
3126 if (flags & SCF_WHILEM_VISITED_POS)
3127 f |= SCF_WHILEM_VISITED_POS;
3128
3129 /* we suppose the run is continuous, last=next...*/
3130 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3131 next, &data_fake,
3132 stopparen, recursed, NULL, f,depth+1);
3133 if (min1 > minnext)
3134 min1 = minnext;
3135 if (max1 < minnext + deltanext)
3136 max1 = minnext + deltanext;
3137 if (deltanext == I32_MAX)
3138 is_inf = is_inf_internal = 1;
3139 scan = next;
3140 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3141 pars++;
3142 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3143 if ( stopmin > minnext)
3144 stopmin = min + min1;
3145 flags &= ~SCF_DO_SUBSTR;
3146 if (data)
3147 data->flags |= SCF_SEEN_ACCEPT;
3148 }
3149 if (data) {
3150 if (data_fake.flags & SF_HAS_EVAL)
3151 data->flags |= SF_HAS_EVAL;
3152 data->whilem_c = data_fake.whilem_c;
3dab1dad 3153 }
8aa23a47 3154 if (flags & SCF_DO_STCLASS)
3fffb88a 3155 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3156 }
3157 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3158 min1 = 0;
3159 if (flags & SCF_DO_SUBSTR) {
3160 data->pos_min += min1;
3161 data->pos_delta += max1 - min1;
3162 if (max1 != min1 || is_inf)
3163 data->longest = &(data->longest_float);
3164 }
3165 min += min1;
3166 delta += max1 - min1;
3167 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3168 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
3169 if (min1) {
3170 cl_and(data->start_class, and_withp);
3171 flags &= ~SCF_DO_STCLASS;
653099ff 3172 }
8aa23a47
YO
3173 }
3174 else if (flags & SCF_DO_STCLASS_AND) {
3175 if (min1) {
3176 cl_and(data->start_class, &accum);
3177 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3178 }
8aa23a47
YO
3179 else {
3180 /* Switch to OR mode: cache the old value of
3181 * data->start_class */
3182 INIT_AND_WITHP;
3183 StructCopy(data->start_class, and_withp,
3184 struct regnode_charclass_class);
3185 flags &= ~SCF_DO_STCLASS_AND;
3186 StructCopy(&accum, data->start_class,
3187 struct regnode_charclass_class);
3188 flags |= SCF_DO_STCLASS_OR;
3189 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 3190 }
8aa23a47 3191 }
a3621e74 3192
8aa23a47
YO
3193 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3194 /* demq.
a3621e74 3195
8aa23a47
YO
3196 Assuming this was/is a branch we are dealing with: 'scan' now
3197 points at the item that follows the branch sequence, whatever
3198 it is. We now start at the beginning of the sequence and look
3199 for subsequences of
a3621e74 3200
8aa23a47
YO
3201 BRANCH->EXACT=>x1
3202 BRANCH->EXACT=>x2
3203 tail
a3621e74 3204
8aa23a47 3205 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3206
486ec47a 3207 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3208 element into a trie and then add the subsequent branch exact
3209 strings to the trie.
a3621e74 3210
8aa23a47 3211 We have two cases
a3621e74 3212
3b753521 3213 1. patterns where the whole set of branches can be converted.
a3621e74 3214
8aa23a47 3215 2. patterns where only a subset can be converted.
a3621e74 3216
8aa23a47
YO
3217 In case 1 we can replace the whole set with a single regop
3218 for the trie. In case 2 we need to keep the start and end
3b753521 3219 branches so
a3621e74 3220
8aa23a47
YO
3221 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3222 becomes BRANCH TRIE; BRANCH X;
786e8c11 3223
8aa23a47
YO
3224 There is an additional case, that being where there is a
3225 common prefix, which gets split out into an EXACT like node
3226 preceding the TRIE node.
a3621e74 3227
8aa23a47
YO
3228 If x(1..n)==tail then we can do a simple trie, if not we make
3229 a "jump" trie, such that when we match the appropriate word
486ec47a 3230 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3231 a nested if into a case structure of sorts.
b515a41d 3232
8aa23a47 3233 */
686b73d4 3234
8aa23a47
YO
3235 int made=0;
3236 if (!re_trie_maxbuff) {
3237 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3238 if (!SvIOK(re_trie_maxbuff))
3239 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3240 }
3241 if ( SvIV(re_trie_maxbuff)>=0 ) {
3242 regnode *cur;
3243 regnode *first = (regnode *)NULL;
3244 regnode *last = (regnode *)NULL;
3245 regnode *tail = scan;
fab2782b 3246 U8 trietype = 0;
8aa23a47 3247 U32 count=0;
a3621e74
YO
3248
3249#ifdef DEBUGGING
8aa23a47 3250 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3251#endif
8aa23a47
YO
3252 /* var tail is used because there may be a TAIL
3253 regop in the way. Ie, the exacts will point to the
3254 thing following the TAIL, but the last branch will
3255 point at the TAIL. So we advance tail. If we
3256 have nested (?:) we may have to move through several
3257 tails.
3258 */
3259
3260 while ( OP( tail ) == TAIL ) {
3261 /* this is the TAIL generated by (?:) */
3262 tail = regnext( tail );
3263 }
a3621e74 3264
8aa23a47 3265
df826430 3266 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3267 regprop(RExC_rx, mysv, tail );
3268 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3269 (int)depth * 2 + 2, "",
3270 "Looking for TRIE'able sequences. Tail node is: ",
3271 SvPV_nolen_const( mysv )
3272 );
3273 });
3274
3275 /*
3276
fab2782b
YO
3277 Step through the branches
3278 cur represents each branch,
3279 noper is the first thing to be matched as part of that branch
3280 noper_next is the regnext() of that node.
3281
3282 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3283 via a "jump trie" but we also support building with NOJUMPTRIE,
3284 which restricts the trie logic to structures like /FOO|BAR/.
3285
3286 If noper is a trieable nodetype then the branch is a possible optimization
3287 target. If we are building under NOJUMPTRIE then we require that noper_next
3288 is the same as scan (our current position in the regex program).
3289
3290 Once we have two or more consecutive such branches we can create a
3291 trie of the EXACT's contents and stitch it in place into the program.
3292
3293 If the sequence represents all of the branches in the alternation we
3294 replace the entire thing with a single TRIE node.
3295
3296 Otherwise when it is a subsequence we need to stitch it in place and
3297 replace only the relevant branches. This means the first branch has
3298 to remain as it is used by the alternation logic, and its next pointer,
3299 and needs to be repointed at the item on the branch chain following
3300 the last branch we have optimized away.
3301
3302 This could be either a BRANCH, in which case the subsequence is internal,
3303 or it could be the item following the branch sequence in which case the
3304 subsequence is at the end (which does not necessarily mean the first node
3305 is the start of the alternation).
3306
3307 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3308
3309 optype | trietype
3310 ----------------+-----------
3311 NOTHING | NOTHING
3312 EXACT | EXACT
3313 EXACTFU | EXACTFU
3314 EXACTFU_SS | EXACTFU
3315 EXACTFU_TRICKYFOLD | EXACTFU
3316 EXACTFA | 0
3317
8aa23a47
YO
3318
3319 */
fab2782b
YO
3320#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3321 ( EXACT == (X) ) ? EXACT : \
3322 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3323 0 )
8aa23a47
YO
3324
3325 /* dont use tail as the end marker for this traverse */
3326 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3327 regnode * const noper = NEXTOPER( cur );
fab2782b
YO
3328 U8 noper_type = OP( noper );
3329 U8 noper_trietype = TRIE_TYPE( noper_type );
b515a41d 3330#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 3331 regnode * const noper_next = regnext( noper );
df826430
YO
3332 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3333 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
b515a41d
YO
3334#endif
3335
df826430 3336 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3337 regprop(RExC_rx, mysv, cur);
3338 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3339 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3340
3341 regprop(RExC_rx, mysv, noper);
3342 PerlIO_printf( Perl_debug_log, " -> %s",
3343 SvPV_nolen_const(mysv));
3344
3345 if ( noper_next ) {
3346 regprop(RExC_rx, mysv, noper_next );
3347 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3348 SvPV_nolen_const(mysv));
3349 }
df826430
YO
3350 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3351 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3352 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3353 );
8aa23a47 3354 });
fab2782b
YO
3355
3356 /* Is noper a trieable nodetype that can be merged with the
3357 * current trie (if there is one)? */
3358 if ( noper_trietype
3359 &&
3360 (
df826430
YO
3361 ( noper_trietype == NOTHING)
3362 || ( trietype == NOTHING )
a40630bf 3363 || ( trietype == noper_trietype )
fab2782b 3364 )
786e8c11 3365#ifdef NOJUMPTRIE
8aa23a47 3366 && noper_next == tail
786e8c11 3367#endif
8aa23a47
YO
3368 && count < U16_MAX)
3369 {
fab2782b
YO
3370 /* Handle mergable triable node
3371 * Either we are the first node in a new trieable sequence,
3372 * in which case we do some bookkeeping, otherwise we update
3373 * the end pointer. */
fab2782b 3374 if ( !first ) {
df826430
YO
3375 if ( noper_trietype == NOTHING ) {
3376#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3377 regnode * const noper_next = regnext( noper );
3378 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3379 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3380#endif
3381
3382 if ( noper_next_trietype ) {
3383 first = cur;
3384 trietype = noper_next_trietype;
3385 }
3386 } else {
3387 first = cur;
3388 trietype = noper_trietype;
3389 }
8aa23a47 3390 } else {
fab2782b
YO
3391 if ( trietype == NOTHING )
3392 trietype = noper_trietype;
8aa23a47
YO
3393 last = cur;
3394 }
df826430
YO
3395 if (first)
3396 count++;
fab2782b
YO
3397 } /* end handle mergable triable node */
3398 else {
3399 /* handle unmergable node -
3400 * noper may either be a triable node which can not be tried
3401 * together with the current trie, or a non triable node */
729aaeb5
YO
3402 if ( last ) {
3403 /* If last is set and trietype is not NOTHING then we have found
3404 * at least two triable branch sequences in a row of a similar
3405 * trietype so we can turn them into a trie. If/when we
3406 * allow NOTHING to start a trie sequence this condition will be
3407 * required, and it isn't expensive so we leave it in for now. */
3408 if ( trietype != NOTHING )
3409 make_trie( pRExC_state,
3410 startbranch, first, cur, tail, count,
3411 trietype, depth+1 );
fab2782b 3412 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
8aa23a47 3413 }
fab2782b 3414 if ( noper_trietype
786e8c11 3415#ifdef NOJUMPTRIE
8aa23a47 3416 && noper_next == tail
786e8c11 3417#endif
8aa23a47 3418 ){
fab2782b 3419 /* noper is triable, so we can start a new trie sequence */
8aa23a47
YO
3420 count = 1;
3421 first = cur;
fab2782b
YO
3422 trietype = noper_trietype;
3423 } else if (first) {
3424 /* if we already saw a first but the current node is not triable then we have
3425 * to reset the first information. */
8aa23a47
YO
3426 count = 0;
3427 first = NULL;
fab2782b 3428 trietype = 0;
8aa23a47 3429 }
fab2782b
YO
3430 } /* end handle unmergable node */
3431 } /* loop over branches */
df826430 3432 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3433 regprop(RExC_rx, mysv, cur);
3434 PerlIO_printf( Perl_debug_log,
3435 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3436 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3437
3438 });
fab2782b
YO
3439 if ( last && trietype != NOTHING ) {
3440 /* the last branch of the sequence was part of a trie,
3441 * so we have to construct it here outside of the loop
3442 */
3443 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
686b73d4 3444#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3445 if ( ((made == MADE_EXACT_TRIE &&
3446 startbranch == first)
3447 || ( first_non_open == first )) &&
3448 depth==0 ) {
3449 flags |= SCF_TRIE_RESTUDY;
3450 if ( startbranch == first
3451 && scan == tail )
3452 {
3453 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3454 }
3455 }
3dab1dad 3456#endif
fab2782b
YO
3457 } /* end if ( last) */
3458 } /* TRIE_MAXBUF is non zero */
8aa23a47
YO
3459
3460 } /* do trie */
3461
653099ff 3462 }
8aa23a47
YO
3463 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3464 scan = NEXTOPER(NEXTOPER(scan));
3465 } else /* single branch is optimized. */
3466 scan = NEXTOPER(scan);
3467 continue;
3468 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3469 scan_frame *newframe = NULL;
3470 I32 paren;
3471 regnode *start;
3472 regnode *end;
3473
3474 if (OP(scan) != SUSPEND) {
3475 /* set the pointer */
3476 if (OP(scan) == GOSUB) {
3477 paren = ARG(scan);
3478 RExC_recurse[ARG2L(scan)] = scan;
3479 start = RExC_open_parens[paren-1];
3480 end = RExC_close_parens[paren-1];
3481 } else {
3482 paren = 0;
f8fc2ecf 3483 start = RExC_rxi->program + 1;
8aa23a47
YO
3484 end = RExC_opend;
3485 }
3486 if (!recursed) {
3487 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3488 SAVEFREEPV(recursed);
3489 }
3490 if (!PAREN_TEST(recursed,paren+1)) {
3491 PAREN_SET(recursed,paren+1);
3492 Newx(newframe,1,scan_frame);
3493 } else {
3494 if (flags & SCF_DO_SUBSTR) {
304ee84b 3495 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3496 data->longest = &(data->longest_float);
3497 }
3498 is_inf = is_inf_internal = 1;
3499 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3500 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3501 flags &= ~SCF_DO_STCLASS;
3502 }
3503 } else {
3504 Newx(newframe,1,scan_frame);
3505 paren = stopparen;
3506 start = scan+2;
3507 end = regnext(scan);
3508 }
3509 if (newframe) {
3510 assert(start);
3511 assert(end);
3512 SAVEFREEPV(newframe);
3513 newframe->next = regnext(scan);
3514 newframe->last = last;
3515 newframe->stop = stopparen;
3516 newframe->prev = frame;
3517
3518 frame = newframe;
3519 scan = start;
3520 stopparen = paren;
3521 last = end;
3522
3523 continue;
3524 }
3525 }
3526 else if (OP(scan) == EXACT) {
3527 I32 l = STR_LEN(scan);
3528 UV uc;
3529 if (UTF) {
3530 const U8 * const s = (U8*)STRING(scan);
4b88fb76 3531 uc = utf8_to_uvchr_buf(s, s + l, NULL);
8aa23a47 3532 l = utf8_length(s, s + l);
8aa23a47
YO
3533 } else {
3534 uc = *((U8*)STRING(scan));
3535 }
3536 min += l;
3537 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3538 /* The code below prefers earlier match for fixed
3539 offset, later match for variable offset. */
3540 if (data->last_end == -1) { /* Update the start info. */
3541 data->last_start_min = data->pos_min;
3542 data->last_start_max = is_inf
3543 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3544 }
8aa23a47
YO
3545 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3546 if (UTF)
3547 SvUTF8_on(data->last_found);
3548 {
3549 SV * const sv = data->last_found;
3550 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3551 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3552 if (mg && mg->mg_len >= 0)
3553 mg->mg_len += utf8_length((U8*)STRING(scan),
3554 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3555 }
8aa23a47
YO
3556 data->last_end = data->pos_min + l;
3557 data->pos_min += l; /* As in the first entry. */
3558 data->flags &= ~SF_BEFORE_EOL;
3559 }
3560 if (flags & SCF_DO_STCLASS_AND) {
3561 /* Check whether it is compatible with what we know already! */
3562 int compat = 1;
3563
54251c2e 3564
486ec47a 3565 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3566 * in the bitmp and either 1) its bit or its fold is set, or 2)
3567 * it's for a locale. Even if there isn't unicode semantics
3568 * here, at runtime there may be because of matching against a
3569 * utf8 string, so accept a possible false positive for
3570 * latin1-range folds */
8aa23a47
YO
3571 if (uc >= 0x100 ||
3572 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3573 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3574 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3575 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3576 )
d18bf9dc 3577 {
8aa23a47 3578 compat = 0;
d18bf9dc 3579 }
8aa23a47
YO
3580 ANYOF_CLASS_ZERO(data->start_class);
3581 ANYOF_BITMAP_ZERO(data->start_class);
3582 if (compat)
3583 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3584 else if (uc >= 0x100) {
3585 int i;
3586
3587 /* Some Unicode code points fold to the Latin1 range; as
3588 * XXX temporary code, instead of figuring out if this is
3589 * one, just assume it is and set all the start class bits
3590 * that could be some such above 255 code point's fold
3591 * which will generate fals positives. As the code
3592 * elsewhere that does compute the fold settles down, it
3593 * can be extracted out and re-used here */
3594 for (i = 0; i < 256; i++){
94dc5c2d 3595 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
d18bf9dc
KW
3596 ANYOF_BITMAP_SET(data->start_class, i);
3597 }
3598 }
3599 }
8aa23a47
YO
3600 data->start_class->flags &= ~ANYOF_EOS;
3601 if (uc < 0x100)
3602 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3603 }
3604 else if (flags & SCF_DO_STCLASS_OR) {
3605 /* false positive possible if the class is case-folded */
3606 if (uc < 0x100)
3607 ANYOF_BITMAP_SET(data->start_class, uc);
3608 else
3609 data->start_class->flags |= ANYOF_UNICODE_ALL;
3610 data->start_class->flags &= ~ANYOF_EOS;
3611 cl_and(data->start_class, and_withp);
3612 }
3613 flags &= ~SCF_DO_STCLASS;
3614 }
3615 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3616 I32 l = STR_LEN(scan);
3617 UV uc = *((U8*)STRING(scan));
3618
3619 /* Search for fixed substrings supports EXACT only. */
3620 if (flags & SCF_DO_SUBSTR) {
3621 assert(data);
304ee84b 3622 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3623 }
3624 if (UTF) {
3625 const U8 * const s = (U8 *)STRING(scan);
4b88fb76 3626 uc = utf8_to_uvchr_buf(s, s + l, NULL);
8aa23a47 3627 l = utf8_length(s, s + l);
8aa23a47 3628 }
f758bddf
KW
3629 else if (has_exactf_sharp_s) {
3630 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
bb914485 3631 }
9d071ca8 3632 min += l - min_subtract;
f646642f
KW
3633 if (min < 0) {
3634 min = 0;
3635 }
9d071ca8 3636 delta += min_subtract;
f646642f 3637 if (flags & SCF_DO_SUBSTR) {
9d071ca8 3638 data->pos_min += l - min_subtract;
f646642f
KW
3639 if (data->pos_min < 0) {
3640 data->pos_min = 0;
3641 }
9d071ca8
KW
3642 data->pos_delta += min_subtract;
3643 if (min_subtract) {
d2197104
KW
3644 data->longest = &(data->longest_float);
3645 }
f646642f 3646 }
8aa23a47
YO
3647 if (flags & SCF_DO_STCLASS_AND) {
3648 /* Check whether it is compatible with what we know already! */
3649 int compat = 1;
8aa23a47 3650 if (uc >= 0x100 ||
54251c2e
KW
3651 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3652 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3653 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3654 {
8aa23a47 3655 compat = 0;
54251c2e 3656 }
8aa23a47
YO
3657 ANYOF_CLASS_ZERO(data->start_class);
3658 ANYOF_BITMAP_ZERO(data->start_class);
3659 if (compat) {
3660 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3661 data->start_class->flags &= ~ANYOF_EOS;
39065660 3662 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3663 if (OP(scan) == EXACTFL) {
af302e7f
KW
3664 /* XXX This set is probably no longer necessary, and
3665 * probably wrong as LOCALE now is on in the initial
3666 * state */
8aa23a47 3667 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3668 }
3669 else {
3670
54251c2e
KW
3671 /* Also set the other member of the fold pair. In case
3672 * that unicode semantics is called for at runtime, use
3673 * the full latin1 fold. (Can't do this for locale,
a0c4c608 3674 * because not known until runtime) */
54251c2e 3675 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
e22b340a 3676
a0c4c608
KW
3677 /* All other (EXACTFL handled above) folds except under
3678 * /iaa that include s, S, and sharp_s also may include
3679 * the others */
e22b340a
KW
3680 if (OP(scan) != EXACTFA) {
3681 if (uc == 's' || uc == 'S') {
3682 ANYOF_BITMAP_SET(data->start_class,
3683 LATIN_SMALL_LETTER_SHARP_S);
3684 }
3685 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3686 ANYOF_BITMAP_SET(data->start_class, 's');
3687 ANYOF_BITMAP_SET(data->start_class, 'S');
3688 }
3689 }
970c8436 3690 }
653099ff 3691 }
d18bf9dc
KW
3692 else if (uc >= 0x100) {
3693 int i;
3694 for (i = 0; i < 256; i++){
3695 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3696 ANYOF_BITMAP_SET(data->start_class, i);
3697 }
3698 }
3699 }
8aa23a47
YO
3700 }
3701 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3702 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3703 /* false positive possible if the class is case-folded.
3704 Assume that the locale settings are the same... */
970c8436 3705 if (uc < 0x100) {
1aa99e6b 3706 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3707 if (OP(scan) != EXACTFL) {
3708
3709 /* And set the other member of the fold pair, but
3710 * can't do that in locale because not known until
3711 * run-time */
3712 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3713 PL_fold_latin1[uc]);
e22b340a
KW
3714
3715 /* All folds except under /iaa that include s, S,
3716 * and sharp_s also may include the others */
3717 if (OP(scan) != EXACTFA) {
3718 if (uc == 's' || uc == 'S') {
3719 ANYOF_BITMAP_SET(data->start_class,
3720 LATIN_SMALL_LETTER_SHARP_S);
3721 }
3722 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3723 ANYOF_BITMAP_SET(data->start_class, 's');
3724 ANYOF_BITMAP_SET(data->start_class, 'S');
3725 }
3726 }
970c8436
KW
3727 }
3728 }
653099ff
GS
3729 data->start_class->flags &= ~ANYOF_EOS;
3730 }
8aa23a47 3731 cl_and(data->start_class, and_withp);
653099ff 3732 }
8aa23a47
YO
3733 flags &= ~SCF_DO_STCLASS;
3734 }
e52fc539 3735 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3736 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3737 I32 f = flags, pos_before = 0;
3738 regnode * const oscan = scan;
3739 struct regnode_charclass_class this_class;
3740 struct regnode_charclass_class *oclass = NULL;
3741 I32 next_is_eval = 0;
3742
3743 switch (PL_regkind[OP(scan)]) {
3744 case WHILEM: /* End of (?:...)* . */
3745 scan = NEXTOPER(scan);
3746 goto finish;
3747 case PLUS:
3748 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3749 next = NEXTOPER(scan);
3750 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3751 mincount = 1;
3752 maxcount = REG_INFTY;
3753 next = regnext(scan);
3754 scan = NEXTOPER(scan);
3755 goto do_curly;
3756 }
3757 }
3758 if (flags & SCF_DO_SUBSTR)
3759 data->pos_min++;
3760 min++;
3761 /* Fall through. */
3762 case STAR:
3763 if (flags & SCF_DO_STCLASS) {
3764 mincount = 0;
3765 maxcount = REG_INFTY;
3766 next = regnext(scan);
3767 scan = NEXTOPER(scan);
3768 goto do_curly;
3769 }
3770 is_inf = is_inf_internal = 1;
3771 scan = regnext(scan);
c277df42 3772 if (flags & SCF_DO_SUBSTR) {
304ee84b 3773 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3774 data->longest = &(data->longest_float);
c277df42 3775 }
8aa23a47
YO
3776 goto optimize_curly_tail;
3777 case CURLY:
3778 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3779 && (scan->flags == stopparen))
3780 {
3781 mincount = 1;
3782 maxcount = 1;
3783 } else {
3784 mincount = ARG1(scan);
3785 maxcount = ARG2(scan);
653099ff 3786 }
8aa23a47
YO
3787 next = regnext(scan);
3788 if (OP(scan) == CURLYX) {
3789 I32 lp = (data ? *(data->last_closep) : 0);
3790 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3791 }
8aa23a47
YO
3792 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3793 next_is_eval = (OP(scan) == EVAL);
3794 do_curly:
3795 if (flags & SCF_DO_SUBSTR) {
304ee84b 3796 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3797 pos_before = data->pos_min;
b45f050a 3798 }
8aa23a47
YO
3799 if (data) {
3800 fl = data->flags;
3801 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3802 if (is_inf)
3803 data->flags |= SF_IS_INF;
3804 }
3805 if (flags & SCF_DO_STCLASS) {
e755fd73 3806 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3807 oclass = data->start_class;
3808 data->start_class = &this_class;
3809 f |= SCF_DO_STCLASS_AND;
3810 f &= ~SCF_DO_STCLASS_OR;
3811 }
779bcb7d
NC
3812 /* Exclude from super-linear cache processing any {n,m}
3813 regops for which the combination of input pos and regex
3814 pos is not enough information to determine if a match
3815 will be possible.
3816
3817 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3818 regex pos at the \s*, the prospects for a match depend not
3819 only on the input position but also on how many (bar\s*)
3820 repeats into the {4,8} we are. */
3821 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3822 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3823
8aa23a47
YO
3824 /* This will finish on WHILEM, setting scan, or on NULL: */
3825 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3826 last, data, stopparen, recursed, NULL,
3827 (mincount == 0
3828 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3829
8aa23a47
YO
3830 if (flags & SCF_DO_STCLASS)
3831 data->start_class = oclass;
3832 if (mincount == 0 || minnext == 0) {
3833 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3834 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3835 }
3836 else if (flags & SCF_DO_STCLASS_AND) {
3837 /* Switch to OR mode: cache the old value of
3838 * data->start_class */
3839 INIT_AND_WITHP;
3840 StructCopy(data->start_class, and_withp,
3841 struct regnode_charclass_class);
3842 flags &= ~SCF_DO_STCLASS_AND;
3843 StructCopy(&this_class, data->start_class,
3844 struct regnode_charclass_class);
3845 flags |= SCF_DO_STCLASS_OR;
3846 data->start_class->flags |= ANYOF_EOS;
3847 }
3848 } else { /* Non-zero len */
3849 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3850 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3851 cl_and(data->start_class, and_withp);
3852 }
3853 else if (flags & SCF_DO_STCLASS_AND)
3854 cl_and(data->start_class, &this_class);
3855 flags &= ~SCF_DO_STCLASS;
3856 }
3857 if (!scan) /* It was not CURLYX, but CURLY. */
3858 scan = next;
3859 if ( /* ? quantifier ok, except for (?{ ... }) */
3860 (next_is_eval || !(mincount == 0 && maxcount == 1))
3861 && (minnext == 0) && (deltanext == 0)
3862 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3863 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3864 {
668c081a
NC
3865 ckWARNreg(RExC_parse,
3866 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3867 }
3868
3869 min += minnext * mincount;
3870 is_inf_internal |= ((maxcount == REG_INFTY
3871 && (minnext + deltanext) > 0)
3872 || deltanext == I32_MAX);
3873 is_inf |= is_inf_internal;
3874 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3875
3876 /* Try powerful optimization CURLYX => CURLYN. */
3877 if ( OP(oscan) == CURLYX && data
3878 && data->flags & SF_IN_PAR
3879 && !(data->flags & SF_HAS_EVAL)
3880 && !deltanext && minnext == 1 ) {
3881 /* Try to optimize to CURLYN. */
3882 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3883 regnode * const nxt1 = nxt;
497b47a8 3884#ifdef DEBUGGING
8aa23a47 3885 regnode *nxt2;
497b47a8 3886#endif
c277df42 3887
8aa23a47
YO
3888 /* Skip open. */
3889 nxt = regnext(nxt);
e52fc539 3890 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3891 && !(PL_regkind[OP(nxt)] == EXACT
3892 && STR_LEN(nxt) == 1))
3893 goto nogo;
497b47a8 3894#ifdef DEBUGGING
8aa23a47 3895 nxt2 = nxt;
497b47a8 3896#endif
8aa23a47
YO
3897 nxt = regnext(nxt);
3898 if (OP(nxt) != CLOSE)
3899 goto nogo;
3900 if (RExC_open_parens) {
3901 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3902 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3903 }
3904 /* Now we know that nxt2 is the only contents: */
3905 oscan->flags = (U8)ARG(nxt);
3906 OP(oscan) = CURLYN;
3907 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3908
c277df42 3909#ifdef DEBUGGING
8aa23a47 3910 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3911 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3912 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3913 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3914 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3915 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3916#endif
8aa23a47
YO
3917 }
3918 nogo:
3919
3920 /* Try optimization CURLYX => CURLYM. */
3921 if ( OP(oscan) == CURLYX && data
3922 && !(data->flags & SF_HAS_PAR)
3923 && !(data->flags & SF_HAS_EVAL)
3924 && !deltanext /* atom is fixed width */
3925 && minnext != 0 /* CURLYM can't handle zero width */
3926 ) {
3927 /* XXXX How to optimize if data == 0? */
3928 /* Optimize to a simpler form. */
3929 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3930 regnode *nxt2;
3931
3932 OP(oscan) = CURLYM;
3933 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3934 && (OP(nxt2) != WHILEM))
3935 nxt = nxt2;
3936 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3937 /* Need to optimize away parenths. */
b3c0965f 3938 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3939 /* Set the parenth number. */
3940 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3941
8aa23a47
YO
3942 oscan->flags = (U8)ARG(nxt);
3943 if (RExC_open_parens) {
3944 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3945 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3946 }
8aa23a47
YO
3947 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3948 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3949
c277df42 3950#ifdef DEBUGGING
8aa23a47
YO
3951 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3952 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3953 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3954 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3955#endif
c277df42 3956#if 0
8aa23a47
YO
3957 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3958 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3959 if (nnxt == nxt) {
3960 if (reg_off_by_arg[OP(nxt1)])
3961 ARG_SET(nxt1, nxt2 - nxt1);
3962 else if (nxt2 - nxt1 < U16_MAX)
3963 NEXT_OFF(nxt1) = nxt2 - nxt1;
3964 else
3965 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3966 }
8aa23a47 3967 nxt1 = nnxt;
c277df42 3968 }
5d1c421c 3969#endif
8aa23a47
YO
3970 /* Optimize again: */
3971 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3972 NULL, stopparen, recursed, NULL, 0,depth+1);
3973 }
3974 else
3975 oscan->flags = 0;
3976 }
3977 else if ((OP(oscan) == CURLYX)
3978 && (flags & SCF_WHILEM_VISITED_POS)
3979 /* See the comment on a similar expression above.
3b753521 3980 However, this time it's not a subexpression
8aa23a47
YO
3981 we care about, but the expression itself. */
3982 && (maxcount == REG_INFTY)
3983 && data && ++data->whilem_c < 16) {
3984 /* This stays as CURLYX, we can put the count/of pair. */
3985 /* Find WHILEM (as in regexec.c) */
3986 regnode *nxt = oscan + NEXT_OFF(oscan);
3987
3988 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3989 nxt += ARG(nxt);
3990 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3991 | (RExC_whilem_seen << 4)); /* On WHILEM */
3992 }
3993 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3994 pars++;
3995 if (flags & SCF_DO_SUBSTR) {
3996 SV *last_str = NULL;
3997 int counted = mincount != 0;
a0ed51b3 3998
8aa23a47
YO
3999 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4000#if defined(SPARC64_GCC_WORKAROUND)
4001 I32 b = 0;
4002 STRLEN l = 0;
4003 const char *s = NULL;
4004 I32 old = 0;
b515a41d 4005
8aa23a47
YO
4006 if (pos_before >= data->last_start_min)
4007 b = pos_before;
4008 else
4009 b = data->last_start_min;
b515a41d 4010
8aa23a47
YO
4011 l = 0;
4012 s = SvPV_const(data->last_found, l);
4013 old = b - data->last_start_min;
4014
4015#else
4016 I32 b = pos_before >= data->last_start_min
4017 ? pos_before : data->last_start_min;
4018 STRLEN l;
4019 const char * const s = SvPV_const(data->last_found, l);
4020 I32 old = b - data->last_start_min;
4021#endif
4022
4023 if (UTF)
4024 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
4025 l -= old;
4026 /* Get the added string: */
740cce10 4027 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
4028 if (deltanext == 0 && pos_before == b) {
4029 /* What was added is a constant string */
4030 if (mincount > 1) {
4031 SvGROW(last_str, (mincount * l) + 1);
4032 repeatcpy(SvPVX(last_str) + l,
4033 SvPVX_const(last_str), l, mincount - 1);
4034 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4035 /* Add additional parts. */
4036 SvCUR_set(data->last_found,
4037 SvCUR(data->last_found) - l);
4038 sv_catsv(data->last_found, last_str);
4039 {
4040 SV * sv = data->last_found;
4041 MAGIC *mg =
4042 SvUTF8(sv) && SvMAGICAL(sv) ?
4043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4044 if (mg && mg->mg_len >= 0)
bd94e887 4045 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 4046 }
8aa23a47 4047 data->last_end += l * (mincount - 1);
b515a41d 4048 }
8aa23a47
YO
4049 } else {
4050 /* start offset must point into the last copy */
4051 data->last_start_min += minnext * (mincount - 1);
4052 data->last_start_max += is_inf ? I32_MAX
4053 : (maxcount - 1) * (minnext + data->pos_delta);
4054 }
c277df42 4055 }
8aa23a47
YO
4056 /* It is counted once already... */
4057 data->pos_min += minnext * (mincount - counted);
4058 data->pos_delta += - counted * deltanext +
4059 (minnext + deltanext) * maxcount - minnext * mincount;
4060 if (mincount != maxcount) {
4061 /* Cannot extend fixed substrings found inside
4062 the group. */
304ee84b 4063 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
4064 if (mincount && last_str) {
4065 SV * const sv = data->last_found;
4066 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4067 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4068
4069 if (mg)
4070 mg->mg_len = -1;
4071 sv_setsv(sv, last_str);
4072 data->last_end = data->pos_min;
4073 data->last_start_min =
4074 data->pos_min - CHR_SVLEN(last_str);
4075 data->last_start_max = is_inf
4076 ? I32_MAX
4077 : data->pos_min + data->pos_delta
4078 - CHR_SVLEN(last_str);
4079 }
4080 data->longest = &(data->longest_float);
4081 }
4082 SvREFCNT_dec(last_str);
c277df42 4083 }
8aa23a47
YO
4084 if (data && (fl & SF_HAS_EVAL))
4085 data->flags |= SF_HAS_EVAL;
4086 optimize_curly_tail:
4087 if (OP(oscan) != CURLYX) {
4088 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4089 && NEXT_OFF(next))
4090 NEXT_OFF(oscan) += NEXT_OFF(next);
4091 }
4092 continue;
f56b6394 4093 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 4094 if (flags & SCF_DO_SUBSTR) {
304ee84b 4095 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4096 data->longest = &(data->longest_float);
4097 }
4098 is_inf = is_inf_internal = 1;
4099 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 4100 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
4101 flags &= ~SCF_DO_STCLASS;
4102 break;
c277df42 4103 }
8aa23a47 4104 }
e1d1eefb
YO
4105 else if (OP(scan) == LNBREAK) {
4106 if (flags & SCF_DO_STCLASS) {
4107 int value = 0;
4108 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4109 if (flags & SCF_DO_STCLASS_AND) {
4110 for (value = 0; value < 256; value++)
e64b1bd1 4111 if (!is_VERTWS_cp(value))
b9a59e08
KW
4112 ANYOF_BITMAP_CLEAR(data->start_class, value);
4113 }
4114 else {
e1d1eefb 4115 for (value = 0; value < 256; value++)
e64b1bd1 4116 if (is_VERTWS_cp(value))
b9a59e08
KW
4117 ANYOF_BITMAP_SET(data->start_class, value);
4118 }
e1d1eefb
YO
4119 if (flags & SCF_DO_STCLASS_OR)
4120 cl_and(data->start_class, and_withp);
4121 flags &= ~SCF_DO_STCLASS;
4122 }
4123 min += 1;
f9a79580 4124 delta += 1;
e1d1eefb
YO
4125 if (flags & SCF_DO_SUBSTR) {
4126 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4127 data->pos_min += 1;
f9a79580 4128 data->pos_delta += 1;
e1d1eefb
YO
4129 data->longest = &(data->longest_float);
4130 }
e1d1eefb 4131 }
e52fc539 4132 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 4133 int value = 0;
653099ff 4134
8aa23a47 4135 if (flags & SCF_DO_SUBSTR) {
304ee84b 4136 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
4137 data->pos_min++;
4138 }
4139 min++;
4140 if (flags & SCF_DO_STCLASS) {
4141 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 4142
8aa23a47
YO
4143 /* Some of the logic below assumes that switching
4144 locale on will only add false positives. */
4145 switch (PL_regkind[OP(scan)]) {
4146 case SANY:
4147 default:
4148 do_default:
4149 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4150 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4151 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
4152 break;
4153 case REG_ANY:
4154 if (OP(scan) == SANY)
4155 goto do_default;
4156 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4157 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 4158 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3fffb88a 4159 cl_anything(pRExC_state, data->start_class);
653099ff 4160 }
8aa23a47
YO
4161 if (flags & SCF_DO_STCLASS_AND || !value)
4162 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4163 break;
4164 case ANYOF:
4165 if (flags & SCF_DO_STCLASS_AND)
4166 cl_and(data->start_class,
4167 (struct regnode_charclass_class*)scan);
653099ff 4168 else
3fffb88a 4169 cl_or(pRExC_state, data->start_class,
8aa23a47
YO
4170 (struct regnode_charclass_class*)scan);
4171 break;
4172 case ALNUM:
4173 if (flags & SCF_DO_STCLASS_AND) {
4174 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4175 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 4176 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4177 for (value = 0; value < 256; value++) {
4178 if (!isWORDCHAR_L1(value)) {
4179 ANYOF_BITMAP_CLEAR(data->start_class, value);
4180 }
4181 }
4182 } else {
4183 for (value = 0; value < 256; value++) {
4184 if (!isALNUM(value)) {
4185 ANYOF_BITMAP_CLEAR(data->start_class, value);
4186 }
4187 }
4188 }
8aa23a47 4189 }
653099ff 4190 }
8aa23a47
YO
4191 else {
4192 if (data->start_class->flags & ANYOF_LOCALE)
4193 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
af302e7f
KW
4194
4195 /* Even if under locale, set the bits for non-locale
4196 * in case it isn't a true locale-node. This will
4197 * create false positives if it truly is locale */
4198 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4199 for (value = 0; value < 256; value++) {
4200 if (isWORDCHAR_L1(value)) {
4201 ANYOF_BITMAP_SET(data->start_class, value);
4202 }
4203 }
4204 } else {
4205 for (value = 0; value < 256; value++) {
4206 if (isALNUM(value)) {
4207 ANYOF_BITMAP_SET(data->start_class, value);
4208 }
4209 }
4210 }
8aa23a47
YO
4211 }
4212 break;
8aa23a47
YO
4213 case NALNUM:
4214 if (flags & SCF_DO_STCLASS_AND) {
4215 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4216 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 4217 if (OP(scan) == NALNUMU) {
a12cf05f
KW
4218 for (value = 0; value < 256; value++) {
4219 if (isWORDCHAR_L1(value)) {
4220 ANYOF_BITMAP_CLEAR(data->start_class, value);
4221 }
4222 }
4223 } else {
4224 for (value = 0; value < 256; value++) {
4225 if (isALNUM(value)) {
4226 ANYOF_BITMAP_CLEAR(data->start_class, value);
4227 }
4228 }
4229 }
653099ff
GS
4230 }
4231 }
8aa23a47
YO
4232 else {
4233 if (data->start_class->flags & ANYOF_LOCALE)
4234 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
af302e7f 4235
75950e1c
KW
4236 /* Even if under locale, set the bits for non-locale in
4237 * case it isn't a true locale-node. This will create
4238 * false positives if it truly is locale */
4239 if (OP(scan) == NALNUMU) {
4240 for (value = 0; value < 256; value++) {
4241 if (! isWORDCHAR_L1(value)) {
4242 ANYOF_BITMAP_SET(data->start_class, value);
4243 }
e9a9c1bc 4244 }
75950e1c
KW
4245 } else {
4246 for (value = 0; value < 256; value++) {
4247 if (! isALNUM(value)) {
4248 ANYOF_BITMAP_SET(data->start_class, value);
4249 }
4250 }
4251 }
653099ff 4252 }
8aa23a47 4253 break;
8aa23a47
YO
4254 case SPACE:
4255 if (flags & SCF_DO_STCLASS_AND) {
4256 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4257 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 4258 if (OP(scan) == SPACEU) {
a12cf05f
KW
4259 for (value = 0; value < 256; value++) {
4260 if (!isSPACE_L1(value)) {
4261 ANYOF_BITMAP_CLEAR(data->start_class, value);
4262 }
4263 }
4264 } else {
4265 for (value = 0; value < 256; value++) {
4266 if (!isSPACE(value)) {
4267 ANYOF_BITMAP_CLEAR(data->start_class, value);
4268 }
4269 }
4270 }
653099ff
GS
4271 }
4272 }
8aa23a47 4273 else {
a12cf05f 4274 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 4275 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 4276 }
af302e7f 4277 if (OP(scan) == SPACEU) {
a12cf05f
KW
4278 for (value = 0; value < 256; value++) {
4279 if (isSPACE_L1(value)) {
4280 ANYOF_BITMAP_SET(data->start_class, value);
4281 }
4282 }
4283 } else {
4284 for (value = 0; value < 256; value++) {
4285 if (isSPACE(value)) {
4286 ANYOF_BITMAP_SET(data->start_class, value);
4287 }
4288 }
8aa23a47 4289 }
653099ff 4290 }
8aa23a47 4291 break;
8aa23a47
YO
4292 case NSPACE:
4293 if (flags & SCF_DO_STCLASS_AND) {
4294 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4295 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 4296 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4297 for (value = 0; value < 256; value++) {
4298 if (isSPACE_L1(value)) {
4299 ANYOF_BITMAP_CLEAR(data->start_class, value);
4300 }
4301 }
4302 } else {
4303 for (value = 0; value < 256; value++) {
4304 if (isSPACE(value)) {
4305 ANYOF_BITMAP_CLEAR(data->start_class, value);
4306 }
4307 }
4308 }
653099ff 4309 }
8aa23a47
YO
4310 }
4311 else {
4312 if (data->start_class->flags & ANYOF_LOCALE)
4313 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
af302e7f 4314 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4315 for (value = 0; value < 256; value++) {
4316 if (!isSPACE_L1(value)) {
4317 ANYOF_BITMAP_SET(data->start_class, value);
4318 }
4319 }
4320 }
4321 else {
4322 for (value = 0; value < 256; value++) {
4323 if (!isSPACE(value)) {
4324 ANYOF_BITMAP_SET(data->start_class, value);
4325 }
4326 }
4327 }
653099ff 4328 }
8aa23a47 4329 break;
8aa23a47
YO
4330 case DIGIT:
4331 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
bf3c5c06
KW
4333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4334 for (value = 0; value < 256; value++)
4335 if (!isDIGIT(value))
4336 ANYOF_BITMAP_CLEAR(data->start_class, value);
bcc0256f 4337 }
8aa23a47
YO
4338 }
4339 else {
4340 if (data->start_class->flags & ANYOF_LOCALE)
4341 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
75950e1c
KW
4342 for (value = 0; value < 256; value++)
4343 if (isDIGIT(value))
4344 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
4345 }
4346 break;
4347 case NDIGIT:
4348 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4349 if (!(data->start_class->flags & ANYOF_LOCALE))
bf3c5c06 4350 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
8aa23a47
YO
4351 for (value = 0; value < 256; value++)
4352 if (isDIGIT(value))
4353 ANYOF_BITMAP_CLEAR(data->start_class, value);
4354 }
4355 else {
4356 if (data->start_class->flags & ANYOF_LOCALE)
4357 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
75950e1c
KW
4358 for (value = 0; value < 256; value++)
4359 if (!isDIGIT(value))
4360 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 4361 }
8aa23a47 4362 break;
e1d1eefb
YO
4363 CASE_SYNST_FNC(VERTWS);
4364 CASE_SYNST_FNC(HORIZWS);
686b73d4 4365
8aa23a47
YO
4366 }
4367 if (flags & SCF_DO_STCLASS_OR)
4368 cl_and(data->start_class, and_withp);
4369 flags &= ~SCF_DO_STCLASS;
4370 }
4371 }
4372 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4373 data->flags |= (OP(scan) == MEOL
4374 ? SF_BEFORE_MEOL
4375 : SF_BEFORE_SEOL);
4376 }
4377 else if ( PL_regkind[OP(scan)] == BRANCHJ
4378 /* Lookbehind, or need to calculate parens/evals/stclass: */
4379 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4380 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4381 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4382 || OP(scan) == UNLESSM )
4383 {
4384 /* Negative Lookahead/lookbehind
4385 In this case we can't do fixed string optimisation.
4386 */
1de06328 4387
8aa23a47
YO
4388 I32 deltanext, minnext, fake = 0;
4389 regnode *nscan;
4390 struct regnode_charclass_class intrnl;
4391 int f = 0;
1de06328 4392
8aa23a47
YO
4393 data_fake.flags = 0;
4394 if (data) {
4395 data_fake.whilem_c = data->whilem_c;
4396 data_fake.last_closep = data->last_closep;
c277df42 4397 }
8aa23a47
YO
4398 else
4399 data_fake.last_closep = &fake;
58e23c8d 4400 data_fake.pos_delta = delta;
8aa23a47
YO
4401 if ( flags & SCF_DO_STCLASS && !scan->flags
4402 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4403 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4404 data_fake.start_class = &intrnl;
4405 f |= SCF_DO_STCLASS_AND;
4406 }
4407 if (flags & SCF_WHILEM_VISITED_POS)
4408 f |= SCF_WHILEM_VISITED_POS;
4409 next = regnext(scan);
4410 nscan = NEXTOPER(NEXTOPER(scan));
4411 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4412 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4413 if (scan->flags) {
4414 if (deltanext) {
58e23c8d 4415 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4416 }
4417 else if (minnext > (I32)U8_MAX) {
58e23c8d 4418 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4419 }
4420 scan->flags = (U8)minnext;
4421 }
4422 if (data) {
4423 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4424 pars++;
4425 if (data_fake.flags & SF_HAS_EVAL)
4426 data->flags |= SF_HAS_EVAL;
4427 data->whilem_c = data_fake.whilem_c;
4428 }
4429 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
4430 if (flags & SCF_DO_STCLASS_OR) {
4431 /* OR before, AND after: ideally we would recurse with
4432 * data_fake to get the AND applied by study of the
4433 * remainder of the pattern, and then derecurse;
4434 * *** HACK *** for now just treat as "no information".
4435 * See [perl #56690].
4436 */
e755fd73 4437 cl_init(pRExC_state, data->start_class);
906cdd2b
HS
4438 } else {
4439 /* AND before and after: combine and continue */
4440 const int was = (data->start_class->flags & ANYOF_EOS);
4441
4442 cl_and(data->start_class, &intrnl);
4443 if (was)
4444 data->start_class->flags |= ANYOF_EOS;
4445 }
8aa23a47 4446 }
cb434fcc 4447 }
8aa23a47
YO
4448#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4449 else {
4450 /* Positive Lookahead/lookbehind
4451 In this case we can do fixed string optimisation,
4452 but we must be careful about it. Note in the case of
4453 lookbehind the positions will be offset by the minimum
4454 length of the pattern, something we won't know about
4455 until after the recurse.
4456 */
4457 I32 deltanext, fake = 0;
4458 regnode *nscan;
4459 struct regnode_charclass_class intrnl;
4460 int f = 0;
4461 /* We use SAVEFREEPV so that when the full compile
4462 is finished perl will clean up the allocated
3b753521 4463 minlens when it's all done. This way we don't
8aa23a47
YO
4464 have to worry about freeing them when we know
4465 they wont be used, which would be a pain.
4466 */
4467 I32 *minnextp;
4468 Newx( minnextp, 1, I32 );
4469 SAVEFREEPV(minnextp);
4470
4471 if (data) {
4472 StructCopy(data, &data_fake, scan_data_t);
4473 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4474 f |= SCF_DO_SUBSTR;
4475 if (scan->flags)
304ee84b 4476 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
4477 data_fake.last_found=newSVsv(data->last_found);
4478 }
4479 }
4480 else
4481 data_fake.last_closep = &fake;
4482 data_fake.flags = 0;
58e23c8d 4483 data_fake.pos_delta = delta;
8aa23a47
YO
4484 if (is_inf)
4485 data_fake.flags |= SF_IS_INF;
4486 if ( flags & SCF_DO_STCLASS && !scan->flags
4487 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4488 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4489 data_fake.start_class = &intrnl;
4490 f |= SCF_DO_STCLASS_AND;
4491 }
4492 if (flags & SCF_WHILEM_VISITED_POS)
4493 f |= SCF_WHILEM_VISITED_POS;
4494 next = regnext(scan);
4495 nscan = NEXTOPER(NEXTOPER(scan));
4496
4497 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4498 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4499 if (scan->flags) {
4500 if (deltanext) {
58e23c8d 4501 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4502 }
4503 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 4504 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4505 }
4506 scan->flags = (U8)*minnextp;
4507 }
4508
4509 *minnextp += min;
4510
4511 if (f & SCF_DO_STCLASS_AND) {
4512 const int was = (data->start_class->flags & ANYOF_EOS);
4513
4514 cl_and(data->start_class, &intrnl);
4515 if (was)
4516 data->start_class->flags |= ANYOF_EOS;
4517 }
4518 if (data) {
4519 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4520 pars++;
4521 if (data_fake.flags & SF_HAS_EVAL)
4522 data->flags |= SF_HAS_EVAL;
4523 data->whilem_c = data_fake.whilem_c;
4524 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4525 if (RExC_rx->minlen<*minnextp)
4526 RExC_rx->minlen=*minnextp;
304ee84b 4527 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4528 SvREFCNT_dec(data_fake.last_found);
4529
4530 if ( data_fake.minlen_fixed != minlenp )
4531 {
4532 data->offset_fixed= data_fake.offset_fixed;
4533 data->minlen_fixed= data_fake.minlen_fixed;
4534 data->lookbehind_fixed+= scan->flags;
4535 }
4536 if ( data_fake.minlen_float != minlenp )
4537 {
4538 data->minlen_float= data_fake.minlen_float;
4539 data->offset_float_min=data_fake.offset_float_min;
4540 data->offset_float_max=data_fake.offset_float_max;
4541 data->lookbehind_float+= scan->flags;
4542 }
4543 }
4544 }
4545
4546
40d049e4 4547 }
8aa23a47
YO
4548#endif
4549 }
4550 else if (OP(scan) == OPEN) {
4551 if (stopparen != (I32)ARG(scan))
4552 pars++;
4553 }
4554 else if (OP(scan) == CLOSE) {
4555 if (stopparen == (I32)ARG(scan)) {
4556 break;
4557 }
4558 if ((I32)ARG(scan) == is_par) {
4559 next = regnext(scan);
b515a41d 4560
8aa23a47
YO
4561 if ( next && (OP(next) != WHILEM) && next < last)
4562 is_par = 0; /* Disable optimization */
40d049e4 4563 }
8aa23a47
YO
4564 if (data)
4565 *(data->last_closep) = ARG(scan);
4566 }
4567 else if (OP(scan) == EVAL) {
c277df42
IZ
4568 if (data)
4569 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4570 }
4571 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4572 if (flags & SCF_DO_SUBSTR) {
304ee84b 4573 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4574 flags &= ~SCF_DO_SUBSTR;
40d049e4 4575 }
8aa23a47
YO
4576 if (data && OP(scan)==ACCEPT) {
4577 data->flags |= SCF_SEEN_ACCEPT;
4578 if (stopmin > min)
4579 stopmin = min;
e2e6a0f1 4580 }
8aa23a47
YO
4581 }
4582 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4583 {
0f5d15d6 4584 if (flags & SCF_DO_SUBSTR) {
304ee84b 4585 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4586 data->longest = &(data->longest_float);
4587 }
4588 is_inf = is_inf_internal = 1;
653099ff 4589 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4590 cl_anything(pRExC_state, data->start_class);
96776eda 4591 flags &= ~SCF_DO_STCLASS;
8aa23a47 4592 }
58e23c8d 4593 else if (OP(scan) == GPOS) {
bbe252da 4594 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4595 !(delta || is_inf || (data && data->pos_delta)))
4596 {
bbe252da
YO
4597 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4598 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4599 if (RExC_rx->gofs < (U32)min)
4600 RExC_rx->gofs = min;
4601 } else {
bbe252da 4602 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4603 RExC_rx->gofs = 0;
4604 }
4605 }
786e8c11 4606#ifdef TRIE_STUDY_OPT
40d049e4 4607#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4608 else if (PL_regkind[OP(scan)] == TRIE) {
4609 /* NOTE - There is similar code to this block above for handling
4610 BRANCH nodes on the initial study. If you change stuff here
4611 check there too. */
4612 regnode *trie_node= scan;
4613 regnode *tail= regnext(scan);
f8fc2ecf 4614 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4615 I32 max1 = 0, min1 = I32_MAX;
4616 struct regnode_charclass_class accum;
4617
4618 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4619 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47 4620 if (flags & SCF_DO_STCLASS)
e755fd73 4621 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
4622
4623 if (!trie->jump) {
4624 min1= trie->minlen;
4625 max1= trie->maxlen;
4626 } else {
4627 const regnode *nextbranch= NULL;
4628 U32 word;
4629
4630 for ( word=1 ; word <= trie->wordcount ; word++)
4631 {
4632 I32 deltanext=0, minnext=0, f = 0, fake;
4633 struct regnode_charclass_class this_class;
4634
4635 data_fake.flags = 0;
4636 if (data) {
4637 data_fake.whilem_c = data->whilem_c;
4638 data_fake.last_closep = data->last_closep;
4639 }
4640 else
4641 data_fake.last_closep = &fake;
58e23c8d 4642 data_fake.pos_delta = delta;
8aa23a47 4643 if (flags & SCF_DO_STCLASS) {
e755fd73 4644 cl_init(pRExC_state, &this_class);
8aa23a47
YO
4645 data_fake.start_class = &this_class;
4646 f = SCF_DO_STCLASS_AND;
4647 }
4648 if (flags & SCF_WHILEM_VISITED_POS)
4649 f |= SCF_WHILEM_VISITED_POS;
4650
4651 if (trie->jump[word]) {
4652 if (!nextbranch)
4653 nextbranch = trie_node + trie->jump[0];
4654 scan= trie_node + trie->jump[word];
4655 /* We go from the jump point to the branch that follows
4656 it. Note this means we need the vestigal unused branches
4657 even though they arent otherwise used.
4658 */
4659 minnext = study_chunk(pRExC_state, &scan, minlenp,
4660 &deltanext, (regnode *)nextbranch, &data_fake,
4661 stopparen, recursed, NULL, f,depth+1);
4662 }
4663 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4664 nextbranch= regnext((regnode*)nextbranch);
4665
4666 if (min1 > (I32)(minnext + trie->minlen))
4667 min1 = minnext + trie->minlen;
4668 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4669 max1 = minnext + deltanext + trie->maxlen;
4670 if (deltanext == I32_MAX)
4671 is_inf = is_inf_internal = 1;
4672
4673 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4674 pars++;
4675 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4676 if ( stopmin > min + min1)
4677 stopmin = min + min1;
4678 flags &= ~SCF_DO_SUBSTR;
4679 if (data)
4680 data->flags |= SCF_SEEN_ACCEPT;
4681 }
4682 if (data) {
4683 if (data_fake.flags & SF_HAS_EVAL)
4684 data->flags |= SF_HAS_EVAL;
4685 data->whilem_c = data_fake.whilem_c;
4686 }
4687 if (flags & SCF_DO_STCLASS)
3fffb88a 4688 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
4689 }
4690 }
4691 if (flags & SCF_DO_SUBSTR) {
4692 data->pos_min += min1;
4693 data->pos_delta += max1 - min1;
4694 if (max1 != min1 || is_inf)
4695 data->longest = &(data->longest_float);
4696 }
4697 min += min1;
4698 delta += max1 - min1;
4699 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 4700 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
4701 if (min1) {
4702 cl_and(data->start_class, and_withp);
4703 flags &= ~SCF_DO_STCLASS;
4704 }
4705 }
4706 else if (flags & SCF_DO_STCLASS_AND) {
4707 if (min1) {
4708 cl_and(data->start_class, &accum);
4709 flags &= ~SCF_DO_STCLASS;
4710 }
4711 else {
4712 /* Switch to OR mode: cache the old value of
4713 * data->start_class */
4714 INIT_AND_WITHP;
4715 StructCopy(data->start_class, and_withp,
4716 struct regnode_charclass_class);
4717 flags &= ~SCF_DO_STCLASS_AND;
4718 StructCopy(&accum, data->start_class,
4719 struct regnode_charclass_class);
4720 flags |= SCF_DO_STCLASS_OR;
4721 data->start_class->flags |= ANYOF_EOS;
4722 }
4723 }
4724 scan= tail;
4725 continue;
4726 }
786e8c11 4727#else
8aa23a47 4728 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4729 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4730 U8*bang=NULL;
4731
4732 min += trie->minlen;
4733 delta += (trie->maxlen - trie->minlen);
4734 flags &= ~SCF_DO_STCLASS; /* xxx */
4735 if (flags & SCF_DO_SUBSTR) {
304ee84b 4736 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4737 data->pos_min += trie->minlen;
4738 data->pos_delta += (trie->maxlen - trie->minlen);
4739 if (trie->maxlen != trie->minlen)
4740 data->longest = &(data->longest_float);
4741 }
4742 if (trie->jump) /* no more substrings -- for now /grr*/
4743 flags &= ~SCF_DO_SUBSTR;
b515a41d 4744 }
8aa23a47 4745#endif /* old or new */
686b73d4 4746#endif /* TRIE_STUDY_OPT */
e1d1eefb 4747
8aa23a47
YO
4748 /* Else: zero-length, ignore. */
4749 scan = regnext(scan);
4750 }
4751 if (frame) {
4752 last = frame->last;
4753 scan = frame->next;
4754 stopparen = frame->stop;
4755 frame = frame->prev;
4756 goto fake_study_recurse;
c277df42
IZ
4757 }
4758
4759 finish:
8aa23a47 4760 assert(!frame);
304ee84b 4761 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4762
c277df42 4763 *scanp = scan;
aca2d497 4764 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4765 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4766 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4767 if (is_par > (I32)U8_MAX)
c277df42
IZ
4768 is_par = 0;
4769 if (is_par && pars==1 && data) {
4770 data->flags |= SF_IN_PAR;
4771 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4772 }
4773 else if (pars && data) {
c277df42
IZ
4774 data->flags |= SF_HAS_PAR;
4775 data->flags &= ~SF_IN_PAR;
4776 }
653099ff 4777 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4778 cl_and(data->start_class, and_withp);
786e8c11
YO
4779 if (flags & SCF_TRIE_RESTUDY)
4780 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4781
304ee84b 4782 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4783
e2e6a0f1 4784 return min < stopmin ? min : stopmin;
c277df42
IZ
4785}
4786
2eccd3b2
NC
4787STATIC U32
4788S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4789{
4a4e7719
NC
4790 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4791
7918f24d
NC
4792 PERL_ARGS_ASSERT_ADD_DATA;
4793
4a4e7719
NC
4794 Renewc(RExC_rxi->data,
4795 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4796 char, struct reg_data);
4797 if(count)
f8fc2ecf 4798 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4799 else
f8fc2ecf 4800 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4801 RExC_rxi->data->count = count + n;
4802 Copy(s, RExC_rxi->data->what + count, n, U8);
4803 return count;
c277df42
IZ
4804}
4805
f8149455 4806/*XXX: todo make this not included in a non debugging perl */
76234dfb 4807#ifndef PERL_IN_XSUB_RE
d88dccdf 4808void
864dbfa3 4809Perl_reginitcolors(pTHX)
d88dccdf 4810{
97aff369 4811 dVAR;
1df70142 4812 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4813 if (s) {
1df70142
AL
4814 char *t = savepv(s);
4815 int i = 0;
4816 PL_colors[0] = t;
d88dccdf 4817 while (++i < 6) {
1df70142
AL
4818 t = strchr(t, '\t');
4819 if (t) {
4820 *t = '\0';
4821 PL_colors[i] = ++t;
d88dccdf
IZ
4822 }
4823 else
1df70142 4824 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4825 }
4826 } else {
1df70142 4827 int i = 0;
b81d288d 4828 while (i < 6)
06b5626a 4829 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4830 }
4831 PL_colorset = 1;
4832}
76234dfb 4833#endif
8615cb43 4834
07be1b83 4835
786e8c11
YO
4836#ifdef TRIE_STUDY_OPT
4837#define CHECK_RESTUDY_GOTO \
4838 if ( \
4839 (data.flags & SCF_TRIE_RESTUDY) \
4840 && ! restudied++ \
4841 ) goto reStudy
4842#else
4843#define CHECK_RESTUDY_GOTO
4844#endif
f9f4320a 4845
a687059c 4846/*
e50aee73 4847 - pregcomp - compile a regular expression into internal code
a687059c
LW
4848 *
4849 * We can't allocate space until we know how big the compiled form will be,
4850 * but we can't compile it (and thus know how big it is) until we've got a
4851 * place to put the code. So we cheat: we compile it twice, once with code
4852 * generation turned off and size counting turned on, and once "for real".
4853 * This also means that we don't allocate space until we are sure that the
4854 * thing really will compile successfully, and we never have to move the
4855 * code and thus invalidate pointers into it. (Note that it has to be in
4856 * one piece because free() must be able to free it all.) [NB: not true in perl]
4857 *
4858 * Beware that the optimization-preparation code in here knows about some
4859 * of the structure of the compiled regexp. [I'll say.]
4860 */
b9b4dddf
YO
4861
4862
4863
f9f4320a 4864#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4865#define RE_ENGINE_PTR &PL_core_reg_engine
4866#else
f9f4320a
YO
4867extern const struct regexp_engine my_reg_engine;
4868#define RE_ENGINE_PTR &my_reg_engine
4869#endif
6d5c990f
RGS
4870
4871#ifndef PERL_IN_XSUB_RE
3ab4a224 4872REGEXP *
1593ad57 4873Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4874{
97aff369 4875 dVAR;
6d5c990f 4876 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4877
4878 PERL_ARGS_ASSERT_PREGCOMP;
4879
f9f4320a
YO
4880 /* Dispatch a request to compile a regexp to correct
4881 regexp engine. */
f9f4320a
YO
4882 if (table) {
4883 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4884 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4885 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4886 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4887 DEBUG_COMPILE_r({
8d8756e7 4888 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4889 SvIV(*ptr));
4890 });
3ab4a224 4891 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4892 }
b9b4dddf 4893 }
3ab4a224 4894 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4895}
6d5c990f 4896#endif
2a5d9b1d 4897
3ab4a224 4898REGEXP *
29b09c41 4899Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4900{
4901 dVAR;
288b8c02
NC
4902 REGEXP *rx;
4903 struct regexp *r;
f8fc2ecf 4904 register regexp_internal *ri;
3ab4a224 4905 STRLEN plen;
4d6b2893 4906 char* VOL exp;
5d51ce98 4907 char* xend;
c277df42 4908 regnode *scan;
a0d0e21e 4909 I32 flags;
a0d0e21e 4910 I32 minlen = 0;
29b09c41 4911 U32 pm_flags;
e7f38d0f
YO
4912
4913 /* these are all flags - maybe they should be turned
4914 * into a single int with different bit masks */
4915 I32 sawlookahead = 0;
a0d0e21e
LW
4916 I32 sawplus = 0;
4917 I32 sawopen = 0;
29b09c41 4918 bool used_setjump = FALSE;
4624b182 4919 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
e7f38d0f 4920
bbd61b5f
KW
4921 U8 jump_ret = 0;
4922 dJMPENV;
2c2d71f5 4923 scan_data_t data;
830247a4 4924 RExC_state_t RExC_state;
be8e71aa 4925 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4926#ifdef TRIE_STUDY_OPT
5d51ce98 4927 int restudied;
07be1b83
YO
4928 RExC_state_t copyRExC_state;
4929#endif
2a5d9b1d 4930 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4931
4932 PERL_ARGS_ASSERT_RE_COMPILE;
4933
6d5c990f 4934 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4935
370b8f2f 4936#ifndef PERL_IN_XSUB_RE
a3e1f3a6
KW
4937 /* Initialize these here instead of as-needed, as is quick and avoids
4938 * having to test them each time otherwise */
4939 if (! PL_AboveLatin1) {
4940 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4941 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4942 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
3f427fd9
KW
4943
4944 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4945 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4946
4947 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4948 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4949
4950 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4951 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4952
dab0c3e7
KW
4953 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4954
3f427fd9
KW
4955 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4956 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4957
4958 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4959
4960 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4961 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4962
4963 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4964 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4965
3f427fd9
KW
4966 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4967 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4968
4969 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4970 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4971
4972 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4973 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4974
4975 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4976 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4977
4978 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4979 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4980
4981 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4982 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4983
4984 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4985
4986 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4987 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4988
4989 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4990 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
a3e1f3a6 4991 }
370b8f2f 4992#endif
a3e1f3a6 4993
11951bcb
KW
4994 exp = SvPV(pattern, plen);
4995
4996 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4997 RExC_utf8 = RExC_orig_utf8 = 0;
4998 }
4999 else {
5000 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
5001 }
e40e74fe 5002 RExC_uni_semantics = 0;
4624b182 5003 RExC_contains_locale = 0;
7b597bb8 5004
d6bd454d 5005 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
5006 /* Longjmp back to here if have to switch in midstream to utf8 */
5007 if (! RExC_orig_utf8) {
5008 JMPENV_PUSH(jump_ret);
29b09c41 5009 used_setjump = TRUE;
bbd61b5f
KW
5010 }
5011
5d51ce98 5012 if (jump_ret == 0) { /* First time through */
29b09c41 5013 xend = exp + plen;
29b09c41 5014
5d51ce98
KW
5015 DEBUG_COMPILE_r({
5016 SV *dsv= sv_newmortal();
5017 RE_PV_QUOTED_DECL(s, RExC_utf8,
5018 dsv, exp, plen, 60);
5019 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5020 PL_colors[4],PL_colors[5],s);
5021 });
5022 }
5023 else { /* longjumped back */
bbd61b5f
KW
5024 STRLEN len = plen;
5025
5d51ce98
KW
5026 /* If the cause for the longjmp was other than changing to utf8, pop
5027 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
5028 if (jump_ret != UTF8_LONGJMP) {
5029 JMPENV_POP;
5030 JMPENV_JUMP(jump_ret);
5031 }
5032
595598ee
KW
5033 GET_RE_DEBUG_FLAGS;
5034
bbd61b5f
KW
5035 /* It's possible to write a regexp in ascii that represents Unicode
5036 codepoints outside of the byte range, such as via \x{100}. If we
5037 detect such a sequence we have to convert the entire pattern to utf8
5038 and then recompile, as our sizing calculation will have been based
5039 on 1 byte == 1 character, but we will need to use utf8 to encode
5040 at least some part of the pattern, and therefore must convert the whole
5041 thing.
5042 -- dmq */
5043 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5044 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
3e0b93e8
KW
5045 exp = (char*)Perl_bytes_to_utf8(aTHX_
5046 (U8*)SvPV_nomg(pattern, plen),
5047 &len);
bbd61b5f
KW
5048 xend = exp + len;
5049 RExC_orig_utf8 = RExC_utf8 = 1;
5050 SAVEFREEPV(exp);
5051 }
5052
5d51ce98
KW
5053#ifdef TRIE_STUDY_OPT
5054 restudied = 0;
5055#endif
5056
29b09c41 5057 pm_flags = orig_pm_flags;
a62b1201 5058
4624b182
KW
5059 if (initial_charset == REGEX_LOCALE_CHARSET) {
5060 RExC_contains_locale = 1;
5061 }
5062 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5063
5064 /* Set to use unicode semantics if the pattern is in utf8 and has the
5065 * 'depends' charset specified, as it means unicode when utf8 */
a62b1201 5066 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
5067 }
5068
02daf0ab 5069 RExC_precomp = exp;
c737faaf 5070 RExC_flags = pm_flags;
830247a4 5071 RExC_sawback = 0;
bbce6d69 5072
830247a4 5073 RExC_seen = 0;
b57e4118 5074 RExC_in_lookbehind = 0;
830247a4
IZ
5075 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5076 RExC_seen_evals = 0;
5077 RExC_extralen = 0;
e2a7e165 5078 RExC_override_recoding = 0;
c277df42 5079
bbce6d69 5080 /* First pass: determine size, legality. */
830247a4 5081 RExC_parse = exp;
fac92740 5082 RExC_start = exp;
830247a4
IZ
5083 RExC_end = xend;
5084 RExC_naughty = 0;
5085 RExC_npar = 1;
e2e6a0f1 5086 RExC_nestroot = 0;
830247a4
IZ
5087 RExC_size = 0L;
5088 RExC_emit = &PL_regdummy;
5089 RExC_whilem_seen = 0;
40d049e4
YO
5090 RExC_open_parens = NULL;
5091 RExC_close_parens = NULL;
5092 RExC_opend = NULL;
81714fb9 5093 RExC_paren_names = NULL;
1f1031fe
YO
5094#ifdef DEBUGGING
5095 RExC_paren_name_list = NULL;
5096#endif
40d049e4
YO
5097 RExC_recurse = NULL;
5098 RExC_recurse_count = 0;
81714fb9 5099
85ddcde9
JH
5100#if 0 /* REGC() is (currently) a NOP at the first pass.
5101 * Clever compilers notice this and complain. --jhi */
830247a4 5102 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 5103#endif
44bed856
KW
5104 DEBUG_PARSE_r(
5105 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5106 RExC_lastnum=0;
5107 RExC_lastparse=NULL;
5108 );
3dab1dad 5109 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 5110 RExC_precomp = NULL;
a0d0e21e
LW
5111 return(NULL);
5112 }
bbd61b5f 5113
29b09c41
KW
5114 /* Here, finished first pass. Get rid of any added setjmp */
5115 if (used_setjump) {
bbd61b5f 5116 JMPENV_POP;
02daf0ab 5117 }
e40e74fe 5118
07be1b83 5119 DEBUG_PARSE_r({
81714fb9
YO
5120 PerlIO_printf(Perl_debug_log,
5121 "Required size %"IVdf" nodes\n"
5122 "Starting second pass (creation)\n",
5123 (IV)RExC_size);
07be1b83
YO
5124 RExC_lastnum=0;
5125 RExC_lastparse=NULL;
5126 });
e40e74fe
KW
5127
5128 /* The first pass could have found things that force Unicode semantics */
5129 if ((RExC_utf8 || RExC_uni_semantics)
5130 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5131 {
5132 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5133 }
5134
c277df42
IZ
5135 /* Small enough for pointer-storage convention?
5136 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
5137 if (RExC_size >= 0x10000L && RExC_extralen)
5138 RExC_size += RExC_extralen;
c277df42 5139 else
830247a4
IZ
5140 RExC_extralen = 0;
5141 if (RExC_whilem_seen > 15)
5142 RExC_whilem_seen = 15;
a0d0e21e 5143
f9f4320a
YO
5144 /* Allocate space and zero-initialize. Note, the two step process
5145 of zeroing when in debug mode, thus anything assigned has to
5146 happen after that */
d2f13c59 5147 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 5148 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
5149 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5150 char, regexp_internal);
5151 if ( r == NULL || ri == NULL )
b45f050a 5152 FAIL("Regexp out of space");
0f79a09d
GS
5153#ifdef DEBUGGING
5154 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 5155 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 5156#else
f8fc2ecf
YO
5157 /* bulk initialize base fields with 0. */
5158 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 5159#endif
58e23c8d
YO
5160
5161 /* non-zero initialization begins here */
f8fc2ecf 5162 RXi_SET( r, ri );
f9f4320a 5163 r->engine= RE_ENGINE_PTR;
c737faaf 5164 r->extflags = pm_flags;
bcdf7404 5165 {
f7819f85 5166 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 5167 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
5168
5169 /* The caret is output if there are any defaults: if not all the STD
5170 * flags are set, or if no character set specifier is needed */
5171 bool has_default =
5172 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5173 || ! has_charset);
bcdf7404 5174 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
5175 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5176 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
5177 const char *fptr = STD_PAT_MODS; /*"msix"*/
5178 char *p;
fb85c044 5179 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
5180 * on. If more precision is desired, we could do a population count of
5181 * the flags set. This could be done with a small lookup table, or by
5182 * shifting, masking and adding, or even, when available, assembly
5183 * language for a machine-language population count.
5184 * We never output a minus, as all those are defaults, so are
5185 * covered by the caret */
fb85c044 5186 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 5187 + has_default /* If needs a caret */
a62b1201
KW
5188
5189 /* If needs a character set specifier */
5190 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
5191 + (sizeof(STD_PAT_MODS) - 1)
5192 + (sizeof("(?:)") - 1);
5193
c5ea2ffa 5194 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 5195 SvPOK_on(rx);
8f6ae13c 5196 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 5197 *p++='('; *p++='?';
9de15fec
KW
5198
5199 /* If a default, cover it using the caret */
c5ea2ffa 5200 if (has_default) {
85508812 5201 *p++= DEFAULT_PAT_MOD;
fb85c044 5202 }
c5ea2ffa 5203 if (has_charset) {
a62b1201
KW
5204 STRLEN len;
5205 const char* const name = get_regex_charset_name(r->extflags, &len);
5206 Copy(name, p, len, char);
5207 p += len;
9de15fec 5208 }
f7819f85
A
5209 if (has_p)
5210 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 5211 {
bcdf7404 5212 char ch;
bcdf7404
YO
5213 while((ch = *fptr++)) {
5214 if(reganch & 1)
5215 *p++ = ch;
bcdf7404
YO
5216 reganch >>= 1;
5217 }
bcdf7404
YO
5218 }
5219
28d8d7f4 5220 *p++ = ':';
bb661a58 5221 Copy(RExC_precomp, p, plen, char);
efd26800
NC
5222 assert ((RX_WRAPPED(rx) - p) < 16);
5223 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 5224 p += plen;
bcdf7404 5225 if (has_runon)
28d8d7f4
YO
5226 *p++ = '\n';
5227 *p++ = ')';
5228 *p = 0;
fb85c044 5229 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
5230 }
5231
bbe252da 5232 r->intflags = 0;
830247a4 5233 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 5234
6bda09f9 5235 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
5236 Newxz(RExC_open_parens, RExC_npar,regnode *);
5237 SAVEFREEPV(RExC_open_parens);
5238 Newxz(RExC_close_parens,RExC_npar,regnode *);
5239 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
5240 }
5241
5242 /* Useful during FAIL. */
7122b237
YO
5243#ifdef RE_TRACK_PATTERN_OFFSETS
5244 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 5245 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 5246 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 5247 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 5248 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
5249#endif
5250 SetProgLen(ri,RExC_size);
288b8c02 5251 RExC_rx_sv = rx;
830247a4 5252 RExC_rx = r;
f8fc2ecf 5253 RExC_rxi = ri;
bbce6d69 5254
5255 /* Second pass: emit code. */
c737faaf 5256 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
5257 RExC_parse = exp;
5258 RExC_end = xend;
5259 RExC_naughty = 0;
5260 RExC_npar = 1;
f8fc2ecf
YO
5261 RExC_emit_start = ri->program;
5262 RExC_emit = ri->program;
3b57cd43
YO
5263 RExC_emit_bound = ri->program + RExC_size + 1;
5264
2cd61cdb 5265 /* Store the count of eval-groups for security checks: */
f8149455 5266 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 5267 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 5268 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 5269 ReREFCNT_dec(rx);
a0d0e21e 5270 return(NULL);
80757612 5271 }
07be1b83
YO
5272 /* XXXX To minimize changes to RE engine we always allocate
5273 3-units-long substrs field. */
5274 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
5275 if (RExC_recurse_count) {
5276 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5277 SAVEFREEPV(RExC_recurse);
5278 }
a0d0e21e 5279
07be1b83 5280reStudy:
e7f38d0f 5281 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 5282 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 5283
07be1b83 5284#ifdef TRIE_STUDY_OPT
0934c9d9
SH
5285 if (!restudied) {
5286 StructCopy(&zero_scan_data, &data, scan_data_t);
5287 copyRExC_state = RExC_state;
5288 } else {
5d458dd8 5289 U32 seen=RExC_seen;
07be1b83 5290 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
5291
5292 RExC_state = copyRExC_state;
5293 if (seen & REG_TOP_LEVEL_BRANCHES)
5294 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5295 else
5296 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 5297 if (data.last_found) {
07be1b83 5298 SvREFCNT_dec(data.longest_fixed);
07be1b83 5299 SvREFCNT_dec(data.longest_float);
07be1b83 5300 SvREFCNT_dec(data.last_found);
1de06328 5301 }
40d049e4 5302 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5303 }
40d049e4
YO
5304#else
5305 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5306#endif
fc8cd66c 5307
a0d0e21e 5308 /* Dig out information for optimizations. */
f7819f85 5309 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
5310 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5311
a0ed51b3 5312 if (UTF)
8f6ae13c 5313 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 5314 ri->regstclass = NULL;
830247a4 5315 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 5316 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 5317 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 5318
1de06328
YO
5319 /* testing for BRANCH here tells us whether there is "must appear"
5320 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 5321 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 5322 I32 fake;
c5254dd6 5323 STRLEN longest_float_length, longest_fixed_length;
07be1b83 5324 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 5325 int stclass_flag;
07be1b83 5326 I32 last_close = 0; /* pointed to by data */
5339e136
YO
5327 regnode *first= scan;
5328 regnode *first_next= regnext(first);
639081d6
YO
5329 /*
5330 * Skip introductions and multiplicators >= 1
5331 * so that we can extract the 'meat' of the pattern that must
5332 * match in the large if() sequence following.
5333 * NOTE that EXACT is NOT covered here, as it is normally
5334 * picked up by the optimiser separately.
5335 *
5336 * This is unfortunate as the optimiser isnt handling lookahead
5337 * properly currently.
5338 *
5339 */
a0d0e21e 5340 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 5341 /* An OR of *one* alternative - should not happen now. */
5339e136 5342 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 5343 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 5344 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
5345 (OP(first) == PLUS) ||
5346 (OP(first) == MINMOD) ||
653099ff 5347 /* An {n,m} with n>0 */
5339e136
YO
5348 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5349 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 5350 {
639081d6
YO
5351 /*
5352 * the only op that could be a regnode is PLUS, all the rest
5353 * will be regnode_1 or regnode_2.
5354 *
5355 */
a0d0e21e
LW
5356 if (OP(first) == PLUS)
5357 sawplus = 1;
5358 else
3dab1dad 5359 first += regarglen[OP(first)];
686b73d4 5360
639081d6 5361 first = NEXTOPER(first);
5339e136 5362 first_next= regnext(first);
a687059c
LW
5363 }
5364
a0d0e21e
LW
5365 /* Starting-point info. */
5366 again:
786e8c11 5367 DEBUG_PEEP("first:",first,0);
07be1b83 5368 /* Ignore EXACT as we deal with it later. */
3dab1dad 5369 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 5370 if (OP(first) == EXACT)
6f207bd3 5371 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 5372 else
f8fc2ecf 5373 ri->regstclass = first;
b3c9acc1 5374 }
686b73d4 5375#ifdef TRIE_STCLASS
786e8c11 5376 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 5377 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 5378 {
786e8c11 5379 regnode *trie_op;
07be1b83 5380 /* this can happen only on restudy */
786e8c11 5381 if ( OP(first) == TRIE ) {
c944940b 5382 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 5383 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
5384 StructCopy(first,trieop,struct regnode_1);
5385 trie_op=(regnode *)trieop;
5386 } else {
c944940b 5387 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 5388 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
5389 StructCopy(first,trieop,struct regnode_charclass);
5390 trie_op=(regnode *)trieop;
5391 }
1de06328 5392 OP(trie_op)+=2;
786e8c11 5393 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 5394 ri->regstclass = trie_op;
07be1b83 5395 }
686b73d4 5396#endif
e52fc539 5397 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 5398 ri->regstclass = first;
3dab1dad
YO
5399 else if (PL_regkind[OP(first)] == BOUND ||
5400 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 5401 ri->regstclass = first;
3dab1dad 5402 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
5403 r->extflags |= (OP(first) == MBOL
5404 ? RXf_ANCH_MBOL
cad2e5aa 5405 : (OP(first) == SBOL
bbe252da
YO
5406 ? RXf_ANCH_SBOL
5407 : RXf_ANCH_BOL));
a0d0e21e 5408 first = NEXTOPER(first);
774d564b 5409 goto again;
5410 }
5411 else if (OP(first) == GPOS) {
bbe252da 5412 r->extflags |= RXf_ANCH_GPOS;
774d564b 5413 first = NEXTOPER(first);
5414 goto again;
a0d0e21e 5415 }
cf2a2b69
YO
5416 else if ((!sawopen || !RExC_sawback) &&
5417 (OP(first) == STAR &&
3dab1dad 5418 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 5419 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
5420 {
5421 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
5422 const int type =
5423 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
5424 ? RXf_ANCH_MBOL
5425 : RXf_ANCH_SBOL;
5426 r->extflags |= type;
5427 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 5428 first = NEXTOPER(first);
774d564b 5429 goto again;
a0d0e21e 5430 }
e7f38d0f 5431 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 5432 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 5433 /* x+ must match at the 1st pos of run of x's */
bbe252da 5434 r->intflags |= PREGf_SKIP;
a0d0e21e 5435
c277df42 5436 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 5437#ifdef TRIE_STUDY_OPT
81714fb9 5438 DEBUG_PARSE_r(
be8e71aa
YO
5439 if (!restudied)
5440 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5441 (IV)(first - scan + 1))
5442 );
5443#else
81714fb9 5444 DEBUG_PARSE_r(
be8e71aa
YO
5445 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5446 (IV)(first - scan + 1))
5447 );
5448#endif
5449
5450
a0d0e21e
LW
5451 /*
5452 * If there's something expensive in the r.e., find the
5453 * longest literal string that must appear and make it the
5454 * regmust. Resolve ties in favor of later strings, since
5455 * the regstart check works with the beginning of the r.e.
5456 * and avoiding duplication strengthens checking. Not a
5457 * strong reason, but sufficient in the absence of others.
5458 * [Now we resolve ties in favor of the earlier string if
c277df42 5459 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
5460 * earlier string may buy us something the later one won't.]
5461 */
686b73d4 5462
396482e1
GA
5463 data.longest_fixed = newSVpvs("");
5464 data.longest_float = newSVpvs("");
5465 data.last_found = newSVpvs("");
c277df42
IZ
5466 data.longest = &(data.longest_fixed);
5467 first = scan;
f8fc2ecf 5468 if (!ri->regstclass) {
e755fd73 5469 cl_init(pRExC_state, &ch_class);
653099ff
GS
5470 data.start_class = &ch_class;
5471 stclass_flag = SCF_DO_STCLASS_AND;
5472 } else /* XXXX Check for BOUND? */
5473 stclass_flag = 0;
cb434fcc 5474 data.last_closep = &last_close;
de8c5301 5475
1de06328 5476 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
5477 &data, -1, NULL, NULL,
5478 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 5479
686b73d4 5480
786e8c11
YO
5481 CHECK_RESTUDY_GOTO;
5482
5483
830247a4 5484 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 5485 && data.last_start_min == 0 && data.last_end > 0
830247a4 5486 && !RExC_seen_zerolen
2bf803e2 5487 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
5488 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5489 r->extflags |= RXf_CHECK_ALL;
304ee84b 5490 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
5491 SvREFCNT_dec(data.last_found);
5492
1de06328
YO
5493 /* Note that code very similar to this but for anchored string
5494 follows immediately below, changes may need to be made to both.
5495 Be careful.
5496 */
a0ed51b3 5497 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 5498 if (longest_float_length
c277df42
IZ
5499 || (data.flags & SF_FL_BEFORE_EOL
5500 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5501 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 5502 {
1182767e 5503 I32 t,ml;
cf93c79d 5504
a0c4c608 5505 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
5506 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5507 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5508 && data.offset_fixed == data.offset_float_min
5509 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
aca2d497
IZ
5510 goto remove_float; /* As in (a)+. */
5511
1de06328
YO
5512 /* copy the information about the longest float from the reg_scan_data
5513 over to the program. */
33b8afdf
JH
5514 if (SvUTF8(data.longest_float)) {
5515 r->float_utf8 = data.longest_float;
c445ea15 5516 r->float_substr = NULL;
33b8afdf
JH
5517 } else {
5518 r->float_substr = data.longest_float;
c445ea15 5519 r->float_utf8 = NULL;
33b8afdf 5520 }
1de06328
YO
5521 /* float_end_shift is how many chars that must be matched that
5522 follow this item. We calculate it ahead of time as once the
5523 lookbehind offset is added in we lose the ability to correctly
5524 calculate it.*/
5525 ml = data.minlen_float ? *(data.minlen_float)
1182767e 5526 : (I32)longest_float_length;
1de06328
YO
5527 r->float_end_shift = ml - data.offset_float_min
5528 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5529 + data.lookbehind_float;
5530 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 5531 r->float_max_offset = data.offset_float_max;
1182767e 5532 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
5533 r->float_max_offset -= data.lookbehind_float;
5534
cf93c79d
IZ
5535 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5536 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5537 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5538 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5539 }
5540 else {
aca2d497 5541 remove_float:
c445ea15 5542 r->float_substr = r->float_utf8 = NULL;
c277df42 5543 SvREFCNT_dec(data.longest_float);
c5254dd6 5544 longest_float_length = 0;
a0d0e21e 5545 }
c277df42 5546
1de06328
YO
5547 /* Note that code very similar to this but for floating string
5548 is immediately above, changes may need to be made to both.
5549 Be careful.
5550 */
a0ed51b3 5551 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
a0c4c608
KW
5552
5553 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
5554 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5555 && (longest_fixed_length
5556 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5557 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5558 || (RExC_flags & RXf_PMf_MULTILINE)))) )
1de06328 5559 {
1182767e 5560 I32 t,ml;
cf93c79d 5561
1de06328
YO
5562 /* copy the information about the longest fixed
5563 from the reg_scan_data over to the program. */
33b8afdf
JH
5564 if (SvUTF8(data.longest_fixed)) {
5565 r->anchored_utf8 = data.longest_fixed;
c445ea15 5566 r->anchored_substr = NULL;
33b8afdf
JH
5567 } else {
5568 r->anchored_substr = data.longest_fixed;
c445ea15 5569 r->anchored_utf8 = NULL;
33b8afdf 5570 }
1de06328
YO
5571 /* fixed_end_shift is how many chars that must be matched that
5572 follow this item. We calculate it ahead of time as once the
5573 lookbehind offset is added in we lose the ability to correctly
5574 calculate it.*/
5575 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 5576 : (I32)longest_fixed_length;
1de06328
YO
5577 r->anchored_end_shift = ml - data.offset_fixed
5578 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5579 + data.lookbehind_fixed;
5580 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5581
cf93c79d
IZ
5582 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5583 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 5584 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5585 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5586 }
5587 else {
c445ea15 5588 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 5589 SvREFCNT_dec(data.longest_fixed);
c5254dd6 5590 longest_fixed_length = 0;
a0d0e21e 5591 }
f8fc2ecf
YO
5592 if (ri->regstclass
5593 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5594 ri->regstclass = NULL;
f4244008 5595
33b8afdf
JH
5596 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5597 && stclass_flag
653099ff 5598 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5599 && !cl_is_anything(data.start_class))
5600 {
2eccd3b2 5601 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5602 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5603
f8fc2ecf 5604 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5605 struct regnode_charclass_class);
5606 StructCopy(data.start_class,
f8fc2ecf 5607 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5608 struct regnode_charclass_class);
f8fc2ecf 5609 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5610 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5611 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5612 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5613 PerlIO_printf(Perl_debug_log,
a0288114 5614 "synthetic stclass \"%s\".\n",
3f7c398e 5615 SvPVX_const(sv));});
653099ff 5616 }
c277df42
IZ
5617
5618 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5619 if (longest_fixed_length > longest_float_length) {
1de06328 5620 r->check_end_shift = r->anchored_end_shift;
c277df42 5621 r->check_substr = r->anchored_substr;
33b8afdf 5622 r->check_utf8 = r->anchored_utf8;
c277df42 5623 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5624 if (r->extflags & RXf_ANCH_SINGLE)
5625 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5626 }
5627 else {
1de06328 5628 r->check_end_shift = r->float_end_shift;
c277df42 5629 r->check_substr = r->float_substr;
33b8afdf 5630 r->check_utf8 = r->float_utf8;
1de06328
YO
5631 r->check_offset_min = r->float_min_offset;
5632 r->check_offset_max = r->float_max_offset;
a0d0e21e 5633 }
30382c73
IZ
5634 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5635 This should be changed ASAP! */
bbe252da
YO
5636 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5637 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5638 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5639 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5640 }
1de06328
YO
5641 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5642 if ( (STRLEN)minlen < longest_float_length )
5643 minlen= longest_float_length;
5644 if ( (STRLEN)minlen < longest_fixed_length )
5645 minlen= longest_fixed_length;
5646 */
a0ed51b3
LW
5647 }
5648 else {
c277df42
IZ
5649 /* Several toplevels. Best we can is to set minlen. */
5650 I32 fake;
653099ff 5651 struct regnode_charclass_class ch_class;
cb434fcc 5652 I32 last_close = 0;
686b73d4 5653
5d458dd8 5654 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5655
f8fc2ecf 5656 scan = ri->program + 1;
e755fd73 5657 cl_init(pRExC_state, &ch_class);
653099ff 5658 data.start_class = &ch_class;
cb434fcc 5659 data.last_closep = &last_close;
07be1b83 5660
de8c5301 5661
1de06328 5662 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5663 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5664
786e8c11 5665 CHECK_RESTUDY_GOTO;
07be1b83 5666
33b8afdf 5667 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5668 = r->float_substr = r->float_utf8 = NULL;
f4244008 5669
653099ff 5670 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5671 && !cl_is_anything(data.start_class))
5672 {
2eccd3b2 5673 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5674 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5675
f8fc2ecf 5676 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5677 struct regnode_charclass_class);
5678 StructCopy(data.start_class,
f8fc2ecf 5679 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5680 struct regnode_charclass_class);
f8fc2ecf 5681 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5682 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5683 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5684 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5685 PerlIO_printf(Perl_debug_log,
a0288114 5686 "synthetic stclass \"%s\".\n",
3f7c398e 5687 SvPVX_const(sv));});
653099ff 5688 }
a0d0e21e
LW
5689 }
5690
1de06328
YO
5691 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5692 the "real" pattern. */
cf9788e3
RGS
5693 DEBUG_OPTIMISE_r({
5694 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5695 (IV)minlen, (IV)r->minlen);
cf9788e3 5696 });
de8c5301 5697 r->minlenret = minlen;
1de06328
YO
5698 if (r->minlen < minlen)
5699 r->minlen = minlen;
5700
b81d288d 5701 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5702 r->extflags |= RXf_GPOS_SEEN;
830247a4 5703 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5704 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5705 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5706 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5707 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5708 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5709 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5710 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5711 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5712 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5713 if (RExC_paren_names)
85fbaab2 5714 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5715 else
5daac39c 5716 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5717
7bd1e614 5718#ifdef STUPID_PATTERN_CHECKS
5509d87a 5719 if (RX_PRELEN(rx) == 0)
640f820d 5720 r->extflags |= RXf_NULL;
5509d87a 5721 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5722 /* XXX: this should happen BEFORE we compile */
5723 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5724 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5725 r->extflags |= RXf_WHITE;
5509d87a 5726 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5727 r->extflags |= RXf_START_ONLY;
f1b875a0 5728#else
5509d87a 5729 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5730 /* XXX: this should happen BEFORE we compile */
5731 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5732 else {
5733 regnode *first = ri->program + 1;
39aa8307 5734 U8 fop = OP(first);
f6d9469c
DM
5735
5736 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 5737 r->extflags |= RXf_NULL;
f6d9469c 5738 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 5739 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
5740 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5741 && OP(regnext(first)) == END)
7bd1e614
YO
5742 r->extflags |= RXf_WHITE;
5743 }
f1b875a0 5744#endif
1f1031fe
YO
5745#ifdef DEBUGGING
5746 if (RExC_paren_names) {
af534a04 5747 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5748 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5749 } else
1f1031fe 5750#endif
cde0cee5 5751 ri->name_list_idx = 0;
1f1031fe 5752
40d049e4
YO
5753 if (RExC_recurse_count) {
5754 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5755 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5756 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5757 }
5758 }
f0ab9afb 5759 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5760 /* assume we don't need to swap parens around before we match */
5761
be8e71aa
YO
5762 DEBUG_DUMP_r({
5763 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5764 regdump(r);
5765 });
7122b237
YO
5766#ifdef RE_TRACK_PATTERN_OFFSETS
5767 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5768 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5769 U32 i;
5770 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5771 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5772 for (i = 1; i <= len; i++) {
7122b237 5773 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5774 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5775 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5776 }
5777 PerlIO_printf(Perl_debug_log, "\n");
5778 });
7122b237 5779#endif
288b8c02 5780 return rx;
a687059c
LW
5781}
5782
f9f4320a 5783#undef RE_ENGINE_PTR
3dab1dad 5784
93b32b6d 5785
81714fb9 5786SV*
192b9cd1
AB
5787Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5788 const U32 flags)
5789{
7918f24d
NC
5790 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5791
192b9cd1
AB
5792 PERL_UNUSED_ARG(value);
5793
f1b875a0 5794 if (flags & RXapif_FETCH) {
192b9cd1 5795 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5796 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5797 Perl_croak_no_modify(aTHX);
192b9cd1 5798 return NULL;
f1b875a0 5799 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5800 return reg_named_buff_exists(rx, key, flags)
5801 ? &PL_sv_yes
5802 : &PL_sv_no;
f1b875a0 5803 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5804 return reg_named_buff_all(rx, flags);
f1b875a0 5805 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5806 return reg_named_buff_scalar(rx, flags);
5807 } else {
5808 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5809 return NULL;
5810 }
5811}
5812
5813SV*
5814Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5815 const U32 flags)
5816{
7918f24d 5817 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5818 PERL_UNUSED_ARG(lastkey);
5819
f1b875a0 5820 if (flags & RXapif_FIRSTKEY)
192b9cd1 5821 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5822 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5823 return reg_named_buff_nextkey(rx, flags);
5824 else {
5825 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5826 return NULL;
5827 }
5828}
5829
5830SV*
288b8c02
NC
5831Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5832 const U32 flags)
81714fb9 5833{
44a2ac75
YO
5834 AV *retarray = NULL;
5835 SV *ret;
288b8c02 5836 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5837
5838 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5839
f1b875a0 5840 if (flags & RXapif_ALL)
44a2ac75 5841 retarray=newAV();
93b32b6d 5842
5daac39c
NC
5843 if (rx && RXp_PAREN_NAMES(rx)) {
5844 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5845 if (he_str) {
5846 IV i;
5847 SV* sv_dat=HeVAL(he_str);
5848 I32 *nums=(I32*)SvPVX(sv_dat);
5849 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5850 if ((I32)(rx->nparens) >= nums[i]
5851 && rx->offs[nums[i]].start != -1
5852 && rx->offs[nums[i]].end != -1)
93b32b6d 5853 {
49d7dfbc 5854 ret = newSVpvs("");
288b8c02 5855 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5856 if (!retarray)
5857 return ret;
5858 } else {
7402016d
AB
5859 if (retarray)
5860 ret = newSVsv(&PL_sv_undef);
93b32b6d 5861 }
ec83ea38 5862 if (retarray)
93b32b6d 5863 av_push(retarray, ret);
81714fb9 5864 }
93b32b6d 5865 if (retarray)
ad64d0ec 5866 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5867 }
5868 }
5869 return NULL;
5870}
5871
5872bool
288b8c02 5873Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5874 const U32 flags)
5875{
288b8c02 5876 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5877
5878 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5879
5daac39c 5880 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5881 if (flags & RXapif_ALL) {
5daac39c 5882 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5883 } else {
288b8c02 5884 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5885 if (sv) {
5886 SvREFCNT_dec(sv);
192b9cd1
AB
5887 return TRUE;
5888 } else {
5889 return FALSE;
5890 }
5891 }
5892 } else {
5893 return FALSE;
5894 }
5895}
5896
5897SV*
288b8c02 5898Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5899{
288b8c02 5900 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5901
5902 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5903
5daac39c
NC
5904 if ( rx && RXp_PAREN_NAMES(rx) ) {
5905 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5906
288b8c02 5907 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5908 } else {
5909 return FALSE;
5910 }
192b9cd1
AB
5911}
5912
5913SV*
288b8c02 5914Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5915{
288b8c02 5916 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5917 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5918
5919 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5920
5daac39c
NC
5921 if (rx && RXp_PAREN_NAMES(rx)) {
5922 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5923 HE *temphe;
5924 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5925 IV i;
5926 IV parno = 0;
5927 SV* sv_dat = HeVAL(temphe);
5928 I32 *nums = (I32*)SvPVX(sv_dat);
5929 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5930 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5931 rx->offs[nums[i]].start != -1 &&
5932 rx->offs[nums[i]].end != -1)
5933 {
5934 parno = nums[i];
5935 break;
5936 }
5937 }
f1b875a0 5938 if (parno || flags & RXapif_ALL) {
a663657d 5939 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5940 }
81714fb9
YO
5941 }
5942 }
44a2ac75
YO
5943 return NULL;
5944}
5945
192b9cd1 5946SV*
288b8c02 5947Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5948{
5949 SV *ret;
5950 AV *av;
5951 I32 length;
288b8c02 5952 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5953
7918f24d
NC
5954 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5955
5daac39c 5956 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5957 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5958 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5959 } else if (flags & RXapif_ONE) {
288b8c02 5960 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5961 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5962 length = av_len(av);
ec83ea38 5963 SvREFCNT_dec(ret);
192b9cd1
AB
5964 return newSViv(length + 1);
5965 } else {
5966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5967 return NULL;
5968 }
5969 }
5970 return &PL_sv_undef;
5971}
5972
5973SV*
288b8c02 5974Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5975{
288b8c02 5976 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5977 AV *av = newAV();
5978
7918f24d
NC
5979 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5980
5daac39c
NC
5981 if (rx && RXp_PAREN_NAMES(rx)) {
5982 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5983 HE *temphe;
5984 (void)hv_iterinit(hv);
5985 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5986 IV i;
5987 IV parno = 0;
5988 SV* sv_dat = HeVAL(temphe);
5989 I32 *nums = (I32*)SvPVX(sv_dat);
5990 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5991 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5992 rx->offs[nums[i]].start != -1 &&
5993 rx->offs[nums[i]].end != -1)
5994 {
5995 parno = nums[i];
5996 break;
5997 }
5998 }
f1b875a0 5999 if (parno || flags & RXapif_ALL) {
a663657d 6000 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
6001 }
6002 }
6003 }
6004
ad64d0ec 6005 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
6006}
6007
49d7dfbc 6008void
288b8c02
NC
6009Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6010 SV * const sv)
44a2ac75 6011{
288b8c02 6012 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 6013 char *s = NULL;
a9d504c3 6014 I32 i = 0;
44a2ac75 6015 I32 s1, t1;
7918f24d
NC
6016
6017 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 6018
cde0cee5
YO
6019 if (!rx->subbeg) {
6020 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6021 return;
cde0cee5
YO
6022 }
6023 else
f1b875a0 6024 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 6025 /* $` */
f0ab9afb 6026 i = rx->offs[0].start;
cde0cee5 6027 s = rx->subbeg;
44a2ac75
YO
6028 }
6029 else
f1b875a0 6030 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 6031 /* $' */
f0ab9afb
NC
6032 s = rx->subbeg + rx->offs[0].end;
6033 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
6034 }
6035 else
6036 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
6037 (s1 = rx->offs[paren].start) != -1 &&
6038 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
6039 {
6040 /* $& $1 ... */
6041 i = t1 - s1;
6042 s = rx->subbeg + s1;
cde0cee5
YO
6043 } else {
6044 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6045 return;
cde0cee5
YO
6046 }
6047 assert(rx->sublen >= (s - rx->subbeg) + i );
6048 if (i >= 0) {
6049 const int oldtainted = PL_tainted;
6050 TAINT_NOT;
6051 sv_setpvn(sv, s, i);
6052 PL_tainted = oldtainted;
6053 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 6054 ? (RXp_MATCH_UTF8(rx)
cde0cee5 6055 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 6056 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
6057 {
6058 SvUTF8_on(sv);
6059 }
6060 else
6061 SvUTF8_off(sv);
6062 if (PL_tainting) {
07bc277f 6063 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
6064 if (SvTYPE(sv) >= SVt_PVMG) {
6065 MAGIC* const mg = SvMAGIC(sv);
6066 MAGIC* mgt;
6067 PL_tainted = 1;
6068 SvMAGIC_set(sv, mg->mg_moremagic);
6069 SvTAINT(sv);
6070 if ((mgt = SvMAGIC(sv))) {
6071 mg->mg_moremagic = mgt;
6072 SvMAGIC_set(sv, mg);
44a2ac75 6073 }
cde0cee5
YO
6074 } else {
6075 PL_tainted = 1;
6076 SvTAINT(sv);
6077 }
6078 } else
6079 SvTAINTED_off(sv);
44a2ac75 6080 }
81714fb9 6081 } else {
44a2ac75 6082 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6083 return;
81714fb9
YO
6084 }
6085}
93b32b6d 6086
2fdbfb4d
AB
6087void
6088Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6089 SV const * const value)
6090{
7918f24d
NC
6091 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6092
2fdbfb4d
AB
6093 PERL_UNUSED_ARG(rx);
6094 PERL_UNUSED_ARG(paren);
6095 PERL_UNUSED_ARG(value);
6096
6097 if (!PL_localizing)
6ad8f254 6098 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
6099}
6100
6101I32
288b8c02 6102Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
6103 const I32 paren)
6104{
288b8c02 6105 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
6106 I32 i;
6107 I32 s1, t1;
6108
7918f24d
NC
6109 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6110
2fdbfb4d
AB
6111 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6112 switch (paren) {
192b9cd1 6113 /* $` / ${^PREMATCH} */
f1b875a0 6114 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
6115 if (rx->offs[0].start != -1) {
6116 i = rx->offs[0].start;
6117 if (i > 0) {
6118 s1 = 0;
6119 t1 = i;
6120 goto getlen;
6121 }
6122 }
6123 return 0;
192b9cd1 6124 /* $' / ${^POSTMATCH} */
f1b875a0 6125 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
6126 if (rx->offs[0].end != -1) {
6127 i = rx->sublen - rx->offs[0].end;
6128 if (i > 0) {
6129 s1 = rx->offs[0].end;
6130 t1 = rx->sublen;
6131 goto getlen;
6132 }
6133 }
6134 return 0;
192b9cd1
AB
6135 /* $& / ${^MATCH}, $1, $2, ... */
6136 default:
2fdbfb4d
AB
6137 if (paren <= (I32)rx->nparens &&
6138 (s1 = rx->offs[paren].start) != -1 &&
6139 (t1 = rx->offs[paren].end) != -1)
6140 {
6141 i = t1 - s1;
6142 goto getlen;
6143 } else {
6144 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 6145 report_uninit((const SV *)sv);
2fdbfb4d
AB
6146 return 0;
6147 }
6148 }
6149 getlen:
07bc277f 6150 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
6151 const char * const s = rx->subbeg + s1;
6152 const U8 *ep;
6153 STRLEN el;
6154
6155 i = t1 - s1;
6156 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6157 i = el;
6158 }
6159 return i;
6160}
6161
fe578d7f 6162SV*
49d7dfbc 6163Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 6164{
7918f24d 6165 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 6166 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
6167 if (0)
6168 return NULL;
6169 else
6170 return newSVpvs("Regexp");
fe578d7f 6171}
0a4db386 6172
894be9b7 6173/* Scans the name of a named buffer from the pattern.
0a4db386
YO
6174 * If flags is REG_RSN_RETURN_NULL returns null.
6175 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6176 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6177 * to the parsed name as looked up in the RExC_paren_names hash.
6178 * If there is an error throws a vFAIL().. type exception.
894be9b7 6179 */
0a4db386
YO
6180
6181#define REG_RSN_RETURN_NULL 0
6182#define REG_RSN_RETURN_NAME 1
6183#define REG_RSN_RETURN_DATA 2
6184
894be9b7 6185STATIC SV*
7918f24d
NC
6186S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6187{
894be9b7 6188 char *name_start = RExC_parse;
1f1031fe 6189
7918f24d
NC
6190 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6191
1f1031fe
YO
6192 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6193 /* skip IDFIRST by using do...while */
6194 if (UTF)
6195 do {
6196 RExC_parse += UTF8SKIP(RExC_parse);
6197 } while (isALNUM_utf8((U8*)RExC_parse));
6198 else
6199 do {
6200 RExC_parse++;
6201 } while (isALNUM(*RExC_parse));
894be9b7 6202 }
1f1031fe 6203
0a4db386 6204 if ( flags ) {
59cd0e26
NC
6205 SV* sv_name
6206 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6207 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
6208 if ( flags == REG_RSN_RETURN_NAME)
6209 return sv_name;
6210 else if (flags==REG_RSN_RETURN_DATA) {
6211 HE *he_str = NULL;
6212 SV *sv_dat = NULL;
6213 if ( ! sv_name ) /* should not happen*/
6214 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6215 if (RExC_paren_names)
6216 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6217 if ( he_str )
6218 sv_dat = HeVAL(he_str);
6219 if ( ! sv_dat )
6220 vFAIL("Reference to nonexistent named group");
6221 return sv_dat;
6222 }
6223 else {
5637ef5b
NC
6224 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6225 (unsigned long) flags);
0a4db386
YO
6226 }
6227 /* NOT REACHED */
894be9b7 6228 }
0a4db386 6229 return NULL;
894be9b7
YO
6230}
6231
3dab1dad
YO
6232#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6233 int rem=(int)(RExC_end - RExC_parse); \
6234 int cut; \
6235 int num; \
6236 int iscut=0; \
6237 if (rem>10) { \
6238 rem=10; \
6239 iscut=1; \
6240 } \
6241 cut=10-rem; \
6242 if (RExC_lastparse!=RExC_parse) \
6243 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6244 rem, RExC_parse, \
6245 cut + 4, \
6246 iscut ? "..." : "<" \
6247 ); \
6248 else \
6249 PerlIO_printf(Perl_debug_log,"%16s",""); \
6250 \
6251 if (SIZE_ONLY) \
3b57cd43 6252 num = RExC_size + 1; \
3dab1dad
YO
6253 else \
6254 num=REG_NODE_NUM(RExC_emit); \
6255 if (RExC_lastnum!=num) \
0a4db386 6256 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 6257 else \
0a4db386 6258 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
6259 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6260 (int)((depth*2)), "", \
3dab1dad
YO
6261 (funcname) \
6262 ); \
6263 RExC_lastnum=num; \
6264 RExC_lastparse=RExC_parse; \
6265})
6266
07be1b83
YO
6267
6268
3dab1dad
YO
6269#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6270 DEBUG_PARSE_MSG((funcname)); \
6271 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6272})
6bda09f9
YO
6273#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6274 DEBUG_PARSE_MSG((funcname)); \
6275 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6276})
d764b54e
KW
6277
6278/* This section of code defines the inversion list object and its methods. The
6279 * interfaces are highly subject to change, so as much as possible is static to
fa2d2a23
KW
6280 * this file. An inversion list is here implemented as a malloc'd C UV array
6281 * with some added info that is placed as UVs at the beginning in a header
6282 * portion. An inversion list for Unicode is an array of code points, sorted
6283 * by ordinal number. The zeroth element is the first code point in the list.
6284 * The 1th element is the first element beyond that not in the list. In other
6285 * words, the first range is
6286 * invlist[0]..(invlist[1]-1)
dbe7a391
KW
6287 * The other ranges follow. Thus every element whose index is divisible by two
6288 * marks the beginning of a range that is in the list, and every element not
fa2d2a23
KW
6289 * divisible by two marks the beginning of a range not in the list. A single
6290 * element inversion list that contains the single code point N generally
6291 * consists of two elements
6292 * invlist[0] == N
6293 * invlist[1] == N+1
6294 * (The exception is when N is the highest representable value on the
6295 * machine, in which case the list containing just it would be a single
6296 * element, itself. By extension, if the last range in the list extends to
6297 * infinity, then the first element of that range will be in the inversion list
6298 * at a position that is divisible by two, and is the final element in the
6299 * list.)
f1b67122
KW
6300 * Taking the complement (inverting) an inversion list is quite simple, if the
6301 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6302 * This implementation reserves an element at the beginning of each inversion list
6303 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6304 * beginning of the list is either that element if 0, or the next one if 1.
6305 *
fa2d2a23
KW
6306 * More about inversion lists can be found in "Unicode Demystified"
6307 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
97b14ce7 6308 * More will be coming when functionality is added later.
d764b54e 6309 *
fa2d2a23
KW
6310 * The inversion list data structure is currently implemented as an SV pointing
6311 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6312 * array of UV whose memory management is automatically handled by the existing
6313 * facilities for SV's.
62672576 6314 *
d764b54e
KW
6315 * Some of the methods should always be private to the implementation, and some
6316 * should eventually be made public */
6317
fa2d2a23
KW
6318#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6319#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6320
f59ff194
KW
6321/* This is a combination of a version and data structure type, so that one
6322 * being passed in can be validated to be an inversion list of the correct
6323 * vintage. When the structure of the header is changed, a new random number
6324 * in the range 2**31-1 should be generated and the new() method changed to
6325 * insert that at this location. Then, if an auxiliary program doesn't change
6326 * correspondingly, it will be discovered immediately */
6327#define INVLIST_VERSION_ID_OFFSET 2
6328#define INVLIST_VERSION_ID 1064334010
6329
6330/* For safety, when adding new elements, remember to #undef them at the end of
6331 * the inversion list code section */
6332
6333#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
f1b67122
KW
6334/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6335 * contains the code point U+00000, and begins here. If 1, the inversion list
6336 * doesn't contain U+0000, and it begins at the next UV in the array.
6337 * Inverting an inversion list consists of adding or removing the 0 at the
6338 * beginning of it. By reserving a space for that 0, inversion can be made
6339 * very fast */
6340
6341#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
97b14ce7
KW
6342
6343/* Internally things are UVs */
6344#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6345#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6346
d764b54e 6347#define INVLIST_INITIAL_LEN 10
d764b54e
KW
6348
6349PERL_STATIC_INLINE UV*
f1b67122
KW
6350S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6351{
6352 /* Returns a pointer to the first element in the inversion list's array.
6353 * This is called upon initialization of an inversion list. Where the
6354 * array begins depends on whether the list has the code point U+0000
6355 * in it or not. The other parameter tells it whether the code that
6356 * follows this call is about to put a 0 in the inversion list or not.
6357 * The first element is either the element with 0, if 0, or the next one,
6358 * if 1 */
6359
6360 UV* zero = get_invlist_zero_addr(invlist);
6361
6362 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6363
6364 /* Must be empty */
6365 assert(! *get_invlist_len_addr(invlist));
6366
6367 /* 1^1 = 0; 1^0 = 1 */
6368 *zero = 1 ^ will_have_0;
6369 return zero + *zero;
6370}
6371
6372PERL_STATIC_INLINE UV*
a25abddc 6373S_invlist_array(pTHX_ SV* const invlist)
d764b54e
KW
6374{
6375 /* Returns the pointer to the inversion list's array. Every time the
6376 * length changes, this needs to be called in case malloc or realloc moved
6377 * it */
6378
d764b54e
KW
6379 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6380
dbe7a391
KW
6381 /* Must not be empty. If these fail, you probably didn't check for <len>
6382 * being non-zero before trying to get the array */
f1b67122
KW
6383 assert(*get_invlist_len_addr(invlist));
6384 assert(*get_invlist_zero_addr(invlist) == 0
6385 || *get_invlist_zero_addr(invlist) == 1);
6386
6387 /* The array begins either at the element reserved for zero if the
6388 * list contains 0 (that element will be set to 0), or otherwise the next
6389 * element (in which case the reserved element will be set to 1). */
6390 return (UV *) (get_invlist_zero_addr(invlist)
6391 + *get_invlist_zero_addr(invlist));
d764b54e
KW
6392}
6393
61bdbf38
KW
6394PERL_STATIC_INLINE UV*
6395S_get_invlist_len_addr(pTHX_ SV* invlist)
6396{
6397 /* Return the address of the UV that contains the current number
6398 * of used elements in the inversion list */
6399
6400 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6401
6402 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6403}
6404
d764b54e 6405PERL_STATIC_INLINE UV
a25abddc 6406S_invlist_len(pTHX_ SV* const invlist)
d764b54e 6407{
dbe7a391
KW
6408 /* Returns the current number of elements stored in the inversion list's
6409 * array */
d764b54e 6410
d764b54e
KW
6411 PERL_ARGS_ASSERT_INVLIST_LEN;
6412
61bdbf38 6413 return *get_invlist_len_addr(invlist);
d764b54e
KW
6414}
6415
c56a880b
KW
6416PERL_STATIC_INLINE void
6417S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6418{
6419 /* Sets the current number of elements stored in the inversion list */
6420
6421 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6422
c56a880b 6423 *get_invlist_len_addr(invlist) = len;
f1b67122 6424
32f89ef6
KW
6425 assert(len <= SvLEN(invlist));
6426
f1b67122
KW
6427 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6428 /* If the list contains U+0000, that element is part of the header,
6429 * and should not be counted as part of the array. It will contain
6430 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6431 * subtract:
6432 * SvCUR_set(invlist,
6433 * TO_INTERNAL_SIZE(len
6434 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6435 * But, this is only valid if len is not 0. The consequences of not doing
9479a769
KW
6436 * this is that the memory allocation code may think that 1 more UV is
6437 * being used than actually is, and so might do an unnecessary grow. That
6438 * seems worth not bothering to make this the precise amount.
25e94a65
KW
6439 *
6440 * Note that when inverting, SvCUR shouldn't change */
c56a880b
KW
6441}
6442
d764b54e 6443PERL_STATIC_INLINE UV
a25abddc 6444S_invlist_max(pTHX_ SV* const invlist)
d764b54e
KW
6445{
6446 /* Returns the maximum number of elements storable in the inversion list's
6447 * array, without having to realloc() */
6448
d764b54e
KW
6449 PERL_ARGS_ASSERT_INVLIST_MAX;
6450
005b65ed 6451 return FROM_INTERNAL_SIZE(SvLEN(invlist));
d764b54e
KW
6452}
6453
f1b67122
KW
6454PERL_STATIC_INLINE UV*
6455S_get_invlist_zero_addr(pTHX_ SV* invlist)
6456{
6457 /* Return the address of the UV that is reserved to hold 0 if the inversion
6458 * list contains 0. This has to be the last element of the heading, as the
6459 * list proper starts with either it if 0, or the next element if not.
6460 * (But we force it to contain either 0 or 1) */
6461
6462 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6463
6464 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6465}
d764b54e 6466
8d69a883 6467#ifndef PERL_IN_XSUB_RE
a25abddc 6468SV*
d764b54e
KW
6469Perl__new_invlist(pTHX_ IV initial_size)
6470{
6471
6472 /* Return a pointer to a newly constructed inversion list, with enough
6473 * space to store 'initial_size' elements. If that number is negative, a
6474 * system default is used instead */
6475
97b14ce7
KW
6476 SV* new_list;
6477
d764b54e
KW
6478 if (initial_size < 0) {
6479 initial_size = INVLIST_INITIAL_LEN;
6480 }
6481
6482 /* Allocate the initial space */
97b14ce7
KW
6483 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6484 invlist_set_len(new_list, 0);
6485
f3dc70d1
KW
6486 /* Force iterinit() to be used to get iteration to work */
6487 *get_invlist_iter_addr(new_list) = UV_MAX;
6488
f1b67122
KW
6489 /* This should force a segfault if a method doesn't initialize this
6490 * properly */
6491 *get_invlist_zero_addr(new_list) = UV_MAX;
6492
f59ff194
KW
6493 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6494#if HEADER_LENGTH != 4
6495# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6496#endif
6497
97b14ce7 6498 return new_list;
d764b54e 6499}
8d69a883 6500#endif
d764b54e 6501
f59ff194
KW
6502STATIC SV*
6503S__new_invlist_C_array(pTHX_ UV* list)
6504{
6505 /* Return a pointer to a newly constructed inversion list, initialized to
6506 * point to <list>, which has to be in the exact correct inversion list
6507 * form, including internal fields. Thus this is a dangerous routine that
6508 * should not be used in the wrong hands */
6509
6510 SV* invlist = newSV_type(SVt_PV);
6511
6512 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6513
6514 SvPV_set(invlist, (char *) list);
6515 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6516 shouldn't touch it */
6517 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6518
6519 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6520 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6521 }
6522
6523 return invlist;
6524}
6525
d764b54e 6526STATIC void
a25abddc 6527S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
d764b54e 6528{
62672576 6529 /* Grow the maximum size of an inversion list */
d764b54e
KW
6530
6531 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6532
005b65ed 6533 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
d764b54e
KW
6534}
6535
6536PERL_STATIC_INLINE void
a25abddc 6537S_invlist_trim(pTHX_ SV* const invlist)
d764b54e
KW
6538{
6539 PERL_ARGS_ASSERT_INVLIST_TRIM;
6540
6541 /* Change the length of the inversion list to how many entries it currently
6542 * has */
6543
62672576 6544 SvPV_shrink_to_cur((SV *) invlist);
d764b54e
KW
6545}
6546
6547/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6548 * etc */
4096c37b
KW
6549#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6550#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
d764b54e 6551
8dc9348a
KW
6552#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6553
9d501133
KW
6554STATIC void
6555S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
d764b54e
KW
6556{
6557 /* Subject to change or removal. Append the range from 'start' to 'end' at
6558 * the end of the inversion list. The range must be above any existing
6559 * ones. */
6560
f1b67122 6561 UV* array;
d764b54e
KW
6562 UV max = invlist_max(invlist);
6563 UV len = invlist_len(invlist);
6564
6565 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6566
f1b67122
KW
6567 if (len == 0) { /* Empty lists must be initialized */
6568 array = _invlist_array_init(invlist, start == 0);
6569 }
6570 else {
d764b54e
KW
6571 /* Here, the existing list is non-empty. The current max entry in the
6572 * list is generally the first value not in the set, except when the
6573 * set extends to the end of permissible values, in which case it is
6574 * the first entry in that final set, and so this call is an attempt to
6575 * append out-of-order */
6576
6577 UV final_element = len - 1;
f1b67122 6578 array = invlist_array(invlist);
d764b54e 6579 if (array[final_element] > start
4096c37b 6580 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
d764b54e 6581 {
5637ef5b
NC
6582 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6583 array[final_element], start,
6584 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
d764b54e
KW
6585 }
6586
6587 /* Here, it is a legal append. If the new range begins with the first
6588 * value not in the set, it is extending the set, so the new first
6589 * value not in the set is one greater than the newly extended range.
6590 * */
6591 if (array[final_element] == start) {
6592 if (end != UV_MAX) {
6593 array[final_element] = end + 1;
6594 }
6595 else {
6596 /* But if the end is the maximum representable on the machine,
dbe7a391 6597 * just let the range that this would extend to have no end */
d764b54e
KW
6598 invlist_set_len(invlist, len - 1);
6599 }
6600 return;
6601 }
6602 }
6603
6604 /* Here the new range doesn't extend any existing set. Add it */
6605
6606 len += 2; /* Includes an element each for the start and end of range */
6607
6608 /* If overflows the existing space, extend, which may cause the array to be
6609 * moved */
6610 if (max < len) {
6611 invlist_extend(invlist, len);
f1b67122
KW
6612 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6613 failure in invlist_array() */
d764b54e
KW
6614 array = invlist_array(invlist);
6615 }
f1b67122
KW
6616 else {
6617 invlist_set_len(invlist, len);
6618 }
d764b54e
KW
6619
6620 /* The next item on the list starts the range, the one after that is
6621 * one past the new range. */
6622 array[len - 2] = start;
6623 if (end != UV_MAX) {
6624 array[len - 1] = end + 1;
6625 }
6626 else {
6627 /* But if the end is the maximum representable on the machine, just let
6628 * the range have no end */
6629 invlist_set_len(invlist, len - 1);
6630 }
6631}
6632
9d501133
KW
6633#ifndef PERL_IN_XSUB_RE
6634
d5e82ecc
KW
6635STATIC IV
6636S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6637{
6638 /* Searches the inversion list for the entry that contains the input code
6639 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6640 * return value is the index into the list's array of the range that
6641 * contains <cp> */
6642
6643 IV low = 0;
6644 IV high = invlist_len(invlist);
6645 const UV * const array = invlist_array(invlist);
6646
6647 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6648
6649 /* If list is empty or the code point is before the first element, return
6650 * failure. */
6651 if (high == 0 || cp < array[0]) {
6652 return -1;
6653 }
6654
6655 /* Binary search. What we are looking for is <i> such that
6656 * array[i] <= cp < array[i+1]
6657 * The loop below converges on the i+1. */
6658 while (low < high) {
6659 IV mid = (low + high) / 2;
6660 if (array[mid] <= cp) {
6661 low = mid + 1;
6662
6663 /* We could do this extra test to exit the loop early.
6664 if (cp < array[low]) {
6665 return mid;
6666 }
6667 */
6668 }
6669 else { /* cp < array[mid] */
6670 high = mid;
6671 }
6672 }
6673
6674 return high - 1;
6675}
6676
86f766ab 6677void
b6a0ff33
KW
6678Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6679{
6680 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6681 * but is used when the swash has an inversion list. This makes this much
6682 * faster, as it uses a binary search instead of a linear one. This is
6683 * intimately tied to that function, and perhaps should be in utf8.c,
6684 * except it is intimately tied to inversion lists as well. It assumes
6685 * that <swatch> is all 0's on input */
6686
6687 UV current = start;
6688 const IV len = invlist_len(invlist);
6689 IV i;
6690 const UV * array;
6691
6692 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6693
6694 if (len == 0) { /* Empty inversion list */
6695 return;
6696 }
6697
6698 array = invlist_array(invlist);
6699
6700 /* Find which element it is */
6701 i = invlist_search(invlist, start);
6702
6703 /* We populate from <start> to <end> */
6704 while (current < end) {
6705 UV upper;
6706
6707 /* The inversion list gives the results for every possible code point
6708 * after the first one in the list. Only those ranges whose index is
6709 * even are ones that the inversion list matches. For the odd ones,
6710 * and if the initial code point is not in the list, we have to skip
6711 * forward to the next element */
6712 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6713 i++;
6714 if (i >= len) { /* Finished if beyond the end of the array */
6715 return;
6716 }
6717 current = array[i];
6718 if (current >= end) { /* Finished if beyond the end of what we
6719 are populating */
6720 return;
6721 }
6722 }
6723 assert(current >= start);
6724
6725 /* The current range ends one below the next one, except don't go past
6726 * <end> */
6727 i++;
6728 upper = (i < len && array[i] < end) ? array[i] : end;
6729
6730 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6731 * for each code point in it */
6732 for (; current < upper; current++) {
6733 const STRLEN offset = (STRLEN)(current - start);
6734 swatch[offset >> 3] |= 1 << (offset & 7);
6735 }
6736
6737 /* Quit if at the end of the list */
6738 if (i >= len) {
6739
6740 /* But first, have to deal with the highest possible code point on
6741 * the platform. The previous code assumes that <end> is one
6742 * beyond where we want to populate, but that is impossible at the
6743 * platform's infinity, so have to handle it specially */
6744 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6745 {
6746 const STRLEN offset = (STRLEN)(end - start);
6747 swatch[offset >> 3] |= 1 << (offset & 7);
6748 }
6749 return;
6750 }
6751
6752 /* Advance to the next range, which will be for code points not in the
6753 * inversion list */
6754 current = array[i];
6755 }
6756
6757 return;
6758}
6759
8dc9348a 6760
b6a0ff33 6761void
164173a2 6762Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
d764b54e 6763{
4065ba03
KW
6764 /* Take the union of two inversion lists and point <output> to it. *output
6765 * should be defined upon input, and if it points to one of the two lists,
f45adb79
KW
6766 * the reference count to that list will be decremented. The first list,
6767 * <a>, may be NULL, in which case a copy of the second list is returned.
164173a2
KW
6768 * If <complement_b> is TRUE, the union is taken of the complement
6769 * (inversion) of <b> instead of b itself.
f45adb79 6770 *
d764b54e
KW
6771 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6772 * Richard Gillam, published by Addison-Wesley, and explained at some
6773 * length there. The preface says to incorporate its examples into your
6774 * code at your own risk.
6775 *
6776 * The algorithm is like a merge sort.
6777 *
6778 * XXX A potential performance improvement is to keep track as we go along
6779 * if only one of the inputs contributes to the result, meaning the other
6780 * is a subset of that one. In that case, we can skip the final copy and
a2995b7f
KW
6781 * return the larger of the input lists, but then outside code might need
6782 * to keep track of whether to free the input list or not */
d764b54e 6783
f1b67122
KW
6784 UV* array_a; /* a's array */
6785 UV* array_b;
6786 UV len_a; /* length of a's array */
6787 UV len_b;
d764b54e 6788
a25abddc 6789 SV* u; /* the resulting union */
d764b54e
KW
6790 UV* array_u;
6791 UV len_u;
6792
6793 UV i_a = 0; /* current index into a's array */
6794 UV i_b = 0;
6795 UV i_u = 0;
6796
6797 /* running count, as explained in the algorithm source book; items are
6798 * stopped accumulating and are output when the count changes to/from 0.
6799 * The count is incremented when we start a range that's in the set, and
6800 * decremented when we start a range that's not in the set. So its range
6801 * is 0 to 2. Only when the count is zero is something not in the set.
6802 */
6803 UV count = 0;
6804
164173a2 6805 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
60825692 6806 assert(a != b);
d764b54e 6807
f1b67122 6808 /* If either one is empty, the union is the other one */
f45adb79 6809 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
4065ba03 6810 if (*output == a) {
f45adb79
KW
6811 if (a != NULL) {
6812 SvREFCNT_dec(a);
6813 }
f1b67122 6814 }
4065ba03 6815 if (*output != b) {
f1b67122 6816 *output = invlist_clone(b);
164173a2
KW
6817 if (complement_b) {
6818 _invlist_invert(*output);
6819 }
dbe7a391 6820 } /* else *output already = b; */
f1b67122
KW
6821 return;
6822 }
6823 else if ((len_b = invlist_len(b)) == 0) {
4065ba03 6824 if (*output == b) {
f1b67122
KW
6825 SvREFCNT_dec(b);
6826 }
164173a2
KW
6827
6828 /* The complement of an empty list is a list that has everything in it,
6829 * so the union with <a> includes everything too */
6830 if (complement_b) {
6831 if (a == *output) {
6832 SvREFCNT_dec(a);
6833 }
6834 *output = _new_invlist(1);
6835 _append_range_to_invlist(*output, 0, UV_MAX);
6836 }
6837 else if (*output != a) {
6838 *output = invlist_clone(a);
6839 }
6840 /* else *output already = a; */
f1b67122
KW
6841 return;
6842 }
6843
6844 /* Here both lists exist and are non-empty */
6845 array_a = invlist_array(a);
6846 array_b = invlist_array(b);
6847
164173a2
KW
6848 /* If are to take the union of 'a' with the complement of b, set it
6849 * up so are looking at b's complement. */
6850 if (complement_b) {
6851
6852 /* To complement, we invert: if the first element is 0, remove it. To
6853 * do this, we just pretend the array starts one later, and clear the
6854 * flag as we don't have to do anything else later */
6855 if (array_b[0] == 0) {
6856 array_b++;
6857 len_b--;
6858 complement_b = FALSE;
6859 }
6860 else {
6861
6862 /* But if the first element is not zero, we unshift a 0 before the
6863 * array. The data structure reserves a space for that 0 (which
6864 * should be a '1' right now), so physical shifting is unneeded,
6865 * but temporarily change that element to 0. Before exiting the
6866 * routine, we must restore the element to '1' */
6867 array_b--;
6868 len_b++;
6869 array_b[0] = 0;
6870 }
6871 }
6872
d764b54e
KW
6873 /* Size the union for the worst case: that the sets are completely
6874 * disjoint */
6875 u = _new_invlist(len_a + len_b);
f1b67122
KW
6876
6877 /* Will contain U+0000 if either component does */
6878 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6879 || (len_b > 0 && array_b[0] == 0));
d764b54e
KW
6880
6881 /* Go through each list item by item, stopping when exhausted one of
6882 * them */
6883 while (i_a < len_a && i_b < len_b) {
6884 UV cp; /* The element to potentially add to the union's array */
6885 bool cp_in_set; /* is it in the the input list's set or not */
6886
6887 /* We need to take one or the other of the two inputs for the union.
6888 * Since we are merging two sorted lists, we take the smaller of the
6889 * next items. In case of a tie, we take the one that is in its set
6890 * first. If we took one not in the set first, it would decrement the
6891 * count, possibly to 0 which would cause it to be output as ending the
6892 * range, and the next time through we would take the same number, and
6893 * output it again as beginning the next range. By doing it the
6894 * opposite way, there is no possibility that the count will be
6895 * momentarily decremented to 0, and thus the two adjoining ranges will
6896 * be seamlessly merged. (In a tie and both are in the set or both not
6897 * in the set, it doesn't matter which we take first.) */
6898 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
6899 || (array_a[i_a] == array_b[i_b]
6900 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 6901 {
4096c37b 6902 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
6903 cp= array_a[i_a++];
6904 }
6905 else {
4096c37b 6906 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
6907 cp= array_b[i_b++];
6908 }
6909
6910 /* Here, have chosen which of the two inputs to look at. Only output
6911 * if the running count changes to/from 0, which marks the
6912 * beginning/end of a range in that's in the set */
6913 if (cp_in_set) {
6914 if (count == 0) {
6915 array_u[i_u++] = cp;
6916 }
6917 count++;
6918 }
6919 else {
6920 count--;
6921 if (count == 0) {
6922 array_u[i_u++] = cp;
6923 }
6924 }
6925 }
6926
6927 /* Here, we are finished going through at least one of the lists, which
6928 * means there is something remaining in at most one. We check if the list
6929 * that hasn't been exhausted is positioned such that we are in the middle
bac5f0ae
KW
6930 * of a range in its set or not. (i_a and i_b point to the element beyond
6931 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6932 * is potentially more to output.
d764b54e
KW
6933 * There are four cases:
6934 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6935 * in the union is entirely from the non-exhausted set.
6936 * 2) Both were in their sets, count is 2. Nothing further should
6937 * be output, as everything that remains will be in the exhausted
6938 * list's set, hence in the union; decrementing to 1 but not 0 insures
6939 * that
6940 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6941 * Nothing further should be output because the union includes
bac5f0ae 6942 * everything from the exhausted set. Not decrementing ensures that.
d764b54e
KW
6943 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6944 * decrementing to 0 insures that we look at the remainder of the
6945 * non-exhausted set */
4096c37b
KW
6946 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6947 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e
KW
6948 {
6949 count--;
6950 }
6951
6952 /* The final length is what we've output so far, plus what else is about to
6953 * be output. (If 'count' is non-zero, then the input list we exhausted
6954 * has everything remaining up to the machine's limit in its set, and hence
6955 * in the union, so there will be no further output. */
6956 len_u = i_u;
6957 if (count == 0) {
6958 /* At most one of the subexpressions will be non-zero */
6959 len_u += (len_a - i_a) + (len_b - i_b);
6960 }
6961
6962 /* Set result to final length, which can change the pointer to array_u, so
6963 * re-find it */
6964 if (len_u != invlist_len(u)) {
6965 invlist_set_len(u, len_u);
6966 invlist_trim(u);
6967 array_u = invlist_array(u);
6968 }
6969
6970 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6971 * the other) ended with everything above it not in its set. That means
6972 * that the remaining part of the union is precisely the same as the
6973 * non-exhausted list, so can just copy it unchanged. (If both list were
6974 * exhausted at the same time, then the operations below will be both 0.)
6975 */
6976 if (count == 0) {
6977 IV copy_count; /* At most one will have a non-zero copy count */
6978 if ((copy_count = len_a - i_a) > 0) {
6979 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6980 }
6981 else if ((copy_count = len_b - i_b) > 0) {
6982 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6983 }
6984 }
6985
a2995b7f 6986 /* We may be removing a reference to one of the inputs */
4065ba03 6987 if (a == *output || b == *output) {
a2995b7f
KW
6988 SvREFCNT_dec(*output);
6989 }
6990
164173a2
KW
6991 /* If we've changed b, restore it */
6992 if (complement_b) {
6993 array_b[0] = 1;
6994 }
6995
a2995b7f
KW
6996 *output = u;
6997 return;
d764b54e
KW
6998}
6999
86f766ab 7000void
52ae8f7e 7001Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
d764b54e 7002{
4065ba03
KW
7003 /* Take the intersection of two inversion lists and point <i> to it. *i
7004 * should be defined upon input, and if it points to one of the two lists,
7005 * the reference count to that list will be decremented.
52ae8f7e
KW
7006 * If <complement_b> is TRUE, the result will be the intersection of <a>
7007 * and the complement (or inversion) of <b> instead of <b> directly.
7008 *
a2995b7f
KW
7009 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7010 * Richard Gillam, published by Addison-Wesley, and explained at some
7011 * length there. The preface says to incorporate its examples into your
7012 * code at your own risk. In fact, it had bugs
d764b54e
KW
7013 *
7014 * The algorithm is like a merge sort, and is essentially the same as the
7015 * union above
7016 */
7017
f1b67122
KW
7018 UV* array_a; /* a's array */
7019 UV* array_b;
7020 UV len_a; /* length of a's array */
7021 UV len_b;
d764b54e 7022
a25abddc 7023 SV* r; /* the resulting intersection */
d764b54e
KW
7024 UV* array_r;
7025 UV len_r;
7026
7027 UV i_a = 0; /* current index into a's array */
7028 UV i_b = 0;
7029 UV i_r = 0;
7030
7031 /* running count, as explained in the algorithm source book; items are
7032 * stopped accumulating and are output when the count changes to/from 2.
7033 * The count is incremented when we start a range that's in the set, and
7034 * decremented when we start a range that's not in the set. So its range
7035 * is 0 to 2. Only when the count is 2 is something in the intersection.
7036 */
7037 UV count = 0;
7038
52ae8f7e 7039 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
60825692 7040 assert(a != b);
d764b54e 7041
52ae8f7e 7042 /* Special case if either one is empty */
f1b67122
KW
7043 len_a = invlist_len(a);
7044 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
f1b67122 7045
52ae8f7e
KW
7046 if (len_a != 0 && complement_b) {
7047
7048 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7049 * be empty. Here, also we are using 'b's complement, which hence
7050 * must be every possible code point. Thus the intersection is
7051 * simply 'a'. */
7052 if (*i != a) {
7053 *i = invlist_clone(a);
7054
7055 if (*i == b) {
7056 SvREFCNT_dec(b);
7057 }
7058 }
7059 /* else *i is already 'a' */
7060 return;
7061 }
7062
7063 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7064 * intersection must be empty */
4065ba03 7065 if (*i == a) {
f1b67122
KW
7066 SvREFCNT_dec(a);
7067 }
4065ba03 7068 else if (*i == b) {
f1b67122
KW
7069 SvREFCNT_dec(b);
7070 }
2ea86699 7071 *i = _new_invlist(0);
f1b67122
KW
7072 return;
7073 }
7074
7075 /* Here both lists exist and are non-empty */
7076 array_a = invlist_array(a);
7077 array_b = invlist_array(b);
7078
52ae8f7e
KW
7079 /* If are to take the intersection of 'a' with the complement of b, set it
7080 * up so are looking at b's complement. */
7081 if (complement_b) {
7082
7083 /* To complement, we invert: if the first element is 0, remove it. To
7084 * do this, we just pretend the array starts one later, and clear the
7085 * flag as we don't have to do anything else later */
7086 if (array_b[0] == 0) {
7087 array_b++;
7088 len_b--;
7089 complement_b = FALSE;
7090 }
7091 else {
7092
7093 /* But if the first element is not zero, we unshift a 0 before the
7094 * array. The data structure reserves a space for that 0 (which
7095 * should be a '1' right now), so physical shifting is unneeded,
7096 * but temporarily change that element to 0. Before exiting the
7097 * routine, we must restore the element to '1' */
7098 array_b--;
7099 len_b++;
7100 array_b[0] = 0;
7101 }
7102 }
7103
d764b54e
KW
7104 /* Size the intersection for the worst case: that the intersection ends up
7105 * fragmenting everything to be completely disjoint */
7106 r= _new_invlist(len_a + len_b);
f1b67122
KW
7107
7108 /* Will contain U+0000 iff both components do */
7109 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7110 && len_b > 0 && array_b[0] == 0);
d764b54e
KW
7111
7112 /* Go through each list item by item, stopping when exhausted one of
7113 * them */
7114 while (i_a < len_a && i_b < len_b) {
7115 UV cp; /* The element to potentially add to the intersection's
7116 array */
7117 bool cp_in_set; /* Is it in the input list's set or not */
7118
c4a30257
KW
7119 /* We need to take one or the other of the two inputs for the
7120 * intersection. Since we are merging two sorted lists, we take the
7121 * smaller of the next items. In case of a tie, we take the one that
7122 * is not in its set first (a difference from the union algorithm). If
7123 * we took one in the set first, it would increment the count, possibly
7124 * to 2 which would cause it to be output as starting a range in the
7125 * intersection, and the next time through we would take that same
7126 * number, and output it again as ending the set. By doing it the
7127 * opposite of this, there is no possibility that the count will be
7128 * momentarily incremented to 2. (In a tie and both are in the set or
7129 * both not in the set, it doesn't matter which we take first.) */
d764b54e 7130 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
7131 || (array_a[i_a] == array_b[i_b]
7132 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 7133 {
4096c37b 7134 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
7135 cp= array_a[i_a++];
7136 }
7137 else {
4096c37b 7138 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
7139 cp= array_b[i_b++];
7140 }
7141
7142 /* Here, have chosen which of the two inputs to look at. Only output
7143 * if the running count changes to/from 2, which marks the
7144 * beginning/end of a range that's in the intersection */
7145 if (cp_in_set) {
7146 count++;
7147 if (count == 2) {
7148 array_r[i_r++] = cp;
7149 }
7150 }
7151 else {
7152 if (count == 2) {
7153 array_r[i_r++] = cp;
7154 }
7155 count--;
7156 }
7157 }
7158
c4a30257
KW
7159 /* Here, we are finished going through at least one of the lists, which
7160 * means there is something remaining in at most one. We check if the list
7161 * that has been exhausted is positioned such that we are in the middle
7162 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7163 * the ones we care about.) There are four cases:
7164 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7165 * nothing left in the intersection.
7166 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7167 * above 2. What should be output is exactly that which is in the
7168 * non-exhausted set, as everything it has is also in the intersection
7169 * set, and everything it doesn't have can't be in the intersection
7170 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7171 * gets incremented to 2. Like the previous case, the intersection is
7172 * everything that remains in the non-exhausted set.
7173 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7174 * remains 1. And the intersection has nothing more. */
4096c37b
KW
7175 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7176 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e 7177 {
c4a30257 7178 count++;
d764b54e
KW
7179 }
7180
7181 /* The final length is what we've output so far plus what else is in the
c4a30257 7182 * intersection. At most one of the subexpressions below will be non-zero */
d764b54e 7183 len_r = i_r;
c4a30257 7184 if (count >= 2) {
d764b54e
KW
7185 len_r += (len_a - i_a) + (len_b - i_b);
7186 }
7187
7188 /* Set result to final length, which can change the pointer to array_r, so
7189 * re-find it */
7190 if (len_r != invlist_len(r)) {
7191 invlist_set_len(r, len_r);
7192 invlist_trim(r);
7193 array_r = invlist_array(r);
7194 }
7195
7196 /* Finish outputting any remaining */
c4a30257 7197 if (count >= 2) { /* At most one will have a non-zero copy count */
d764b54e
KW
7198 IV copy_count;
7199 if ((copy_count = len_a - i_a) > 0) {
7200 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7201 }
7202 else if ((copy_count = len_b - i_b) > 0) {
7203 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7204 }
7205 }
7206
a2995b7f 7207 /* We may be removing a reference to one of the inputs */
4065ba03 7208 if (a == *i || b == *i) {
a2995b7f
KW
7209 SvREFCNT_dec(*i);
7210 }
7211
52ae8f7e
KW
7212 /* If we've changed b, restore it */
7213 if (complement_b) {
7214 array_b[0] = 1;
7215 }
7216
a2995b7f
KW
7217 *i = r;
7218 return;
d764b54e
KW
7219}
7220
9d501133
KW
7221SV*
7222Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
d764b54e
KW
7223{
7224 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7225 * set. A pointer to the inversion list is returned. This may actually be
c52a3e71
KW
7226 * a new list, in which case the passed in one has been destroyed. The
7227 * passed in inversion list can be NULL, in which case a new one is created
7228 * with just the one range in it */
d764b54e 7229
a25abddc 7230 SV* range_invlist;
c52a3e71 7231 UV len;
d764b54e 7232
c52a3e71
KW
7233 if (invlist == NULL) {
7234 invlist = _new_invlist(2);
7235 len = 0;
7236 }
7237 else {
7238 len = invlist_len(invlist);
7239 }
d764b54e
KW
7240
7241 /* If comes after the final entry, can just append it to the end */
7242 if (len == 0
7243 || start >= invlist_array(invlist)
7244 [invlist_len(invlist) - 1])
7245 {
7246 _append_range_to_invlist(invlist, start, end);
7247 return invlist;
7248 }
7249
7250 /* Here, can't just append things, create and return a new inversion list
7251 * which is the union of this range and the existing inversion list */
7252 range_invlist = _new_invlist(2);
7253 _append_range_to_invlist(range_invlist, start, end);
7254
37e85ffe 7255 _invlist_union(invlist, range_invlist, &invlist);
d764b54e 7256
0a89af2f 7257 /* The temporary can be freed */
318c430e 7258 SvREFCNT_dec(range_invlist);
d764b54e 7259
6d63a9fb 7260 return invlist;
d764b54e
KW
7261}
7262
9d501133
KW
7263#endif
7264
a25abddc
KW
7265PERL_STATIC_INLINE SV*
7266S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9d501133 7267 return _add_range_to_invlist(invlist, cp, cp);
c229b64c
KW
7268}
7269
3c234b35 7270#ifndef PERL_IN_XSUB_RE
86f766ab
KW
7271void
7272Perl__invlist_invert(pTHX_ SV* const invlist)
25e94a65
KW
7273{
7274 /* Complement the input inversion list. This adds a 0 if the list didn't
7275 * have a zero; removes it otherwise. As described above, the data
7276 * structure is set up so that this is very efficient */
7277
7278 UV* len_pos = get_invlist_len_addr(invlist);
7279
37e85ffe 7280 PERL_ARGS_ASSERT__INVLIST_INVERT;
25e94a65
KW
7281
7282 /* The inverse of matching nothing is matching everything */
7283 if (*len_pos == 0) {
7284 _append_range_to_invlist(invlist, 0, UV_MAX);
7285 return;
7286 }
7287
7288 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7289 * zero element was a 0, so it is being removed, so the length decrements
7290 * by 1; and vice-versa. SvCUR is unaffected */
7291 if (*get_invlist_zero_addr(invlist) ^= 1) {
7292 (*len_pos)--;
7293 }
7294 else {
7295 (*len_pos)++;
7296 }
7297}
89302fc2
KW
7298
7299void
7300Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7301{
7302 /* Complement the input inversion list (which must be a Unicode property,
7303 * all of which don't match above the Unicode maximum code point.) And
7304 * Perl has chosen to not have the inversion match above that either. This
7305 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7306 */
7307
7308 UV len;
7309 UV* array;
7310
7311 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7312
7313 _invlist_invert(invlist);
7314
7315 len = invlist_len(invlist);
7316
7317 if (len != 0) { /* If empty do nothing */
7318 array = invlist_array(invlist);
7319 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7320 /* Add 0x110000. First, grow if necessary */
7321 len++;
7322 if (invlist_max(invlist) < len) {
7323 invlist_extend(invlist, len);
7324 array = invlist_array(invlist);
7325 }
7326 invlist_set_len(invlist, len);
7327 array[len - 1] = PERL_UNICODE_MAX + 1;
7328 }
7329 else { /* Remove the 0x110000 */
7330 invlist_set_len(invlist, len - 1);
7331 }
7332 }
7333
7334 return;
7335}
3c234b35 7336#endif
25e94a65
KW
7337
7338PERL_STATIC_INLINE SV*
7339S_invlist_clone(pTHX_ SV* const invlist)
7340{
7341
7342 /* Return a new inversion list that is a copy of the input one, which is
7343 * unchanged */
7344
6c6c83ac
KW
7345 /* Need to allocate extra space to accommodate Perl's addition of a
7346 * trailing NUL to SvPV's, since it thinks they are always strings */
7347 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
6d47fb3d 7348 STRLEN length = SvCUR(invlist);
25e94a65
KW
7349
7350 PERL_ARGS_ASSERT_INVLIST_CLONE;
7351
6d47fb3d
KW
7352 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7353 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7354
25e94a65
KW
7355 return new_invlist;
7356}
7357
f3dc70d1
KW
7358PERL_STATIC_INLINE UV*
7359S_get_invlist_iter_addr(pTHX_ SV* invlist)
7360{
7361 /* Return the address of the UV that contains the current iteration
7362 * position */
7363
7364 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7365
7366 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7367}
7368
f59ff194
KW
7369PERL_STATIC_INLINE UV*
7370S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7371{
7372 /* Return the address of the UV that contains the version id. */
7373
7374 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7375
7376 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7377}
7378
f3dc70d1
KW
7379PERL_STATIC_INLINE void
7380S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7381{
7382 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7383
7384 *get_invlist_iter_addr(invlist) = 0;
7385}
7386
7387STATIC bool
7388S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7389{
dbe7a391
KW
7390 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7391 * This call sets in <*start> and <*end>, the next range in <invlist>.
7392 * Returns <TRUE> if successful and the next call will return the next
7393 * range; <FALSE> if was already at the end of the list. If the latter,
7394 * <*start> and <*end> are unchanged, and the next call to this function
7395 * will start over at the beginning of the list */
7396
f3dc70d1
KW
7397 UV* pos = get_invlist_iter_addr(invlist);
7398 UV len = invlist_len(invlist);
7399 UV *array;
7400
7401 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7402
7403 if (*pos >= len) {
7404 *pos = UV_MAX; /* Force iternit() to be required next time */
7405 return FALSE;
7406 }
7407
7408 array = invlist_array(invlist);
7409
7410 *start = array[(*pos)++];
7411
7412 if (*pos >= len) {
7413 *end = UV_MAX;
7414 }
7415 else {
7416 *end = array[(*pos)++] - 1;
7417 }
7418
7419 return TRUE;
7420}
7421
b2b97e77
KW
7422#ifndef PERL_IN_XSUB_RE
7423SV *
7424Perl__invlist_contents(pTHX_ SV* const invlist)
7425{
7426 /* Get the contents of an inversion list into a string SV so that they can
7427 * be printed out. It uses the format traditionally done for debug tracing
7428 */
7429
7430 UV start, end;
7431 SV* output = newSVpvs("\n");
7432
7433 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7434
7435 invlist_iterinit(invlist);
7436 while (invlist_iternext(invlist, &start, &end)) {
7437 if (end == UV_MAX) {
7438 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7439 }
7440 else if (end != start) {
7441 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7442 start, end);
7443 }
7444 else {
7445 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7446 }
7447 }
7448
7449 return output;
7450}
7451#endif
7452
768318b8
KW
7453#if 0
7454void
7455S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7456{
7457 /* Dumps out the ranges in an inversion list. The string 'header'
7458 * if present is output on a line before the first range */
7459
7460 UV start, end;
7461
7462 if (header && strlen(header)) {
7463 PerlIO_printf(Perl_debug_log, "%s\n", header);
7464 }
7465 invlist_iterinit(invlist);
7466 while (invlist_iternext(invlist, &start, &end)) {
7467 if (end == UV_MAX) {
7468 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7469 }
7470 else {
7471 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7472 }
7473 }
7474}
7475#endif
7476
97b14ce7 7477#undef HEADER_LENGTH
060b7a35 7478#undef INVLIST_INITIAL_LENGTH
005b65ed
KW
7479#undef TO_INTERNAL_SIZE
7480#undef FROM_INTERNAL_SIZE
f1b67122
KW
7481#undef INVLIST_LEN_OFFSET
7482#undef INVLIST_ZERO_OFFSET
f3dc70d1 7483#undef INVLIST_ITER_OFFSET
f59ff194 7484#undef INVLIST_VERSION_ID
060b7a35 7485
d764b54e
KW
7486/* End of inversion list object */
7487
a687059c
LW
7488/*
7489 - reg - regular expression, i.e. main body or parenthesized thing
7490 *
7491 * Caller must absorb opening parenthesis.
7492 *
7493 * Combining parenthesis handling with the base level of regular expression
7494 * is a trifle forced, but the need to tie the tails of the branches to what
7495 * follows makes it hard to avoid.
7496 */
07be1b83
YO
7497#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7498#ifdef DEBUGGING
7499#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7500#else
7501#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7502#endif
3dab1dad 7503
76e3520e 7504STATIC regnode *
3dab1dad 7505S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 7506 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 7507{
27da23d5 7508 dVAR;
c277df42
IZ
7509 register regnode *ret; /* Will be the head of the group. */
7510 register regnode *br;
7511 register regnode *lastbr;
cbbf8932 7512 register regnode *ender = NULL;
a0d0e21e 7513 register I32 parno = 0;
cbbf8932 7514 I32 flags;
f7819f85 7515 U32 oregflags = RExC_flags;
6136c704
AL
7516 bool have_branch = 0;
7517 bool is_open = 0;
594d7033
YO
7518 I32 freeze_paren = 0;
7519 I32 after_freeze = 0;
9d1d55b5
JP
7520
7521 /* for (?g), (?gc), and (?o) warnings; warning
7522 about (?c) will warn about (?g) -- japhy */
7523
6136c704
AL
7524#define WASTED_O 0x01
7525#define WASTED_G 0x02
7526#define WASTED_C 0x04
7527#define WASTED_GC (0x02|0x04)
cbbf8932 7528 I32 wastedflags = 0x00;
9d1d55b5 7529
fac92740 7530 char * parse_start = RExC_parse; /* MJD */
a28509cc 7531 char * const oregcomp_parse = RExC_parse;
a0d0e21e 7532
3dab1dad 7533 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7534
7535 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
7536 DEBUG_PARSE("reg ");
7537
821b33a5 7538 *flagp = 0; /* Tentatively. */
a0d0e21e 7539
9d1d55b5 7540
a0d0e21e
LW
7541 /* Make an OPEN node, if parenthesized. */
7542 if (paren) {
e2e6a0f1
YO
7543 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7544 char *start_verb = RExC_parse;
7545 STRLEN verb_len = 0;
7546 char *start_arg = NULL;
7547 unsigned char op = 0;
7548 int argok = 1;
7549 int internal_argval = 0; /* internal_argval is only useful if !argok */
7550 while ( *RExC_parse && *RExC_parse != ')' ) {
7551 if ( *RExC_parse == ':' ) {
7552 start_arg = RExC_parse + 1;
7553 break;
7554 }
7555 RExC_parse++;
7556 }
7557 ++start_verb;
7558 verb_len = RExC_parse - start_verb;
7559 if ( start_arg ) {
7560 RExC_parse++;
7561 while ( *RExC_parse && *RExC_parse != ')' )
7562 RExC_parse++;
7563 if ( *RExC_parse != ')' )
7564 vFAIL("Unterminated verb pattern argument");
7565 if ( RExC_parse == start_arg )
7566 start_arg = NULL;
7567 } else {
7568 if ( *RExC_parse != ')' )
7569 vFAIL("Unterminated verb pattern");
7570 }
5d458dd8 7571
e2e6a0f1
YO
7572 switch ( *start_verb ) {
7573 case 'A': /* (*ACCEPT) */
568a785a 7574 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
7575 op = ACCEPT;
7576 internal_argval = RExC_nestroot;
7577 }
7578 break;
7579 case 'C': /* (*COMMIT) */
568a785a 7580 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 7581 op = COMMIT;
e2e6a0f1
YO
7582 break;
7583 case 'F': /* (*FAIL) */
568a785a 7584 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
7585 op = OPFAIL;
7586 argok = 0;
7587 }
7588 break;
5d458dd8
YO
7589 case ':': /* (*:NAME) */
7590 case 'M': /* (*MARK:NAME) */
568a785a 7591 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 7592 op = MARKPOINT;
5d458dd8
YO
7593 argok = -1;
7594 }
7595 break;
7596 case 'P': /* (*PRUNE) */
568a785a 7597 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 7598 op = PRUNE;
e2e6a0f1 7599 break;
5d458dd8 7600 case 'S': /* (*SKIP) */
568a785a 7601 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
7602 op = SKIP;
7603 break;
7604 case 'T': /* (*THEN) */
7605 /* [19:06] <TimToady> :: is then */
568a785a 7606 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
7607 op = CUTGROUP;
7608 RExC_seen |= REG_SEEN_CUTGROUP;
7609 }
e2e6a0f1
YO
7610 break;
7611 }
7612 if ( ! op ) {
7613 RExC_parse++;
7614 vFAIL3("Unknown verb pattern '%.*s'",
7615 verb_len, start_verb);
7616 }
7617 if ( argok ) {
7618 if ( start_arg && internal_argval ) {
7619 vFAIL3("Verb pattern '%.*s' may not have an argument",
7620 verb_len, start_verb);
7621 } else if ( argok < 0 && !start_arg ) {
7622 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7623 verb_len, start_verb);
7624 } else {
7625 ret = reganode(pRExC_state, op, internal_argval);
7626 if ( ! internal_argval && ! SIZE_ONLY ) {
7627 if (start_arg) {
7628 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7629 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 7630 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
7631 ret->flags = 0;
7632 } else {
7633 ret->flags = 1;
7634 }
7635 }
7636 }
7637 if (!internal_argval)
7638 RExC_seen |= REG_SEEN_VERBARG;
7639 } else if ( start_arg ) {
7640 vFAIL3("Verb pattern '%.*s' may not have an argument",
7641 verb_len, start_verb);
7642 } else {
7643 ret = reg_node(pRExC_state, op);
7644 }
7645 nextchar(pRExC_state);
7646 return ret;
7647 } else
fac92740 7648 if (*RExC_parse == '?') { /* (?...) */
6136c704 7649 bool is_logical = 0;
a28509cc 7650 const char * const seqstart = RExC_parse;
fb85c044 7651 bool has_use_defaults = FALSE;
ca9dfc88 7652
830247a4
IZ
7653 RExC_parse++;
7654 paren = *RExC_parse++;
c277df42 7655 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 7656 switch (paren) {
894be9b7 7657
1f1031fe
YO
7658 case 'P': /* (?P...) variants for those used to PCRE/Python */
7659 paren = *RExC_parse++;
7660 if ( paren == '<') /* (?P<...>) named capture */
7661 goto named_capture;
7662 else if (paren == '>') { /* (?P>name) named recursion */
7663 goto named_recursion;
7664 }
7665 else if (paren == '=') { /* (?P=...) named backref */
7666 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7667 you change this make sure you change that */
7668 char* name_start = RExC_parse;
7669 U32 num = 0;
7670 SV *sv_dat = reg_scan_name(pRExC_state,
7671 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7672 if (RExC_parse == name_start || *RExC_parse != ')')
7673 vFAIL2("Sequence %.3s... not terminated",parse_start);
7674
7675 if (!SIZE_ONLY) {
7676 num = add_data( pRExC_state, 1, "S" );
7677 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7678 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7679 }
7680 RExC_sawback = 1;
4444fd9f
KW
7681 ret = reganode(pRExC_state,
7682 ((! FOLD)
7683 ? NREF
2f7f8cb1
KW
7684 : (MORE_ASCII_RESTRICTED)
7685 ? NREFFA
7686 : (AT_LEAST_UNI_SEMANTICS)
7687 ? NREFFU
7688 : (LOC)
7689 ? NREFFL
7690 : NREFF),
4444fd9f 7691 num);
1f1031fe
YO
7692 *flagp |= HASWIDTH;
7693
7694 Set_Node_Offset(ret, parse_start+1);
7695 Set_Node_Cur_Length(ret); /* MJD */
7696
7697 nextchar(pRExC_state);
7698 return ret;
7699 }
57b84237
YO
7700 RExC_parse++;
7701 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7702 /*NOTREACHED*/
7703 case '<': /* (?<...) */
b81d288d 7704 if (*RExC_parse == '!')
c277df42 7705 paren = ',';
0a4db386 7706 else if (*RExC_parse != '=')
1f1031fe 7707 named_capture:
0a4db386 7708 { /* (?<...>) */
81714fb9 7709 char *name_start;
894be9b7 7710 SV *svname;
81714fb9
YO
7711 paren= '>';
7712 case '\'': /* (?'...') */
7713 name_start= RExC_parse;
0a4db386
YO
7714 svname = reg_scan_name(pRExC_state,
7715 SIZE_ONLY ? /* reverse test from the others */
7716 REG_RSN_RETURN_NAME :
7717 REG_RSN_RETURN_NULL);
57b84237
YO
7718 if (RExC_parse == name_start) {
7719 RExC_parse++;
7720 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7721 /*NOTREACHED*/
7722 }
81714fb9
YO
7723 if (*RExC_parse != paren)
7724 vFAIL2("Sequence (?%c... not terminated",
7725 paren=='>' ? '<' : paren);
7726 if (SIZE_ONLY) {
e62cc96a
YO
7727 HE *he_str;
7728 SV *sv_dat = NULL;
486ec47a 7729 if (!svname) /* shouldn't happen */
894be9b7
YO
7730 Perl_croak(aTHX_
7731 "panic: reg_scan_name returned NULL");
81714fb9
YO
7732 if (!RExC_paren_names) {
7733 RExC_paren_names= newHV();
ad64d0ec 7734 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
7735#ifdef DEBUGGING
7736 RExC_paren_name_list= newAV();
ad64d0ec 7737 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 7738#endif
81714fb9
YO
7739 }
7740 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 7741 if ( he_str )
81714fb9 7742 sv_dat = HeVAL(he_str);
e62cc96a 7743 if ( ! sv_dat ) {
81714fb9 7744 /* croak baby croak */
e62cc96a
YO
7745 Perl_croak(aTHX_
7746 "panic: paren_name hash element allocation failed");
7747 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
7748 /* (?|...) can mean we have dupes so scan to check
7749 its already been stored. Maybe a flag indicating
7750 we are inside such a construct would be useful,
7751 but the arrays are likely to be quite small, so
7752 for now we punt -- dmq */
7753 IV count = SvIV(sv_dat);
7754 I32 *pv = (I32*)SvPVX(sv_dat);
7755 IV i;
7756 for ( i = 0 ; i < count ; i++ ) {
7757 if ( pv[i] == RExC_npar ) {
7758 count = 0;
7759 break;
7760 }
7761 }
7762 if ( count ) {
7763 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7764 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7765 pv[count] = RExC_npar;
3a92e6ae 7766 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 7767 }
81714fb9
YO
7768 } else {
7769 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7770 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7771 SvIOK_on(sv_dat);
3ec35e0f 7772 SvIV_set(sv_dat, 1);
e62cc96a 7773 }
1f1031fe 7774#ifdef DEBUGGING
17a3c617 7775 /* Yes this does cause a memory leak in debugging Perls */
1f1031fe
YO
7776 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7777 SvREFCNT_dec(svname);
7778#endif
e62cc96a 7779
81714fb9
YO
7780 /*sv_dump(sv_dat);*/
7781 }
7782 nextchar(pRExC_state);
7783 paren = 1;
7784 goto capturing_parens;
7785 }
7786 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 7787 RExC_in_lookbehind++;
830247a4 7788 RExC_parse++;
fac92740 7789 case '=': /* (?=...) */
89c6a13e 7790 RExC_seen_zerolen++;
5c3fa2e7 7791 break;
fac92740 7792 case '!': /* (?!...) */
830247a4 7793 RExC_seen_zerolen++;
e2e6a0f1
YO
7794 if (*RExC_parse == ')') {
7795 ret=reg_node(pRExC_state, OPFAIL);
7796 nextchar(pRExC_state);
7797 return ret;
7798 }
594d7033
YO
7799 break;
7800 case '|': /* (?|...) */
7801 /* branch reset, behave like a (?:...) except that
7802 buffers in alternations share the same numbers */
7803 paren = ':';
7804 after_freeze = freeze_paren = RExC_npar;
7805 break;
fac92740
MJD
7806 case ':': /* (?:...) */
7807 case '>': /* (?>...) */
a0d0e21e 7808 break;
fac92740
MJD
7809 case '$': /* (?$...) */
7810 case '@': /* (?@...) */
8615cb43 7811 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 7812 break;
fac92740 7813 case '#': /* (?#...) */
830247a4
IZ
7814 while (*RExC_parse && *RExC_parse != ')')
7815 RExC_parse++;
7816 if (*RExC_parse != ')')
c277df42 7817 FAIL("Sequence (?#... not terminated");
830247a4 7818 nextchar(pRExC_state);
a0d0e21e
LW
7819 *flagp = TRYAGAIN;
7820 return NULL;
894be9b7
YO
7821 case '0' : /* (?0) */
7822 case 'R' : /* (?R) */
7823 if (*RExC_parse != ')')
6bda09f9 7824 FAIL("Sequence (?R) not terminated");
1a147d38 7825 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 7826 *flagp |= POSTPONED;
7f69552c
YO
7827 nextchar(pRExC_state);
7828 return ret;
7829 /*notreached*/
894be9b7
YO
7830 { /* named and numeric backreferences */
7831 I32 num;
894be9b7
YO
7832 case '&': /* (?&NAME) */
7833 parse_start = RExC_parse - 1;
1f1031fe 7834 named_recursion:
894be9b7 7835 {
0a4db386
YO
7836 SV *sv_dat = reg_scan_name(pRExC_state,
7837 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7838 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
7839 }
7840 goto gen_recurse_regop;
7841 /* NOT REACHED */
542fa716
YO
7842 case '+':
7843 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7844 RExC_parse++;
7845 vFAIL("Illegal pattern");
7846 }
7847 goto parse_recursion;
7848 /* NOT REACHED*/
7849 case '-': /* (?-1) */
7850 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7851 RExC_parse--; /* rewind to let it be handled later */
7852 goto parse_flags;
7853 }
7854 /*FALLTHROUGH */
6bda09f9
YO
7855 case '1': case '2': case '3': case '4': /* (?1) */
7856 case '5': case '6': case '7': case '8': case '9':
7857 RExC_parse--;
542fa716 7858 parse_recursion:
894be9b7
YO
7859 num = atoi(RExC_parse);
7860 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
7861 if (*RExC_parse == '-')
7862 RExC_parse++;
6bda09f9
YO
7863 while (isDIGIT(*RExC_parse))
7864 RExC_parse++;
7865 if (*RExC_parse!=')')
7866 vFAIL("Expecting close bracket");
686b73d4 7867
894be9b7 7868 gen_recurse_regop:
542fa716
YO
7869 if ( paren == '-' ) {
7870 /*
7871 Diagram of capture buffer numbering.
7872 Top line is the normal capture buffer numbers
3b753521 7873 Bottom line is the negative indexing as from
542fa716
YO
7874 the X (the (?-2))
7875
7876 + 1 2 3 4 5 X 6 7
7877 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7878 - 5 4 3 2 1 X x x
7879
7880 */
7881 num = RExC_npar + num;
7882 if (num < 1) {
7883 RExC_parse++;
7884 vFAIL("Reference to nonexistent group");
7885 }
7886 } else if ( paren == '+' ) {
7887 num = RExC_npar + num - 1;
7888 }
7889
1a147d38 7890 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
7891 if (!SIZE_ONLY) {
7892 if (num > (I32)RExC_rx->nparens) {
7893 RExC_parse++;
7894 vFAIL("Reference to nonexistent group");
7895 }
40d049e4 7896 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 7897 RExC_emit++;
226de585 7898 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 7899 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 7900 } else {
6bda09f9 7901 RExC_size++;
6bda09f9 7902 }
0a4db386 7903 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 7904 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
7905 Set_Node_Offset(ret, parse_start); /* MJD */
7906
a3b492c3 7907 *flagp |= POSTPONED;
6bda09f9
YO
7908 nextchar(pRExC_state);
7909 return ret;
894be9b7
YO
7910 } /* named and numeric backreferences */
7911 /* NOT REACHED */
7912
fac92740 7913 case '?': /* (??...) */
6136c704 7914 is_logical = 1;
57b84237
YO
7915 if (*RExC_parse != '{') {
7916 RExC_parse++;
7917 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7918 /*NOTREACHED*/
7919 }
a3b492c3 7920 *flagp |= POSTPONED;
830247a4 7921 paren = *RExC_parse++;
0f5d15d6 7922 /* FALL THROUGH */
fac92740 7923 case '{': /* (?{...}) */
c277df42 7924 {
2eccd3b2
NC
7925 I32 count = 1;
7926 U32 n = 0;
c277df42 7927 char c;
830247a4 7928 char *s = RExC_parse;
c277df42 7929
830247a4
IZ
7930 RExC_seen_zerolen++;
7931 RExC_seen |= REG_SEEN_EVAL;
7932 while (count && (c = *RExC_parse)) {
6136c704
AL
7933 if (c == '\\') {
7934 if (RExC_parse[1])
7935 RExC_parse++;
7936 }
b81d288d 7937 else if (c == '{')
c277df42 7938 count++;
b81d288d 7939 else if (c == '}')
c277df42 7940 count--;
830247a4 7941 RExC_parse++;
c277df42 7942 }
6136c704 7943 if (*RExC_parse != ')') {
686b73d4 7944 RExC_parse = s;
b45f050a
JF
7945 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7946 }
c277df42 7947 if (!SIZE_ONLY) {
f3548bdc 7948 PAD *pad;
6136c704
AL
7949 OP_4tree *sop, *rop;
7950 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 7951
569233ed
SB
7952 ENTER;
7953 Perl_save_re_context(aTHX);
d59a8b3e 7954 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
7955 sop->op_private |= OPpREFCOUNTED;
7956 /* re_dup will OpREFCNT_inc */
7957 OpREFCNT_set(sop, 1);
569233ed 7958 LEAVE;
c277df42 7959
830247a4 7960 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
7961 RExC_rxi->data->data[n] = (void*)rop;
7962 RExC_rxi->data->data[n+1] = (void*)sop;
7963 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 7964 SvREFCNT_dec(sv);
a0ed51b3 7965 }
e24b16f9 7966 else { /* First pass */
830247a4 7967 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 7968 && IN_PERL_RUNTIME)
2cd61cdb
IZ
7969 /* No compiled RE interpolated, has runtime
7970 components ===> unsafe. */
7971 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 7972 if (PL_tainting && PL_tainted)
cc6b7395 7973 FAIL("Eval-group in insecure regular expression");
54df2634 7974#if PERL_VERSION > 8
923e4eb5 7975 if (IN_PERL_COMPILETIME)
b5c19bd7 7976 PL_cv_has_eval = 1;
54df2634 7977#endif
c277df42 7978 }
b5c19bd7 7979
830247a4 7980 nextchar(pRExC_state);
6136c704 7981 if (is_logical) {
830247a4 7982 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
7983 if (!SIZE_ONLY)
7984 ret->flags = 2;
3dab1dad 7985 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 7986 /* deal with the length of this later - MJD */
0f5d15d6
IZ
7987 return ret;
7988 }
ccb2c380
MP
7989 ret = reganode(pRExC_state, EVAL, n);
7990 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7991 Set_Node_Offset(ret, parse_start);
7992 return ret;
c277df42 7993 }
fac92740 7994 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 7995 {
0a4db386 7996 int is_define= 0;
fac92740 7997 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
7998 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7999 || RExC_parse[1] == '<'
830247a4 8000 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42 8001 I32 flag;
686b73d4 8002
830247a4 8003 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
8004 if (!SIZE_ONLY)
8005 ret->flags = 1;
3dab1dad 8006 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 8007 goto insert_if;
b81d288d 8008 }
a0ed51b3 8009 }
0a4db386
YO
8010 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8011 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8012 {
8013 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8014 char *name_start= RExC_parse++;
2eccd3b2 8015 U32 num = 0;
0a4db386
YO
8016 SV *sv_dat=reg_scan_name(pRExC_state,
8017 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8018 if (RExC_parse == name_start || *RExC_parse != ch)
8019 vFAIL2("Sequence (?(%c... not terminated",
8020 (ch == '>' ? '<' : ch));
8021 RExC_parse++;
8022 if (!SIZE_ONLY) {
8023 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 8024 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8025 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
8026 }
8027 ret = reganode(pRExC_state,NGROUPP,num);
8028 goto insert_if_check_paren;
8029 }
8030 else if (RExC_parse[0] == 'D' &&
8031 RExC_parse[1] == 'E' &&
8032 RExC_parse[2] == 'F' &&
8033 RExC_parse[3] == 'I' &&
8034 RExC_parse[4] == 'N' &&
8035 RExC_parse[5] == 'E')
8036 {
8037 ret = reganode(pRExC_state,DEFINEP,0);
8038 RExC_parse +=6 ;
8039 is_define = 1;
8040 goto insert_if_check_paren;
8041 }
8042 else if (RExC_parse[0] == 'R') {
8043 RExC_parse++;
8044 parno = 0;
8045 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8046 parno = atoi(RExC_parse++);
8047 while (isDIGIT(*RExC_parse))
8048 RExC_parse++;
8049 } else if (RExC_parse[0] == '&') {
8050 SV *sv_dat;
8051 RExC_parse++;
8052 sv_dat = reg_scan_name(pRExC_state,
8053 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8054 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8055 }
1a147d38 8056 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
8057 goto insert_if_check_paren;
8058 }
830247a4 8059 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 8060 /* (?(1)...) */
6136c704 8061 char c;
830247a4 8062 parno = atoi(RExC_parse++);
c277df42 8063
830247a4
IZ
8064 while (isDIGIT(*RExC_parse))
8065 RExC_parse++;
fac92740 8066 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 8067
0a4db386 8068 insert_if_check_paren:
830247a4 8069 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 8070 vFAIL("Switch condition not recognized");
c277df42 8071 insert_if:
3dab1dad
YO
8072 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8073 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 8074 if (br == NULL)
830247a4 8075 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 8076 else
3dab1dad 8077 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 8078 c = *nextchar(pRExC_state);
d1b80229
IZ
8079 if (flags&HASWIDTH)
8080 *flagp |= HASWIDTH;
c277df42 8081 if (c == '|') {
0a4db386
YO
8082 if (is_define)
8083 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 8084 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
8085 regbranch(pRExC_state, &flags, 1,depth+1);
8086 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
8087 if (flags&HASWIDTH)
8088 *flagp |= HASWIDTH;
830247a4 8089 c = *nextchar(pRExC_state);
a0ed51b3
LW
8090 }
8091 else
c277df42
IZ
8092 lastbr = NULL;
8093 if (c != ')')
8615cb43 8094 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 8095 ender = reg_node(pRExC_state, TAIL);
3dab1dad 8096 REGTAIL(pRExC_state, br, ender);
c277df42 8097 if (lastbr) {
3dab1dad
YO
8098 REGTAIL(pRExC_state, lastbr, ender);
8099 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
8100 }
8101 else
3dab1dad 8102 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
8103 RExC_size++; /* XXX WHY do we need this?!!
8104 For large programs it seems to be required
8105 but I can't figure out why. -- dmq*/
c277df42 8106 return ret;
a0ed51b3
LW
8107 }
8108 else {
830247a4 8109 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
8110 }
8111 }
1b1626e4 8112 case 0:
830247a4 8113 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 8114 vFAIL("Sequence (? incomplete");
1b1626e4 8115 break;
85508812
KW
8116 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8117 that follow */
fb85c044
KW
8118 has_use_defaults = TRUE;
8119 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
8120 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8121 ? REGEX_UNICODE_CHARSET
8122 : REGEX_DEPENDS_CHARSET);
fb85c044 8123 goto parse_flags;
a0d0e21e 8124 default:
cde0cee5
YO
8125 --RExC_parse;
8126 parse_flags: /* (?i) */
8127 {
8128 U32 posflags = 0, negflags = 0;
8129 U32 *flagsp = &posflags;
f6a766d5 8130 char has_charset_modifier = '\0';
96f54887
KW
8131 regex_charset cs = get_regex_charset(RExC_flags);
8132 if (cs == REGEX_DEPENDS_CHARSET
8133 && (RExC_utf8 || RExC_uni_semantics))
8134 {
8135 cs = REGEX_UNICODE_CHARSET;
8136 }
cde0cee5
YO
8137
8138 while (*RExC_parse) {
8139 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
8140 /* (?g), (?gc) and (?o) are useless here
8141 and must be globally applied -- japhy */
cde0cee5
YO
8142 switch (*RExC_parse) {
8143 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec 8144 case LOCALE_PAT_MOD:
f6a766d5
KW
8145 if (has_charset_modifier) {
8146 goto excess_modifier;
8147 }
8148 else if (flagsp == &negflags) {
9442e3b8 8149 goto neg_modifier;
9de15fec 8150 }
a62b1201 8151 cs = REGEX_LOCALE_CHARSET;
f6a766d5 8152 has_charset_modifier = LOCALE_PAT_MOD;
4624b182 8153 RExC_contains_locale = 1;
9de15fec
KW
8154 break;
8155 case UNICODE_PAT_MOD:
f6a766d5
KW
8156 if (has_charset_modifier) {
8157 goto excess_modifier;
8158 }
8159 else if (flagsp == &negflags) {
9442e3b8 8160 goto neg_modifier;
9de15fec 8161 }
a62b1201 8162 cs = REGEX_UNICODE_CHARSET;
f6a766d5 8163 has_charset_modifier = UNICODE_PAT_MOD;
9de15fec 8164 break;
cfaf538b 8165 case ASCII_RESTRICT_PAT_MOD:
f6a766d5 8166 if (flagsp == &negflags) {
9442e3b8 8167 goto neg_modifier;
cfaf538b 8168 }
f6a766d5
KW
8169 if (has_charset_modifier) {
8170 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8171 goto excess_modifier;
8172 }
2f7f8cb1 8173 /* Doubled modifier implies more restricted */
f6a766d5
KW
8174 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8175 }
2f7f8cb1
KW
8176 else {
8177 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8178 }
f6a766d5 8179 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
cfaf538b 8180 break;
50e91148 8181 case DEPENDS_PAT_MOD:
9442e3b8 8182 if (has_use_defaults) {
9de15fec 8183 goto fail_modifiers;
f6a766d5 8184 }
9442e3b8
KW
8185 else if (flagsp == &negflags) {
8186 goto neg_modifier;
8187 }
f6a766d5
KW
8188 else if (has_charset_modifier) {
8189 goto excess_modifier;
9de15fec 8190 }
7b98bc43
KW
8191
8192 /* The dual charset means unicode semantics if the
8193 * pattern (or target, not known until runtime) are
e40e74fe
KW
8194 * utf8, or something in the pattern indicates unicode
8195 * semantics */
8196 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
8197 ? REGEX_UNICODE_CHARSET
8198 : REGEX_DEPENDS_CHARSET;
f6a766d5 8199 has_charset_modifier = DEPENDS_PAT_MOD;
9de15fec 8200 break;
f6a766d5
KW
8201 excess_modifier:
8202 RExC_parse++;
8203 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
0c96c706 8204 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
f6a766d5
KW
8205 }
8206 else if (has_charset_modifier == *(RExC_parse - 1)) {
0c96c706 8207 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
f6a766d5
KW
8208 }
8209 else {
0c96c706 8210 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
f6a766d5
KW
8211 }
8212 /*NOTREACHED*/
9442e3b8
KW
8213 neg_modifier:
8214 RExC_parse++;
8215 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8216 /*NOTREACHED*/
f7819f85
A
8217 case ONCE_PAT_MOD: /* 'o' */
8218 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 8219 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 8220 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
8221 if (! (wastedflags & wflagbit) ) {
8222 wastedflags |= wflagbit;
8223 vWARN5(
8224 RExC_parse + 1,
8225 "Useless (%s%c) - %suse /%c modifier",
8226 flagsp == &negflags ? "?-" : "?",
8227 *RExC_parse,
8228 flagsp == &negflags ? "don't " : "",
8229 *RExC_parse
8230 );
8231 }
8232 }
cde0cee5
YO
8233 break;
8234
f7819f85 8235 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 8236 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
8237 if (! (wastedflags & WASTED_C) ) {
8238 wastedflags |= WASTED_GC;
9d1d55b5
JP
8239 vWARN3(
8240 RExC_parse + 1,
8241 "Useless (%sc) - %suse /gc modifier",
8242 flagsp == &negflags ? "?-" : "?",
8243 flagsp == &negflags ? "don't " : ""
8244 );
8245 }
8246 }
cde0cee5 8247 break;
f7819f85 8248 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 8249 if (flagsp == &negflags) {
668c081a
NC
8250 if (SIZE_ONLY)
8251 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
8252 } else {
8253 *flagsp |= RXf_PMf_KEEPCOPY;
8254 }
8255 break;
8256 case '-':
3b753521 8257 /* A flag is a default iff it is following a minus, so
fb85c044
KW
8258 * if there is a minus, it means will be trying to
8259 * re-specify a default which is an error */
8260 if (has_use_defaults || flagsp == &negflags) {
9de15fec 8261 fail_modifiers:
57b84237
YO
8262 RExC_parse++;
8263 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8264 /*NOTREACHED*/
8265 }
cde0cee5
YO
8266 flagsp = &negflags;
8267 wastedflags = 0; /* reset so (?g-c) warns twice */
8268 break;
8269 case ':':
8270 paren = ':';
8271 /*FALLTHROUGH*/
8272 case ')':
8273 RExC_flags |= posflags;
8274 RExC_flags &= ~negflags;
a62b1201 8275 set_regex_charset(&RExC_flags, cs);
f7819f85
A
8276 if (paren != ':') {
8277 oregflags |= posflags;
8278 oregflags &= ~negflags;
a62b1201 8279 set_regex_charset(&oregflags, cs);
f7819f85 8280 }
cde0cee5
YO
8281 nextchar(pRExC_state);
8282 if (paren != ':') {
8283 *flagp = TRYAGAIN;
8284 return NULL;
8285 } else {
8286 ret = NULL;
8287 goto parse_rest;
8288 }
8289 /*NOTREACHED*/
8290 default:
cde0cee5
YO
8291 RExC_parse++;
8292 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8293 /*NOTREACHED*/
8294 }
830247a4 8295 ++RExC_parse;
48c036b1 8296 }
cde0cee5 8297 }} /* one for the default block, one for the switch */
a0d0e21e 8298 }
fac92740 8299 else { /* (...) */
81714fb9 8300 capturing_parens:
830247a4
IZ
8301 parno = RExC_npar;
8302 RExC_npar++;
e2e6a0f1 8303
830247a4 8304 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
8305 if (!SIZE_ONLY ){
8306 if (!RExC_nestroot)
8307 RExC_nestroot = parno;
c009da3d
YO
8308 if (RExC_seen & REG_SEEN_RECURSE
8309 && !RExC_open_parens[parno-1])
8310 {
e2e6a0f1 8311 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
8312 "Setting open paren #%"IVdf" to %d\n",
8313 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
8314 RExC_open_parens[parno-1]= ret;
8315 }
6bda09f9 8316 }
fac92740
MJD
8317 Set_Node_Length(ret, 1); /* MJD */
8318 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 8319 is_open = 1;
a0d0e21e 8320 }
a0ed51b3 8321 }
fac92740 8322 else /* ! paren */
a0d0e21e 8323 ret = NULL;
cde0cee5
YO
8324
8325 parse_rest:
a0d0e21e 8326 /* Pick up the branches, linking them together. */
fac92740 8327 parse_start = RExC_parse; /* MJD */
3dab1dad 8328 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e 8329
fac92740 8330 /* branch_len = (paren != 0); */
2af232bd 8331
a0d0e21e
LW
8332 if (br == NULL)
8333 return(NULL);
830247a4
IZ
8334 if (*RExC_parse == '|') {
8335 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 8336 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 8337 }
fac92740 8338 else { /* MJD */
6bda09f9 8339 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
8340 Set_Node_Length(br, paren != 0);
8341 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8342 }
c277df42
IZ
8343 have_branch = 1;
8344 if (SIZE_ONLY)
830247a4 8345 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
8346 }
8347 else if (paren == ':') {
c277df42
IZ
8348 *flagp |= flags&SIMPLE;
8349 }
6136c704 8350 if (is_open) { /* Starts with OPEN. */
3dab1dad 8351 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
8352 }
8353 else if (paren != '?') /* Not Conditional */
a0d0e21e 8354 ret = br;
8ae10a67 8355 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 8356 lastbr = br;
830247a4
IZ
8357 while (*RExC_parse == '|') {
8358 if (!SIZE_ONLY && RExC_extralen) {
8359 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 8360 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
8361 }
8362 if (SIZE_ONLY)
830247a4
IZ
8363 RExC_extralen += 2; /* Account for LONGJMP. */
8364 nextchar(pRExC_state);
594d7033
YO
8365 if (freeze_paren) {
8366 if (RExC_npar > after_freeze)
8367 after_freeze = RExC_npar;
8368 RExC_npar = freeze_paren;
8369 }
3dab1dad 8370 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 8371
a687059c 8372 if (br == NULL)
a0d0e21e 8373 return(NULL);
3dab1dad 8374 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 8375 lastbr = br;
8ae10a67 8376 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
8377 }
8378
c277df42
IZ
8379 if (have_branch || paren != ':') {
8380 /* Make a closing node, and hook it on the end. */
8381 switch (paren) {
8382 case ':':
830247a4 8383 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
8384 break;
8385 case 1:
830247a4 8386 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
8387 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8388 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8389 "Setting close paren #%"IVdf" to %d\n",
8390 (IV)parno, REG_NODE_NUM(ender)));
8391 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
8392 if (RExC_nestroot == parno)
8393 RExC_nestroot = 0;
40d049e4 8394 }
fac92740
MJD
8395 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8396 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
8397 break;
8398 case '<':
c277df42
IZ
8399 case ',':
8400 case '=':
8401 case '!':
c277df42 8402 *flagp &= ~HASWIDTH;
821b33a5
IZ
8403 /* FALL THROUGH */
8404 case '>':
830247a4 8405 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
8406 break;
8407 case 0:
830247a4 8408 ender = reg_node(pRExC_state, END);
40d049e4
YO
8409 if (!SIZE_ONLY) {
8410 assert(!RExC_opend); /* there can only be one! */
8411 RExC_opend = ender;
8412 }
c277df42
IZ
8413 break;
8414 }
eaf3ca90 8415 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 8416
9674d46a 8417 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
8418 if (depth==1)
8419 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8420
c277df42 8421 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
8422 for (br = ret; br; br = regnext(br)) {
8423 const U8 op = PL_regkind[OP(br)];
8424 if (op == BRANCH) {
07be1b83 8425 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
8426 }
8427 else if (op == BRANCHJ) {
07be1b83 8428 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 8429 }
c277df42
IZ
8430 }
8431 }
a0d0e21e 8432 }
c277df42
IZ
8433
8434 {
e1ec3a88
AL
8435 const char *p;
8436 static const char parens[] = "=!<,>";
c277df42
IZ
8437
8438 if (paren && (p = strchr(parens, paren))) {
eb160463 8439 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
8440 int flag = (p - parens) > 1;
8441
8442 if (paren == '>')
8443 node = SUSPEND, flag = 0;
6bda09f9 8444 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
8445 Set_Node_Cur_Length(ret);
8446 Set_Node_Offset(ret, parse_start + 1);
c277df42 8447 ret->flags = flag;
07be1b83 8448 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 8449 }
a0d0e21e
LW
8450 }
8451
8452 /* Check for proper termination. */
ce3e6498 8453 if (paren) {
e2509266 8454 RExC_flags = oregflags;
830247a4
IZ
8455 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8456 RExC_parse = oregcomp_parse;
380a0633 8457 vFAIL("Unmatched (");
ce3e6498 8458 }
a0ed51b3 8459 }
830247a4
IZ
8460 else if (!paren && RExC_parse < RExC_end) {
8461 if (*RExC_parse == ')') {
8462 RExC_parse++;
380a0633 8463 vFAIL("Unmatched )");
a0ed51b3
LW
8464 }
8465 else
b45f050a 8466 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
8467 /* NOTREACHED */
8468 }
b57e4118
KW
8469
8470 if (RExC_in_lookbehind) {
8471 RExC_in_lookbehind--;
8472 }
fd4be6f0 8473 if (after_freeze > RExC_npar)
594d7033 8474 RExC_npar = after_freeze;
a0d0e21e 8475 return(ret);
a687059c
LW
8476}
8477
8478/*
8479 - regbranch - one alternative of an | operator
8480 *
8481 * Implements the concatenation operator.
8482 */
76e3520e 8483STATIC regnode *
3dab1dad 8484S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 8485{
97aff369 8486 dVAR;
c277df42
IZ
8487 register regnode *ret;
8488 register regnode *chain = NULL;
8489 register regnode *latest;
8490 I32 flags = 0, c = 0;
3dab1dad 8491 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8492
8493 PERL_ARGS_ASSERT_REGBRANCH;
8494
3dab1dad 8495 DEBUG_PARSE("brnc");
02daf0ab 8496
b81d288d 8497 if (first)
c277df42
IZ
8498 ret = NULL;
8499 else {
b81d288d 8500 if (!SIZE_ONLY && RExC_extralen)
830247a4 8501 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 8502 else {
830247a4 8503 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
8504 Set_Node_Length(ret, 1);
8505 }
c277df42 8506 }
686b73d4 8507
b81d288d 8508 if (!first && SIZE_ONLY)
830247a4 8509 RExC_extralen += 1; /* BRANCHJ */
b81d288d 8510
c277df42 8511 *flagp = WORST; /* Tentatively. */
a0d0e21e 8512
830247a4
IZ
8513 RExC_parse--;
8514 nextchar(pRExC_state);
8515 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 8516 flags &= ~TRYAGAIN;
3dab1dad 8517 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
8518 if (latest == NULL) {
8519 if (flags & TRYAGAIN)
8520 continue;
8521 return(NULL);
a0ed51b3
LW
8522 }
8523 else if (ret == NULL)
c277df42 8524 ret = latest;
8ae10a67 8525 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 8526 if (chain == NULL) /* First piece. */
a0d0e21e
LW
8527 *flagp |= flags&SPSTART;
8528 else {
830247a4 8529 RExC_naughty++;
3dab1dad 8530 REGTAIL(pRExC_state, chain, latest);
a687059c 8531 }
a0d0e21e 8532 chain = latest;
c277df42
IZ
8533 c++;
8534 }
8535 if (chain == NULL) { /* Loop ran zero times. */
830247a4 8536 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
8537 if (ret == NULL)
8538 ret = chain;
8539 }
8540 if (c == 1) {
8541 *flagp |= flags&SIMPLE;
a0d0e21e 8542 }
a687059c 8543
d4c19fe8 8544 return ret;
a687059c
LW
8545}
8546
8547/*
8548 - regpiece - something followed by possible [*+?]
8549 *
8550 * Note that the branching code sequences used for ? and the general cases
8551 * of * and + are somewhat optimized: they use the same NOTHING node as
8552 * both the endmarker for their branch list and the body of the last branch.
8553 * It might seem that this node could be dispensed with entirely, but the
8554 * endmarker role is not redundant.
8555 */
76e3520e 8556STATIC regnode *
3dab1dad 8557S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 8558{
97aff369 8559 dVAR;
c277df42 8560 register regnode *ret;
a0d0e21e
LW
8561 register char op;
8562 register char *next;
8563 I32 flags;
1df70142 8564 const char * const origparse = RExC_parse;
a0d0e21e 8565 I32 min;
c277df42 8566 I32 max = REG_INFTY;
f19a8d85 8567#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8568 char *parse_start;
f19a8d85 8569#endif
10edeb5d 8570 const char *maxpos = NULL;
3dab1dad 8571 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8572
8573 PERL_ARGS_ASSERT_REGPIECE;
8574
3dab1dad 8575 DEBUG_PARSE("piec");
a0d0e21e 8576
3dab1dad 8577 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
8578 if (ret == NULL) {
8579 if (flags & TRYAGAIN)
8580 *flagp |= TRYAGAIN;
8581 return(NULL);
8582 }
8583
830247a4 8584 op = *RExC_parse;
a0d0e21e 8585
830247a4 8586 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 8587 maxpos = NULL;
f19a8d85 8588#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8589 parse_start = RExC_parse; /* MJD */
f19a8d85 8590#endif
830247a4 8591 next = RExC_parse + 1;
a0d0e21e
LW
8592 while (isDIGIT(*next) || *next == ',') {
8593 if (*next == ',') {
8594 if (maxpos)
8595 break;
8596 else
8597 maxpos = next;
a687059c 8598 }
a0d0e21e
LW
8599 next++;
8600 }
8601 if (*next == '}') { /* got one */
8602 if (!maxpos)
8603 maxpos = next;
830247a4
IZ
8604 RExC_parse++;
8605 min = atoi(RExC_parse);
a0d0e21e
LW
8606 if (*maxpos == ',')
8607 maxpos++;
8608 else
830247a4 8609 maxpos = RExC_parse;
a0d0e21e
LW
8610 max = atoi(maxpos);
8611 if (!max && *maxpos != '0')
c277df42
IZ
8612 max = REG_INFTY; /* meaning "infinity" */
8613 else if (max >= REG_INFTY)
8615cb43 8614 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
8615 RExC_parse = next;
8616 nextchar(pRExC_state);
a0d0e21e
LW
8617
8618 do_curly:
8619 if ((flags&SIMPLE)) {
830247a4 8620 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 8621 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
8622 Set_Node_Offset(ret, parse_start+1); /* MJD */
8623 Set_Node_Cur_Length(ret);
a0d0e21e
LW
8624 }
8625 else {
3dab1dad 8626 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
8627
8628 w->flags = 0;
3dab1dad 8629 REGTAIL(pRExC_state, ret, w);
830247a4 8630 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
8631 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8632 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
8633 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8634 }
6bda09f9 8635 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
8636 /* MJD hk */
8637 Set_Node_Offset(ret, parse_start+1);
2af232bd 8638 Set_Node_Length(ret,
fac92740 8639 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 8640
830247a4 8641 if (!SIZE_ONLY && RExC_extralen)
c277df42 8642 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 8643 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 8644 if (SIZE_ONLY)
830247a4
IZ
8645 RExC_whilem_seen++, RExC_extralen += 3;
8646 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 8647 }
c277df42 8648 ret->flags = 0;
a0d0e21e
LW
8649
8650 if (min > 0)
821b33a5
IZ
8651 *flagp = WORST;
8652 if (max > 0)
8653 *flagp |= HASWIDTH;
8fa23287 8654 if (max < min)
8615cb43 8655 vFAIL("Can't do {n,m} with n > m");
c277df42 8656 if (!SIZE_ONLY) {
eb160463
GS
8657 ARG1_SET(ret, (U16)min);
8658 ARG2_SET(ret, (U16)max);
a687059c 8659 }
a687059c 8660
a0d0e21e 8661 goto nest_check;
a687059c 8662 }
a0d0e21e 8663 }
a687059c 8664
a0d0e21e
LW
8665 if (!ISMULT1(op)) {
8666 *flagp = flags;
a687059c 8667 return(ret);
a0d0e21e 8668 }
bb20fd44 8669
c277df42 8670#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
8671
8672 /* if this is reinstated, don't forget to put this back into perldiag:
8673
8674 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8675
8676 (F) The part of the regexp subject to either the * or + quantifier
8677 could match an empty string. The {#} shows in the regular
8678 expression about where the problem was discovered.
8679
8680 */
8681
bb20fd44 8682 if (!(flags&HASWIDTH) && op != '?')
b45f050a 8683 vFAIL("Regexp *+ operand could be empty");
b81d288d 8684#endif
bb20fd44 8685
f19a8d85 8686#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8687 parse_start = RExC_parse;
f19a8d85 8688#endif
830247a4 8689 nextchar(pRExC_state);
a0d0e21e 8690
821b33a5 8691 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
8692
8693 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 8694 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 8695 ret->flags = 0;
830247a4 8696 RExC_naughty += 4;
a0d0e21e
LW
8697 }
8698 else if (op == '*') {
8699 min = 0;
8700 goto do_curly;
a0ed51b3
LW
8701 }
8702 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 8703 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 8704 ret->flags = 0;
830247a4 8705 RExC_naughty += 3;
a0d0e21e
LW
8706 }
8707 else if (op == '+') {
8708 min = 1;
8709 goto do_curly;
a0ed51b3
LW
8710 }
8711 else if (op == '?') {
a0d0e21e
LW
8712 min = 0; max = 1;
8713 goto do_curly;
8714 }
8715 nest_check:
668c081a
NC
8716 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8717 ckWARN3reg(RExC_parse,
8718 "%.*s matches null string many times",
8719 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8720 origparse);
a0d0e21e
LW
8721 }
8722
b9b4dddf 8723 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 8724 nextchar(pRExC_state);
6bda09f9 8725 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 8726 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 8727 }
b9b4dddf
YO
8728#ifndef REG_ALLOW_MINMOD_SUSPEND
8729 else
8730#endif
8731 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8732 regnode *ender;
8733 nextchar(pRExC_state);
8734 ender = reg_node(pRExC_state, SUCCEED);
8735 REGTAIL(pRExC_state, ret, ender);
8736 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8737 ret->flags = 0;
8738 ender = reg_node(pRExC_state, TAIL);
8739 REGTAIL(pRExC_state, ret, ender);
8740 /*ret= ender;*/
8741 }
8742
8743 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 8744 RExC_parse++;
b45f050a
JF
8745 vFAIL("Nested quantifiers");
8746 }
a0d0e21e
LW
8747
8748 return(ret);
a687059c
LW
8749}
8750
fc8cd66c 8751
9d64099b 8752/* reg_namedseq(pRExC_state,UVp, UV depth)
fc8cd66c
YO
8753
8754 This is expected to be called by a parser routine that has
afefe6bf 8755 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
8756 expected to point at the first char following the N at the time
8757 of the call.
ff3f963a
KW
8758
8759 The \N may be inside (indicated by valuep not being NULL) or outside a
8760 character class.
8761
8762 \N may begin either a named sequence, or if outside a character class, mean
8763 to match a non-newline. For non single-quoted regexes, the tokenizer has
8764 attempted to decide which, and in the case of a named sequence converted it
8765 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8766 where c1... are the characters in the sequence. For single-quoted regexes,
8767 the tokenizer passes the \N sequence through unchanged; this code will not
8768 attempt to determine this nor expand those. The net effect is that if the
8769 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8770 signals that this \N occurrence means to match a non-newline.
8771
8772 Only the \N{U+...} form should occur in a character class, for the same
8773 reason that '.' inside a character class means to just match a period: it
8774 just doesn't make sense.
fc8cd66c
YO
8775
8776 If valuep is non-null then it is assumed that we are parsing inside
8777 of a charclass definition and the first codepoint in the resolved
8778 string is returned via *valuep and the routine will return NULL.
8779 In this mode if a multichar string is returned from the charnames
ff3f963a 8780 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
8781 sequence will be examined. If the string returned is zero length
8782 then the value of *valuep is undefined and NON-NULL will
8783 be returned to indicate failure. (This will NOT be a valid pointer
8784 to a regnode.)
8785
ff3f963a
KW
8786 If valuep is null then it is assumed that we are parsing normal text and a
8787 new EXACT node is inserted into the program containing the resolved string,
8788 and a pointer to the new node is returned. But if the string is zero length
8789 a NOTHING node is emitted instead.
afefe6bf 8790
fc8cd66c 8791 On success RExC_parse is set to the char following the endbrace.
ff3f963a 8792 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
8793 */
8794STATIC regnode *
9d64099b 8795S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
fc8cd66c 8796{
c3c41406 8797 char * endbrace; /* '}' following the name */
fc8cd66c 8798 regnode *ret = NULL;
c3c41406 8799 char* p;
ff3f963a
KW
8800
8801 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8802
8803 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
8804
8805 GET_RE_DEBUG_FLAGS;
c3c41406
KW
8806
8807 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8808 * modifier. The other meaning does not */
8809 p = (RExC_flags & RXf_PMf_EXTENDED)
8810 ? regwhite( pRExC_state, RExC_parse )
8811 : RExC_parse;
7918f24d 8812
ff3f963a 8813 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
8814 * [^\n]. The former is assumed when it can't be the latter. */
8815 if (*p != '{' || regcurly(p)) {
8816 RExC_parse = p;
ff3f963a 8817 if (valuep) {
afefe6bf 8818 /* no bare \N in a charclass */
ff3f963a
KW
8819 vFAIL("\\N in a character class must be a named character: \\N{...}");
8820 }
afefe6bf
RGS
8821 nextchar(pRExC_state);
8822 ret = reg_node(pRExC_state, REG_ANY);
8823 *flagp |= HASWIDTH|SIMPLE;
8824 RExC_naughty++;
8825 RExC_parse--;
8826 Set_Node_Length(ret, 1); /* MJD */
8827 return ret;
fc8cd66c 8828 }
a4893424 8829
c3c41406
KW
8830 /* Here, we have decided it should be a named sequence */
8831
8832 /* The test above made sure that the next real character is a '{', but
8833 * under the /x modifier, it could be separated by space (or a comment and
8834 * \n) and this is not allowed (for consistency with \x{...} and the
8835 * tokenizer handling of \N{NAME}). */
8836 if (*RExC_parse != '{') {
8837 vFAIL("Missing braces on \\N{}");
8838 }
8839
ff3f963a 8840 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
8841
8842 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8843 || ! (endbrace == RExC_parse /* nothing between the {} */
8844 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8845 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8846 {
8847 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8848 vFAIL("\\N{NAME} must be resolved by the lexer");
8849 }
8850
ff3f963a
KW
8851 if (endbrace == RExC_parse) { /* empty: \N{} */
8852 if (! valuep) {
8853 RExC_parse = endbrace + 1;
8854 return reg_node(pRExC_state,NOTHING);
a4893424 8855 }
fc8cd66c 8856
ff3f963a
KW
8857 if (SIZE_ONLY) {
8858 ckWARNreg(RExC_parse,
8859 "Ignoring zero length \\N{} in character class"
8860 );
8861 RExC_parse = endbrace + 1;
8862 }
8863 *valuep = 0;
8864 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 8865 }
ff3f963a 8866
62fed28b 8867 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
8868 RExC_parse += 2; /* Skip past the 'U+' */
8869
8870 if (valuep) { /* In a bracketed char class */
8871 /* We only pay attention to the first char of
8872 multichar strings being returned. I kinda wonder
8873 if this makes sense as it does change the behaviour
8874 from earlier versions, OTOH that behaviour was broken
8875 as well. XXX Solution is to recharacterize as
8876 [rest-of-class]|multi1|multi2... */
8877
8878 STRLEN length_of_hex;
8879 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8880 | PERL_SCAN_DISALLOW_PREFIX
8881 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8882
37820adc
KW
8883 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8884 if (endchar < endbrace) {
ff3f963a
KW
8885 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8886 }
ff3f963a
KW
8887
8888 length_of_hex = (STRLEN)(endchar - RExC_parse);
8889 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8890
8891 /* The tokenizer should have guaranteed validity, but it's possible to
8892 * bypass it by using single quoting, so check */
c3c41406
KW
8893 if (length_of_hex == 0
8894 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8895 {
8896 RExC_parse += length_of_hex; /* Includes all the valid */
8897 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8898 ? UTF8SKIP(RExC_parse)
8899 : 1;
8900 /* Guard against malformed utf8 */
8901 if (RExC_parse >= endchar) RExC_parse = endchar;
8902 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
8903 }
8904
8905 RExC_parse = endbrace + 1;
8906 if (endchar == endbrace) return NULL;
8907
8908 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 8909 }
ff3f963a 8910 else { /* Not a char class */
e2a7e165
KW
8911
8912 /* What is done here is to convert this to a sub-pattern of the form
8913 * (?:\x{char1}\x{char2}...)
8914 * and then call reg recursively. That way, it retains its atomicness,
8915 * while not having to worry about special handling that some code
8916 * points may have. toke.c has converted the original Unicode values
8917 * to native, so that we can just pass on the hex values unchanged. We
8918 * do have to set a flag to keep recoding from happening in the
8919 * recursion */
8920
8921 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8922 STRLEN len;
ff3f963a
KW
8923 char *endchar; /* Points to '.' or '}' ending cur char in the input
8924 stream */
e2a7e165
KW
8925 char *orig_end = RExC_end;
8926
8927 while (RExC_parse < endbrace) {
ff3f963a
KW
8928
8929 /* Code points are separated by dots. If none, there is only one
8930 * code point, and is terminated by the brace */
37820adc 8931 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a 8932
e2a7e165
KW
8933 /* Convert to notation the rest of the code understands */
8934 sv_catpv(substitute_parse, "\\x{");
8935 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8936 sv_catpv(substitute_parse, "}");
ff3f963a
KW
8937
8938 /* Point to the beginning of the next character in the sequence. */
8939 RExC_parse = endchar + 1;
ff3f963a 8940 }
e2a7e165 8941 sv_catpv(substitute_parse, ")");
ff3f963a 8942
e2a7e165 8943 RExC_parse = SvPV(substitute_parse, len);
ff3f963a 8944
e2a7e165
KW
8945 /* Don't allow empty number */
8946 if (len < 8) {
8947 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a 8948 }
e2a7e165 8949 RExC_end = RExC_parse + len;
ff3f963a 8950
e2a7e165
KW
8951 /* The values are Unicode, and therefore not subject to recoding */
8952 RExC_override_recoding = 1;
8953
8954 ret = reg(pRExC_state, 1, flagp, depth+1);
8955
8956 RExC_parse = endbrace;
8957 RExC_end = orig_end;
8958 RExC_override_recoding = 0;
ff3f963a 8959
ff3f963a
KW
8960 nextchar(pRExC_state);
8961 }
8962
8963 return ret;
fc8cd66c
YO
8964}
8965
8966
9e08bc66
TS
8967/*
8968 * reg_recode
8969 *
8970 * It returns the code point in utf8 for the value in *encp.
8971 * value: a code value in the source encoding
8972 * encp: a pointer to an Encode object
8973 *
8974 * If the result from Encode is not a single character,
8975 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8976 */
8977STATIC UV
8978S_reg_recode(pTHX_ const char value, SV **encp)
8979{
8980 STRLEN numlen = 1;
59cd0e26 8981 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 8982 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
8983 const STRLEN newlen = SvCUR(sv);
8984 UV uv = UNICODE_REPLACEMENT;
8985
7918f24d
NC
8986 PERL_ARGS_ASSERT_REG_RECODE;
8987
9e08bc66
TS
8988 if (newlen)
8989 uv = SvUTF8(sv)
8990 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8991 : *(U8*)s;
8992
8993 if (!newlen || numlen != newlen) {
8994 uv = UNICODE_REPLACEMENT;
c86f7df5 8995 *encp = NULL;
9e08bc66
TS
8996 }
8997 return uv;
8998}
8999
fc8cd66c 9000
a687059c
LW
9001/*
9002 - regatom - the lowest level
ee9b8eae
YO
9003
9004 Try to identify anything special at the start of the pattern. If there
9005 is, then handle it as required. This may involve generating a single regop,
9006 such as for an assertion; or it may involve recursing, such as to
9007 handle a () structure.
9008
9009 If the string doesn't start with something special then we gobble up
9010 as much literal text as we can.
9011
9012 Once we have been able to handle whatever type of thing started the
9013 sequence, we return.
9014
9015 Note: we have to be careful with escapes, as they can be both literal
9016 and special, and in the case of \10 and friends can either, depending
486ec47a 9017 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
9018 escape sequences, with the one for handling literal escapes requiring
9019 a dummy entry for all of the special escapes that are actually handled
9020 by the other.
9021*/
9022
76e3520e 9023STATIC regnode *
3dab1dad 9024S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 9025{
97aff369 9026 dVAR;
cbbf8932 9027 register regnode *ret = NULL;
a0d0e21e 9028 I32 flags;
45948336 9029 char *parse_start = RExC_parse;
980866de 9030 U8 op;
3dab1dad
YO
9031 GET_RE_DEBUG_FLAGS_DECL;
9032 DEBUG_PARSE("atom");
a0d0e21e
LW
9033 *flagp = WORST; /* Tentatively. */
9034
7918f24d 9035 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 9036
a0d0e21e 9037tryagain:
f9a79580 9038 switch ((U8)*RExC_parse) {
a0d0e21e 9039 case '^':
830247a4
IZ
9040 RExC_seen_zerolen++;
9041 nextchar(pRExC_state);
bbe252da 9042 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 9043 ret = reg_node(pRExC_state, MBOL);
bbe252da 9044 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 9045 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 9046 else
830247a4 9047 ret = reg_node(pRExC_state, BOL);
fac92740 9048 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9049 break;
9050 case '$':
830247a4 9051 nextchar(pRExC_state);
b81d288d 9052 if (*RExC_parse)
830247a4 9053 RExC_seen_zerolen++;
bbe252da 9054 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 9055 ret = reg_node(pRExC_state, MEOL);
bbe252da 9056 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 9057 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 9058 else
830247a4 9059 ret = reg_node(pRExC_state, EOL);
fac92740 9060 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9061 break;
9062 case '.':
830247a4 9063 nextchar(pRExC_state);
bbe252da 9064 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
9065 ret = reg_node(pRExC_state, SANY);
9066 else
9067 ret = reg_node(pRExC_state, REG_ANY);
9068 *flagp |= HASWIDTH|SIMPLE;
830247a4 9069 RExC_naughty++;
fac92740 9070 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9071 break;
9072 case '[':
b45f050a 9073 {
3dab1dad
YO
9074 char * const oregcomp_parse = ++RExC_parse;
9075 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
9076 if (*RExC_parse != ']') {
9077 RExC_parse = oregcomp_parse;
b45f050a
JF
9078 vFAIL("Unmatched [");
9079 }
830247a4 9080 nextchar(pRExC_state);
a0d0e21e 9081 *flagp |= HASWIDTH|SIMPLE;
fac92740 9082 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 9083 break;
b45f050a 9084 }
a0d0e21e 9085 case '(':
830247a4 9086 nextchar(pRExC_state);
3dab1dad 9087 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 9088 if (ret == NULL) {
bf93d4cc 9089 if (flags & TRYAGAIN) {
830247a4 9090 if (RExC_parse == RExC_end) {
bf93d4cc
GS
9091 /* Make parent create an empty node if needed. */
9092 *flagp |= TRYAGAIN;
9093 return(NULL);
9094 }
a0d0e21e 9095 goto tryagain;
bf93d4cc 9096 }
a0d0e21e
LW
9097 return(NULL);
9098 }
a3b492c3 9099 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
9100 break;
9101 case '|':
9102 case ')':
9103 if (flags & TRYAGAIN) {
9104 *flagp |= TRYAGAIN;
9105 return NULL;
9106 }
b45f050a 9107 vFAIL("Internal urp");
a0d0e21e
LW
9108 /* Supposed to be caught earlier. */
9109 break;
9110 case '?':
9111 case '+':
9112 case '*':
830247a4 9113 RExC_parse++;
b45f050a 9114 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
9115 break;
9116 case '\\':
ee9b8eae
YO
9117 /* Special Escapes
9118
9119 This switch handles escape sequences that resolve to some kind
9120 of special regop and not to literal text. Escape sequnces that
9121 resolve to literal text are handled below in the switch marked
9122 "Literal Escapes".
9123
9124 Every entry in this switch *must* have a corresponding entry
9125 in the literal escape switch. However, the opposite is not
9126 required, as the default for this switch is to jump to the
9127 literal text handling code.
9128 */
a0a388a1 9129 switch ((U8)*++RExC_parse) {
ee9b8eae 9130 /* Special Escapes */
a0d0e21e 9131 case 'A':
830247a4
IZ
9132 RExC_seen_zerolen++;
9133 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 9134 *flagp |= SIMPLE;
ee9b8eae 9135 goto finish_meta_pat;
a0d0e21e 9136 case 'G':
830247a4
IZ
9137 ret = reg_node(pRExC_state, GPOS);
9138 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 9139 *flagp |= SIMPLE;
ee9b8eae
YO
9140 goto finish_meta_pat;
9141 case 'K':
9142 RExC_seen_zerolen++;
9143 ret = reg_node(pRExC_state, KEEPS);
9144 *flagp |= SIMPLE;
37923168
RGS
9145 /* XXX:dmq : disabling in-place substitution seems to
9146 * be necessary here to avoid cases of memory corruption, as
9147 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9148 */
9149 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 9150 goto finish_meta_pat;
a0d0e21e 9151 case 'Z':
830247a4 9152 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 9153 *flagp |= SIMPLE;
a1917ab9 9154 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9155 goto finish_meta_pat;
b85d18e9 9156 case 'z':
830247a4 9157 ret = reg_node(pRExC_state, EOS);
b85d18e9 9158 *flagp |= SIMPLE;
830247a4 9159 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9160 goto finish_meta_pat;
4a2d328f 9161 case 'C':
f33976b4
DB
9162 ret = reg_node(pRExC_state, CANY);
9163 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 9164 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9165 goto finish_meta_pat;
a0ed51b3 9166 case 'X':
830247a4 9167 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 9168 *flagp |= HASWIDTH;
ee9b8eae 9169 goto finish_meta_pat;
a0d0e21e 9170 case 'w':
980866de
KW
9171 switch (get_regex_charset(RExC_flags)) {
9172 case REGEX_LOCALE_CHARSET:
9173 op = ALNUML;
9174 break;
9175 case REGEX_UNICODE_CHARSET:
9176 op = ALNUMU;
9177 break;
cfaf538b 9178 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9179 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9180 op = ALNUMA;
9181 break;
980866de
KW
9182 case REGEX_DEPENDS_CHARSET:
9183 op = ALNUM;
9184 break;
9185 default:
9186 goto bad_charset;
a12cf05f 9187 }
980866de 9188 ret = reg_node(pRExC_state, op);
a0d0e21e 9189 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9190 goto finish_meta_pat;
a0d0e21e 9191 case 'W':
980866de
KW
9192 switch (get_regex_charset(RExC_flags)) {
9193 case REGEX_LOCALE_CHARSET:
9194 op = NALNUML;
9195 break;
9196 case REGEX_UNICODE_CHARSET:
9197 op = NALNUMU;
9198 break;
cfaf538b 9199 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9200 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9201 op = NALNUMA;
9202 break;
980866de
KW
9203 case REGEX_DEPENDS_CHARSET:
9204 op = NALNUM;
9205 break;
9206 default:
9207 goto bad_charset;
a12cf05f 9208 }
980866de 9209 ret = reg_node(pRExC_state, op);
a0d0e21e 9210 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9211 goto finish_meta_pat;
a0d0e21e 9212 case 'b':
830247a4
IZ
9213 RExC_seen_zerolen++;
9214 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
9215 switch (get_regex_charset(RExC_flags)) {
9216 case REGEX_LOCALE_CHARSET:
9217 op = BOUNDL;
9218 break;
9219 case REGEX_UNICODE_CHARSET:
9220 op = BOUNDU;
9221 break;
cfaf538b 9222 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9223 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9224 op = BOUNDA;
9225 break;
63ac0dad
KW
9226 case REGEX_DEPENDS_CHARSET:
9227 op = BOUND;
9228 break;
9229 default:
9230 goto bad_charset;
a12cf05f 9231 }
63ac0dad 9232 ret = reg_node(pRExC_state, op);
b988e673 9233 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9234 *flagp |= SIMPLE;
ee9b8eae 9235 goto finish_meta_pat;
a0d0e21e 9236 case 'B':
830247a4
IZ
9237 RExC_seen_zerolen++;
9238 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
9239 switch (get_regex_charset(RExC_flags)) {
9240 case REGEX_LOCALE_CHARSET:
9241 op = NBOUNDL;
9242 break;
9243 case REGEX_UNICODE_CHARSET:
9244 op = NBOUNDU;
9245 break;
cfaf538b 9246 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9247 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9248 op = NBOUNDA;
9249 break;
63ac0dad
KW
9250 case REGEX_DEPENDS_CHARSET:
9251 op = NBOUND;
9252 break;
9253 default:
9254 goto bad_charset;
a12cf05f 9255 }
63ac0dad 9256 ret = reg_node(pRExC_state, op);
b988e673 9257 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9258 *flagp |= SIMPLE;
ee9b8eae 9259 goto finish_meta_pat;
a0d0e21e 9260 case 's':
980866de
KW
9261 switch (get_regex_charset(RExC_flags)) {
9262 case REGEX_LOCALE_CHARSET:
9263 op = SPACEL;
9264 break;
9265 case REGEX_UNICODE_CHARSET:
9266 op = SPACEU;
9267 break;
cfaf538b 9268 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9269 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9270 op = SPACEA;
9271 break;
980866de
KW
9272 case REGEX_DEPENDS_CHARSET:
9273 op = SPACE;
9274 break;
9275 default:
9276 goto bad_charset;
a12cf05f 9277 }
980866de 9278 ret = reg_node(pRExC_state, op);
a0d0e21e 9279 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9280 goto finish_meta_pat;
a0d0e21e 9281 case 'S':
980866de
KW
9282 switch (get_regex_charset(RExC_flags)) {
9283 case REGEX_LOCALE_CHARSET:
9284 op = NSPACEL;
9285 break;
9286 case REGEX_UNICODE_CHARSET:
9287 op = NSPACEU;
9288 break;
cfaf538b 9289 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9290 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9291 op = NSPACEA;
9292 break;
980866de
KW
9293 case REGEX_DEPENDS_CHARSET:
9294 op = NSPACE;
9295 break;
9296 default:
9297 goto bad_charset;
a12cf05f 9298 }
980866de 9299 ret = reg_node(pRExC_state, op);
a0d0e21e 9300 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9301 goto finish_meta_pat;
a0d0e21e 9302 case 'd':
56ae17b4
KW
9303 switch (get_regex_charset(RExC_flags)) {
9304 case REGEX_LOCALE_CHARSET:
9305 op = DIGITL;
9306 break;
cfaf538b 9307 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9308 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9309 op = DIGITA;
9310 break;
56ae17b4
KW
9311 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9312 case REGEX_UNICODE_CHARSET:
9313 op = DIGIT;
9314 break;
9315 default:
9316 goto bad_charset;
6ab9ea91 9317 }
56ae17b4 9318 ret = reg_node(pRExC_state, op);
a0d0e21e 9319 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9320 goto finish_meta_pat;
a0d0e21e 9321 case 'D':
56ae17b4
KW
9322 switch (get_regex_charset(RExC_flags)) {
9323 case REGEX_LOCALE_CHARSET:
9324 op = NDIGITL;
9325 break;
cfaf538b 9326 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9327 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9328 op = NDIGITA;
9329 break;
56ae17b4
KW
9330 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9331 case REGEX_UNICODE_CHARSET:
9332 op = NDIGIT;
9333 break;
9334 default:
9335 goto bad_charset;
6ab9ea91 9336 }
56ae17b4 9337 ret = reg_node(pRExC_state, op);
a0d0e21e 9338 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9339 goto finish_meta_pat;
e1d1eefb
YO
9340 case 'R':
9341 ret = reg_node(pRExC_state, LNBREAK);
9342 *flagp |= HASWIDTH|SIMPLE;
9343 goto finish_meta_pat;
9344 case 'h':
9345 ret = reg_node(pRExC_state, HORIZWS);
9346 *flagp |= HASWIDTH|SIMPLE;
9347 goto finish_meta_pat;
9348 case 'H':
9349 ret = reg_node(pRExC_state, NHORIZWS);
9350 *flagp |= HASWIDTH|SIMPLE;
9351 goto finish_meta_pat;
ee9b8eae 9352 case 'v':
e1d1eefb
YO
9353 ret = reg_node(pRExC_state, VERTWS);
9354 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
9355 goto finish_meta_pat;
9356 case 'V':
e1d1eefb
YO
9357 ret = reg_node(pRExC_state, NVERTWS);
9358 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9359 finish_meta_pat:
830247a4 9360 nextchar(pRExC_state);
fac92740 9361 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 9362 break;
a14b48bc
LW
9363 case 'p':
9364 case 'P':
686b73d4 9365 {
3dab1dad 9366 char* const oldregxend = RExC_end;
d008bc60 9367#ifdef DEBUGGING
ccb2c380 9368 char* parse_start = RExC_parse - 2;
d008bc60 9369#endif
a14b48bc 9370
830247a4 9371 if (RExC_parse[1] == '{') {
3568d838 9372 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
9373 RExC_end = strchr(RExC_parse, '}');
9374 if (!RExC_end) {
3dab1dad 9375 const U8 c = (U8)*RExC_parse;
830247a4
IZ
9376 RExC_parse += 2;
9377 RExC_end = oldregxend;
0da60cf5 9378 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 9379 }
830247a4 9380 RExC_end++;
a14b48bc 9381 }
af6f566e 9382 else {
830247a4 9383 RExC_end = RExC_parse + 2;
af6f566e
HS
9384 if (RExC_end > oldregxend)
9385 RExC_end = oldregxend;
9386 }
830247a4 9387 RExC_parse--;
a14b48bc 9388
3dab1dad 9389 ret = regclass(pRExC_state,depth+1);
a14b48bc 9390
830247a4
IZ
9391 RExC_end = oldregxend;
9392 RExC_parse--;
ccb2c380
MP
9393
9394 Set_Node_Offset(ret, parse_start + 2);
9395 Set_Node_Cur_Length(ret);
830247a4 9396 nextchar(pRExC_state);
a14b48bc
LW
9397 *flagp |= HASWIDTH|SIMPLE;
9398 }
9399 break;
fc8cd66c 9400 case 'N':
afefe6bf 9401 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
9402 multicharacter. join_exact() will join them up later on.
9403 Also this makes sure that things like /\N{BLAH}+/ and
9404 \N{BLAH} being multi char Just Happen. dmq*/
9405 ++RExC_parse;
9d64099b 9406 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
fc8cd66c 9407 break;
0a4db386 9408 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 9409 parse_named_seq:
81714fb9
YO
9410 {
9411 char ch= RExC_parse[1];
1f1031fe
YO
9412 if (ch != '<' && ch != '\'' && ch != '{') {
9413 RExC_parse++;
9414 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 9415 } else {
1f1031fe
YO
9416 /* this pretty much dupes the code for (?P=...) in reg(), if
9417 you change this make sure you change that */
81714fb9 9418 char* name_start = (RExC_parse += 2);
2eccd3b2 9419 U32 num = 0;
0a4db386
YO
9420 SV *sv_dat = reg_scan_name(pRExC_state,
9421 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 9422 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 9423 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
9424 vFAIL2("Sequence %.3s... not terminated",parse_start);
9425
9426 if (!SIZE_ONLY) {
9427 num = add_data( pRExC_state, 1, "S" );
9428 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 9429 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
9430 }
9431
81714fb9
YO
9432 RExC_sawback = 1;
9433 ret = reganode(pRExC_state,
4444fd9f
KW
9434 ((! FOLD)
9435 ? NREF
2f7f8cb1
KW
9436 : (MORE_ASCII_RESTRICTED)
9437 ? NREFFA
9438 : (AT_LEAST_UNI_SEMANTICS)
9439 ? NREFFU
9440 : (LOC)
9441 ? NREFFL
9442 : NREFF),
4444fd9f 9443 num);
81714fb9 9444 *flagp |= HASWIDTH;
1f1031fe 9445
81714fb9
YO
9446 /* override incorrect value set in reganode MJD */
9447 Set_Node_Offset(ret, parse_start+1);
9448 Set_Node_Cur_Length(ret); /* MJD */
9449 nextchar(pRExC_state);
1f1031fe 9450
81714fb9
YO
9451 }
9452 break;
1f1031fe 9453 }
2bf803e2 9454 case 'g':
a0d0e21e
LW
9455 case '1': case '2': case '3': case '4':
9456 case '5': case '6': case '7': case '8': case '9':
9457 {
c74340f9 9458 I32 num;
2bf803e2
YO
9459 bool isg = *RExC_parse == 'g';
9460 bool isrel = 0;
9461 bool hasbrace = 0;
9462 if (isg) {
c74340f9 9463 RExC_parse++;
2bf803e2
YO
9464 if (*RExC_parse == '{') {
9465 RExC_parse++;
9466 hasbrace = 1;
9467 }
9468 if (*RExC_parse == '-') {
9469 RExC_parse++;
9470 isrel = 1;
9471 }
1f1031fe
YO
9472 if (hasbrace && !isDIGIT(*RExC_parse)) {
9473 if (isrel) RExC_parse--;
9474 RExC_parse -= 2;
9475 goto parse_named_seq;
9476 } }
c74340f9 9477 num = atoi(RExC_parse);
b72d83b2
RGS
9478 if (isg && num == 0)
9479 vFAIL("Reference to invalid group 0");
c74340f9 9480 if (isrel) {
5624f11d 9481 num = RExC_npar - num;
c74340f9
YO
9482 if (num < 1)
9483 vFAIL("Reference to nonexistent or unclosed group");
9484 }
2bf803e2 9485 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
9486 goto defchar;
9487 else {
3dab1dad 9488 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
9489 while (isDIGIT(*RExC_parse))
9490 RExC_parse++;
1f1031fe
YO
9491 if (parse_start == RExC_parse - 1)
9492 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
9493 if (hasbrace) {
9494 if (*RExC_parse != '}')
9495 vFAIL("Unterminated \\g{...} pattern");
9496 RExC_parse++;
9497 }
c74340f9
YO
9498 if (!SIZE_ONLY) {
9499 if (num > (I32)RExC_rx->nparens)
9500 vFAIL("Reference to nonexistent group");
c74340f9 9501 }
830247a4 9502 RExC_sawback = 1;
eb160463 9503 ret = reganode(pRExC_state,
4444fd9f
KW
9504 ((! FOLD)
9505 ? REF
2f7f8cb1
KW
9506 : (MORE_ASCII_RESTRICTED)
9507 ? REFFA
9508 : (AT_LEAST_UNI_SEMANTICS)
9509 ? REFFU
9510 : (LOC)
9511 ? REFFL
9512 : REFF),
4444fd9f 9513 num);
a0d0e21e 9514 *flagp |= HASWIDTH;
2af232bd 9515
fac92740 9516 /* override incorrect value set in reganode MJD */
2af232bd 9517 Set_Node_Offset(ret, parse_start+1);
fac92740 9518 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
9519 RExC_parse--;
9520 nextchar(pRExC_state);
a0d0e21e
LW
9521 }
9522 }
9523 break;
9524 case '\0':
830247a4 9525 if (RExC_parse >= RExC_end)
b45f050a 9526 FAIL("Trailing \\");
a0d0e21e
LW
9527 /* FALL THROUGH */
9528 default:
a0288114 9529 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 9530 back into the quick-grab loop below */
45948336 9531 parse_start--;
a0d0e21e
LW
9532 goto defchar;
9533 }
9534 break;
4633a7c4
LW
9535
9536 case '#':
bbe252da 9537 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 9538 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
9539 goto tryagain;
9540 }
9541 /* FALL THROUGH */
9542
f9a79580 9543 default:
561784a5
KW
9544
9545 parse_start = RExC_parse - 1;
9546
9547 RExC_parse++;
9548
9549 defchar: {
ba210ebe 9550 register STRLEN len;
58ae7d3f 9551 register UV ender;
a0d0e21e 9552 register char *p;
3dab1dad 9553 char *s;
80aecb99 9554 STRLEN foldlen;
89ebb4a3 9555 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
d82f9944 9556 U8 node_type;
f06dbbb7 9557
bb914485
KW
9558 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9559 * it is folded to 'ss' even if not utf8 */
9560 bool is_exactfu_sharp_s;
9561
58ae7d3f 9562 ender = 0;
3f410cf6
KW
9563 node_type = ((! FOLD) ? EXACT
9564 : (LOC)
9565 ? EXACTFL
9566 : (MORE_ASCII_RESTRICTED)
9567 ? EXACTFA
9568 : (AT_LEAST_UNI_SEMANTICS)
9569 ? EXACTFU
9570 : EXACTF);
d82f9944 9571 ret = reg_node(pRExC_state, node_type);
cd439c50 9572 s = STRING(ret);
3f410cf6
KW
9573
9574 /* XXX The node can hold up to 255 bytes, yet this only goes to
9575 * 127. I (khw) do not know why. Keeping it somewhat less than
9576 * 255 allows us to not have to worry about overflow due to
9577 * converting to utf8 and fold expansion, but that value is
9578 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9579 * split up by this limit into a single one using the real max of
9580 * 255. Even at 127, this breaks under rare circumstances. If
9581 * folding, we do not want to split a node at a character that is a
9582 * non-final in a multi-char fold, as an input string could just
9583 * happen to want to match across the node boundary. The join
9584 * would solve that problem if the join actually happens. But a
9585 * series of more than two nodes in a row each of 127 would cause
9586 * the first join to succeed to get to 254, but then there wouldn't
9587 * be room for the next one, which could at be one of those split
9588 * multi-char folds. I don't know of any fool-proof solution. One
9589 * could back off to end with only a code point that isn't such a
9590 * non-final, but it is possible for there not to be any in the
9591 * entire node. */
830247a4 9592 for (len = 0, p = RExC_parse - 1;
3f410cf6
KW
9593 len < 127 && p < RExC_end;
9594 len++)
a0d0e21e 9595 {
3dab1dad 9596 char * const oldp = p;
5b5a24f7 9597
bbe252da 9598 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 9599 p = regwhite( pRExC_state, p );
f9a79580 9600 switch ((U8)*p) {
a0d0e21e
LW
9601 case '^':
9602 case '$':
9603 case '.':
9604 case '[':
9605 case '(':
9606 case ')':
9607 case '|':
9608 goto loopdone;
9609 case '\\':
ee9b8eae
YO
9610 /* Literal Escapes Switch
9611
9612 This switch is meant to handle escape sequences that
9613 resolve to a literal character.
9614
9615 Every escape sequence that represents something
9616 else, like an assertion or a char class, is handled
9617 in the switch marked 'Special Escapes' above in this
9618 routine, but also has an entry here as anything that
9619 isn't explicitly mentioned here will be treated as
9620 an unescaped equivalent literal.
9621 */
9622
a0a388a1 9623 switch ((U8)*++p) {
ee9b8eae
YO
9624 /* These are all the special escapes. */
9625 case 'A': /* Start assertion */
9626 case 'b': case 'B': /* Word-boundary assertion*/
9627 case 'C': /* Single char !DANGEROUS! */
9628 case 'd': case 'D': /* digit class */
9629 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 9630 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
9631 case 'k': case 'K': /* named backref, keep marker */
9632 case 'N': /* named char sequence */
38a44b82 9633 case 'p': case 'P': /* Unicode property */
e1d1eefb 9634 case 'R': /* LNBREAK */
ee9b8eae 9635 case 's': case 'S': /* space class */
e1d1eefb 9636 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
9637 case 'w': case 'W': /* word class */
9638 case 'X': /* eXtended Unicode "combining character sequence" */
9639 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
9640 --p;
9641 goto loopdone;
ee9b8eae
YO
9642
9643 /* Anything after here is an escape that resolves to a
9644 literal. (Except digits, which may or may not)
9645 */
a0d0e21e
LW
9646 case 'n':
9647 ender = '\n';
9648 p++;
a687059c 9649 break;
a0d0e21e
LW
9650 case 'r':
9651 ender = '\r';
9652 p++;
a687059c 9653 break;
a0d0e21e
LW
9654 case 't':
9655 ender = '\t';
9656 p++;
a687059c 9657 break;
a0d0e21e
LW
9658 case 'f':
9659 ender = '\f';
9660 p++;
a687059c 9661 break;
a0d0e21e 9662 case 'e':
c7f1f016 9663 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 9664 p++;
a687059c 9665 break;
a0d0e21e 9666 case 'a':
c7f1f016 9667 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 9668 p++;
a687059c 9669 break;
f0a2b745
KW
9670 case 'o':
9671 {
9672 STRLEN brace_len = len;
00c0cb6d 9673 UV result;
454155d9
KW
9674 const char* error_msg;
9675
9676 bool valid = grok_bslash_o(p,
9677 &result,
9678 &brace_len,
9679 &error_msg,
9680 1);
9681 p += brace_len;
9682 if (! valid) {
9683 RExC_parse = p; /* going to die anyway; point
9684 to exact spot of failure */
f0a2b745
KW
9685 vFAIL(error_msg);
9686 }
00c0cb6d
DG
9687 else
9688 {
9689 ender = result;
9690 }
f0a2b745
KW
9691 if (PL_encoding && ender < 0x100) {
9692 goto recode_encoding;
9693 }
9694 if (ender > 0xff) {
62fed28b 9695 REQUIRE_UTF8;
f0a2b745
KW
9696 }
9697 break;
9698 }
a0d0e21e 9699 case 'x':
a0ed51b3 9700 if (*++p == '{') {
1df70142 9701 char* const e = strchr(p, '}');
686b73d4 9702
b45f050a 9703 if (!e) {
830247a4 9704 RExC_parse = p + 1;
b45f050a
JF
9705 vFAIL("Missing right brace on \\x{}");
9706 }
de5f0749 9707 else {
a4c04bdc
NC
9708 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9709 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 9710 STRLEN numlen = e - p - 1;
53305cf1 9711 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 9712 if (ender > 0xff)
62fed28b 9713 REQUIRE_UTF8;
a0ed51b3
LW
9714 p = e + 1;
9715 }
a0ed51b3
LW
9716 }
9717 else {
a4c04bdc 9718 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 9719 STRLEN numlen = 2;
53305cf1 9720 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
9721 p += numlen;
9722 }
9e08bc66
TS
9723 if (PL_encoding && ender < 0x100)
9724 goto recode_encoding;
a687059c 9725 break;
a0d0e21e
LW
9726 case 'c':
9727 p++;
17a3df4c 9728 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
a687059c 9729 break;
a0d0e21e
LW
9730 case '0': case '1': case '2': case '3':case '4':
9731 case '5': case '6': case '7': case '8':case '9':
9732 if (*p == '0' ||
ca67da41 9733 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
9734 {
9735 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 9736 STRLEN numlen = 3;
53305cf1 9737 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 9738 if (ender > 0xff) {
62fed28b 9739 REQUIRE_UTF8;
609122bd 9740 }
a0d0e21e
LW
9741 p += numlen;
9742 }
9743 else {
9744 --p;
9745 goto loopdone;
a687059c 9746 }
9e08bc66
TS
9747 if (PL_encoding && ender < 0x100)
9748 goto recode_encoding;
9749 break;
9750 recode_encoding:
e2a7e165 9751 if (! RExC_override_recoding) {
9e08bc66
TS
9752 SV* enc = PL_encoding;
9753 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
9754 if (!enc && SIZE_ONLY)
9755 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 9756 REQUIRE_UTF8;
9e08bc66 9757 }
a687059c 9758 break;
a0d0e21e 9759 case '\0':
830247a4 9760 if (p >= RExC_end)
b45f050a 9761 FAIL("Trailing \\");
a687059c 9762 /* FALL THROUGH */
a0d0e21e 9763 default:
216bfc0a 9764 if (!SIZE_ONLY&& isALPHA(*p)) {
2a53d331 9765 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
216bfc0a 9766 }
a0ed51b3 9767 goto normal_default;
a0d0e21e
LW
9768 }
9769 break;
2a53d331
KW
9770 case '{':
9771 /* Currently we don't warn when the lbrace is at the start
9772 * of a construct. This catches it in the middle of a
9773 * literal string, or when its the first thing after
9774 * something like "\b" */
9775 if (! SIZE_ONLY
9776 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
9777 {
9778 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
9779 }
9780 /*FALLTHROUGH*/
a687059c 9781 default:
a0ed51b3 9782 normal_default:
fd400ab9 9783 if (UTF8_IS_START(*p) && UTF) {
1df70142 9784 STRLEN numlen;
5e12f4fb 9785 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 9786 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
9787 p += numlen;
9788 }
9789 else
5b67c30a 9790 ender = (U8) *p++;
a0d0e21e 9791 break;
7e2509c1
KW
9792 } /* End of switch on the literal */
9793
bb914485
KW
9794 is_exactfu_sharp_s = (node_type == EXACTFU
9795 && ender == LATIN_SMALL_LETTER_SHARP_S);
bcdf7404
YO
9796 if ( RExC_flags & RXf_PMf_EXTENDED)
9797 p = regwhite( pRExC_state, p );
bb914485 9798 if ((UTF && FOLD) || is_exactfu_sharp_s) {
17580e7a
KW
9799 /* Prime the casefolded buffer. Locale rules, which apply
9800 * only to code points < 256, aren't known until execution,
9801 * so for them, just output the original character using
a0c4c608
KW
9802 * utf8. If we start to fold non-UTF patterns, be sure to
9803 * update join_exact() */
17580e7a
KW
9804 if (LOC && ender < 256) {
9805 if (UNI_IS_INVARIANT(ender)) {
9806 *tmpbuf = (U8) ender;
9807 foldlen = 1;
9808 } else {
9809 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9810 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9811 foldlen = 2;
9812 }
9813 }
9814 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9815 */
2f7f8cb1 9816 ender = toLOWER(ender);
cd64649c 9817 *tmpbuf = (U8) ender;
2f7f8cb1
KW
9818 foldlen = 1;
9819 }
17580e7a
KW
9820 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9821
9822 /* Locale and /aa require more selectivity about the
9823 * fold, so are handled below. Otherwise, here, just
9824 * use the fold */
2f7f8cb1
KW
9825 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9826 }
9827 else {
17580e7a
KW
9828 /* Under locale rules or /aa we are not to mix,
9829 * respectively, ords < 256 or ASCII with non-. So
9830 * reject folds that mix them, using only the
9831 * non-folded code point. So do the fold to a
9832 * temporary, and inspect each character in it. */
2f7f8cb1
KW
9833 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9834 U8* s = trialbuf;
9835 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9836 U8* e = s + foldlen;
9837 bool fold_ok = TRUE;
9838
9839 while (s < e) {
17580e7a
KW
9840 if (isASCII(*s)
9841 || (LOC && (UTF8_IS_INVARIANT(*s)
9842 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9843 {
2f7f8cb1
KW
9844 fold_ok = FALSE;
9845 break;
9846 }
9847 s += UTF8SKIP(s);
9848 }
9849 if (fold_ok) {
9850 Copy(trialbuf, tmpbuf, foldlen, U8);
9851 ender = tmpender;
9852 }
9853 else {
9854 uvuni_to_utf8(tmpbuf, ender);
9855 foldlen = UNISKIP(ender);
9856 }
9857 }
60a8b682 9858 }
bcdf7404 9859 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
9860 if (len)
9861 p = oldp;
bb914485 9862 else if (UTF || is_exactfu_sharp_s) {
80aecb99 9863 if (FOLD) {
60a8b682 9864 /* Emit all the Unicode characters. */
1df70142 9865 STRLEN numlen;
80aecb99
JH
9866 for (foldbuf = tmpbuf;
9867 foldlen;
9868 foldlen -= numlen) {
4b88fb76
KW
9869
9870 /* tmpbuf has been constructed by us, so we
9871 * know it is valid utf8 */
9872 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 9873 if (numlen > 0) {
71207a34 9874 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
9875 s += unilen;
9876 len += unilen;
9877 /* In EBCDIC the numlen
9878 * and unilen can differ. */
9dc45d57 9879 foldbuf += numlen;
47654450
JH
9880 if (numlen >= foldlen)
9881 break;
9dc45d57
JH
9882 }
9883 else
9884 break; /* "Can't happen." */
80aecb99
JH
9885 }
9886 }
9887 else {
71207a34 9888 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 9889 if (unilen > 0) {
0ebc6274
JH
9890 s += unilen;
9891 len += unilen;
9dc45d57 9892 }
80aecb99 9893 }
a0ed51b3 9894 }
a0d0e21e
LW
9895 else {
9896 len++;
eb160463 9897 REGC((char)ender, s++);
a0d0e21e
LW
9898 }
9899 break;
a687059c 9900 }
bb914485 9901 if (UTF || is_exactfu_sharp_s) {
80aecb99 9902 if (FOLD) {
60a8b682 9903 /* Emit all the Unicode characters. */
1df70142 9904 STRLEN numlen;
80aecb99
JH
9905 for (foldbuf = tmpbuf;
9906 foldlen;
9907 foldlen -= numlen) {
4b88fb76 9908 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 9909 if (numlen > 0) {
71207a34 9910 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
9911 len += unilen;
9912 s += unilen;
9913 /* In EBCDIC the numlen
9914 * and unilen can differ. */
9dc45d57 9915 foldbuf += numlen;
47654450
JH
9916 if (numlen >= foldlen)
9917 break;
9dc45d57
JH
9918 }
9919 else
9920 break;
80aecb99
JH
9921 }
9922 }
9923 else {
71207a34 9924 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 9925 if (unilen > 0) {
0ebc6274
JH
9926 s += unilen;
9927 len += unilen;
9dc45d57 9928 }
80aecb99
JH
9929 }
9930 len--;
a0ed51b3 9931 }
d669c36c 9932 else {
eb160463 9933 REGC((char)ender, s++);
d669c36c 9934 }
a0d0e21e 9935 }
7e2509c1
KW
9936 loopdone: /* Jumped to when encounters something that shouldn't be in
9937 the node */
830247a4 9938 RExC_parse = p - 1;
fac92740 9939 Set_Node_Cur_Length(ret); /* MJD */
830247a4 9940 nextchar(pRExC_state);
793db0cb
JH
9941 {
9942 /* len is STRLEN which is unsigned, need to copy to signed */
9943 IV iv = len;
9944 if (iv < 0)
9945 vFAIL("Internal disaster");
9946 }
a0d0e21e
LW
9947 if (len > 0)
9948 *flagp |= HASWIDTH;
090f7165 9949 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 9950 *flagp |= SIMPLE;
686b73d4 9951
cd439c50 9952 if (SIZE_ONLY)
830247a4 9953 RExC_size += STR_SZ(len);
3dab1dad
YO
9954 else {
9955 STR_LEN(ret) = len;
830247a4 9956 RExC_emit += STR_SZ(len);
07be1b83 9957 }
3dab1dad 9958 }
a0d0e21e
LW
9959 break;
9960 }
a687059c 9961
a0d0e21e 9962 return(ret);
980866de
KW
9963
9964/* Jumped to when an unrecognized character set is encountered */
9965bad_charset:
9966 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9967 return(NULL);
a687059c
LW
9968}
9969
873ef191 9970STATIC char *
bcdf7404 9971S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 9972{
bcdf7404 9973 const char *e = RExC_end;
7918f24d
NC
9974
9975 PERL_ARGS_ASSERT_REGWHITE;
9976
5b5a24f7
CS
9977 while (p < e) {
9978 if (isSPACE(*p))
9979 ++p;
9980 else if (*p == '#') {
bcdf7404 9981 bool ended = 0;
5b5a24f7 9982 do {
bcdf7404
YO
9983 if (*p++ == '\n') {
9984 ended = 1;
9985 break;
9986 }
9987 } while (p < e);
9988 if (!ended)
9989 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
9990 }
9991 else
9992 break;
9993 }
9994 return p;
9995}
9996
b8c5462f
JH
9997/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9998 Character classes ([:foo:]) can also be negated ([:^foo:]).
9999 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10000 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 10001 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
10002
10003#define POSIXCC_DONE(c) ((c) == ':')
10004#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10005#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10006
b8c5462f 10007STATIC I32
830247a4 10008S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 10009{
97aff369 10010 dVAR;
936ed897 10011 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 10012
7918f24d
NC
10013 PERL_ARGS_ASSERT_REGPPOSIXCC;
10014
830247a4 10015 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 10016 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 10017 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 10018 const char c = UCHARAT(RExC_parse);
097eb12c 10019 char* const s = RExC_parse++;
686b73d4 10020
9a86a77b 10021 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
10022 RExC_parse++;
10023 if (RExC_parse == RExC_end)
620e46c5 10024 /* Grandfather lone [:, [=, [. */
830247a4 10025 RExC_parse = s;
620e46c5 10026 else {
3dab1dad 10027 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
10028 assert(*t == c);
10029
9a86a77b 10030 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 10031 const char *posixcc = s + 1;
830247a4 10032 RExC_parse++; /* skip over the ending ] */
3dab1dad 10033
b8c5462f 10034 if (*s == ':') {
1df70142
AL
10035 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10036 const I32 skip = t - posixcc;
80916619
NC
10037
10038 /* Initially switch on the length of the name. */
10039 switch (skip) {
10040 case 4:
3dab1dad
YO
10041 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10042 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 10043 break;
80916619
NC
10044 case 5:
10045 /* Names all of length 5. */
10046 /* alnum alpha ascii blank cntrl digit graph lower
10047 print punct space upper */
10048 /* Offset 4 gives the best switch position. */
10049 switch (posixcc[4]) {
10050 case 'a':
3dab1dad
YO
10051 if (memEQ(posixcc, "alph", 4)) /* alpha */
10052 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
10053 break;
10054 case 'e':
3dab1dad
YO
10055 if (memEQ(posixcc, "spac", 4)) /* space */
10056 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
10057 break;
10058 case 'h':
3dab1dad
YO
10059 if (memEQ(posixcc, "grap", 4)) /* graph */
10060 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
10061 break;
10062 case 'i':
3dab1dad
YO
10063 if (memEQ(posixcc, "asci", 4)) /* ascii */
10064 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
10065 break;
10066 case 'k':
3dab1dad
YO
10067 if (memEQ(posixcc, "blan", 4)) /* blank */
10068 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
10069 break;
10070 case 'l':
3dab1dad
YO
10071 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10072 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
10073 break;
10074 case 'm':
3dab1dad
YO
10075 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10076 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
10077 break;
10078 case 'r':
3dab1dad
YO
10079 if (memEQ(posixcc, "lowe", 4)) /* lower */
10080 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10081 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10082 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
10083 break;
10084 case 't':
3dab1dad
YO
10085 if (memEQ(posixcc, "digi", 4)) /* digit */
10086 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10087 else if (memEQ(posixcc, "prin", 4)) /* print */
10088 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10089 else if (memEQ(posixcc, "punc", 4)) /* punct */
10090 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 10091 break;
b8c5462f
JH
10092 }
10093 break;
80916619 10094 case 6:
3dab1dad
YO
10095 if (memEQ(posixcc, "xdigit", 6))
10096 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
10097 break;
10098 }
80916619
NC
10099
10100 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
10101 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10102 t - s - 1, s + 1);
80916619
NC
10103 assert (posixcc[skip] == ':');
10104 assert (posixcc[skip+1] == ']');
b45f050a 10105 } else if (!SIZE_ONLY) {
b8c5462f 10106 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 10107
830247a4 10108 /* adjust RExC_parse so the warning shows after
b45f050a 10109 the class closes */
9a86a77b 10110 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 10111 RExC_parse++;
b45f050a
JF
10112 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10113 }
b8c5462f
JH
10114 } else {
10115 /* Maternal grandfather:
10116 * "[:" ending in ":" but not in ":]" */
830247a4 10117 RExC_parse = s;
767d463e 10118 }
620e46c5
JH
10119 }
10120 }
10121
b8c5462f
JH
10122 return namedclass;
10123}
10124
10125STATIC void
830247a4 10126S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 10127{
97aff369 10128 dVAR;
7918f24d
NC
10129
10130 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10131
3dab1dad 10132 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
10133 const char *s = RExC_parse;
10134 const char c = *s++;
b8c5462f 10135
3dab1dad 10136 while (isALNUM(*s))
b8c5462f
JH
10137 s++;
10138 if (*s && c == *s && s[1] == ']') {
668c081a
NC
10139 ckWARN3reg(s+2,
10140 "POSIX syntax [%c %c] belongs inside character classes",
10141 c, c);
b45f050a
JF
10142
10143 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 10144 if (POSIXCC_NOTYET(c)) {
830247a4 10145 /* adjust RExC_parse so the error shows after
b45f050a 10146 the class closes */
9a86a77b 10147 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 10148 NOOP;
b45f050a
JF
10149 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10150 }
b8c5462f
JH
10151 }
10152 }
620e46c5
JH
10153}
10154
ea317ccb
KW
10155/* Generate the code to add a full posix character <class> to the bracketed
10156 * character class given by <node>. (<node> is needed only under locale rules)
10157 * destlist is the inversion list for non-locale rules that this class is
10158 * to be added to
10159 * sourcelist is the ASCII-range inversion list to add under /a rules
10160 * Xsourcelist is the full Unicode range list to use otherwise. */
10161#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10162 if (LOC) { \
10163 SV* scratch_list = NULL; \
10164 \
10165 /* Set this class in the node for runtime matching */ \
10166 ANYOF_CLASS_SET(node, class); \
10167 \
10168 /* For above Latin1 code points, we use the full Unicode range */ \
10169 _invlist_intersection(PL_AboveLatin1, \
10170 Xsourcelist, \
10171 &scratch_list); \
10172 /* And set the output to it, adding instead if there already is an \
10173 * output. Checking if <destlist> is NULL first saves an extra \
10174 * clone. Its reference count will be decremented at the next \
10175 * union, etc, or if this is the only instance, at the end of the \
10176 * routine */ \
10177 if (! destlist) { \
10178 destlist = scratch_list; \
10179 } \
10180 else { \
10181 _invlist_union(destlist, scratch_list, &destlist); \
10182 SvREFCNT_dec(scratch_list); \
10183 } \
10184 } \
10185 else { \
10186 /* For non-locale, just add it to any existing list */ \
10187 _invlist_union(destlist, \
10188 (AT_LEAST_ASCII_RESTRICTED) \
10189 ? sourcelist \
10190 : Xsourcelist, \
10191 &destlist); \
10192 }
10193
10194/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10195 */
10196#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10197 if (LOC) { \
10198 SV* scratch_list = NULL; \
10199 ANYOF_CLASS_SET(node, class); \
10200 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10201 if (! destlist) { \
10202 destlist = scratch_list; \
10203 } \
10204 else { \
10205 _invlist_union(destlist, scratch_list, &destlist); \
10206 SvREFCNT_dec(scratch_list); \
10207 } \
10208 } \
10209 else { \
10210 _invlist_union_complement_2nd(destlist, \
10211 (AT_LEAST_ASCII_RESTRICTED) \
10212 ? sourcelist \
10213 : Xsourcelist, \
10214 &destlist); \
10215 /* Under /d, everything in the upper half of the Latin1 range \
10216 * matches this complement */ \
10217 if (DEPENDS_SEMANTICS) { \
10218 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10219 } \
10220 }
10221
10222/* Generate the code to add a posix character <class> to the bracketed
10223 * character class given by <node>. (<node> is needed only under locale rules)
10224 * destlist is the inversion list for non-locale rules that this class is
10225 * to be added to
10226 * sourcelist is the ASCII-range inversion list to add under /a rules
10227 * l1_sourcelist is the Latin1 range list to use otherwise.
10228 * Xpropertyname is the name to add to <run_time_list> of the property to
10229 * specify the code points above Latin1 that will have to be
10230 * determined at run-time
10231 * run_time_list is a SV* that contains text names of properties that are to
10232 * be computed at run time. This concatenates <Xpropertyname>
10233 * to it, apppropriately
10234 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10235 * time */
10236#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10237 l1_sourcelist, Xpropertyname, run_time_list) \
4f6289a3
RB
10238 /* First, resolve whether to use the ASCII-only list or the L1 \
10239 * list */ \
10240 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10241 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10242 Xpropertyname, run_time_list)
10243
10244#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10245 Xpropertyname, run_time_list) \
ea317ccb
KW
10246 /* If not /a matching, there are going to be code points we will have \
10247 * to defer to runtime to look-up */ \
10248 if (! AT_LEAST_ASCII_RESTRICTED) { \
10249 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10250 } \
10251 if (LOC) { \
10252 ANYOF_CLASS_SET(node, class); \
10253 } \
10254 else { \
4f6289a3 10255 _invlist_union(destlist, sourcelist, &destlist); \
ea317ccb
KW
10256 }
10257
10258/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10259 * this and DO_N_POSIX */
10260#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10261 l1_sourcelist, Xpropertyname, run_time_list) \
10262 if (AT_LEAST_ASCII_RESTRICTED) { \
10263 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10264 } \
10265 else { \
10266 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10267 if (LOC) { \
10268 ANYOF_CLASS_SET(node, namedclass); \
10269 } \
10270 else { \
10271 SV* scratch_list = NULL; \
10272 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10273 if (! destlist) { \
10274 destlist = scratch_list; \
10275 } \
10276 else { \
10277 _invlist_union(destlist, scratch_list, &destlist); \
10278 SvREFCNT_dec(scratch_list); \
10279 } \
10280 if (DEPENDS_SEMANTICS) { \
10281 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10282 } \
10283 } \
10284 }
a12cf05f 10285
2283d326 10286STATIC U8
a25abddc 10287S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
10288{
10289
10290 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10291 * Locale folding is done at run-time, so this function should not be
10292 * called for nodes that are for locales.
10293 *
d50a4f90 10294 * This function sets the bit corresponding to the fold of the input
2283d326
KW
10295 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10296 * 'F' is 'f'.
10297 *
d50a4f90
KW
10298 * It also knows about the characters that are in the bitmap that have
10299 * folds that are matchable only outside it, and sets the appropriate lists
10300 * and flags.
10301 *
10302 * It returns the number of bits that actually changed from 0 to 1 */
2283d326
KW
10303
10304 U8 stored = 0;
2283d326
KW
10305 U8 fold;
10306
4c9daa0a
KW
10307 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10308
cfaf538b 10309 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2f7f8cb1 10310 : PL_fold[value];
2283d326
KW
10311
10312 /* It assumes the bit for 'value' has already been set */
10313 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10314 ANYOF_BITMAP_SET(node, fold);
10315 stored++;
10316 }
d50a4f90
KW
10317 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10318 /* Certain Latin1 characters have matches outside the bitmap. To get
10319 * here, 'value' is one of those characters. None of these matches is
10320 * valid for ASCII characters under /aa, which have been excluded by
10321 * the 'if' above. The matches fall into three categories:
10322 * 1) They are singly folded-to or -from an above 255 character, as
10323 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10324 * WITH DIAERESIS;
10325 * 2) They are part of a multi-char fold with another character in the
10326 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10327 * 3) They are part of a multi-char fold with a character not in the
10328 * bitmap, such as various ligatures.
10329 * We aren't dealing fully with multi-char folds, except we do deal
10330 * with the pattern containing a character that has a multi-char fold
10331 * (not so much the inverse).
10332 * For types 1) and 3), the matches only happen when the target string
10333 * is utf8; that's not true for 2), and we set a flag for it.
10334 *
10335 * The code below adds to the passed in inversion list the single fold
10336 * closures for 'value'. The values are hard-coded here so that an
10337 * innocent-looking character class, like /[ks]/i won't have to go out
10338 * to disk to find the possible matches. XXX It would be better to
10339 * generate these via regen, in case a new version of the Unicode
10340 * standard adds new mappings, though that is not really likely. */
10341 switch (value) {
10342 case 'k':
10343 case 'K':
10344 /* KELVIN SIGN */
10345 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10346 break;
10347 case 's':
10348 case 'S':
10349 /* LATIN SMALL LETTER LONG S */
10350 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10351 break;
10352 case MICRO_SIGN:
10353 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10354 GREEK_SMALL_LETTER_MU);
10355 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10356 GREEK_CAPITAL_LETTER_MU);
10357 break;
10358 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10359 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10360 /* ANGSTROM SIGN */
10361 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10362 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10363 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10364 PL_fold_latin1[value]);
10365 }
10366 break;
10367 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10368 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10369 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10370 break;
10371 case LATIN_SMALL_LETTER_SHARP_S:
1d4120df
KW
10372 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10373 LATIN_CAPITAL_LETTER_SHARP_S);
d50a4f90 10374
419d8974 10375 /* Under /a, /d, and /u, this can match the two chars "ss" */
d50a4f90
KW
10376 if (! MORE_ASCII_RESTRICTED) {
10377 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10378
419d8974
KW
10379 /* And under /u or /a, it can match even if the target is
10380 * not utf8 */
10381 if (AT_LEAST_UNI_SEMANTICS) {
d50a4f90
KW
10382 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10383 }
10384 }
10385 break;
10386 case 'F': case 'f':
10387 case 'I': case 'i':
10388 case 'L': case 'l':
10389 case 'T': case 't':
d50a4f90
KW
10390 case 'A': case 'a':
10391 case 'H': case 'h':
10392 case 'J': case 'j':
10393 case 'N': case 'n':
10394 case 'W': case 'w':
10395 case 'Y': case 'y':
f580a93d
KW
10396 /* These all are targets of multi-character folds from code
10397 * points that require UTF8 to express, so they can't match
10398 * unless the target string is in UTF-8, so no action here is
10399 * necessary, as regexec.c properly handles the general case
10400 * for UTF-8 matching */
d50a4f90
KW
10401 break;
10402 default:
10403 /* Use deprecated warning to increase the chances of this
10404 * being output */
10405 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10406 break;
10407 }
10408 }
10409 else if (DEPENDS_SEMANTICS
f56b6394 10410 && ! isASCII(value)
d50a4f90
KW
10411 && PL_fold_latin1[value] != value)
10412 {
10413 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10414 * folds only when the target string is in UTF-8. We add the fold
10415 * here to the list of things to match outside the bitmap, which
10416 * won't be looked at unless it is UTF8 (or else if something else
10417 * says to look even if not utf8, but those things better not happen
10418 * under DEPENDS semantics. */
10419 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
2283d326
KW
10420 }
10421
10422 return stored;
10423}
10424
10425
10426PERL_STATIC_INLINE U8
a25abddc 10427S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
10428{
10429 /* This inline function sets a bit in the bitmap if not already set, and if
10430 * appropriate, its fold, returning the number of bits that actually
10431 * changed from 0 to 1 */
10432
10433 U8 stored;
10434
4c9daa0a
KW
10435 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10436
2283d326
KW
10437 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10438 return 0;
10439 }
10440
10441 ANYOF_BITMAP_SET(node, value);
10442 stored = 1;
10443
10444 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
2c6aa593 10445 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
2283d326
KW
10446 }
10447
10448 return stored;
10449}
10450
c8453963
KW
10451STATIC void
10452S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10453{
10454 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10455 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10456 * the multi-character folds of characters in the node */
10457 SV *sv;
10458
10459 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10460
10461 if (! *alternate_ptr) {
10462 *alternate_ptr = newAV();
10463 }
10464 sv = newSVpvn_utf8((char*)string, len, TRUE);
10465 av_push(*alternate_ptr, sv);
10466 return;
10467}
10468
7f6f358c
YO
10469/*
10470 parse a class specification and produce either an ANYOF node that
ddad5e0b 10471 matches the pattern or perhaps will be optimized into an EXACTish node
679d1424
KW
10472 instead. The node contains a bit map for the first 256 characters, with the
10473 corresponding bit set if that character is in the list. For characters
10474 above 255, a range list is used */
89836f1f 10475
76e3520e 10476STATIC regnode *
3dab1dad 10477S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 10478{
97aff369 10479 dVAR;
9a86a77b 10480 register UV nextvalue;
3568d838 10481 register IV prevvalue = OOB_UNICODE;
ffc61ed2 10482 register IV range = 0;
e1d1eefb 10483 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 10484 register regnode *ret;
ba210ebe 10485 STRLEN numlen;
ffc61ed2 10486 IV namedclass;
cbbf8932 10487 char *rangebegin = NULL;
936ed897 10488 bool need_class = 0;
827f5bb8 10489 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
c445ea15 10490 SV *listsv = NULL;
137165a6
KW
10491 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10492 than just initialized. */
dc415b4a
KW
10493 SV* properties = NULL; /* Code points that match \p{} \P{} */
10494 UV element_count = 0; /* Number of distinct elements in the class.
10495 Optimizations may be possible if this is tiny */
ffc61ed2 10496 UV n;
53742956 10497
dc415b4a 10498 /* Unicode properties are stored in a swash; this holds the current one
88d45d28
KW
10499 * being parsed. If this swash is the only above-latin1 component of the
10500 * character class, an optimization is to pass it directly on to the
10501 * execution engine. Otherwise, it is set to NULL to indicate that there
10502 * are other things in the class that have to be dealt with at execution
10503 * time */
dc415b4a
KW
10504 SV* swash = NULL; /* Code points that match \p{} \P{} */
10505
10506 /* Set if a component of this character class is user-defined; just passed
10507 * on to the engine */
10508 UV has_user_defined_property = 0;
10509
53742956 10510 /* code points this node matches that can't be stored in the bitmap */
a25abddc 10511 SV* nonbitmap = NULL;
53742956
KW
10512
10513 /* The items that are to match that aren't stored in the bitmap, but are a
10514 * result of things that are stored there. This is the fold closure of
10515 * such a character, either because it has DEPENDS semantics and shouldn't
10516 * be matched unless the target string is utf8, or is a code point that is
10517 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10518 * above 255. This all is solely for performance reasons. By having this
10519 * code know the outside-the-bitmap folds that the bitmapped characters are
10520 * involved with, we don't have to go out to disk to find the list of
10521 * matches, unless the character class includes code points that aren't
10522 * storable in the bit map. That means that a character class with an 's'
10523 * in it, for example, doesn't need to go out to disk to find everything
10524 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10525 * empty unless there is something whose fold we don't know about, and will
10526 * have to go out to the disk to find. */
a25abddc 10527 SV* l1_fold_invlist = NULL;
53742956
KW
10528
10529 /* List of multi-character folds that are matched by this node */
cbbf8932 10530 AV* unicode_alternate = NULL;
1b2d223b
JH
10531#ifdef EBCDIC
10532 UV literal_endpoint = 0;
10533#endif
ffc130aa 10534 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 10535
3dab1dad 10536 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 10537 case we need to change the emitted regop to an EXACT. */
07be1b83 10538 const char * orig_parse = RExC_parse;
72f13be8 10539 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10540
10541 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
10542#ifndef DEBUGGING
10543 PERL_UNUSED_ARG(depth);
10544#endif
72f13be8 10545
3dab1dad 10546 DEBUG_PARSE("clas");
7f6f358c
YO
10547
10548 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
10549 ret = reganode(pRExC_state, ANYOF, 0);
10550
56ca34ca
KW
10551
10552 if (!SIZE_ONLY) {
ffc61ed2 10553 ANYOF_FLAGS(ret) = 0;
56ca34ca 10554 }
ffc61ed2 10555
9a86a77b 10556 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
10557 RExC_naughty++;
10558 RExC_parse++;
10559 if (!SIZE_ONLY)
10560 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
827f5bb8
KW
10561
10562 /* We have decided to not allow multi-char folds in inverted character
ac455f4c
KW
10563 * classes, due to the confusion that can happen, especially with
10564 * classes that are designed for a non-Unicode world: You have the
10565 * peculiar case that:
827f5bb8
KW
10566 "s s" =~ /^[^\xDF]+$/i => Y
10567 "ss" =~ /^[^\xDF]+$/i => N
10568 *
10569 * See [perl #89750] */
10570 allow_full_fold = FALSE;
ffc61ed2 10571 }
a0d0e21e 10572
73060fc4 10573 if (SIZE_ONLY) {
830247a4 10574 RExC_size += ANYOF_SKIP;
73060fc4
JH
10575 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10576 }
936ed897 10577 else {
830247a4 10578 RExC_emit += ANYOF_SKIP;
3a15e693 10579 if (LOC) {
936ed897 10580 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693 10581 }
ffc61ed2 10582 ANYOF_BITMAP_ZERO(ret);
396482e1 10583 listsv = newSVpvs("# comment\n");
137165a6 10584 initial_listsv_len = SvCUR(listsv);
a0d0e21e 10585 }
b8c5462f 10586
9a86a77b
JH
10587 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10588
b938889d 10589 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 10590 checkposixcc(pRExC_state);
b8c5462f 10591
f064b6ad
HS
10592 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10593 if (UCHARAT(RExC_parse) == ']')
10594 goto charclassloop;
ffc61ed2 10595
fc8cd66c 10596parseit:
9a86a77b 10597 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
10598
10599 charclassloop:
10600
10601 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10602
dc415b4a 10603 if (!range) {
830247a4 10604 rangebegin = RExC_parse;
dc415b4a
KW
10605 element_count++;
10606 }
ffc61ed2 10607 if (UTF) {
5e12f4fb 10608 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 10609 RExC_end - RExC_parse,
9f7f3913 10610 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
10611 RExC_parse += numlen;
10612 }
10613 else
10614 value = UCHARAT(RExC_parse++);
7f6f358c 10615
9a86a77b
JH
10616 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10617 if (value == '[' && POSIXCC(nextvalue))
830247a4 10618 namedclass = regpposixcc(pRExC_state, value);
620e46c5 10619 else if (value == '\\') {
ffc61ed2 10620 if (UTF) {
5e12f4fb 10621 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 10622 RExC_end - RExC_parse,
9f7f3913 10623 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
10624 RExC_parse += numlen;
10625 }
10626 else
10627 value = UCHARAT(RExC_parse++);
470c3474 10628 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 10629 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
10630 * be a problem later if we want switch on Unicode.
10631 * A similar issue a little bit later when switching on
10632 * namedclass. --jhi */
ffc61ed2 10633 switch ((I32)value) {
b8c5462f
JH
10634 case 'w': namedclass = ANYOF_ALNUM; break;
10635 case 'W': namedclass = ANYOF_NALNUM; break;
10636 case 's': namedclass = ANYOF_SPACE; break;
10637 case 'S': namedclass = ANYOF_NSPACE; break;
10638 case 'd': namedclass = ANYOF_DIGIT; break;
10639 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
10640 case 'v': namedclass = ANYOF_VERTWS; break;
10641 case 'V': namedclass = ANYOF_NVERTWS; break;
10642 case 'h': namedclass = ANYOF_HORIZWS; break;
10643 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
10644 case 'N': /* Handle \N{NAME} in class */
10645 {
10646 /* We only pay attention to the first char of
10647 multichar strings being returned. I kinda wonder
10648 if this makes sense as it does change the behaviour
10649 from earlier versions, OTOH that behaviour was broken
10650 as well. */
10651 UV v; /* value is register so we cant & it /grrr */
9d64099b 10652 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
fc8cd66c
YO
10653 goto parseit;
10654 }
10655 value= v;
10656 }
10657 break;
ffc61ed2
JH
10658 case 'p':
10659 case 'P':
3dab1dad
YO
10660 {
10661 char *e;
af6f566e 10662 if (RExC_parse >= RExC_end)
2a4859cd 10663 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 10664 if (*RExC_parse == '{') {
1df70142 10665 const U8 c = (U8)value;
ffc61ed2
JH
10666 e = strchr(RExC_parse++, '}');
10667 if (!e)
0da60cf5 10668 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
10669 while (isSPACE(UCHARAT(RExC_parse)))
10670 RExC_parse++;
10671 if (e == RExC_parse)
0da60cf5 10672 vFAIL2("Empty \\%c{}", c);
ffc61ed2 10673 n = e - RExC_parse;
ab13f0c7
JH
10674 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10675 n--;
ffc61ed2
JH
10676 }
10677 else {
10678 e = RExC_parse;
10679 n = 1;
10680 }
ee410026 10681 if (!SIZE_ONLY) {
dc415b4a
KW
10682 SV** invlistsvp;
10683 SV* invlist;
10684 char* name;
ab13f0c7
JH
10685 if (UCHARAT(RExC_parse) == '^') {
10686 RExC_parse++;
10687 n--;
10688 value = value == 'p' ? 'P' : 'p'; /* toggle */
10689 while (isSPACE(UCHARAT(RExC_parse))) {
10690 RExC_parse++;
10691 n--;
10692 }
10693 }
dc415b4a
KW
10694 /* Try to get the definition of the property into
10695 * <invlist>. If /i is in effect, the effective property
10696 * will have its name be <__NAME_i>. The design is
10697 * discussed in commit
10698 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10699 Newx(name, n + sizeof("_i__\n"), char);
10700
10701 sprintf(name, "%s%.*s%s\n",
10702 (FOLD) ? "__" : "",
10703 (int)n,
10704 RExC_parse,
10705 (FOLD) ? "_i" : ""
10706 );
10707
10708 /* Look up the property name, and get its swash and
10709 * inversion list, if the property is found */
dc415b4a
KW
10710 if (swash) {
10711 SvREFCNT_dec(swash);
10712 }
10713 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10714 1, /* binary */
10715 0, /* not tr/// */
10716 TRUE, /* this routine will handle
10717 undefined properties */
10718 NULL, FALSE /* No inversion list */
10719 );
b6c46382 10720 if ( ! swash
dc415b4a
KW
10721 || ! SvROK(swash)
10722 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10723 || ! (invlistsvp =
10724 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10725 "INVLIST", FALSE))
10726 || ! (invlist = *invlistsvp))
10727 {
10728 if (swash) {
10729 SvREFCNT_dec(swash);
10730 swash = NULL;
10731 }
10732
10733 /* Here didn't find it. It could be a user-defined
10734 * property that will be available at run-time. Add it
10735 * to the list to look up then */
10736 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10737 (value == 'p' ? '+' : '!'),
10738 name);
10739 has_user_defined_property = 1;
10740
10741 /* We don't know yet, so have to assume that the
10742 * property could match something in the Latin1 range,
10743 * hence something that isn't utf8 */
10744 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10745 }
10746 else {
10747
10748 /* Here, did get the swash and its inversion list. If
10749 * the swash is from a user-defined property, then this
10750 * whole character class should be regarded as such */
10751 SV** user_defined_svp =
10752 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10753 "USER_DEFINED", FALSE);
10754 if (user_defined_svp) {
10755 has_user_defined_property
10756 |= SvUV(*user_defined_svp);
10757 }
10758
10759 /* Invert if asking for the complement */
10760 if (value == 'P') {
8dc9348a 10761 _invlist_union_complement_2nd(properties, invlist, &properties);
2f833f52 10762
dc415b4a
KW
10763 /* The swash can't be used as-is, because we've
10764 * inverted things; delay removing it to here after
10765 * have copied its invlist above */
10766 SvREFCNT_dec(swash);
10767 swash = NULL;
10768 }
10769 else {
112b0fc6 10770 _invlist_union(properties, invlist, &properties);
dc415b4a
KW
10771 }
10772 }
10773 Safefree(name);
ffc61ed2
JH
10774 }
10775 RExC_parse = e + 1;
f81125e2 10776 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
10777
10778 /* \p means they want Unicode semantics */
10779 RExC_uni_semantics = 1;
3dab1dad 10780 }
f81125e2 10781 break;
b8c5462f
JH
10782 case 'n': value = '\n'; break;
10783 case 'r': value = '\r'; break;
10784 case 't': value = '\t'; break;
10785 case 'f': value = '\f'; break;
10786 case 'b': value = '\b'; break;
c7f1f016
NIS
10787 case 'e': value = ASCII_TO_NATIVE('\033');break;
10788 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
10789 case 'o':
10790 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
10791 {
10792 const char* error_msg;
10793 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
10794 &value,
10795 &numlen,
454155d9
KW
10796 &error_msg,
10797 SIZE_ONLY);
10798 RExC_parse += numlen;
10799 if (! valid) {
10800 vFAIL(error_msg);
10801 }
f0a2b745 10802 }
f0a2b745
KW
10803 if (PL_encoding && value < 0x100) {
10804 goto recode_encoding;
10805 }
10806 break;
b8c5462f 10807 case 'x':
ffc61ed2 10808 if (*RExC_parse == '{') {
a4c04bdc
NC
10809 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10810 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 10811 char * const e = strchr(RExC_parse++, '}');
b81d288d 10812 if (!e)
ffc61ed2 10813 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
10814
10815 numlen = e - RExC_parse;
10816 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
10817 RExC_parse = e + 1;
10818 }
10819 else {
a4c04bdc 10820 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
10821 numlen = 2;
10822 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
10823 RExC_parse += numlen;
10824 }
9e08bc66
TS
10825 if (PL_encoding && value < 0x100)
10826 goto recode_encoding;
b8c5462f
JH
10827 break;
10828 case 'c':
17a3df4c 10829 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
b8c5462f
JH
10830 break;
10831 case '0': case '1': case '2': case '3': case '4':
c99e91e9 10832 case '5': case '6': case '7':
9e08bc66 10833 {
c99e91e9
KW
10834 /* Take 1-3 octal digits */
10835 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
10836 numlen = 3;
10837 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10838 RExC_parse += numlen;
10839 if (PL_encoding && value < 0x100)
10840 goto recode_encoding;
10841 break;
10842 }
10843 recode_encoding:
e2a7e165 10844 if (! RExC_override_recoding) {
9e08bc66
TS
10845 SV* enc = PL_encoding;
10846 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
10847 if (!enc && SIZE_ONLY)
10848 ckWARNreg(RExC_parse,
10849 "Invalid escape in the specified encoding");
9e08bc66
TS
10850 break;
10851 }
1028017a 10852 default:
c99e91e9
KW
10853 /* Allow \_ to not give an error */
10854 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
10855 ckWARN2reg(RExC_parse,
10856 "Unrecognized escape \\%c in character class passed through",
10857 (int)value);
c99e91e9 10858 }
1028017a 10859 break;
b8c5462f 10860 }
ffc61ed2 10861 } /* end of \blah */
1b2d223b
JH
10862#ifdef EBCDIC
10863 else
10864 literal_endpoint++;
10865#endif
ffc61ed2
JH
10866
10867 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10868
2c63ecad
KW
10869 /* What matches in a locale is not known until runtime, so need to
10870 * (one time per class) allocate extra space to pass to regexec.
10871 * The space will contain a bit for each named class that is to be
10872 * matched against. This isn't needed for \p{} and pseudo-classes,
10873 * as they are not affected by locale, and hence are dealt with
10874 * separately */
10875 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10876 need_class = 1;
10877 if (SIZE_ONLY) {
dd58aee1 10878 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
10879 }
10880 else {
dd58aee1 10881 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
10882 ANYOF_CLASS_ZERO(ret);
10883 }
9051cfd9 10884 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 10885 }
ffc61ed2 10886
d5788240 10887 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
1d791ab2
KW
10888 * literal, as is the character that began the false range, i.e.
10889 * the 'a' in the examples */
ffc61ed2 10890 if (range) {
73b437c8 10891 if (!SIZE_ONLY) {
668c081a
NC
10892 const int w =
10893 RExC_parse >= rangebegin ?
10894 RExC_parse - rangebegin : 0;
10895 ckWARN4reg(RExC_parse,
b45f050a 10896 "False [] range \"%*.*s\"",
097eb12c 10897 w, w, rangebegin);
668c081a 10898
1d791ab2 10899 stored +=
5bfec14d 10900 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
3568d838 10901 if (prevvalue < 256) {
2283d326 10902 stored +=
5bfec14d 10903 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
ffc61ed2
JH
10904 }
10905 else {
1d791ab2 10906 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
ffc61ed2 10907 }
b8c5462f 10908 }
ffc61ed2
JH
10909
10910 range = 0; /* this was not a true range */
73b437c8 10911 }
ffc61ed2 10912
73b437c8 10913 if (!SIZE_ONLY) {
c49a72a9 10914
e2962f66
JH
10915 /* Possible truncation here but in some 64-bit environments
10916 * the compiler gets heartburn about switch on 64-bit values.
10917 * A similar issue a little earlier when switching on value.
98f323fa 10918 * --jhi */
e2962f66 10919 switch ((I32)namedclass) {
ea317ccb
KW
10920
10921 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10922 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10923 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10924 break;
10925 case ANYOF_NALNUMC:
10926 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10927 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10928 break;
10929 case ANYOF_ALPHA:
10930 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10931 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10932 break;
10933 case ANYOF_NALPHA:
10934 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10935 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10936 break;
73b437c8 10937 case ANYOF_ASCII:
ea317ccb
KW
10938 if (LOC) {
10939 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 10940 }
ea317ccb
KW
10941 else {
10942 _invlist_union(properties, PL_ASCII, &properties);
10943 }
73b437c8
JH
10944 break;
10945 case ANYOF_NASCII:
ea317ccb
KW
10946 if (LOC) {
10947 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 10948 }
ea317ccb
KW
10949 else {
10950 _invlist_union_complement_2nd(properties,
10951 PL_ASCII, &properties);
10952 if (DEPENDS_SEMANTICS) {
10953 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10954 }
10955 }
10956 break;
10957 case ANYOF_BLANK:
10958 DO_POSIX(ret, namedclass, properties,
10959 PL_PosixBlank, PL_XPosixBlank);
10960 break;
10961 case ANYOF_NBLANK:
10962 DO_N_POSIX(ret, namedclass, properties,
10963 PL_PosixBlank, PL_XPosixBlank);
10964 break;
10965 case ANYOF_CNTRL:
10966 DO_POSIX(ret, namedclass, properties,
10967 PL_PosixCntrl, PL_XPosixCntrl);
10968 break;
10969 case ANYOF_NCNTRL:
10970 DO_N_POSIX(ret, namedclass, properties,
10971 PL_PosixCntrl, PL_XPosixCntrl);
10972 break;
ffc61ed2 10973 case ANYOF_DIGIT:
4f6289a3
RB
10974 /* There are no digits in the Latin1 range outside of
10975 * ASCII, so call the macro that doesn't have to resolve
10976 * them */
10977 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
10978 PL_PosixDigit, "XPosixDigit", listsv);
ea317ccb
KW
10979 break;
10980 case ANYOF_NDIGIT:
10981 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10982 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10983 break;
10984 case ANYOF_GRAPH:
10985 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10986 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10987 break;
10988 case ANYOF_NGRAPH:
10989 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10990 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10991 break;
10992 case ANYOF_HORIZWS:
10993 /* For these, we use the nonbitmap, as /d doesn't make a
10994 * difference in what these match. There would be problems
10995 * if these characters had folds other than themselves, as
24caacbc
KW
10996 * nonbitmap is subject to folding. It turns out that \h
10997 * is just a synonym for XPosixBlank */
10998 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
ea317ccb
KW
10999 break;
11000 case ANYOF_NHORIZWS:
11001 _invlist_union_complement_2nd(nonbitmap,
24caacbc 11002 PL_XPosixBlank, &nonbitmap);
ea317ccb
KW
11003 break;
11004 case ANYOF_LOWER:
11005 case ANYOF_NLOWER:
11006 { /* These require special handling, as they differ under
dab0c3e7
KW
11007 folding, matching Cased there (which in the ASCII range
11008 is the same as Alpha */
ea317ccb
KW
11009
11010 SV* ascii_source;
11011 SV* l1_source;
11012 const char *Xname;
11013
11014 if (FOLD && ! LOC) {
11015 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
11016 l1_source = PL_L1Cased;
11017 Xname = "Cased";
ea317ccb 11018 }
ffc61ed2 11019 else {
ea317ccb
KW
11020 ascii_source = PL_PosixLower;
11021 l1_source = PL_L1PosixLower;
11022 Xname = "XPosixLower";
11023 }
11024 if (namedclass == ANYOF_LOWER) {
11025 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11026 ascii_source, l1_source, Xname, listsv);
11027 }
11028 else {
11029 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11030 properties, ascii_source, l1_source, Xname, listsv);
ffc61ed2 11031 }
ffc61ed2 11032 break;
ea317ccb
KW
11033 }
11034 case ANYOF_PRINT:
11035 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11036 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11037 break;
11038 case ANYOF_NPRINT:
11039 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11040 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11041 break;
11042 case ANYOF_PUNCT:
11043 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11044 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11045 break;
11046 case ANYOF_NPUNCT:
11047 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11048 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11049 break;
11050 case ANYOF_PSXSPC:
11051 DO_POSIX(ret, namedclass, properties,
11052 PL_PosixSpace, PL_XPosixSpace);
11053 break;
11054 case ANYOF_NPSXSPC:
11055 DO_N_POSIX(ret, namedclass, properties,
11056 PL_PosixSpace, PL_XPosixSpace);
11057 break;
11058 case ANYOF_SPACE:
11059 DO_POSIX(ret, namedclass, properties,
11060 PL_PerlSpace, PL_XPerlSpace);
11061 break;
11062 case ANYOF_NSPACE:
11063 DO_N_POSIX(ret, namedclass, properties,
11064 PL_PerlSpace, PL_XPerlSpace);
11065 break;
11066 case ANYOF_UPPER: /* Same as LOWER, above */
11067 case ANYOF_NUPPER:
11068 {
11069 SV* ascii_source;
11070 SV* l1_source;
11071 const char *Xname;
11072
11073 if (FOLD && ! LOC) {
11074 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
11075 l1_source = PL_L1Cased;
11076 Xname = "Cased";
ea317ccb 11077 }
ffc61ed2 11078 else {
ea317ccb
KW
11079 ascii_source = PL_PosixUpper;
11080 l1_source = PL_L1PosixUpper;
11081 Xname = "XPosixUpper";
ffc61ed2 11082 }
ea317ccb
KW
11083 if (namedclass == ANYOF_UPPER) {
11084 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11085 ascii_source, l1_source, Xname, listsv);
cfaf538b 11086 }
ea317ccb
KW
11087 else {
11088 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11089 properties, ascii_source, l1_source, Xname, listsv);
11090 }
11091 break;
11092 }
11093 case ANYOF_ALNUM: /* Really is 'Word' */
11094 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11095 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11096 break;
11097 case ANYOF_NALNUM:
11098 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11099 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11100 break;
11101 case ANYOF_VERTWS:
11102 /* For these, we use the nonbitmap, as /d doesn't make a
11103 * difference in what these match. There would be problems
11104 * if these characters had folds other than themselves, as
11105 * nonbitmap is subject to folding */
11106 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11107 break;
11108 case ANYOF_NVERTWS:
11109 _invlist_union_complement_2nd(nonbitmap,
11110 PL_VertSpace, &nonbitmap);
11111 break;
11112 case ANYOF_XDIGIT:
11113 DO_POSIX(ret, namedclass, properties,
11114 PL_PosixXDigit, PL_XPosixXDigit);
11115 break;
11116 case ANYOF_NXDIGIT:
11117 DO_N_POSIX(ret, namedclass, properties,
11118 PL_PosixXDigit, PL_XPosixXDigit);
11119 break;
f81125e2
JP
11120 case ANYOF_MAX:
11121 /* this is to handle \p and \P */
11122 break;
73b437c8 11123 default:
b45f050a 11124 vFAIL("Invalid [::] class");
73b437c8 11125 break;
b8c5462f 11126 }
ce1c68b2 11127
73b437c8 11128 continue;
a0d0e21e 11129 }
ffc61ed2
JH
11130 } /* end of namedclass \blah */
11131
a0d0e21e 11132 if (range) {
eb160463 11133 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
11134 const int w = RExC_parse - rangebegin;
11135 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 11136 range = 0; /* not a valid range */
73b437c8 11137 }
a0d0e21e
LW
11138 }
11139 else {
3568d838 11140 prevvalue = value; /* save the beginning of the range */
646253b5
KW
11141 if (RExC_parse+1 < RExC_end
11142 && *RExC_parse == '-'
11143 && RExC_parse[1] != ']')
11144 {
830247a4 11145 RExC_parse++;
ffc61ed2
JH
11146
11147 /* a bad range like \w-, [:word:]- ? */
11148 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 11149 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 11150 const int w =
afd78fd5
JH
11151 RExC_parse >= rangebegin ?
11152 RExC_parse - rangebegin : 0;
830247a4 11153 vWARN4(RExC_parse,
b45f050a 11154 "False [] range \"%*.*s\"",
097eb12c 11155 w, w, rangebegin);
afd78fd5 11156 }
73b437c8 11157 if (!SIZE_ONLY)
2283d326 11158 stored +=
5bfec14d 11159 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
73b437c8 11160 } else
ffc61ed2
JH
11161 range = 1; /* yeah, it's a range! */
11162 continue; /* but do it the next time */
a0d0e21e 11163 }
a687059c 11164 }
ffc61ed2 11165
046c4055
KW
11166 /* non-Latin1 code point implies unicode semantics. Must be set in
11167 * pass1 so is there for the whole of pass 2 */
56ca34ca
KW
11168 if (value > 255) {
11169 RExC_uni_semantics = 1;
11170 }
11171
93733859 11172 /* now is the next time */
ae5c130c 11173 if (!SIZE_ONLY) {
3568d838 11174 if (prevvalue < 256) {
1df70142 11175 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 11176 IV i;
3568d838 11177#ifdef EBCDIC
1b2d223b
JH
11178 /* In EBCDIC [\x89-\x91] should include
11179 * the \x8e but [i-j] should not. */
11180 if (literal_endpoint == 2 &&
11181 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11182 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 11183 {
3568d838
JH
11184 if (isLOWER(prevvalue)) {
11185 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 11186 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 11187 stored +=
5bfec14d 11188 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 11189 }
ffc61ed2 11190 } else {
3568d838 11191 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 11192 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 11193 stored +=
5bfec14d 11194 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 11195 }
ffc61ed2 11196 }
8ada0baa 11197 }
ffc61ed2 11198 else
8ada0baa 11199#endif
07be1b83 11200 for (i = prevvalue; i <= ceilvalue; i++) {
5bfec14d 11201 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
07be1b83 11202 }
3568d838 11203 }
56ca34ca
KW
11204 if (value > 255) {
11205 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11206 const UV natvalue = NATIVE_TO_UNI(value);
9d501133 11207 nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
56ca34ca 11208 }
1b2d223b
JH
11209#ifdef EBCDIC
11210 literal_endpoint = 0;
11211#endif
8ada0baa 11212 }
ffc61ed2
JH
11213
11214 range = 0; /* this range (if it was one) is done now */
a0d0e21e 11215 }
ffc61ed2 11216
ffc61ed2 11217
7f6f358c
YO
11218
11219 if (SIZE_ONLY)
11220 return ret;
11221 /****** !SIZE_ONLY AFTER HERE *********/
11222
0c6e4288
KW
11223 /* If folding and there are code points above 255, we calculate all
11224 * characters that could fold to or from the ones already on the list */
11225 if (FOLD && nonbitmap) {
0d527bf8 11226 UV start, end; /* End points of code point ranges */
56ca34ca 11227
4065ba03 11228 SV* fold_intersection = NULL;
93e5bb1c
KW
11229
11230 /* This is a list of all the characters that participate in folds
11231 * (except marks, etc in multi-char folds */
11232 if (! PL_utf8_foldable) {
11233 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11234 PL_utf8_foldable = _swash_to_invlist(swash);
0501f9a8 11235 SvREFCNT_dec(swash);
93e5bb1c 11236 }
56ca34ca 11237
93e5bb1c
KW
11238 /* This is a hash that for a particular fold gives all characters
11239 * that are involved in it */
11240 if (! PL_utf8_foldclosures) {
11241
11242 /* If we were unable to find any folds, then we likely won't be
11243 * able to find the closures. So just create an empty list.
11244 * Folding will effectively be restricted to the non-Unicode rules
11245 * hard-coded into Perl. (This case happens legitimately during
11246 * compilation of Perl itself before the Unicode tables are
11247 * generated) */
11248 if (invlist_len(PL_utf8_foldable) == 0) {
ddc1cd80 11249 PL_utf8_foldclosures = newHV();
93e5bb1c
KW
11250 } else {
11251 /* If the folds haven't been read in, call a fold function
11252 * to force that */
11253 if (! PL_utf8_tofold) {
11254 U8 dummy[UTF8_MAXBYTES+1];
11255 STRLEN dummy_len;
f26f1b9c
KW
11256
11257 /* This particular string is above \xff in both UTF-8 and
11258 * UTFEBCDIC */
11259 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
686c8a98 11260 assert(PL_utf8_tofold); /* Verify that worked */
56ca34ca 11261 }
93e5bb1c 11262 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
56ca34ca 11263 }
93e5bb1c
KW
11264 }
11265
dbe7a391
KW
11266 /* Only the characters in this class that participate in folds need be
11267 * checked. Get the intersection of this class and all the possible
11268 * characters that are foldable. This can quickly narrow down a large
11269 * class */
37e85ffe 11270 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
93e5bb1c
KW
11271
11272 /* Now look at the foldable characters in this class individually */
0d527bf8
KW
11273 invlist_iterinit(fold_intersection);
11274 while (invlist_iternext(fold_intersection, &start, &end)) {
93e5bb1c
KW
11275 UV j;
11276
93e5bb1c
KW
11277 /* Look at every character in the range */
11278 for (j = start; j <= end; j++) {
11279
11280 /* Get its fold */
11281 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11282 STRLEN foldlen;
827f5bb8 11283 const UV f =
a0270393
KW
11284 _to_uni_fold_flags(j, foldbuf, &foldlen,
11285 (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
93e5bb1c
KW
11286
11287 if (foldlen > (STRLEN)UNISKIP(f)) {
11288
dbe7a391
KW
11289 /* Any multicharacter foldings (disallowed in lookbehind
11290 * patterns) require the following transform: [ABCDEF] ->
11291 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11292 * folds into "rst", all other characters fold to single
11293 * characters. We save away these multicharacter foldings,
11294 * to be later saved as part of the additional "s" data. */
93e5bb1c
KW
11295 if (! RExC_in_lookbehind) {
11296 U8* loc = foldbuf;
11297 U8* e = foldbuf + foldlen;
11298
dbe7a391
KW
11299 /* If any of the folded characters of this are in the
11300 * Latin1 range, tell the regex engine that this can
11301 * match a non-utf8 target string. The only multi-byte
11302 * fold whose source is in the Latin1 range (U+00DF)
11303 * applies only when the target string is utf8, or
11304 * under unicode rules */
93e5bb1c
KW
11305 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11306 while (loc < e) {
11307
11308 /* Can't mix ascii with non- under /aa */
11309 if (MORE_ASCII_RESTRICTED
11310 && (isASCII(*loc) != isASCII(j)))
11311 {
11312 goto end_multi_fold;
11313 }
11314 if (UTF8_IS_INVARIANT(*loc)
11315 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11316 {
dbe7a391
KW
11317 /* Can't mix above and below 256 under LOC
11318 */
93e5bb1c 11319 if (LOC) {
2f7f8cb1
KW
11320 goto end_multi_fold;
11321 }
93e5bb1c
KW
11322 ANYOF_FLAGS(ret)
11323 |= ANYOF_NONBITMAP_NON_UTF8;
11324 break;
8e3094e5 11325 }
93e5bb1c 11326 loc += UTF8SKIP(loc);
8e3094e5 11327 }
56ca34ca 11328 }
17580e7a 11329
93e5bb1c
KW
11330 add_alternate(&unicode_alternate, foldbuf, foldlen);
11331 end_multi_fold: ;
11332 }
14e30abc
KW
11333
11334 /* This is special-cased, as it is the only letter which
11335 * has both a multi-fold and single-fold in Latin1. All
11336 * the other chars that have single and multi-folds are
11337 * always in utf8, and the utf8 folding algorithm catches
11338 * them */
11339 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11340 stored += set_regclass_bit(pRExC_state,
11341 ret,
11342 LATIN_SMALL_LETTER_SHARP_S,
11343 &l1_fold_invlist, &unicode_alternate);
11344 }
93e5bb1c
KW
11345 }
11346 else {
11347 /* Single character fold. Add everything in its fold
dbe7a391 11348 * closure to the list that this node should match */
93e5bb1c
KW
11349 SV** listp;
11350
dbe7a391
KW
11351 /* The fold closures data structure is a hash with the keys
11352 * being every character that is folded to, like 'k', and
11353 * the values each an array of everything that folds to its
11354 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
93e5bb1c
KW
11355 if ((listp = hv_fetch(PL_utf8_foldclosures,
11356 (char *) foldbuf, foldlen, FALSE)))
11357 {
11358 AV* list = (AV*) *listp;
11359 IV k;
11360 for (k = 0; k <= av_len(list); k++) {
11361 SV** c_p = av_fetch(list, k, FALSE);
11362 UV c;
11363 if (c_p == NULL) {
11364 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11365 }
11366 c = SvUV(*c_p);
11367
dbe7a391
KW
11368 /* /aa doesn't allow folds between ASCII and non-;
11369 * /l doesn't allow them between above and below
11370 * 256 */
93e5bb1c
KW
11371 if ((MORE_ASCII_RESTRICTED
11372 && (isASCII(c) != isASCII(j)))
11373 || (LOC && ((c < 256) != (j < 256))))
11374 {
11375 continue;
11376 }
56ca34ca 11377
93e5bb1c
KW
11378 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11379 stored += set_regclass_bit(pRExC_state,
11380 ret,
11381 (U8) c,
11382 &l1_fold_invlist, &unicode_alternate);
11383 }
dbe7a391
KW
11384 /* It may be that the code point is already in
11385 * this range or already in the bitmap, in
11386 * which case we need do nothing */
93e5bb1c
KW
11387 else if ((c < start || c > end)
11388 && (c > 255
11389 || ! ANYOF_BITMAP_TEST(ret, c)))
11390 {
11391 nonbitmap = add_cp_to_invlist(nonbitmap, c);
56ca34ca
KW
11392 }
11393 }
11394 }
11395 }
11396 }
93e5bb1c 11397 }
318c430e 11398 SvREFCNT_dec(fold_intersection);
56ca34ca
KW
11399 }
11400
53742956
KW
11401 /* Combine the two lists into one. */
11402 if (l1_fold_invlist) {
11403 if (nonbitmap) {
37e85ffe 11404 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
318c430e 11405 SvREFCNT_dec(l1_fold_invlist);
53742956
KW
11406 }
11407 else {
11408 nonbitmap = l1_fold_invlist;
11409 }
11410 }
11411
dc415b4a
KW
11412 /* And combine the result (if any) with any inversion list from properties.
11413 * The lists are kept separate up to now because we don't want to fold the
11414 * properties */
11415 if (properties) {
11416 if (nonbitmap) {
11417 _invlist_union(nonbitmap, properties, &nonbitmap);
11418 SvREFCNT_dec(properties);
11419 }
11420 else {
11421 nonbitmap = properties;
11422 }
11423 }
11424
e4e94b48
KW
11425 /* Here, <nonbitmap> contains all the code points we can determine at
11426 * compile time that we haven't put into the bitmap. Go through it, and
11427 * for things that belong in the bitmap, put them there, and delete from
11428 * <nonbitmap> */
11429 if (nonbitmap) {
11430
11431 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11432 * possibly only should match when the target string is UTF-8 */
11433 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11434
11435 /* This gets set if we actually need to modify things */
11436 bool change_invlist = FALSE;
11437
11438 UV start, end;
11439
11440 /* Start looking through <nonbitmap> */
11441 invlist_iterinit(nonbitmap);
11442 while (invlist_iternext(nonbitmap, &start, &end)) {
11443 UV high;
11444 int i;
11445
11446 /* Quit if are above what we should change */
11447 if (start > max_cp_to_set) {
11448 break;
11449 }
11450
11451 change_invlist = TRUE;
11452
11453 /* Set all the bits in the range, up to the max that we are doing */
11454 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11455 for (i = start; i <= (int) high; i++) {
11456 if (! ANYOF_BITMAP_TEST(ret, i)) {
11457 ANYOF_BITMAP_SET(ret, i);
11458 stored++;
11459 prevvalue = value;
11460 value = i;
11461 }
11462 }
11463 }
11464
a3e1f3a6
KW
11465 /* Done with loop; remove any code points that are in the bitmap from
11466 * <nonbitmap> */
e4e94b48 11467 if (change_invlist) {
a3e1f3a6
KW
11468 _invlist_subtract(nonbitmap,
11469 (DEPENDS_SEMANTICS)
11470 ? PL_ASCII
11471 : PL_Latin1,
11472 &nonbitmap);
e4e94b48
KW
11473 }
11474
11475 /* If have completely emptied it, remove it completely */
11476 if (invlist_len(nonbitmap) == 0) {
11477 SvREFCNT_dec(nonbitmap);
11478 nonbitmap = NULL;
11479 }
11480 }
dc415b4a 11481
fb9bfbf7 11482 /* Here, we have calculated what code points should be in the character
e4e94b48
KW
11483 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11484 * case of DEPENDS rules.
dbe7a391
KW
11485 *
11486 * Now we can see about various optimizations. Fold calculation (which we
11487 * did above) needs to take place before inversion. Otherwise /[^k]/i
11488 * would invert to include K, which under /i would match k, which it
11489 * shouldn't. */
fb9bfbf7 11490
f56b6394 11491 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
dbe7a391 11492 * set the FOLD flag yet, so this does optimize those. It doesn't
40c78556
KW
11493 * optimize locale. Doing so perhaps could be done as long as there is
11494 * nothing like \w in it; some thought also would have to be given to the
11495 * interaction with above 0x100 chars */
dbe7a391
KW
11496 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11497 && ! LOC
137165a6 11498 && ! unicode_alternate
2fde50e1
KW
11499 /* In case of /d, there are some things that should match only when in
11500 * not in the bitmap, i.e., they require UTF8 to match. These are
dbe7a391
KW
11501 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11502 * case, they don't require UTF8, so can invert here */
2fde50e1
KW
11503 && (! nonbitmap
11504 || ! DEPENDS_SEMANTICS
11505 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
137165a6
KW
11506 && SvCUR(listsv) == initial_listsv_len)
11507 {
a9746a27 11508 int i;
2fde50e1 11509 if (! nonbitmap) {
a9746a27
KW
11510 for (i = 0; i < 256; ++i) {
11511 if (ANYOF_BITMAP_TEST(ret, i)) {
11512 ANYOF_BITMAP_CLEAR(ret, i);
11513 }
11514 else {
11515 ANYOF_BITMAP_SET(ret, i);
11516 prevvalue = value;
11517 value = i;
11518 }
11519 }
2fde50e1
KW
11520 /* The inversion means that everything above 255 is matched */
11521 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11522 }
11523 else {
4004d578
KW
11524 /* Here, also has things outside the bitmap that may overlap with
11525 * the bitmap. We have to sync them up, so that they get inverted
11526 * in both places. Earlier, we removed all overlaps except in the
11527 * case of /d rules, so no syncing is needed except for this case
11528 */
11529 SV *remove_list = NULL;
11530
11531 if (DEPENDS_SEMANTICS) {
11532 UV start, end;
11533
11534 /* Set the bits that correspond to the ones that aren't in the
11535 * bitmap. Otherwise, when we invert, we'll miss these.
11536 * Earlier, we removed from the nonbitmap all code points
11537 * < 128, so there is no extra work here */
11538 invlist_iterinit(nonbitmap);
11539 while (invlist_iternext(nonbitmap, &start, &end)) {
11540 if (start > 255) { /* The bit map goes to 255 */
11541 break;
11542 }
11543 if (end > 255) {
11544 end = 255;
11545 }
11546 for (i = start; i <= (int) end; ++i) {
11547 ANYOF_BITMAP_SET(ret, i);
11548 prevvalue = value;
11549 value = i;
11550 }
11551 }
11552 }
a9746a27
KW
11553
11554 /* Now invert both the bitmap and the nonbitmap. Anything in the
4004d578
KW
11555 * bitmap has to also be removed from the non-bitmap, but again,
11556 * there should not be overlap unless is /d rules. */
37e85ffe 11557 _invlist_invert(nonbitmap);
4004d578 11558
5d103fec
KW
11559 /* Any swash can't be used as-is, because we've inverted things */
11560 if (swash) {
11561 SvREFCNT_dec(swash);
11562 swash = NULL;
11563 }
11564
a9746a27
KW
11565 for (i = 0; i < 256; ++i) {
11566 if (ANYOF_BITMAP_TEST(ret, i)) {
11567 ANYOF_BITMAP_CLEAR(ret, i);
4004d578
KW
11568 if (DEPENDS_SEMANTICS) {
11569 if (! remove_list) {
11570 remove_list = _new_invlist(2);
11571 }
11572 remove_list = add_cp_to_invlist(remove_list, i);
11573 }
2fde50e1
KW
11574 }
11575 else {
a9746a27
KW
11576 ANYOF_BITMAP_SET(ret, i);
11577 prevvalue = value;
11578 value = i;
2fde50e1
KW
11579 }
11580 }
dbe7a391
KW
11581
11582 /* And do the removal */
4004d578
KW
11583 if (DEPENDS_SEMANTICS) {
11584 if (remove_list) {
11585 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11586 SvREFCNT_dec(remove_list);
11587 }
11588 }
11589 else {
11590 /* There is no overlap for non-/d, so just delete anything
11591 * below 256 */
a3e1f3a6 11592 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
4004d578 11593 }
2fde50e1
KW
11594 }
11595
40c78556
KW
11596 stored = 256 - stored;
11597
2fde50e1
KW
11598 /* Clear the invert flag since have just done it here */
11599 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
40c78556
KW
11600 }
11601
0222889f
KW
11602 /* Folding in the bitmap is taken care of above, but not for locale (for
11603 * which we have to wait to see what folding is in effect at runtime), and
d9105c95
KW
11604 * for some things not in the bitmap (only the upper latin folds in this
11605 * case, as all other single-char folding has been set above). Set
11606 * run-time fold flag for these */
11607 if (FOLD && (LOC
11608 || (DEPENDS_SEMANTICS
11609 && nonbitmap
11610 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11611 || unicode_alternate))
11612 {
0222889f 11613 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
f56b6394
KW
11614 }
11615
2786be71
KW
11616 /* A single character class can be "optimized" into an EXACTish node.
11617 * Note that since we don't currently count how many characters there are
11618 * outside the bitmap, we are XXX missing optimization possibilities for
11619 * them. This optimization can't happen unless this is a truly single
11620 * character class, which means that it can't be an inversion into a
11621 * many-character class, and there must be no possibility of there being
11622 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
11623 * \w, etc, so have to make a special test that they aren't present
11624 *
11625 * Similarly A 2-character class of the very special form like [bB] can be
11626 * optimized into an EXACTFish node, but only for non-locales, and for
11627 * characters which only have the two folds; so things like 'fF' and 'Ii'
11628 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11629 * FI'. */
137165a6 11630 if (! nonbitmap
53742956 11631 && ! unicode_alternate
137165a6
KW
11632 && SvCUR(listsv) == initial_listsv_len
11633 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
11634 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11635 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11636 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11637 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11638 /* If the latest code point has a fold whose
11639 * bit is set, it must be the only other one */
2dcac756 11640 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 11641 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
11642 {
11643 /* Note that the information needed to decide to do this optimization
11644 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
11645 * used EXACTish node takes less space than the calculated ANYOF node,
11646 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
11647 * than actually used, so this optimization doesn't gain us any space.
11648 * But an EXACT node is faster than an ANYOF node, and can be combined
11649 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
11650 * gains. The speed of executing an EXACTF is similar to an ANYOF
11651 * node, so the optimization advantage comes from the ability to join
11652 * it to adjacent EXACT nodes */
2786be71 11653
07be1b83 11654 const char * cur_parse= RExC_parse;
6da63e10 11655 U8 op;
07be1b83
YO
11656 RExC_emit = (regnode *)orig_emit;
11657 RExC_parse = (char *)orig_parse;
2786be71 11658
6da63e10
KW
11659 if (stored == 1) {
11660
11661 /* A locale node with one point can be folded; all the other cases
11662 * with folding will have two points, since we calculate them above
11663 */
39065660 11664 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
11665 op = EXACTFL;
11666 }
11667 else {
11668 op = EXACT;
11669 }
b36527fc
KW
11670 }
11671 else { /* else 2 chars in the bit map: the folds of each other */
11672
11673 /* Use the folded value, which for the cases where we get here,
11674 * is just the lower case of the current one (which may resolve to
11675 * itself, or to the other one */
11676 value = toLOWER_LATIN1(value);
6da63e10 11677
bf4c00b4
KW
11678 /* To join adjacent nodes, they must be the exact EXACTish type.
11679 * Try to use the most likely type, by using EXACTFA if possible,
11680 * then EXACTFU if the regex calls for it, or is required because
11681 * the character is non-ASCII. (If <value> is ASCII, its fold is
11682 * also ASCII for the cases where we get here.) */
11683 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11684 op = EXACTFA;
11685 }
11686 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
e62862f6
KW
11687 op = EXACTFU;
11688 }
11689 else { /* Otherwise, more likely to be EXACTF type */
11690 op = EXACTF;
11691 }
b36527fc 11692 }
6da63e10
KW
11693
11694 ret = reg_node(pRExC_state, op);
07be1b83 11695 RExC_parse = (char *)cur_parse;
2786be71
KW
11696 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11697 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11698 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11699 STR_LEN(ret)= 2;
11700 RExC_emit += STR_SZ(2);
11701 }
11702 else {
11703 *STRING(ret)= (char)value;
11704 STR_LEN(ret)= 1;
11705 RExC_emit += STR_SZ(1);
11706 }
ef8d46e8 11707 SvREFCNT_dec(listsv);
7f6f358c
YO
11708 return ret;
11709 }
ffc61ed2 11710
dc415b4a
KW
11711 /* If there is a swash and more than one element, we can't use the swash in
11712 * the optimization below. */
11713 if (swash && element_count > 1) {
11714 SvREFCNT_dec(swash);
11715 swash = NULL;
11716 }
c16787fd
KW
11717 if (! nonbitmap
11718 && SvCUR(listsv) == initial_listsv_len
11719 && ! unicode_alternate)
11720 {
137165a6
KW
11721 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11722 SvREFCNT_dec(listsv);
11723 SvREFCNT_dec(unicode_alternate);
11724 }
11725 else {
0bd1039c
KW
11726 /* av[0] stores the character class description in its textual form:
11727 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11728 * appropriate swash, and is also useful for dumping the regnode.
11729 * av[1] if NULL, is a placeholder to later contain the swash computed
11730 * from av[0]. But if no further computation need be done, the
11731 * swash is stored there now.
11732 * av[2] stores the multicharacter foldings, used later in
11733 * regexec.c:S_reginclass().
11734 * av[3] stores the nonbitmap inversion list for use in addition or
11735 * instead of av[0]; not used if av[1] isn't NULL
11736 * av[4] is set if any component of the class is from a user-defined
11737 * property; not used if av[1] isn't NULL */
097eb12c 11738 AV * const av = newAV();
ffc61ed2 11739 SV *rv;
0bd1039c 11740
c16787fd
KW
11741 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11742 ? &PL_sv_undef
11743 : listsv);
88d45d28
KW
11744 if (swash) {
11745 av_store(av, 1, swash);
11746 SvREFCNT_dec(nonbitmap);
11747 }
11748 else {
11749 av_store(av, 1, NULL);
c16787fd
KW
11750 if (nonbitmap) {
11751 av_store(av, 3, nonbitmap);
dc415b4a 11752 av_store(av, 4, newSVuv(has_user_defined_property));
c16787fd 11753 }
88d45d28 11754 }
827f5bb8
KW
11755
11756 /* Store any computed multi-char folds only if we are allowing
11757 * them */
11758 if (allow_full_fold) {
7b4a7e58
KW
11759 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11760 if (unicode_alternate) { /* This node is variable length */
11761 OP(ret) = ANYOFV;
11762 }
827f5bb8
KW
11763 }
11764 else {
11765 av_store(av, 2, NULL);
11766 }
ad64d0ec 11767 rv = newRV_noinc(MUTABLE_SV(av));
19860706 11768 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 11769 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 11770 ARG_SET(ret, n);
a0ed51b3 11771 }
a0ed51b3
LW
11772 return ret;
11773}
89836f1f 11774
a0ed51b3 11775
bcdf7404
YO
11776/* reg_skipcomment()
11777
11778 Absorbs an /x style # comments from the input stream.
11779 Returns true if there is more text remaining in the stream.
11780 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11781 terminates the pattern without including a newline.
11782
11783 Note its the callers responsibility to ensure that we are
11784 actually in /x mode
11785
11786*/
11787
11788STATIC bool
11789S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11790{
11791 bool ended = 0;
7918f24d
NC
11792
11793 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11794
bcdf7404
YO
11795 while (RExC_parse < RExC_end)
11796 if (*RExC_parse++ == '\n') {
11797 ended = 1;
11798 break;
11799 }
11800 if (!ended) {
11801 /* we ran off the end of the pattern without ending
11802 the comment, so we have to add an \n when wrapping */
11803 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11804 return 0;
11805 } else
11806 return 1;
11807}
11808
11809/* nextchar()
11810
3b753521 11811 Advances the parse position, and optionally absorbs
bcdf7404
YO
11812 "whitespace" from the inputstream.
11813
11814 Without /x "whitespace" means (?#...) style comments only,
11815 with /x this means (?#...) and # comments and whitespace proper.
11816
11817 Returns the RExC_parse point from BEFORE the scan occurs.
11818
11819 This is the /x friendly way of saying RExC_parse++.
11820*/
11821
76e3520e 11822STATIC char*
830247a4 11823S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 11824{
097eb12c 11825 char* const retval = RExC_parse++;
a0d0e21e 11826
7918f24d
NC
11827 PERL_ARGS_ASSERT_NEXTCHAR;
11828
4633a7c4 11829 for (;;) {
d224c965
KW
11830 if (RExC_end - RExC_parse >= 3
11831 && *RExC_parse == '('
11832 && RExC_parse[1] == '?'
11833 && RExC_parse[2] == '#')
11834 {
e994fd66
AE
11835 while (*RExC_parse != ')') {
11836 if (RExC_parse == RExC_end)
11837 FAIL("Sequence (?#... not terminated");
830247a4 11838 RExC_parse++;
e994fd66 11839 }
830247a4 11840 RExC_parse++;
4633a7c4
LW
11841 continue;
11842 }
bbe252da 11843 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
11844 if (isSPACE(*RExC_parse)) {
11845 RExC_parse++;
748a9306
LW
11846 continue;
11847 }
830247a4 11848 else if (*RExC_parse == '#') {
bcdf7404
YO
11849 if ( reg_skipcomment( pRExC_state ) )
11850 continue;
748a9306 11851 }
748a9306 11852 }
4633a7c4 11853 return retval;
a0d0e21e 11854 }
a687059c
LW
11855}
11856
11857/*
c277df42 11858- reg_node - emit a node
a0d0e21e 11859*/
76e3520e 11860STATIC regnode * /* Location. */
830247a4 11861S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 11862{
97aff369 11863 dVAR;
c277df42 11864 register regnode *ptr;
504618e9 11865 regnode * const ret = RExC_emit;
07be1b83 11866 GET_RE_DEBUG_FLAGS_DECL;
a687059c 11867
7918f24d
NC
11868 PERL_ARGS_ASSERT_REG_NODE;
11869
c277df42 11870 if (SIZE_ONLY) {
830247a4
IZ
11871 SIZE_ALIGN(RExC_size);
11872 RExC_size += 1;
a0d0e21e
LW
11873 return(ret);
11874 }
3b57cd43 11875 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
11876 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11877 op, RExC_emit, RExC_emit_bound);
3b57cd43 11878
c277df42 11879 NODE_ALIGN_FILL(ret);
a0d0e21e 11880 ptr = ret;
c277df42 11881 FILL_ADVANCE_NODE(ptr, op);
7122b237 11882#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11883 if (RExC_offsets) { /* MJD */
07be1b83 11884 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 11885 "reg_node", __LINE__,
13d6edb4 11886 PL_reg_name[op],
07be1b83
YO
11887 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11888 ? "Overwriting end of array!\n" : "OK",
11889 (UV)(RExC_emit - RExC_emit_start),
11890 (UV)(RExC_parse - RExC_start),
11891 (UV)RExC_offsets[0]));
ccb2c380 11892 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 11893 }
7122b237 11894#endif
830247a4 11895 RExC_emit = ptr;
a0d0e21e 11896 return(ret);
a687059c
LW
11897}
11898
11899/*
a0d0e21e
LW
11900- reganode - emit a node with an argument
11901*/
76e3520e 11902STATIC regnode * /* Location. */
830247a4 11903S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 11904{
97aff369 11905 dVAR;
c277df42 11906 register regnode *ptr;
504618e9 11907 regnode * const ret = RExC_emit;
07be1b83 11908 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 11909
7918f24d
NC
11910 PERL_ARGS_ASSERT_REGANODE;
11911
c277df42 11912 if (SIZE_ONLY) {
830247a4
IZ
11913 SIZE_ALIGN(RExC_size);
11914 RExC_size += 2;
6bda09f9
YO
11915 /*
11916 We can't do this:
11917
11918 assert(2==regarglen[op]+1);
686b73d4 11919
6bda09f9
YO
11920 Anything larger than this has to allocate the extra amount.
11921 If we changed this to be:
11922
11923 RExC_size += (1 + regarglen[op]);
11924
11925 then it wouldn't matter. Its not clear what side effect
11926 might come from that so its not done so far.
11927 -- dmq
11928 */
a0d0e21e
LW
11929 return(ret);
11930 }
3b57cd43 11931 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
11932 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11933 op, RExC_emit, RExC_emit_bound);
3b57cd43 11934
c277df42 11935 NODE_ALIGN_FILL(ret);
a0d0e21e 11936 ptr = ret;
c277df42 11937 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 11938#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11939 if (RExC_offsets) { /* MJD */
07be1b83 11940 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 11941 "reganode",
ccb2c380 11942 __LINE__,
13d6edb4 11943 PL_reg_name[op],
07be1b83 11944 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 11945 "Overwriting end of array!\n" : "OK",
07be1b83
YO
11946 (UV)(RExC_emit - RExC_emit_start),
11947 (UV)(RExC_parse - RExC_start),
11948 (UV)RExC_offsets[0]));
ccb2c380 11949 Set_Cur_Node_Offset;
fac92740 11950 }
7122b237 11951#endif
830247a4 11952 RExC_emit = ptr;
a0d0e21e 11953 return(ret);
fe14fcc3
LW
11954}
11955
11956/*
cd439c50 11957- reguni - emit (if appropriate) a Unicode character
a0ed51b3 11958*/
71207a34
AL
11959STATIC STRLEN
11960S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 11961{
97aff369 11962 dVAR;
7918f24d
NC
11963
11964 PERL_ARGS_ASSERT_REGUNI;
11965
71207a34 11966 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
11967}
11968
11969/*
a0d0e21e
LW
11970- reginsert - insert an operator in front of already-emitted operand
11971*
11972* Means relocating the operand.
11973*/
76e3520e 11974STATIC void
6bda09f9 11975S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 11976{
97aff369 11977 dVAR;
c277df42
IZ
11978 register regnode *src;
11979 register regnode *dst;
11980 register regnode *place;
504618e9 11981 const int offset = regarglen[(U8)op];
6bda09f9 11982 const int size = NODE_STEP_REGNODE + offset;
07be1b83 11983 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11984
11985 PERL_ARGS_ASSERT_REGINSERT;
def51078 11986 PERL_UNUSED_ARG(depth);
22c35a8c 11987/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 11988 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 11989 if (SIZE_ONLY) {
6bda09f9 11990 RExC_size += size;
a0d0e21e
LW
11991 return;
11992 }
a687059c 11993
830247a4 11994 src = RExC_emit;
6bda09f9 11995 RExC_emit += size;
830247a4 11996 dst = RExC_emit;
40d049e4 11997 if (RExC_open_parens) {
6bda09f9 11998 int paren;
3b57cd43 11999 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 12000 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 12001 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 12002 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
12003 RExC_open_parens[paren] += size;
12004 } else {
3b57cd43 12005 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
12006 }
12007 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 12008 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
12009 RExC_close_parens[paren] += size;
12010 } else {
3b57cd43 12011 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
12012 }
12013 }
6bda09f9 12014 }
40d049e4 12015
fac92740 12016 while (src > opnd) {
c277df42 12017 StructCopy(--src, --dst, regnode);
7122b237 12018#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12019 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 12020 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 12021 "reg_insert",
ccb2c380 12022 __LINE__,
13d6edb4 12023 PL_reg_name[op],
07be1b83
YO
12024 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12025 ? "Overwriting end of array!\n" : "OK",
12026 (UV)(src - RExC_emit_start),
12027 (UV)(dst - RExC_emit_start),
12028 (UV)RExC_offsets[0]));
ccb2c380
MP
12029 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12030 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 12031 }
7122b237 12032#endif
fac92740
MJD
12033 }
12034
a0d0e21e
LW
12035
12036 place = opnd; /* Op node, where operand used to be. */
7122b237 12037#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12038 if (RExC_offsets) { /* MJD */
07be1b83 12039 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 12040 "reginsert",
ccb2c380 12041 __LINE__,
13d6edb4 12042 PL_reg_name[op],
07be1b83 12043 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 12044 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
12045 (UV)(place - RExC_emit_start),
12046 (UV)(RExC_parse - RExC_start),
786e8c11 12047 (UV)RExC_offsets[0]));
ccb2c380 12048 Set_Node_Offset(place, RExC_parse);
45948336 12049 Set_Node_Length(place, 1);
fac92740 12050 }
7122b237 12051#endif
c277df42
IZ
12052 src = NEXTOPER(place);
12053 FILL_ADVANCE_NODE(place, op);
12054 Zero(src, offset, regnode);
a687059c
LW
12055}
12056
12057/*
c277df42 12058- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 12059- SEE ALSO: regtail_study
a0d0e21e 12060*/
097eb12c 12061/* TODO: All three parms should be const */
76e3520e 12062STATIC void
3dab1dad 12063S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 12064{
97aff369 12065 dVAR;
c277df42 12066 register regnode *scan;
72f13be8 12067 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12068
12069 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
12070#ifndef DEBUGGING
12071 PERL_UNUSED_ARG(depth);
12072#endif
a0d0e21e 12073
c277df42 12074 if (SIZE_ONLY)
a0d0e21e
LW
12075 return;
12076
12077 /* Find last node. */
12078 scan = p;
12079 for (;;) {
504618e9 12080 regnode * const temp = regnext(scan);
3dab1dad
YO
12081 DEBUG_PARSE_r({
12082 SV * const mysv=sv_newmortal();
12083 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12084 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
12085 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12086 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12087 (temp == NULL ? "->" : ""),
13d6edb4 12088 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 12089 );
3dab1dad
YO
12090 });
12091 if (temp == NULL)
12092 break;
12093 scan = temp;
12094 }
12095
12096 if (reg_off_by_arg[OP(scan)]) {
12097 ARG_SET(scan, val - scan);
12098 }
12099 else {
12100 NEXT_OFF(scan) = val - scan;
12101 }
12102}
12103
07be1b83 12104#ifdef DEBUGGING
3dab1dad
YO
12105/*
12106- regtail_study - set the next-pointer at the end of a node chain of p to val.
12107- Look for optimizable sequences at the same time.
12108- currently only looks for EXACT chains.
07be1b83 12109
486ec47a 12110This is experimental code. The idea is to use this routine to perform
07be1b83
YO
12111in place optimizations on branches and groups as they are constructed,
12112with the long term intention of removing optimization from study_chunk so
12113that it is purely analytical.
12114
12115Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12116to control which is which.
12117
3dab1dad
YO
12118*/
12119/* TODO: All four parms should be const */
07be1b83 12120
3dab1dad
YO
12121STATIC U8
12122S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12123{
12124 dVAR;
12125 register regnode *scan;
07be1b83
YO
12126 U8 exact = PSEUDO;
12127#ifdef EXPERIMENTAL_INPLACESCAN
12128 I32 min = 0;
12129#endif
3dab1dad
YO
12130 GET_RE_DEBUG_FLAGS_DECL;
12131
7918f24d
NC
12132 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12133
07be1b83 12134
3dab1dad
YO
12135 if (SIZE_ONLY)
12136 return exact;
12137
12138 /* Find last node. */
12139
12140 scan = p;
12141 for (;;) {
12142 regnode * const temp = regnext(scan);
07be1b83 12143#ifdef EXPERIMENTAL_INPLACESCAN
f758bddf
KW
12144 if (PL_regkind[OP(scan)] == EXACT) {
12145 bool has_exactf_sharp_s; /* Unexamined in this routine */
12146 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
07be1b83 12147 return EXACT;
f758bddf 12148 }
07be1b83 12149#endif
3dab1dad
YO
12150 if ( exact ) {
12151 switch (OP(scan)) {
12152 case EXACT:
12153 case EXACTF:
2f7f8cb1 12154 case EXACTFA:
2c2b7f86 12155 case EXACTFU:
3c760661 12156 case EXACTFU_SS:
fab2782b 12157 case EXACTFU_TRICKYFOLD:
3dab1dad
YO
12158 case EXACTFL:
12159 if( exact == PSEUDO )
12160 exact= OP(scan);
07be1b83
YO
12161 else if ( exact != OP(scan) )
12162 exact= 0;
3dab1dad
YO
12163 case NOTHING:
12164 break;
12165 default:
12166 exact= 0;
12167 }
12168 }
12169 DEBUG_PARSE_r({
12170 SV * const mysv=sv_newmortal();
12171 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12172 regprop(RExC_rx, mysv, scan);
eaf3ca90 12173 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 12174 SvPV_nolen_const(mysv),
eaf3ca90 12175 REG_NODE_NUM(scan),
13d6edb4 12176 PL_reg_name[exact]);
3dab1dad 12177 });
a0d0e21e
LW
12178 if (temp == NULL)
12179 break;
12180 scan = temp;
12181 }
07be1b83
YO
12182 DEBUG_PARSE_r({
12183 SV * const mysv_val=sv_newmortal();
12184 DEBUG_PARSE_MSG("");
12185 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
12186 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12187 SvPV_nolen_const(mysv_val),
12188 (IV)REG_NODE_NUM(val),
12189 (IV)(val - scan)
07be1b83
YO
12190 );
12191 });
c277df42
IZ
12192 if (reg_off_by_arg[OP(scan)]) {
12193 ARG_SET(scan, val - scan);
a0ed51b3
LW
12194 }
12195 else {
c277df42
IZ
12196 NEXT_OFF(scan) = val - scan;
12197 }
3dab1dad
YO
12198
12199 return exact;
a687059c 12200}
07be1b83 12201#endif
a687059c
LW
12202
12203/*
fd181c75 12204 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 12205 */
f7819f85 12206#ifdef DEBUGGING
c33269f7 12207static void
7918f24d
NC
12208S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12209{
f7819f85
A
12210 int bit;
12211 int set=0;
a62b1201 12212 regex_charset cs;
7918f24d 12213
f7819f85
A
12214 for (bit=0; bit<32; bit++) {
12215 if (flags & (1<<bit)) {
a62b1201
KW
12216 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12217 continue;
12218 }
f7819f85
A
12219 if (!set++ && lead)
12220 PerlIO_printf(Perl_debug_log, "%s",lead);
12221 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12222 }
12223 }
a62b1201
KW
12224 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12225 if (!set++ && lead) {
12226 PerlIO_printf(Perl_debug_log, "%s",lead);
12227 }
12228 switch (cs) {
12229 case REGEX_UNICODE_CHARSET:
12230 PerlIO_printf(Perl_debug_log, "UNICODE");
12231 break;
12232 case REGEX_LOCALE_CHARSET:
12233 PerlIO_printf(Perl_debug_log, "LOCALE");
12234 break;
cfaf538b
KW
12235 case REGEX_ASCII_RESTRICTED_CHARSET:
12236 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12237 break;
2f7f8cb1
KW
12238 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12239 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12240 break;
a62b1201
KW
12241 default:
12242 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12243 break;
12244 }
12245 }
f7819f85
A
12246 if (lead) {
12247 if (set)
12248 PerlIO_printf(Perl_debug_log, "\n");
12249 else
12250 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12251 }
12252}
12253#endif
12254
a687059c 12255void
097eb12c 12256Perl_regdump(pTHX_ const regexp *r)
a687059c 12257{
35ff7856 12258#ifdef DEBUGGING
97aff369 12259 dVAR;
c445ea15 12260 SV * const sv = sv_newmortal();
ab3bbdeb 12261 SV *dsv= sv_newmortal();
f8fc2ecf 12262 RXi_GET_DECL(r,ri);
f7819f85 12263 GET_RE_DEBUG_FLAGS_DECL;
a687059c 12264
7918f24d
NC
12265 PERL_ARGS_ASSERT_REGDUMP;
12266
f8fc2ecf 12267 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
12268
12269 /* Header fields of interest. */
ab3bbdeb
YO
12270 if (r->anchored_substr) {
12271 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12272 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 12273 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12274 "anchored %s%s at %"IVdf" ",
12275 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 12276 (IV)r->anchored_offset);
ab3bbdeb
YO
12277 } else if (r->anchored_utf8) {
12278 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12279 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 12280 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12281 "anchored utf8 %s%s at %"IVdf" ",
12282 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 12283 (IV)r->anchored_offset);
ab3bbdeb
YO
12284 }
12285 if (r->float_substr) {
12286 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12287 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 12288 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12289 "floating %s%s at %"IVdf"..%"UVuf" ",
12290 s, RE_SV_TAIL(r->float_substr),
7b0972df 12291 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
12292 } else if (r->float_utf8) {
12293 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12294 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 12295 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12296 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12297 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 12298 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 12299 }
33b8afdf 12300 if (r->check_substr || r->check_utf8)
b81d288d 12301 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
12302 (const char *)
12303 (r->check_substr == r->float_substr
12304 && r->check_utf8 == r->float_utf8
12305 ? "(checking floating" : "(checking anchored"));
bbe252da 12306 if (r->extflags & RXf_NOSCAN)
c277df42 12307 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 12308 if (r->extflags & RXf_CHECK_ALL)
c277df42 12309 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 12310 if (r->check_substr || r->check_utf8)
c277df42
IZ
12311 PerlIO_printf(Perl_debug_log, ") ");
12312
f8fc2ecf
YO
12313 if (ri->regstclass) {
12314 regprop(r, sv, ri->regstclass);
1de06328 12315 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 12316 }
bbe252da 12317 if (r->extflags & RXf_ANCH) {
774d564b 12318 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 12319 if (r->extflags & RXf_ANCH_BOL)
774d564b 12320 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 12321 if (r->extflags & RXf_ANCH_MBOL)
c277df42 12322 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 12323 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 12324 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 12325 if (r->extflags & RXf_ANCH_GPOS)
774d564b 12326 PerlIO_printf(Perl_debug_log, "(GPOS)");
12327 PerlIO_putc(Perl_debug_log, ' ');
12328 }
bbe252da 12329 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 12330 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 12331 if (r->intflags & PREGf_SKIP)
760ac839 12332 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 12333 if (r->intflags & PREGf_IMPLICIT)
760ac839 12334 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 12335 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 12336 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 12337 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 12338 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 12339 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 12340#else
7918f24d 12341 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 12342 PERL_UNUSED_CONTEXT;
65e66c80 12343 PERL_UNUSED_ARG(r);
17c3b450 12344#endif /* DEBUGGING */
a687059c
LW
12345}
12346
12347/*
a0d0e21e
LW
12348- regprop - printable representation of opcode
12349*/
3339dfd8
YO
12350#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12351STMT_START { \
12352 if (do_sep) { \
12353 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12354 if (flags & ANYOF_INVERT) \
12355 /*make sure the invert info is in each */ \
12356 sv_catpvs(sv, "^"); \
12357 do_sep = 0; \
12358 } \
12359} STMT_END
12360
46fc3d4c 12361void
32fc9b6a 12362Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 12363{
35ff7856 12364#ifdef DEBUGGING
97aff369 12365 dVAR;
9b155405 12366 register int k;
f8fc2ecf 12367 RXi_GET_DECL(prog,progi);
1de06328 12368 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 12369
7918f24d 12370 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 12371
76f68e9b 12372 sv_setpvs(sv, "");
8aa23a47 12373
03363afd 12374 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
12375 /* It would be nice to FAIL() here, but this may be called from
12376 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 12377 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 12378 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 12379
3dab1dad 12380 k = PL_regkind[OP(o)];
9b155405 12381
2a782b5b 12382 if (k == EXACT) {
f92a2122 12383 sv_catpvs(sv, " ");
ab3bbdeb
YO
12384 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12385 * is a crude hack but it may be the best for now since
12386 * we have no flag "this EXACTish node was UTF-8"
12387 * --jhi */
f92a2122
NC
12388 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12389 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 12390 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
12391 PERL_PV_PRETTY_ELLIPSES |
12392 PERL_PV_PRETTY_LTGT |
12393 PERL_PV_PRETTY_NOCLEAR
12394 );
bb263b4e 12395 } else if (k == TRIE) {
3dab1dad 12396 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 12397 * progi->data isn't available here */
1de06328 12398 const char op = OP(o);
647f639f 12399 const U32 n = ARG(o);
1de06328 12400 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 12401 (reg_ac_data *)progi->data->data[n] :
1de06328 12402 NULL;
3251b653
NC
12403 const reg_trie_data * const trie
12404 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 12405
13d6edb4 12406 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
12407 DEBUG_TRIE_COMPILE_r(
12408 Perl_sv_catpvf(aTHX_ sv,
12409 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12410 (UV)trie->startstate,
1e2e3d02 12411 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
12412 (UV)trie->wordcount,
12413 (UV)trie->minlen,
12414 (UV)trie->maxlen,
12415 (UV)TRIE_CHARCOUNT(trie),
12416 (UV)trie->uniquecharcount
12417 )
12418 );
12419 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12420 int i;
12421 int rangestart = -1;
f46cb337 12422 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 12423 sv_catpvs(sv, "[");
1de06328
YO
12424 for (i = 0; i <= 256; i++) {
12425 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12426 if (rangestart == -1)
12427 rangestart = i;
12428 } else if (rangestart != -1) {
12429 if (i <= rangestart + 3)
12430 for (; rangestart < i; rangestart++)
12431 put_byte(sv, rangestart);
12432 else {
12433 put_byte(sv, rangestart);
12434 sv_catpvs(sv, "-");
12435 put_byte(sv, i - 1);
12436 }
12437 rangestart = -1;
12438 }
12439 }
f3a2811a 12440 sv_catpvs(sv, "]");
1de06328
YO
12441 }
12442
a3621e74 12443 } else if (k == CURLY) {
cb434fcc 12444 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
12445 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12446 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 12447 }
2c2d71f5
JH
12448 else if (k == WHILEM && o->flags) /* Ordinal/of */
12449 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 12450 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 12451 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 12452 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 12453 if ( k != REF || (OP(o) < NREF)) {
502c6561 12454 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
12455 SV **name= av_fetch(list, ARG(o), 0 );
12456 if (name)
12457 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12458 }
12459 else {
502c6561 12460 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 12461 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
12462 I32 *nums=(I32*)SvPVX(sv_dat);
12463 SV **name= av_fetch(list, nums[0], 0 );
12464 I32 n;
12465 if (name) {
12466 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12467 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12468 (n ? "," : ""), (IV)nums[n]);
12469 }
12470 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 12471 }
1f1031fe 12472 }
ee9b8eae 12473 }
1f1031fe 12474 } else if (k == GOSUB)
6bda09f9 12475 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
12476 else if (k == VERB) {
12477 if (!o->flags)
12478 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 12479 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 12480 } else if (k == LOGICAL)
04ebc1ab 12481 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
12482 else if (k == ANYOF) {
12483 int i, rangestart = -1;
2d03de9c 12484 const U8 flags = ANYOF_FLAGS(o);
24d786f4 12485 int do_sep = 0;
0bd48802
AL
12486
12487 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12488 static const char * const anyofs[] = {
653099ff
GS
12489 "\\w",
12490 "\\W",
12491 "\\s",
12492 "\\S",
12493 "\\d",
12494 "\\D",
12495 "[:alnum:]",
12496 "[:^alnum:]",
12497 "[:alpha:]",
12498 "[:^alpha:]",
12499 "[:ascii:]",
12500 "[:^ascii:]",
24d786f4
YO
12501 "[:cntrl:]",
12502 "[:^cntrl:]",
653099ff
GS
12503 "[:graph:]",
12504 "[:^graph:]",
12505 "[:lower:]",
12506 "[:^lower:]",
12507 "[:print:]",
12508 "[:^print:]",
12509 "[:punct:]",
12510 "[:^punct:]",
12511 "[:upper:]",
aaa51d5e 12512 "[:^upper:]",
653099ff 12513 "[:xdigit:]",
aaa51d5e
JF
12514 "[:^xdigit:]",
12515 "[:space:]",
12516 "[:^space:]",
12517 "[:blank:]",
12518 "[:^blank:]"
653099ff
GS
12519 };
12520
19860706 12521 if (flags & ANYOF_LOCALE)
396482e1 12522 sv_catpvs(sv, "{loc}");
39065660 12523 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 12524 sv_catpvs(sv, "{i}");
653099ff 12525 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 12526 if (flags & ANYOF_INVERT)
396482e1 12527 sv_catpvs(sv, "^");
686b73d4 12528
3339dfd8 12529 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
12530 for (i = 0; i <= 256; i++) {
12531 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12532 if (rangestart == -1)
12533 rangestart = i;
12534 } else if (rangestart != -1) {
12535 if (i <= rangestart + 3)
12536 for (; rangestart < i; rangestart++)
653099ff 12537 put_byte(sv, rangestart);
ffc61ed2
JH
12538 else {
12539 put_byte(sv, rangestart);
396482e1 12540 sv_catpvs(sv, "-");
ffc61ed2 12541 put_byte(sv, i - 1);
653099ff 12542 }
24d786f4 12543 do_sep = 1;
ffc61ed2 12544 rangestart = -1;
653099ff 12545 }
847a199f 12546 }
3339dfd8
YO
12547
12548 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
12549 /* output any special charclass tests (used entirely under use locale) */
12550 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 12551 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 12552 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 12553 sv_catpv(sv, anyofs[i]);
24d786f4
YO
12554 do_sep = 1;
12555 }
12556
3339dfd8
YO
12557 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12558
11454c59
KW
12559 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12560 sv_catpvs(sv, "{non-utf8-latin1-all}");
12561 }
12562
3339dfd8 12563 /* output information about the unicode matching */
ef87b810 12564 if (flags & ANYOF_UNICODE_ALL)
396482e1 12565 sv_catpvs(sv, "{unicode_all}");
137165a6 12566 else if (ANYOF_NONBITMAP(o))
ef87b810 12567 sv_catpvs(sv, "{unicode}");
f5ecd18d 12568 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 12569 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2 12570
1aa9930e 12571 if (ANYOF_NONBITMAP(o)) {
dbe7a391 12572 SV *lv; /* Set if there is something outside the bit map */
32fc9b6a 12573 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
f1114c30
KW
12574 bool byte_output = FALSE; /* If something in the bitmap has been
12575 output */
686b73d4 12576
c16787fd 12577 if (lv && lv != &PL_sv_undef) {
ffc61ed2 12578 if (sw) {
89ebb4a3 12579 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 12580
dbe7a391 12581 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
1df70142 12582 uvchr_to_utf8(s, i);
686b73d4 12583
dcf8909a
KW
12584 if (i < 256
12585 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12586 things already
12587 output as part
12588 of the bitmap */
12589 && swash_fetch(sw, s, TRUE))
12590 {
ffc61ed2
JH
12591 if (rangestart == -1)
12592 rangestart = i;
12593 } else if (rangestart != -1) {
f1114c30 12594 byte_output = TRUE;
ffc61ed2
JH
12595 if (i <= rangestart + 3)
12596 for (; rangestart < i; rangestart++) {
7128c099 12597 put_byte(sv, rangestart);
ffc61ed2
JH
12598 }
12599 else {
7128c099 12600 put_byte(sv, rangestart);
396482e1 12601 sv_catpvs(sv, "-");
7128c099 12602 put_byte(sv, i-1);
ffc61ed2 12603 }
e87973a9 12604 rangestart = -1;
19860706 12605 }
e87973a9 12606 }
19860706 12607 }
fde631ed 12608
ffc61ed2 12609 {
2e0de35c 12610 char *s = savesvpv(lv);
c445ea15 12611 char * const origs = s;
686b73d4 12612
3dab1dad
YO
12613 while (*s && *s != '\n')
12614 s++;
686b73d4 12615
ffc61ed2 12616 if (*s == '\n') {
2d03de9c 12617 const char * const t = ++s;
686b73d4 12618
f1114c30
KW
12619 if (byte_output) {
12620 sv_catpvs(sv, " ");
12621 }
12622
ffc61ed2 12623 while (*s) {
c574ffb9
KW
12624 if (*s == '\n') {
12625
12626 /* Truncate very long output */
12627 if (s - origs > 256) {
12628 Perl_sv_catpvf(aTHX_ sv,
12629 "%.*s...",
12630 (int) (s - origs - 1),
12631 t);
12632 goto out_dump;
12633 }
ffc61ed2 12634 *s = ' ';
1a9c8476
KW
12635 }
12636 else if (*s == '\t') {
12637 *s = '-';
12638 }
ffc61ed2
JH
12639 s++;
12640 }
12641 if (s[-1] == ' ')
12642 s[-1] = 0;
686b73d4 12643
ffc61ed2 12644 sv_catpv(sv, t);
fde631ed 12645 }
686b73d4 12646
c574ffb9
KW
12647 out_dump:
12648
ffc61ed2 12649 Safefree(origs);
fde631ed 12650 }
c16787fd 12651 SvREFCNT_dec(lv);
fde631ed 12652 }
653099ff 12653 }
ffc61ed2 12654
653099ff
GS
12655 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12656 }
9b155405 12657 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 12658 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 12659#else
96a5add6 12660 PERL_UNUSED_CONTEXT;
65e66c80
SP
12661 PERL_UNUSED_ARG(sv);
12662 PERL_UNUSED_ARG(o);
f9049ba1 12663 PERL_UNUSED_ARG(prog);
17c3b450 12664#endif /* DEBUGGING */
35ff7856 12665}
a687059c 12666
cad2e5aa 12667SV *
288b8c02 12668Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 12669{ /* Assume that RE_INTUIT is set */
97aff369 12670 dVAR;
288b8c02 12671 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 12672 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12673
12674 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
12675 PERL_UNUSED_CONTEXT;
12676
a3621e74 12677 DEBUG_COMPILE_r(
cfd0369c 12678 {
2d03de9c 12679 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 12680 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
12681
12682 if (!PL_colorset) reginitcolors();
12683 PerlIO_printf(Perl_debug_log,
a0288114 12684 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
12685 PL_colors[4],
12686 prog->check_substr ? "" : "utf8 ",
12687 PL_colors[5],PL_colors[0],
cad2e5aa
JH
12688 s,
12689 PL_colors[1],
12690 (strlen(s) > 60 ? "..." : ""));
12691 } );
12692
33b8afdf 12693 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
12694}
12695
84da74a7 12696/*
f8149455 12697 pregfree()
84da74a7 12698
f8149455
YO
12699 handles refcounting and freeing the perl core regexp structure. When
12700 it is necessary to actually free the structure the first thing it
3b753521 12701 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
12702 the regexp, allowing the handling of the void *pprivate; member
12703 first. (This routine is not overridable by extensions, which is why
12704 the extensions free is called first.)
12705
12706 See regdupe and regdupe_internal if you change anything here.
84da74a7 12707*/
f8149455 12708#ifndef PERL_IN_XSUB_RE
2b69d0c2 12709void
84679df5 12710Perl_pregfree(pTHX_ REGEXP *r)
a687059c 12711{
288b8c02
NC
12712 SvREFCNT_dec(r);
12713}
12714
12715void
12716Perl_pregfree2(pTHX_ REGEXP *rx)
12717{
27da23d5 12718 dVAR;
288b8c02 12719 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 12720 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 12721
7918f24d
NC
12722 PERL_ARGS_ASSERT_PREGFREE2;
12723
28d8d7f4
YO
12724 if (r->mother_re) {
12725 ReREFCNT_dec(r->mother_re);
12726 } else {
288b8c02 12727 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 12728 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
12729 }
12730 if (r->substrs) {
ef8d46e8
VP
12731 SvREFCNT_dec(r->anchored_substr);
12732 SvREFCNT_dec(r->anchored_utf8);
12733 SvREFCNT_dec(r->float_substr);
12734 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
12735 Safefree(r->substrs);
12736 }
288b8c02 12737 RX_MATCH_COPY_FREE(rx);
f8c7b90f 12738#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 12739 SvREFCNT_dec(r->saved_copy);
ed252734 12740#endif
f0ab9afb 12741 Safefree(r->offs);
f8149455 12742}
28d8d7f4
YO
12743
12744/* reg_temp_copy()
12745
12746 This is a hacky workaround to the structural issue of match results
12747 being stored in the regexp structure which is in turn stored in
12748 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12749 could be PL_curpm in multiple contexts, and could require multiple
12750 result sets being associated with the pattern simultaneously, such
12751 as when doing a recursive match with (??{$qr})
12752
12753 The solution is to make a lightweight copy of the regexp structure
12754 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 12755 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
12756 the starp/end and the actual regexp structure itself.
12757
12758*/
12759
12760
84679df5 12761REGEXP *
f0826785 12762Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 12763{
f0826785 12764 struct regexp *ret;
288b8c02 12765 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 12766 register const I32 npar = r->nparens+1;
7918f24d
NC
12767
12768 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12769
f0826785
BM
12770 if (!ret_x)
12771 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12772 ret = (struct regexp *)SvANY(ret_x);
12773
288b8c02 12774 (void)ReREFCNT_inc(rx);
f7c278bf
NC
12775 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12776 by pointing directly at the buffer, but flagging that the allocated
12777 space in the copy is zero. As we've just done a struct copy, it's now
12778 a case of zero-ing that, rather than copying the current length. */
12779 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 12780 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
12781 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12782 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 12783 SvLEN_set(ret_x, 0);
b9ad13ac 12784 SvSTASH_set(ret_x, NULL);
703c388d 12785 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
12786 Newx(ret->offs, npar, regexp_paren_pair);
12787 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 12788 if (r->substrs) {
28d8d7f4 12789 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
12790 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12791
12792 SvREFCNT_inc_void(ret->anchored_substr);
12793 SvREFCNT_inc_void(ret->anchored_utf8);
12794 SvREFCNT_inc_void(ret->float_substr);
12795 SvREFCNT_inc_void(ret->float_utf8);
12796
12797 /* check_substr and check_utf8, if non-NULL, point to either their
12798 anchored or float namesakes, and don't hold a second reference. */
486913e4 12799 }
288b8c02 12800 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 12801#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 12802 ret->saved_copy = NULL;
28d8d7f4 12803#endif
288b8c02 12804 ret->mother_re = rx;
28d8d7f4 12805
288b8c02 12806 return ret_x;
28d8d7f4 12807}
f8149455
YO
12808#endif
12809
12810/* regfree_internal()
12811
12812 Free the private data in a regexp. This is overloadable by
12813 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 12814 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
12815 know about, however of course we have to handle the
12816 regexp_internal structure when no extension is in use.
12817
12818 Note this is called before freeing anything in the regexp
12819 structure.
12820 */
12821
12822void
288b8c02 12823Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
12824{
12825 dVAR;
288b8c02 12826 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
12827 RXi_GET_DECL(r,ri);
12828 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12829
12830 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12831
f8149455
YO
12832 DEBUG_COMPILE_r({
12833 if (!PL_colorset)
12834 reginitcolors();
12835 {
12836 SV *dsv= sv_newmortal();
3c8556c3 12837 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 12838 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
12839 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12840 PL_colors[4],PL_colors[5],s);
12841 }
12842 });
7122b237
YO
12843#ifdef RE_TRACK_PATTERN_OFFSETS
12844 if (ri->u.offsets)
12845 Safefree(ri->u.offsets); /* 20010421 MJD */
12846#endif
f8fc2ecf
YO
12847 if (ri->data) {
12848 int n = ri->data->count;
f3548bdc
DM
12849 PAD* new_comppad = NULL;
12850 PAD* old_comppad;
4026c95a 12851 PADOFFSET refcnt;
dfad63ad 12852
c277df42 12853 while (--n >= 0) {
261faec3 12854 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 12855 switch (ri->data->what[n]) {
af534a04 12856 case 'a':
c277df42 12857 case 's':
81714fb9 12858 case 'S':
55eed653 12859 case 'u':
ad64d0ec 12860 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 12861 break;
653099ff 12862 case 'f':
f8fc2ecf 12863 Safefree(ri->data->data[n]);
653099ff 12864 break;
dfad63ad 12865 case 'p':
502c6561 12866 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 12867 break;
c277df42 12868 case 'o':
dfad63ad 12869 if (new_comppad == NULL)
cea2e8a9 12870 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
12871 PAD_SAVE_LOCAL(old_comppad,
12872 /* Watch out for global destruction's random ordering. */
c445ea15 12873 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 12874 );
b34c0dd4 12875 OP_REFCNT_LOCK;
f8fc2ecf 12876 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
12877 OP_REFCNT_UNLOCK;
12878 if (!refcnt)
f8fc2ecf 12879 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 12880
f3548bdc 12881 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 12882 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 12883 new_comppad = NULL;
c277df42
IZ
12884 break;
12885 case 'n':
9e55ce06 12886 break;
07be1b83 12887 case 'T':
be8e71aa
YO
12888 { /* Aho Corasick add-on structure for a trie node.
12889 Used in stclass optimization only */
07be1b83 12890 U32 refcount;
f8fc2ecf 12891 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
12892 OP_REFCNT_LOCK;
12893 refcount = --aho->refcount;
12894 OP_REFCNT_UNLOCK;
12895 if ( !refcount ) {
446bd890
NC
12896 PerlMemShared_free(aho->states);
12897 PerlMemShared_free(aho->fail);
446bd890
NC
12898 /* do this last!!!! */
12899 PerlMemShared_free(ri->data->data[n]);
12900 PerlMemShared_free(ri->regstclass);
07be1b83
YO
12901 }
12902 }
12903 break;
a3621e74 12904 case 't':
07be1b83 12905 {
be8e71aa 12906 /* trie structure. */
07be1b83 12907 U32 refcount;
f8fc2ecf 12908 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
12909 OP_REFCNT_LOCK;
12910 refcount = --trie->refcount;
12911 OP_REFCNT_UNLOCK;
12912 if ( !refcount ) {
446bd890 12913 PerlMemShared_free(trie->charmap);
446bd890
NC
12914 PerlMemShared_free(trie->states);
12915 PerlMemShared_free(trie->trans);
07be1b83 12916 if (trie->bitmap)
446bd890 12917 PerlMemShared_free(trie->bitmap);
786e8c11 12918 if (trie->jump)
446bd890 12919 PerlMemShared_free(trie->jump);
2e64971a 12920 PerlMemShared_free(trie->wordinfo);
446bd890
NC
12921 /* do this last!!!! */
12922 PerlMemShared_free(ri->data->data[n]);
a3621e74 12923 }
07be1b83
YO
12924 }
12925 break;
c277df42 12926 default:
f8fc2ecf 12927 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
12928 }
12929 }
f8fc2ecf
YO
12930 Safefree(ri->data->what);
12931 Safefree(ri->data);
a0d0e21e 12932 }
28d8d7f4 12933
f8fc2ecf 12934 Safefree(ri);
a687059c 12935}
c277df42 12936
a09252eb
NC
12937#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12938#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
12939#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12940
12941/*
32cd70f6 12942 re_dup - duplicate a regexp.
84da74a7 12943
8233f606
DM
12944 This routine is expected to clone a given regexp structure. It is only
12945 compiled under USE_ITHREADS.
32cd70f6 12946
f8149455
YO
12947 After all of the core data stored in struct regexp is duplicated
12948 the regexp_engine.dupe method is used to copy any private data
12949 stored in the *pprivate pointer. This allows extensions to handle
12950 any duplication it needs to do.
12951
12952 See pregfree() and regfree_internal() if you change anything here.
84da74a7 12953*/
a3c0e9ca 12954#if defined(USE_ITHREADS)
f8149455 12955#ifndef PERL_IN_XSUB_RE
288b8c02
NC
12956void
12957Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 12958{
84da74a7 12959 dVAR;
a86a1ca7 12960 I32 npar;
288b8c02
NC
12961 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12962 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 12963
7918f24d
NC
12964 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12965
84da74a7 12966 npar = r->nparens+1;
f0ab9afb
NC
12967 Newx(ret->offs, npar, regexp_paren_pair);
12968 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 12969 if(ret->swap) {
28d8d7f4 12970 /* no need to copy these */
f0ab9afb 12971 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 12972 }
84da74a7 12973
6057429f 12974 if (ret->substrs) {
32cd70f6
NC
12975 /* Do it this way to avoid reading from *r after the StructCopy().
12976 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12977 cache, it doesn't matter. */
66b1de87
NC
12978 const bool anchored = r->check_substr
12979 ? r->check_substr == r->anchored_substr
12980 : r->check_utf8 == r->anchored_utf8;
785a26d5 12981 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
12982 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12983
32cd70f6
NC
12984 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12985 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12986 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12987 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 12988
32cd70f6
NC
12989 /* check_substr and check_utf8, if non-NULL, point to either their
12990 anchored or float namesakes, and don't hold a second reference. */
12991
12992 if (ret->check_substr) {
12993 if (anchored) {
12994 assert(r->check_utf8 == r->anchored_utf8);
12995 ret->check_substr = ret->anchored_substr;
12996 ret->check_utf8 = ret->anchored_utf8;
12997 } else {
12998 assert(r->check_substr == r->float_substr);
12999 assert(r->check_utf8 == r->float_utf8);
13000 ret->check_substr = ret->float_substr;
13001 ret->check_utf8 = ret->float_utf8;
13002 }
66b1de87
NC
13003 } else if (ret->check_utf8) {
13004 if (anchored) {
13005 ret->check_utf8 = ret->anchored_utf8;
13006 } else {
13007 ret->check_utf8 = ret->float_utf8;
13008 }
32cd70f6 13009 }
6057429f 13010 }
f8149455 13011
5daac39c 13012 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 13013
6057429f 13014 if (ret->pprivate)
288b8c02 13015 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 13016
288b8c02 13017 if (RX_MATCH_COPIED(dstr))
6057429f 13018 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
13019 else
13020 ret->subbeg = NULL;
13021#ifdef PERL_OLD_COPY_ON_WRITE
13022 ret->saved_copy = NULL;
13023#endif
6057429f 13024
c2123ae3
NC
13025 if (ret->mother_re) {
13026 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13027 /* Our storage points directly to our mother regexp, but that's
13028 1: a buffer in a different thread
13029 2: something we no longer hold a reference on
13030 so we need to copy it locally. */
d5aafdca
FC
13031 /* Note we need to use SvCUR(), rather than
13032 SvLEN(), on our mother_re, because it, in
c2123ae3
NC
13033 turn, may well be pointing to its own mother_re. */
13034 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13035 SvCUR(ret->mother_re)+1));
13036 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13037 }
13038 ret->mother_re = NULL;
13039 }
6057429f 13040 ret->gofs = 0;
f8149455
YO
13041}
13042#endif /* PERL_IN_XSUB_RE */
13043
13044/*
13045 regdupe_internal()
13046
13047 This is the internal complement to regdupe() which is used to copy
13048 the structure pointed to by the *pprivate pointer in the regexp.
13049 This is the core version of the extension overridable cloning hook.
13050 The regexp structure being duplicated will be copied by perl prior
13051 to this and will be provided as the regexp *r argument, however
13052 with the /old/ structures pprivate pointer value. Thus this routine
13053 may override any copying normally done by perl.
13054
13055 It returns a pointer to the new regexp_internal structure.
13056*/
13057
13058void *
288b8c02 13059Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
13060{
13061 dVAR;
288b8c02 13062 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455 13063 regexp_internal *reti;
0780bc72 13064 int len;
f8149455 13065 RXi_GET_DECL(r,ri);
7918f24d
NC
13066
13067 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455 13068
7122b237 13069 len = ProgLen(ri);
f8149455 13070
45cf4570 13071 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
13072 Copy(ri->program, reti->program, len+1, regnode);
13073
f8149455 13074
f8fc2ecf 13075 reti->regstclass = NULL;
bcdf7404 13076
f8fc2ecf 13077 if (ri->data) {
84da74a7 13078 struct reg_data *d;
f8fc2ecf 13079 const int count = ri->data->count;
84da74a7
YO
13080 int i;
13081
13082 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13083 char, struct reg_data);
13084 Newx(d->what, count, U8);
13085
13086 d->count = count;
13087 for (i = 0; i < count; i++) {
f8fc2ecf 13088 d->what[i] = ri->data->what[i];
84da74a7 13089 switch (d->what[i]) {
af534a04 13090 /* legal options are one of: sSfpontTua
84da74a7 13091 see also regcomp.h and pregfree() */
af534a04 13092 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 13093 case 's':
81714fb9 13094 case 'S':
0536c0a7 13095 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 13096 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 13097 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 13098 break;
84da74a7
YO
13099 case 'f':
13100 /* This is cheating. */
13101 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 13102 StructCopy(ri->data->data[i], d->data[i],
84da74a7 13103 struct regnode_charclass_class);
f8fc2ecf 13104 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
13105 break;
13106 case 'o':
bbe252da
YO
13107 /* Compiled op trees are readonly and in shared memory,
13108 and can thus be shared without duplication. */
84da74a7 13109 OP_REFCNT_LOCK;
f8fc2ecf 13110 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
13111 OP_REFCNT_UNLOCK;
13112 break;
23eab42c
NC
13113 case 'T':
13114 /* Trie stclasses are readonly and can thus be shared
13115 * without duplication. We free the stclass in pregfree
13116 * when the corresponding reg_ac_data struct is freed.
13117 */
13118 reti->regstclass= ri->regstclass;
13119 /* Fall through */
84da74a7 13120 case 't':
84da74a7 13121 OP_REFCNT_LOCK;
0536c0a7 13122 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 13123 OP_REFCNT_UNLOCK;
0536c0a7
NC
13124 /* Fall through */
13125 case 'n':
13126 d->data[i] = ri->data->data[i];
84da74a7 13127 break;
84da74a7 13128 default:
f8fc2ecf 13129 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
13130 }
13131 }
13132
f8fc2ecf 13133 reti->data = d;
84da74a7
YO
13134 }
13135 else
f8fc2ecf 13136 reti->data = NULL;
84da74a7 13137
cde0cee5
YO
13138 reti->name_list_idx = ri->name_list_idx;
13139
7122b237
YO
13140#ifdef RE_TRACK_PATTERN_OFFSETS
13141 if (ri->u.offsets) {
13142 Newx(reti->u.offsets, 2*len+1, U32);
13143 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13144 }
13145#else
13146 SetProgLen(reti,len);
13147#endif
13148
f8149455 13149 return (void*)reti;
84da74a7 13150}
f8149455
YO
13151
13152#endif /* USE_ITHREADS */
84da74a7 13153
f8149455 13154#ifndef PERL_IN_XSUB_RE
bcdf7404 13155
c277df42
IZ
13156/*
13157 - regnext - dig the "next" pointer out of a node
c277df42
IZ
13158 */
13159regnode *
864dbfa3 13160Perl_regnext(pTHX_ register regnode *p)
c277df42 13161{
97aff369 13162 dVAR;
c277df42
IZ
13163 register I32 offset;
13164
f8fc2ecf 13165 if (!p)
c277df42
IZ
13166 return(NULL);
13167
35db910f
KW
13168 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13169 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13170 }
13171
c277df42
IZ
13172 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13173 if (offset == 0)
13174 return(NULL);
13175
c277df42 13176 return(p+offset);
c277df42 13177}
76234dfb 13178#endif
c277df42 13179
686b73d4 13180STATIC void
cea2e8a9 13181S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
13182{
13183 va_list args;
13184 STRLEN l1 = strlen(pat1);
13185 STRLEN l2 = strlen(pat2);
13186 char buf[512];
06bf62c7 13187 SV *msv;
73d840c0 13188 const char *message;
c277df42 13189
7918f24d
NC
13190 PERL_ARGS_ASSERT_RE_CROAK2;
13191
c277df42
IZ
13192 if (l1 > 510)
13193 l1 = 510;
13194 if (l1 + l2 > 510)
13195 l2 = 510 - l1;
13196 Copy(pat1, buf, l1 , char);
13197 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
13198 buf[l1 + l2] = '\n';
13199 buf[l1 + l2 + 1] = '\0';
8736538c
AS
13200#ifdef I_STDARG
13201 /* ANSI variant takes additional second argument */
c277df42 13202 va_start(args, pat2);
8736538c
AS
13203#else
13204 va_start(args);
13205#endif
5a844595 13206 msv = vmess(buf, &args);
c277df42 13207 va_end(args);
cfd0369c 13208 message = SvPV_const(msv,l1);
c277df42
IZ
13209 if (l1 > 512)
13210 l1 = 512;
13211 Copy(message, buf, l1 , char);
197cf9b9 13212 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 13213 Perl_croak(aTHX_ "%s", buf);
c277df42 13214}
a0ed51b3
LW
13215
13216/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13217
76234dfb 13218#ifndef PERL_IN_XSUB_RE
a0ed51b3 13219void
864dbfa3 13220Perl_save_re_context(pTHX)
b81d288d 13221{
97aff369 13222 dVAR;
1ade1aa1
NC
13223
13224 struct re_save_state *state;
13225
13226 SAVEVPTR(PL_curcop);
13227 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13228
13229 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13230 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 13231 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 13232
46ab3289 13233 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 13234
a0ed51b3 13235 PL_reg_start_tmp = 0;
a0ed51b3 13236 PL_reg_start_tmpl = 0;
c445ea15 13237 PL_reg_oldsaved = NULL;
a5db57d6 13238 PL_reg_oldsavedlen = 0;
a5db57d6 13239 PL_reg_maxiter = 0;
a5db57d6 13240 PL_reg_leftiter = 0;
c445ea15 13241 PL_reg_poscache = NULL;
a5db57d6 13242 PL_reg_poscache_size = 0;
1ade1aa1
NC
13243#ifdef PERL_OLD_COPY_ON_WRITE
13244 PL_nrs = NULL;
13245#endif
ada6e8a9 13246
c445ea15
AL
13247 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13248 if (PL_curpm) {
13249 const REGEXP * const rx = PM_GETRE(PL_curpm);
13250 if (rx) {
1df70142 13251 U32 i;
07bc277f 13252 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 13253 char digits[TYPE_CHARS(long)];
d9fad198 13254 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
13255 GV *const *const gvp
13256 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13257
b37c2d43
AL
13258 if (gvp) {
13259 GV * const gv = *gvp;
13260 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13261 save_scalar(gv);
49f27e4b 13262 }
ada6e8a9
AMS
13263 }
13264 }
13265 }
a0ed51b3 13266}
76234dfb 13267#endif
51371543 13268
51371543 13269static void
acfe0abc 13270clear_re(pTHX_ void *r)
51371543 13271{
97aff369 13272 dVAR;
84679df5 13273 ReREFCNT_dec((REGEXP *)r);
51371543 13274}
ffbc6a93 13275
a28509cc
AL
13276#ifdef DEBUGGING
13277
13278STATIC void
13279S_put_byte(pTHX_ SV *sv, int c)
13280{
7918f24d
NC
13281 PERL_ARGS_ASSERT_PUT_BYTE;
13282
7fddd944
NC
13283 /* Our definition of isPRINT() ignores locales, so only bytes that are
13284 not part of UTF-8 are considered printable. I assume that the same
13285 holds for UTF-EBCDIC.
13286 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13287 which Wikipedia says:
13288
13289 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13290 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13291 identical, to the ASCII delete (DEL) or rubout control character.
13292 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
13293 if (!isPRINT(c)) {
13294 if (c < 256) {
13295 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13296 }
13297 else {
13298 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13299 }
13300 }
5e7aa789 13301 else {
88c9ea1e 13302 const char string = c;
5e7aa789
NC
13303 if (c == '-' || c == ']' || c == '\\' || c == '^')
13304 sv_catpvs(sv, "\\");
13305 sv_catpvn(sv, &string, 1);
13306 }
a28509cc
AL
13307}
13308
786e8c11 13309
3dab1dad
YO
13310#define CLEAR_OPTSTART \
13311 if (optstart) STMT_START { \
70685ca0 13312 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
13313 optstart=NULL; \
13314 } STMT_END
13315
786e8c11 13316#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 13317
b5a2f8d8
NC
13318STATIC const regnode *
13319S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
13320 const regnode *last, const regnode *plast,
13321 SV* sv, I32 indent, U32 depth)
a28509cc 13322{
97aff369 13323 dVAR;
786e8c11 13324 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 13325 register const regnode *next;
3dab1dad 13326 const regnode *optstart= NULL;
1f1031fe 13327
f8fc2ecf 13328 RXi_GET_DECL(r,ri);
3dab1dad 13329 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
13330
13331 PERL_ARGS_ASSERT_DUMPUNTIL;
13332
786e8c11
YO
13333#ifdef DEBUG_DUMPUNTIL
13334 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13335 last ? last-start : 0,plast ? plast-start : 0);
13336#endif
13337
13338 if (plast && plast < last)
13339 last= plast;
13340
13341 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 13342 /* While that wasn't END last time... */
a28509cc
AL
13343 NODE_ALIGN(node);
13344 op = OP(node);
de734bd5 13345 if (op == CLOSE || op == WHILEM)
786e8c11 13346 indent--;
b5a2f8d8 13347 next = regnext((regnode *)node);
1f1031fe 13348
a28509cc 13349 /* Where, what. */
8e11feef 13350 if (OP(node) == OPTIMIZED) {
e68ec53f 13351 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 13352 optstart = node;
3dab1dad 13353 else
8e11feef 13354 goto after_print;
3dab1dad
YO
13355 } else
13356 CLEAR_OPTSTART;
686b73d4 13357
32fc9b6a 13358 regprop(r, sv, node);
a28509cc 13359 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 13360 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
13361
13362 if (OP(node) != OPTIMIZED) {
13363 if (next == NULL) /* Next ptr. */
13364 PerlIO_printf(Perl_debug_log, " (0)");
13365 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13366 PerlIO_printf(Perl_debug_log, " (FAIL)");
13367 else
13368 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13369 (void)PerlIO_putc(Perl_debug_log, '\n');
13370 }
13371
a28509cc
AL
13372 after_print:
13373 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
13374 assert(next);
13375 {
13376 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
13377 ? regnext((regnode *)next)
13378 : next);
be8e71aa
YO
13379 if (last && nnode > last)
13380 nnode = last;
786e8c11 13381 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 13382 }
a28509cc
AL
13383 }
13384 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 13385 assert(next);
786e8c11 13386 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
13387 }
13388 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 13389 const regnode *this_trie = node;
1de06328 13390 const char op = OP(node);
647f639f 13391 const U32 n = ARG(node);
1de06328 13392 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 13393 (reg_ac_data *)ri->data->data[n] :
1de06328 13394 NULL;
3251b653
NC
13395 const reg_trie_data * const trie =
13396 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 13397#ifdef DEBUGGING
502c6561 13398 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 13399#endif
786e8c11 13400 const regnode *nextbranch= NULL;
a28509cc 13401 I32 word_idx;
76f68e9b 13402 sv_setpvs(sv, "");
786e8c11 13403 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 13404 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
686b73d4 13405
786e8c11
YO
13406 PerlIO_printf(Perl_debug_log, "%*s%s ",
13407 (int)(2*(indent+3)), "",
13408 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
13409 PL_colors[0], PL_colors[1],
13410 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 13411 PERL_PV_PRETTY_ELLIPSES |
7f69552c 13412 PERL_PV_PRETTY_LTGT
786e8c11
YO
13413 )
13414 : "???"
13415 );
13416 if (trie->jump) {
40d049e4 13417 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
13418 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13419 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
13420 if (dist) {
13421 if (!nextbranch)
24b23f37 13422 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
13423 DUMPUNTIL(this_trie + dist, nextbranch);
13424 }
786e8c11
YO
13425 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13426 nextbranch= regnext((regnode *)nextbranch);
13427 } else {
13428 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 13429 }
786e8c11
YO
13430 }
13431 if (last && next > last)
13432 node= last;
13433 else
13434 node= next;
a28509cc 13435 }
786e8c11
YO
13436 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13437 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13438 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
13439 }
13440 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 13441 assert(next);
786e8c11 13442 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
13443 }
13444 else if ( op == PLUS || op == STAR) {
786e8c11 13445 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 13446 }
f56b6394 13447 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 13448 /* arglen 1 + class block */
4a3ee7a8 13449 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
13450 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13451 node = NEXTOPER(node);
13452 }
13453 else if (PL_regkind[(U8)op] == EXACT) {
13454 /* Literal string, where present. */
13455 node += NODE_SZ_STR(node) - 1;
13456 node = NEXTOPER(node);
13457 }
13458 else {
13459 node = NEXTOPER(node);
13460 node += regarglen[(U8)op];
13461 }
13462 if (op == CURLYX || op == OPEN)
786e8c11 13463 indent++;
a28509cc 13464 }
3dab1dad 13465 CLEAR_OPTSTART;
786e8c11 13466#ifdef DEBUG_DUMPUNTIL
70685ca0 13467 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 13468#endif
1de06328 13469 return node;
a28509cc
AL
13470}
13471
13472#endif /* DEBUGGING */
13473
241d1a3b
NC
13474/*
13475 * Local variables:
13476 * c-indentation-style: bsd
13477 * c-basic-offset: 4
14d04a33 13478 * indent-tabs-mode: nil
241d1a3b
NC
13479 * End:
13480 *
14d04a33 13481 * ex: set ts=8 sts=4 sw=4 et:
37442d52 13482 */