This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Put a cap on op slab sizes
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d 88#include "dquote_static.c"
370b8f2f
TC
89#ifndef PERL_IN_XSUB_RE
90# include "charclass_invlists.h"
91#endif
04e98a4d 92
94dc5c2d
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
94
d4cce5f1 95#ifdef op
11343788 96#undef op
d4cce5f1 97#endif /* op */
11343788 98
fe14fcc3 99#ifdef MSDOS
7e4e8c89 100# if defined(BUGGY_MSC6)
fe14fcc3 101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 102# pragma optimize("a",off)
fe14fcc3 103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
104# pragma optimize("w",on )
105# endif /* BUGGY_MSC6 */
fe14fcc3
LW
106#endif /* MSDOS */
107
a687059c
LW
108#ifndef STATIC
109#define STATIC static
110#endif
111
b1603ef8 112
830247a4 113typedef struct RExC_state_t {
514a91f1
DM
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 116 char *precomp; /* uncompiled string. */
288b8c02 117 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 120 char *start; /* Start of input for compile */
830247a4
IZ
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 124 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 125 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 126 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
129 U32 seen;
130 I32 size; /* Code size. */
c74340f9
YO
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 133 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
134 I32 extralen;
135 I32 seen_zerolen;
40d049e4
YO
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
02daf0ab
YO
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
e40e74fe
KW
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
145 utf8 */
81714fb9 146 HV *paren_names; /* Paren names */
1f1031fe 147
40d049e4
YO
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
b57e4118 150 I32 in_lookbehind;
4624b182 151 I32 contains_locale;
bb3f3ed2 152 I32 override_recoding;
3d2bd50a 153 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 154 within pattern */
b1603ef8
DM
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
830247a4
IZ
157#if ADD_TO_REGEXEC
158 char *starttry; /* -Dr: where regtry was called. */
159#define RExC_starttry (pRExC_state->starttry)
160#endif
d24ca0c5 161 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 162#ifdef DEBUGGING
be8e71aa 163 const char *lastparse;
3dab1dad 164 I32 lastnum;
1f1031fe 165 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
166#define RExC_lastparse (pRExC_state->lastparse)
167#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 168#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 169#endif
830247a4
IZ
170} RExC_state_t;
171
e2509266 172#define RExC_flags (pRExC_state->flags)
514a91f1 173#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 174#define RExC_precomp (pRExC_state->precomp)
288b8c02 175#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 176#define RExC_rx (pRExC_state->rx)
f8fc2ecf 177#define RExC_rxi (pRExC_state->rxi)
fac92740 178#define RExC_start (pRExC_state->start)
830247a4
IZ
179#define RExC_end (pRExC_state->end)
180#define RExC_parse (pRExC_state->parse)
181#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
182#ifdef RE_TRACK_PATTERN_OFFSETS
183#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
184#endif
830247a4 185#define RExC_emit (pRExC_state->emit)
fac92740 186#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 187#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
188#define RExC_naughty (pRExC_state->naughty)
189#define RExC_sawback (pRExC_state->sawback)
190#define RExC_seen (pRExC_state->seen)
191#define RExC_size (pRExC_state->size)
192#define RExC_npar (pRExC_state->npar)
e2e6a0f1 193#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
194#define RExC_extralen (pRExC_state->extralen)
195#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 196#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 197#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 198#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
199#define RExC_open_parens (pRExC_state->open_parens)
200#define RExC_close_parens (pRExC_state->close_parens)
201#define RExC_opend (pRExC_state->opend)
81714fb9 202#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
203#define RExC_recurse (pRExC_state->recurse)
204#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 205#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 206#define RExC_contains_locale (pRExC_state->contains_locale)
bb3f3ed2 207#define RExC_override_recoding (pRExC_state->override_recoding)
830247a4 208
cde0cee5 209
a687059c
LW
210#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
a687059c 213
35c8bce7
LW
214#ifdef SPSTART
215#undef SPSTART /* dratted cpp namespace... */
216#endif
a687059c
LW
217/*
218 * Flags to be passed up and down.
219 */
a687059c 220#define WORST 0 /* Worst case. */
a3b492c3 221#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
222
223/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
cf8c372d
KW
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
fda99bee 226#define SIMPLE 0x02
a3b492c3
YO
227#define SPSTART 0x04 /* Starts with * or +. */
228#define TRYAGAIN 0x08 /* Weeded out a declaration. */
229#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 230
3dab1dad
YO
231#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
232
07be1b83
YO
233/* whether trie related optimizations are enabled */
234#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235#define TRIE_STUDY_OPT
786e8c11 236#define FULL_TRIE_STUDY
07be1b83
YO
237#define TRIE_STCLASS
238#endif
1de06328
YO
239
240
40d049e4
YO
241
242#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243#define PBITVAL(paren) (1 << ((paren) & 7))
244#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
247
bbd61b5f
KW
248/* If not already in utf8, do a longjmp back to the beginning */
249#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250#define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
252 } STMT_END
40d049e4 253
1de06328
YO
254/* About scan_data_t.
255
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
3b753521 260 string that must appear at a fixed location, and we look for the
1de06328
YO
261 longest string that may appear at a floating location. So for instance
262 in the pattern:
263
264 /FOO[xX]A.*B[xX]BAR/
265
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
269
270 The strings can be composites, for instance
271
272 /(f)(o)(o)/
273
274 will result in a composite fixed substring 'foo'.
275
276 For each string some basic information is maintained:
277
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
1de06328
YO
283 tells us how many characters must appear after the string we have
284 found.
285
286 - max_offset
287 Only used for floating strings. This is the rightmost point that
3b753521 288 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
289 string can occur infinitely far to the right.
290
291 - minlenp
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
295 involved. Consider
296
297 /(?=FOO).*F/
298
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
486ec47a 305 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
309
310 - lookbehind
311
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
322 associated string.
323
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
326
327*/
2c2d71f5
JH
328
329typedef struct scan_data_t {
1de06328
YO
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
2c2d71f5
JH
332 I32 pos_min;
333 I32 pos_delta;
334 SV *last_found;
1de06328 335 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
336 I32 last_start_min;
337 I32 last_start_max;
1de06328
YO
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
486ec47a 341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 347 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
348 I32 flags;
349 I32 whilem_c;
cb434fcc 350 I32 *last_closep;
653099ff 351 struct regnode_charclass_class *start_class;
2c2d71f5
JH
352} scan_data_t;
353
a687059c 354/*
e50aee73 355 * Forward declarations for pregcomp()'s friends.
a687059c 356 */
a0d0e21e 357
27da23d5 358static const scan_data_t zero_scan_data =
1de06328 359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
360
361#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
362#define SF_BEFORE_SEOL 0x0001
363#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
364#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
366
09b7f37c
CB
367#ifdef NO_UNARY_PLUS
368# define SF_FIX_SHIFT_EOL (0+2)
369# define SF_FL_SHIFT_EOL (0+4)
370#else
371# define SF_FIX_SHIFT_EOL (+2)
372# define SF_FL_SHIFT_EOL (+4)
373#endif
c277df42
IZ
374
375#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
377
378#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
380#define SF_IS_INF 0x0040
381#define SF_HAS_PAR 0x0080
382#define SF_IN_PAR 0x0100
383#define SF_HAS_EVAL 0x0200
384#define SCF_DO_SUBSTR 0x0400
653099ff
GS
385#define SCF_DO_STCLASS_AND 0x0800
386#define SCF_DO_STCLASS_OR 0x1000
387#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 388#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 389
786e8c11 390#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 391#define SCF_SEEN_ACCEPT 0x8000
07be1b83 392
43fead97 393#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
394
395/* The enums for all these are ordered so things work out correctly */
a62b1201 396#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 397#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 398#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
399#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
401#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
402#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 403
43fead97 404#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 405
ffc61ed2 406#define OOB_UNICODE 12345678
93733859 407#define OOB_NAMEDCLASS -1
b8c5462f 408
a0ed51b3
LW
409#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
410#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
411
8615cb43 412
b45f050a
JF
413/* length of regex to show in messages that don't mark a position within */
414#define RegexLengthToShowInErrorMessages 127
415
416/*
417 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
418 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
419 * op/pragma/warn/regcomp.
420 */
7253e4e3
RK
421#define MARKER1 "<-- HERE" /* marker as it appears in the description */
422#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 423
7253e4e3 424#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
425
426/*
427 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
428 * arg. Show regex, up to a maximum length. If it's too long, chop and add
429 * "...".
430 */
58e23c8d 431#define _FAIL(code) STMT_START { \
bfed75c6 432 const char *ellipses = ""; \
ccb2c380
MP
433 IV len = RExC_end - RExC_precomp; \
434 \
435 if (!SIZE_ONLY) \
288b8c02 436 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
437 if (len > RegexLengthToShowInErrorMessages) { \
438 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
439 len = RegexLengthToShowInErrorMessages - 10; \
440 ellipses = "..."; \
441 } \
58e23c8d 442 code; \
ccb2c380 443} STMT_END
8615cb43 444
58e23c8d
YO
445#define FAIL(msg) _FAIL( \
446 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
447 msg, (int)len, RExC_precomp, ellipses))
448
449#define FAIL2(msg,arg) _FAIL( \
450 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
451 arg, (int)len, RExC_precomp, ellipses))
452
b45f050a 453/*
b45f050a
JF
454 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
455 */
ccb2c380 456#define Simple_vFAIL(m) STMT_START { \
a28509cc 457 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
458 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
459 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
b45f050a
JF
461
462/*
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
464 */
ccb2c380
MP
465#define vFAIL(m) STMT_START { \
466 if (!SIZE_ONLY) \
288b8c02 467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
468 Simple_vFAIL(m); \
469} STMT_END
b45f050a
JF
470
471/*
472 * Like Simple_vFAIL(), but accepts two arguments.
473 */
ccb2c380 474#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 475 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
476 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
477 (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
b45f050a
JF
479
480/*
481 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
482 */
ccb2c380
MP
483#define vFAIL2(m,a1) STMT_START { \
484 if (!SIZE_ONLY) \
288b8c02 485 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
486 Simple_vFAIL2(m, a1); \
487} STMT_END
b45f050a
JF
488
489
490/*
491 * Like Simple_vFAIL(), but accepts three arguments.
492 */
ccb2c380 493#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 494 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
b45f050a
JF
498
499/*
500 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
501 */
ccb2c380
MP
502#define vFAIL3(m,a1,a2) STMT_START { \
503 if (!SIZE_ONLY) \
288b8c02 504 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
505 Simple_vFAIL3(m, a1, a2); \
506} STMT_END
b45f050a
JF
507
508/*
509 * Like Simple_vFAIL(), but accepts four arguments.
510 */
ccb2c380 511#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 512 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
513 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
515} STMT_END
b45f050a 516
668c081a 517#define ckWARNreg(loc,m) STMT_START { \
a28509cc 518 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
519 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
520 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
521} STMT_END
522
668c081a 523#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 524 const IV offset = loc - RExC_precomp; \
d1d15184 525 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
526 m REPORT_LOCATION, \
527 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
528} STMT_END
529
2335b3d3
KW
530#define ckWARN2regdep(loc,m, a1) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
533 m REPORT_LOCATION, \
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
535} STMT_END
536
668c081a 537#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 538 const IV offset = loc - RExC_precomp; \
668c081a 539 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
540 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
541} STMT_END
542
543#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 544 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
545 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
547} STMT_END
548
668c081a
NC
549#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
553} STMT_END
554
ccb2c380 555#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 556 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
557 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
559} STMT_END
560
668c081a
NC
561#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
565} STMT_END
566
ccb2c380 567#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 568 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
569 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
570 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
571} STMT_END
9d1d55b5 572
8615cb43 573
cd439c50 574/* Allow for side effects in s */
ccb2c380
MP
575#define REGC(c,s) STMT_START { \
576 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
577} STMT_END
cd439c50 578
fac92740
MJD
579/* Macros for recording node offsets. 20001227 mjd@plover.com
580 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
581 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
582 * Element 0 holds the number n.
07be1b83 583 * Position is 1 indexed.
fac92740 584 */
7122b237
YO
585#ifndef RE_TRACK_PATTERN_OFFSETS
586#define Set_Node_Offset_To_R(node,byte)
587#define Set_Node_Offset(node,byte)
588#define Set_Cur_Node_Offset
589#define Set_Node_Length_To_R(node,len)
590#define Set_Node_Length(node,len)
591#define Set_Node_Cur_Length(node)
592#define Node_Offset(n)
593#define Node_Length(n)
594#define Set_Node_Offset_Length(node,offset,len)
595#define ProgLen(ri) ri->u.proglen
596#define SetProgLen(ri,x) ri->u.proglen = x
597#else
598#define ProgLen(ri) ri->u.offsets[0]
599#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
600#define Set_Node_Offset_To_R(node,byte) STMT_START { \
601 if (! SIZE_ONLY) { \
602 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 603 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 604 if((node) < 0) { \
551405c4 605 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
606 } else { \
607 RExC_offsets[2*(node)-1] = (byte); \
608 } \
609 } \
610} STMT_END
611
612#define Set_Node_Offset(node,byte) \
613 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
614#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
615
616#define Set_Node_Length_To_R(node,len) STMT_START { \
617 if (! SIZE_ONLY) { \
618 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 619 __LINE__, (int)(node), (int)(len))); \
ccb2c380 620 if((node) < 0) { \
551405c4 621 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
622 } else { \
623 RExC_offsets[2*(node)] = (len); \
624 } \
625 } \
626} STMT_END
627
628#define Set_Node_Length(node,len) \
629 Set_Node_Length_To_R((node)-RExC_emit_start, len)
630#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
631#define Set_Node_Cur_Length(node) \
632 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
633
634/* Get offsets and lengths */
635#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
636#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
637
07be1b83
YO
638#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
639 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
640 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
641} STMT_END
7122b237 642#endif
07be1b83
YO
643
644#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
645#define EXPERIMENTAL_INPLACESCAN
f427392e 646#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 647
304ee84b
YO
648#define DEBUG_STUDYDATA(str,data,depth) \
649DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 650 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
651 "%*s" str "Pos:%"IVdf"/%"IVdf \
652 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
653 (int)(depth)*2, "", \
654 (IV)((data)->pos_min), \
655 (IV)((data)->pos_delta), \
304ee84b 656 (UV)((data)->flags), \
1de06328 657 (IV)((data)->whilem_c), \
304ee84b
YO
658 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
659 is_inf ? "INF " : "" \
1de06328
YO
660 ); \
661 if ((data)->last_found) \
662 PerlIO_printf(Perl_debug_log, \
663 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
664 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
665 SvPVX_const((data)->last_found), \
666 (IV)((data)->last_end), \
667 (IV)((data)->last_start_min), \
668 (IV)((data)->last_start_max), \
669 ((data)->longest && \
670 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
671 SvPVX_const((data)->longest_fixed), \
672 (IV)((data)->offset_fixed), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_float)) ? "*" : "", \
675 SvPVX_const((data)->longest_float), \
676 (IV)((data)->offset_float_min), \
677 (IV)((data)->offset_float_max) \
678 ); \
679 PerlIO_printf(Perl_debug_log,"\n"); \
680});
681
acfe0abc 682static void clear_re(pTHX_ void *r);
4327152a 683
653099ff 684/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 685 Update the longest found anchored substring and the longest found
653099ff
GS
686 floating substrings if needed. */
687
4327152a 688STATIC void
304ee84b 689S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 690{
e1ec3a88
AL
691 const STRLEN l = CHR_SVLEN(data->last_found);
692 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 693 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 694
7918f24d
NC
695 PERL_ARGS_ASSERT_SCAN_COMMIT;
696
c277df42 697 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 698 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
699 if (*data->longest == data->longest_fixed) {
700 data->offset_fixed = l ? data->last_start_min : data->pos_min;
701 if (data->flags & SF_BEFORE_EOL)
b81d288d 702 data->flags
c277df42
IZ
703 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
704 else
705 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 706 data->minlen_fixed=minlenp;
1de06328 707 data->lookbehind_fixed=0;
a0ed51b3 708 }
304ee84b 709 else { /* *data->longest == data->longest_float */
c277df42 710 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
711 data->offset_float_max = (l
712 ? data->last_start_max
c277df42 713 : data->pos_min + data->pos_delta);
304ee84b 714 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 715 data->offset_float_max = I32_MAX;
c277df42 716 if (data->flags & SF_BEFORE_EOL)
b81d288d 717 data->flags
c277df42
IZ
718 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
719 else
720 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
721 data->minlen_float=minlenp;
722 data->lookbehind_float=0;
c277df42
IZ
723 }
724 }
725 SvCUR_set(data->last_found, 0);
0eda9292 726 {
a28509cc 727 SV * const sv = data->last_found;
097eb12c
AL
728 if (SvUTF8(sv) && SvMAGICAL(sv)) {
729 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
730 if (mg)
731 mg->mg_len = 0;
732 }
0eda9292 733 }
c277df42
IZ
734 data->last_end = -1;
735 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 736 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
737}
738
653099ff
GS
739/* Can match anything (initialization) */
740STATIC void
3fffb88a 741S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 742{
7918f24d
NC
743 PERL_ARGS_ASSERT_CL_ANYTHING;
744
f8bef550 745 ANYOF_BITMAP_SETALL(cl);
dd58aee1 746 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
3ad98780 747 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
3fffb88a
KW
748
749 /* If any portion of the regex is to operate under locale rules,
750 * initialization includes it. The reason this isn't done for all regexes
751 * is that the optimizer was written under the assumption that locale was
752 * all-or-nothing. Given the complexity and lack of documentation in the
753 * optimizer, and that there are inadequate test cases for locale, so many
754 * parts of it may not work properly, it is safest to avoid locale unless
755 * necessary. */
756 if (RExC_contains_locale) {
9d7a1e63 757 ANYOF_CLASS_SETALL(cl); /* /l uses class */
3fffb88a
KW
758 cl->flags |= ANYOF_LOCALE;
759 }
9d7a1e63
KW
760 else {
761 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
762 }
653099ff
GS
763}
764
765/* Can match anything (initialization) */
766STATIC int
5f66b61c 767S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
768{
769 int value;
770
7918f24d
NC
771 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
772
aaa51d5e 773 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
774 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
775 return 1;
1aa99e6b
IH
776 if (!(cl->flags & ANYOF_UNICODE_ALL))
777 return 0;
10edeb5d 778 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 779 return 0;
653099ff
GS
780 return 1;
781}
782
783/* Can match anything (initialization) */
784STATIC void
e755fd73 785S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 786{
7918f24d
NC
787 PERL_ARGS_ASSERT_CL_INIT;
788
8ecf7187 789 Zero(cl, 1, struct regnode_charclass_class);
653099ff 790 cl->type = ANYOF;
3fffb88a 791 cl_anything(pRExC_state, cl);
1411dba4 792 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
793}
794
1051e1c4
KW
795/* These two functions currently do the exact same thing */
796#define cl_init_zero S_cl_init
653099ff 797
dd58aee1
KW
798/* 'AND' a given class with another one. Can create false positives. 'cl'
799 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
800 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 801STATIC void
5f66b61c 802S_cl_and(struct regnode_charclass_class *cl,
a28509cc 803 const struct regnode_charclass_class *and_with)
653099ff 804{
7918f24d 805 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
806
807 assert(and_with->type == ANYOF);
1e6ade67 808
c6b76537 809 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
810 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
811 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 812 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
813 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
814 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
815 int i;
816
817 if (and_with->flags & ANYOF_INVERT)
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] &= ~and_with->bitmap[i];
820 else
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] &= and_with->bitmap[i];
823 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 824
c6b76537 825 if (and_with->flags & ANYOF_INVERT) {
8951c461 826
c6b76537
KW
827 /* Here, the and'ed node is inverted. Get the AND of the flags that
828 * aren't affected by the inversion. Those that are affected are
829 * handled individually below */
830 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
831 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
832 cl->flags |= affected_flags;
833
834 /* We currently don't know how to deal with things that aren't in the
835 * bitmap, but we know that the intersection is no greater than what
836 * is already in cl, so let there be false positives that get sorted
837 * out after the synthetic start class succeeds, and the node is
838 * matched for real. */
839
840 /* The inversion of these two flags indicate that the resulting
841 * intersection doesn't have them */
842 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
843 cl->flags &= ~ANYOF_UNICODE_ALL;
844 }
c6b76537
KW
845 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
846 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 847 }
1aa99e6b 848 }
c6b76537 849 else { /* and'd node is not inverted */
3ad98780
KW
850 U8 outside_bitmap_but_not_utf8; /* Temp variable */
851
137165a6 852 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
853
854 /* Here 'and_with' doesn't match anything outside the bitmap
855 * (except possibly ANYOF_UNICODE_ALL), which means the
856 * intersection can't either, except for ANYOF_UNICODE_ALL, in
857 * which case we don't know what the intersection is, but it's no
858 * greater than what cl already has, so can just leave it alone,
859 * with possible false positives */
860 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
861 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 862 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 863 }
137165a6 864 }
c6b76537
KW
865 else if (! ANYOF_NONBITMAP(cl)) {
866
867 /* Here, 'and_with' does match something outside the bitmap, and cl
868 * doesn't have a list of things to match outside the bitmap. If
869 * cl can match all code points above 255, the intersection will
3ad98780
KW
870 * be those above-255 code points that 'and_with' matches. If cl
871 * can't match all Unicode code points, it means that it can't
872 * match anything outside the bitmap (since the 'if' that got us
873 * into this block tested for that), so we leave the bitmap empty.
874 */
c6b76537
KW
875 if (cl->flags & ANYOF_UNICODE_ALL) {
876 ARG_SET(cl, ARG(and_with));
3ad98780
KW
877
878 /* and_with's ARG may match things that don't require UTF8.
879 * And now cl's will too, in spite of this being an 'and'. See
880 * the comments below about the kludge */
881 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
c6b76537
KW
882 }
883 }
884 else {
885 /* Here, both 'and_with' and cl match something outside the
886 * bitmap. Currently we do not do the intersection, so just match
887 * whatever cl had at the beginning. */
888 }
889
890
3ad98780
KW
891 /* Take the intersection of the two sets of flags. However, the
892 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
893 * kludge around the fact that this flag is not treated like the others
894 * which are initialized in cl_anything(). The way the optimizer works
895 * is that the synthetic start class (SSC) is initialized to match
896 * anything, and then the first time a real node is encountered, its
897 * values are AND'd with the SSC's with the result being the values of
898 * the real node. However, there are paths through the optimizer where
899 * the AND never gets called, so those initialized bits are set
900 * inappropriately, which is not usually a big deal, as they just cause
901 * false positives in the SSC, which will just mean a probably
902 * imperceptible slow down in execution. However this bit has a
903 * higher false positive consequence in that it can cause utf8.pm,
904 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
905 * bigger slowdown and also causes significant extra memory to be used.
906 * In order to prevent this, the code now takes a different tack. The
907 * bit isn't set unless some part of the regular expression needs it,
908 * but once set it won't get cleared. This means that these extra
909 * modules won't get loaded unless there was some path through the
910 * pattern that would have required them anyway, and so any false
911 * positives that occur by not ANDing them out when they could be
912 * aren't as severe as they would be if we treated this bit like all
913 * the others */
914 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
915 & ANYOF_NONBITMAP_NON_UTF8;
c6b76537 916 cl->flags &= and_with->flags;
3ad98780 917 cl->flags |= outside_bitmap_but_not_utf8;
137165a6 918 }
653099ff
GS
919}
920
dd58aee1
KW
921/* 'OR' a given class with another one. Can create false positives. 'cl'
922 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
923 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 924STATIC void
3fffb88a 925S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 926{
7918f24d
NC
927 PERL_ARGS_ASSERT_CL_OR;
928
653099ff 929 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
930
931 /* Here, the or'd node is to be inverted. This means we take the
932 * complement of everything not in the bitmap, but currently we don't
933 * know what that is, so give up and match anything */
934 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 935 cl_anything(pRExC_state, cl);
c6b76537 936 }
653099ff
GS
937 /* We do not use
938 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
939 * <= (B1 | !B2) | (CL1 | !CL2)
940 * which is wasteful if CL2 is small, but we ignore CL2:
941 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
942 * XXXX Can we handle case-fold? Unclear:
943 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
944 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
945 */
c6b76537 946 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
947 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
948 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
949 int i;
950
951 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
952 cl->bitmap[i] |= ~or_with->bitmap[i];
953 } /* XXXX: logic is complicated otherwise */
954 else {
3fffb88a 955 cl_anything(pRExC_state, cl);
653099ff 956 }
c6b76537
KW
957
958 /* And, we can just take the union of the flags that aren't affected
959 * by the inversion */
960 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
961
962 /* For the remaining flags:
963 ANYOF_UNICODE_ALL and inverted means to not match anything above
964 255, which means that the union with cl should just be
965 what cl has in it, so can ignore this flag
966 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
967 is 127-255 to match them, but then invert that, so the
968 union with cl should just be what cl has in it, so can
969 ignore this flag
970 */
971 } else { /* 'or_with' is not inverted */
653099ff
GS
972 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
973 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
974 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
975 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
976 int i;
977
978 /* OR char bitmap and class bitmap separately */
979 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
980 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 981 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
982 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
983 cl->classflags[i] |= or_with->classflags[i];
984 cl->flags |= ANYOF_CLASS;
985 }
986 }
987 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 988 cl_anything(pRExC_state, cl);
653099ff 989 }
9826f543 990
c6b76537
KW
991 if (ANYOF_NONBITMAP(or_with)) {
992
993 /* Use the added node's outside-the-bit-map match if there isn't a
994 * conflict. If there is a conflict (both nodes match something
995 * outside the bitmap, but what they match outside is not the same
996 * pointer, and hence not easily compared until XXX we extend
997 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
998 * match everything outside the bitmap. If that stuff is all above
999 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
c6b76537
KW
1000 if (! ANYOF_NONBITMAP(cl)) {
1001 ARG_SET(cl, ARG(or_with));
1002 }
1003 else if (ARG(cl) != ARG(or_with)) {
d94b1d13
KW
1004
1005 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1006 cl_anything(pRExC_state, cl);
1007 }
1008 else {
1009 cl->flags |= ANYOF_UNICODE_ALL;
1010 }
c6b76537 1011 }
4c34a693 1012 }
0b9668ee
KW
1013
1014 /* Take the union */
1015 cl->flags |= or_with->flags;
1aa99e6b 1016 }
653099ff
GS
1017}
1018
a3621e74
YO
1019#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1020#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1021#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1022#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1023
3dab1dad
YO
1024
1025#ifdef DEBUGGING
07be1b83 1026/*
2b8b4781
NC
1027 dump_trie(trie,widecharmap,revcharmap)
1028 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1029 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1030
1031 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1032 The _interim_ variants are used for debugging the interim
1033 tables that are used to generate the final compressed
1034 representation which is what dump_trie expects.
1035
486ec47a 1036 Part of the reason for their existence is to provide a form
3dab1dad 1037 of documentation as to how the different representations function.
07be1b83
YO
1038
1039*/
3dab1dad
YO
1040
1041/*
3dab1dad
YO
1042 Dumps the final compressed table form of the trie to Perl_debug_log.
1043 Used for debugging make_trie().
1044*/
b9a59e08 1045
3dab1dad 1046STATIC void
2b8b4781
NC
1047S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1048 AV *revcharmap, U32 depth)
3dab1dad
YO
1049{
1050 U32 state;
ab3bbdeb 1051 SV *sv=sv_newmortal();
55eed653 1052 int colwidth= widecharmap ? 6 : 4;
2e64971a 1053 U16 word;
3dab1dad
YO
1054 GET_RE_DEBUG_FLAGS_DECL;
1055
7918f24d 1056 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1057
3dab1dad
YO
1058 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1059 (int)depth * 2 + 2,"",
1060 "Match","Base","Ofs" );
1061
1062 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1063 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1064 if ( tmp ) {
ab3bbdeb
YO
1065 PerlIO_printf( Perl_debug_log, "%*s",
1066 colwidth,
ddc5bc0f 1067 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1068 PL_colors[0], PL_colors[1],
1069 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1070 PERL_PV_ESCAPE_FIRSTCHAR
1071 )
1072 );
3dab1dad
YO
1073 }
1074 }
1075 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1076 (int)depth * 2 + 2,"");
1077
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1079 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1080 PerlIO_printf( Perl_debug_log, "\n");
1081
1e2e3d02 1082 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1083 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1084
1085 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1086
1087 if ( trie->states[ state ].wordnum ) {
1088 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1089 } else {
1090 PerlIO_printf( Perl_debug_log, "%6s", "" );
1091 }
1092
1093 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1094
1095 if ( base ) {
1096 U32 ofs = 0;
1097
1098 while( ( base + ofs < trie->uniquecharcount ) ||
1099 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1100 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1101 ofs++;
1102
1103 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1104
1105 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1106 if ( ( base + ofs >= trie->uniquecharcount ) &&
1107 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1108 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1109 {
ab3bbdeb
YO
1110 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1111 colwidth,
3dab1dad
YO
1112 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1113 } else {
ab3bbdeb 1114 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1115 }
1116 }
1117
1118 PerlIO_printf( Perl_debug_log, "]");
1119
1120 }
1121 PerlIO_printf( Perl_debug_log, "\n" );
1122 }
2e64971a
DM
1123 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1124 for (word=1; word <= trie->wordcount; word++) {
1125 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1126 (int)word, (int)(trie->wordinfo[word].prev),
1127 (int)(trie->wordinfo[word].len));
1128 }
1129 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1130}
1131/*
3dab1dad
YO
1132 Dumps a fully constructed but uncompressed trie in list form.
1133 List tries normally only are used for construction when the number of
1134 possible chars (trie->uniquecharcount) is very high.
1135 Used for debugging make_trie().
1136*/
1137STATIC void
55eed653 1138S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1140 U32 depth)
3dab1dad
YO
1141{
1142 U32 state;
ab3bbdeb 1143 SV *sv=sv_newmortal();
55eed653 1144 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1145 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1146
1147 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1148
3dab1dad 1149 /* print out the table precompression. */
ab3bbdeb
YO
1150 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1151 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1152 "------:-----+-----------------\n" );
3dab1dad
YO
1153
1154 for( state=1 ; state < next_alloc ; state ++ ) {
1155 U16 charid;
1156
ab3bbdeb 1157 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1158 (int)depth * 2 + 2,"", (UV)state );
1159 if ( ! trie->states[ state ].wordnum ) {
1160 PerlIO_printf( Perl_debug_log, "%5s| ","");
1161 } else {
1162 PerlIO_printf( Perl_debug_log, "W%4x| ",
1163 trie->states[ state ].wordnum
1164 );
1165 }
1166 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1167 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1168 if ( tmp ) {
1169 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1170 colwidth,
ddc5bc0f 1171 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1172 PL_colors[0], PL_colors[1],
1173 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1174 PERL_PV_ESCAPE_FIRSTCHAR
1175 ) ,
1e2e3d02
YO
1176 TRIE_LIST_ITEM(state,charid).forid,
1177 (UV)TRIE_LIST_ITEM(state,charid).newstate
1178 );
1179 if (!(charid % 10))
664e119d
RGS
1180 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1181 (int)((depth * 2) + 14), "");
1e2e3d02 1182 }
ab3bbdeb
YO
1183 }
1184 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1185 }
1186}
1187
1188/*
3dab1dad
YO
1189 Dumps a fully constructed but uncompressed trie in table form.
1190 This is the normal DFA style state transition table, with a few
1191 twists to facilitate compression later.
1192 Used for debugging make_trie().
1193*/
1194STATIC void
55eed653 1195S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1196 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1197 U32 depth)
3dab1dad
YO
1198{
1199 U32 state;
1200 U16 charid;
ab3bbdeb 1201 SV *sv=sv_newmortal();
55eed653 1202 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1203 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1204
1205 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1206
1207 /*
1208 print out the table precompression so that we can do a visual check
1209 that they are identical.
1210 */
1211
1212 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1213
1214 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1215 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1216 if ( tmp ) {
ab3bbdeb
YO
1217 PerlIO_printf( Perl_debug_log, "%*s",
1218 colwidth,
ddc5bc0f 1219 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1220 PL_colors[0], PL_colors[1],
1221 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222 PERL_PV_ESCAPE_FIRSTCHAR
1223 )
1224 );
3dab1dad
YO
1225 }
1226 }
1227
1228 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1229
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1231 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1232 }
1233
1234 PerlIO_printf( Perl_debug_log, "\n" );
1235
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237
1238 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1239 (int)depth * 2 + 2,"",
1240 (UV)TRIE_NODENUM( state ) );
1241
1242 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1243 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1244 if (v)
1245 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1246 else
1247 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1248 }
1249 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1250 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1251 } else {
1252 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1253 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1254 }
1255 }
07be1b83 1256}
3dab1dad
YO
1257
1258#endif
1259
2e64971a 1260
786e8c11
YO
1261/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1262 startbranch: the first branch in the whole branch sequence
1263 first : start branch of sequence of branch-exact nodes.
1264 May be the same as startbranch
1265 last : Thing following the last branch.
1266 May be the same as tail.
1267 tail : item following the branch sequence
1268 count : words in the sequence
1269 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1270 depth : indent depth
3dab1dad 1271
786e8c11 1272Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1273
786e8c11
YO
1274A trie is an N'ary tree where the branches are determined by digital
1275decomposition of the key. IE, at the root node you look up the 1st character and
1276follow that branch repeat until you find the end of the branches. Nodes can be
1277marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1278
786e8c11 1279 /he|she|his|hers/
72f13be8 1280
786e8c11
YO
1281would convert into the following structure. Numbers represent states, letters
1282following numbers represent valid transitions on the letter from that state, if
1283the number is in square brackets it represents an accepting state, otherwise it
1284will be in parenthesis.
07be1b83 1285
786e8c11
YO
1286 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1287 | |
1288 | (2)
1289 | |
1290 (1) +-i->(6)-+-s->[7]
1291 |
1292 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1293
786e8c11
YO
1294 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1295
1296This shows that when matching against the string 'hers' we will begin at state 1
1297read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1298then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1299is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1300single traverse. We store a mapping from accepting to state to which word was
1301matched, and then when we have multiple possibilities we try to complete the
1302rest of the regex in the order in which they occured in the alternation.
1303
1304The only prior NFA like behaviour that would be changed by the TRIE support is
1305the silent ignoring of duplicate alternations which are of the form:
1306
1307 / (DUPE|DUPE) X? (?{ ... }) Y /x
1308
4b714af6 1309Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1310and without the optimisation. With the optimisations dupes will be silently
486ec47a 1311ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1312the following demonstrates:
1313
1314 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1315
1316which prints out 'word' three times, but
1317
1318 'words'=~/(word|word|word)(?{ print $1 })S/
1319
1320which doesnt print it out at all. This is due to other optimisations kicking in.
1321
1322Example of what happens on a structural level:
1323
486ec47a 1324The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1325
1326 1: CURLYM[1] {1,32767}(18)
1327 5: BRANCH(8)
1328 6: EXACT <ac>(16)
1329 8: BRANCH(11)
1330 9: EXACT <ad>(16)
1331 11: BRANCH(14)
1332 12: EXACT <ab>(16)
1333 16: SUCCEED(0)
1334 17: NOTHING(18)
1335 18: END(0)
1336
1337This would be optimizable with startbranch=5, first=5, last=16, tail=16
1338and should turn into:
1339
1340 1: CURLYM[1] {1,32767}(18)
1341 5: TRIE(16)
1342 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1343 <ac>
1344 <ad>
1345 <ab>
1346 16: SUCCEED(0)
1347 17: NOTHING(18)
1348 18: END(0)
1349
1350Cases where tail != last would be like /(?foo|bar)baz/:
1351
1352 1: BRANCH(4)
1353 2: EXACT <foo>(8)
1354 4: BRANCH(7)
1355 5: EXACT <bar>(8)
1356 7: TAIL(8)
1357 8: EXACT <baz>(10)
1358 10: END(0)
1359
1360which would be optimizable with startbranch=1, first=1, last=7, tail=8
1361and would end up looking like:
1362
1363 1: TRIE(8)
1364 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1365 <foo>
1366 <bar>
1367 7: TAIL(8)
1368 8: EXACT <baz>(10)
1369 10: END(0)
1370
1371 d = uvuni_to_utf8_flags(d, uv, 0);
1372
1373is the recommended Unicode-aware way of saying
1374
1375 *(d++) = uv;
1376*/
1377
fab2782b 1378#define TRIE_STORE_REVCHAR(val) \
786e8c11 1379 STMT_START { \
73031816 1380 if (UTF) { \
fab2782b 1381 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1382 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
fab2782b 1383 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
73031816
NC
1384 SvCUR_set(zlopp, kapow - flrbbbbb); \
1385 SvPOK_on(zlopp); \
1386 SvUTF8_on(zlopp); \
1387 av_push(revcharmap, zlopp); \
1388 } else { \
fab2782b 1389 char ooooff = (char)val; \
73031816
NC
1390 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1391 } \
1392 } STMT_END
786e8c11 1393
fab2782b
YO
1394#define TRIE_READ_CHAR STMT_START { \
1395 wordlen++; \
1396 if ( UTF ) { \
1397 /* if it is UTF then it is either already folded, or does not need folding */ \
1398 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1399 } \
1400 else if (folder == PL_fold_latin1) { \
1401 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1402 if ( foldlen > 0 ) { \
1403 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1404 foldlen -= len; \
1405 scan += len; \
1406 len = 0; \
1407 } else { \
1408 len = 1; \
1409 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1410 skiplen = UNISKIP(uvc); \
1411 foldlen -= skiplen; \
1412 scan = foldbuf + skiplen; \
1413 } \
1414 } else { \
1415 /* raw data, will be folded later if needed */ \
1416 uvc = (U32)*uc; \
1417 len = 1; \
1418 } \
786e8c11
YO
1419} STMT_END
1420
1421
1422
1423#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1424 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1425 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1426 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1427 } \
1428 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1429 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1430 TRIE_LIST_CUR( state )++; \
1431} STMT_END
07be1b83 1432
786e8c11
YO
1433#define TRIE_LIST_NEW(state) STMT_START { \
1434 Newxz( trie->states[ state ].trans.list, \
1435 4, reg_trie_trans_le ); \
1436 TRIE_LIST_CUR( state ) = 1; \
1437 TRIE_LIST_LEN( state ) = 4; \
1438} STMT_END
07be1b83 1439
786e8c11
YO
1440#define TRIE_HANDLE_WORD(state) STMT_START { \
1441 U16 dupe= trie->states[ state ].wordnum; \
1442 regnode * const noper_next = regnext( noper ); \
1443 \
786e8c11
YO
1444 DEBUG_r({ \
1445 /* store the word for dumping */ \
1446 SV* tmp; \
1447 if (OP(noper) != NOTHING) \
740cce10 1448 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1449 else \
740cce10 1450 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1451 av_push( trie_words, tmp ); \
786e8c11
YO
1452 }); \
1453 \
1454 curword++; \
2e64971a
DM
1455 trie->wordinfo[curword].prev = 0; \
1456 trie->wordinfo[curword].len = wordlen; \
1457 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1458 \
1459 if ( noper_next < tail ) { \
1460 if (!trie->jump) \
c944940b 1461 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1462 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1463 if (!jumper) \
1464 jumper = noper_next; \
1465 if (!nextbranch) \
1466 nextbranch= regnext(cur); \
1467 } \
1468 \
1469 if ( dupe ) { \
2e64971a
DM
1470 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1471 /* chain, so that when the bits of chain are later */\
1472 /* linked together, the dups appear in the chain */\
1473 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1474 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1475 } else { \
1476 /* we haven't inserted this word yet. */ \
1477 trie->states[ state ].wordnum = curword; \
1478 } \
1479} STMT_END
07be1b83 1480
3dab1dad 1481
786e8c11
YO
1482#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1483 ( ( base + charid >= ucharcount \
1484 && base + charid < ubound \
1485 && state == trie->trans[ base - ucharcount + charid ].check \
1486 && trie->trans[ base - ucharcount + charid ].next ) \
1487 ? trie->trans[ base - ucharcount + charid ].next \
1488 : ( state==1 ? special : 0 ) \
1489 )
3dab1dad 1490
786e8c11
YO
1491#define MADE_TRIE 1
1492#define MADE_JUMP_TRIE 2
1493#define MADE_EXACT_TRIE 4
3dab1dad 1494
a3621e74 1495STATIC I32
786e8c11 1496S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1497{
27da23d5 1498 dVAR;
a3621e74
YO
1499 /* first pass, loop through and scan words */
1500 reg_trie_data *trie;
55eed653 1501 HV *widecharmap = NULL;
2b8b4781 1502 AV *revcharmap = newAV();
a3621e74 1503 regnode *cur;
9f7f3913 1504 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1505 STRLEN len = 0;
1506 UV uvc = 0;
1507 U16 curword = 0;
1508 U32 next_alloc = 0;
786e8c11
YO
1509 regnode *jumper = NULL;
1510 regnode *nextbranch = NULL;
7f69552c 1511 regnode *convert = NULL;
2e64971a 1512 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1513 /* we just use folder as a flag in utf8 */
1e696034 1514 const U8 * folder = NULL;
a3621e74 1515
2b8b4781
NC
1516#ifdef DEBUGGING
1517 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1518 AV *trie_words = NULL;
1519 /* along with revcharmap, this only used during construction but both are
1520 * useful during debugging so we store them in the struct when debugging.
8e11feef 1521 */
2b8b4781
NC
1522#else
1523 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1524 STRLEN trie_charcount=0;
3dab1dad 1525#endif
2b8b4781 1526 SV *re_trie_maxbuff;
a3621e74 1527 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1528
1529 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1530#ifndef DEBUGGING
1531 PERL_UNUSED_ARG(depth);
1532#endif
a3621e74 1533
1e696034 1534 switch (flags) {
c46d03cf 1535 case EXACT: break;
2f7f8cb1 1536 case EXACTFA:
fab2782b
YO
1537 case EXACTFU_SS:
1538 case EXACTFU_TRICKYFOLD:
1e696034
KW
1539 case EXACTFU: folder = PL_fold_latin1; break;
1540 case EXACTF: folder = PL_fold; break;
1541 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1542 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1543 }
1544
c944940b 1545 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1546 trie->refcount = 1;
3dab1dad 1547 trie->startstate = 1;
786e8c11 1548 trie->wordcount = word_count;
f8fc2ecf 1549 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1550 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1551 if (flags == EXACT)
c944940b 1552 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1553 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1554 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1555
a3621e74 1556 DEBUG_r({
2b8b4781 1557 trie_words = newAV();
a3621e74 1558 });
a3621e74 1559
0111c4fd 1560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1561 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1563 }
df826430 1564 DEBUG_TRIE_COMPILE_r({
3dab1dad 1565 PerlIO_printf( Perl_debug_log,
786e8c11 1566 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1567 (int)depth * 2 + 2, "",
1568 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1569 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1570 (int)depth);
3dab1dad 1571 });
7f69552c
YO
1572
1573 /* Find the node we are going to overwrite */
1574 if ( first == startbranch && OP( last ) != BRANCH ) {
1575 /* whole branch chain */
1576 convert = first;
1577 } else {
1578 /* branch sub-chain */
1579 convert = NEXTOPER( first );
1580 }
1581
a3621e74
YO
1582 /* -- First loop and Setup --
1583
1584 We first traverse the branches and scan each word to determine if it
1585 contains widechars, and how many unique chars there are, this is
1586 important as we have to build a table with at least as many columns as we
1587 have unique chars.
1588
1589 We use an array of integers to represent the character codes 0..255
38a44b82 1590 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1591 native representation of the character value as the key and IV's for the
1592 coded index.
1593
1594 *TODO* If we keep track of how many times each character is used we can
1595 remap the columns so that the table compression later on is more
3b753521 1596 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1597 middle and the least common are on the outside. IMO this would be better
1598 than a most to least common mapping as theres a decent chance the most
1599 common letter will share a node with the least common, meaning the node
486ec47a 1600 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1601 case is when we have the least common nodes twice.
1602
1603 */
1604
a3621e74 1605 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1606 regnode *noper = NEXTOPER( cur );
e1ec3a88 1607 const U8 *uc = (U8*)STRING( noper );
df826430 1608 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
1609 STRLEN foldlen = 0;
1610 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
fab2782b 1611 STRLEN skiplen = 0;
2af232bd 1612 const U8 *scan = (U8*)NULL;
07be1b83 1613 U32 wordlen = 0; /* required init */
02daf0ab
YO
1614 STRLEN chars = 0;
1615 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1616
3dab1dad 1617 if (OP(noper) == NOTHING) {
df826430
YO
1618 regnode *noper_next= regnext(noper);
1619 if (noper_next != tail && OP(noper_next) == flags) {
1620 noper = noper_next;
1621 uc= (U8*)STRING(noper);
1622 e= uc + STR_LEN(noper);
1623 trie->minlen= STR_LEN(noper);
1624 } else {
1625 trie->minlen= 0;
1626 continue;
1627 }
3dab1dad 1628 }
df826430 1629
fab2782b 1630 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
1631 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1632 regardless of encoding */
fab2782b
YO
1633 if (OP( noper ) == EXACTFU_SS) {
1634 /* false positives are ok, so just set this */
1635 TRIE_BITMAP_SET(trie,0xDF);
1636 }
1637 }
a3621e74 1638 for ( ; uc < e ; uc += len ) {
3dab1dad 1639 TRIE_CHARCOUNT(trie)++;
a3621e74 1640 TRIE_READ_CHAR;
3dab1dad 1641 chars++;
a3621e74 1642 if ( uvc < 256 ) {
fab2782b
YO
1643 if ( folder ) {
1644 U8 folded= folder[ (U8) uvc ];
1645 if ( !trie->charmap[ folded ] ) {
1646 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1647 TRIE_STORE_REVCHAR( folded );
1648 }
1649 }
a3621e74
YO
1650 if ( !trie->charmap[ uvc ] ) {
1651 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 1652 TRIE_STORE_REVCHAR( uvc );
a3621e74 1653 }
02daf0ab 1654 if ( set_bit ) {
62012aee
KW
1655 /* store the codepoint in the bitmap, and its folded
1656 * equivalent. */
fab2782b 1657 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
1658
1659 /* store the folded codepoint */
fab2782b 1660 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
1661
1662 if ( !UTF ) {
1663 /* store first byte of utf8 representation of
acdf4139
KW
1664 variant codepoints */
1665 if (! UNI_IS_INVARIANT(uvc)) {
1666 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1667 }
1668 }
02daf0ab
YO
1669 set_bit = 0; /* We've done our bit :-) */
1670 }
a3621e74
YO
1671 } else {
1672 SV** svpp;
55eed653
NC
1673 if ( !widecharmap )
1674 widecharmap = newHV();
a3621e74 1675
55eed653 1676 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1677
1678 if ( !svpp )
e4584336 1679 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1680
1681 if ( !SvTRUE( *svpp ) ) {
1682 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 1683 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
1684 }
1685 }
1686 }
3dab1dad 1687 if( cur == first ) {
fab2782b
YO
1688 trie->minlen = chars;
1689 trie->maxlen = chars;
3dab1dad 1690 } else if (chars < trie->minlen) {
fab2782b 1691 trie->minlen = chars;
3dab1dad 1692 } else if (chars > trie->maxlen) {
fab2782b
YO
1693 trie->maxlen = chars;
1694 }
1695 if (OP( noper ) == EXACTFU_SS) {
1696 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1697 if (trie->minlen > 1)
1698 trie->minlen= 1;
1699 }
1700 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1701 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1702 * - We assume that any such sequence might match a 2 byte string */
1703 if (trie->minlen > 2 )
1704 trie->minlen= 2;
3dab1dad
YO
1705 }
1706
a3621e74
YO
1707 } /* end first pass */
1708 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1709 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1710 (int)depth * 2 + 2,"",
55eed653 1711 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1712 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1713 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1714 );
a3621e74
YO
1715
1716 /*
1717 We now know what we are dealing with in terms of unique chars and
1718 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1719 representation using a flat table will take. If it's over a reasonable
1720 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1721 conservative but potentially much slower representation using an array
1722 of lists.
1723
1724 At the end we convert both representations into the same compressed
1725 form that will be used in regexec.c for matching with. The latter
1726 is a form that cannot be used to construct with but has memory
1727 properties similar to the list form and access properties similar
1728 to the table form making it both suitable for fast searches and
1729 small enough that its feasable to store for the duration of a program.
1730
1731 See the comment in the code where the compressed table is produced
1732 inplace from the flat tabe representation for an explanation of how
1733 the compression works.
1734
1735 */
1736
1737
2e64971a
DM
1738 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1739 prev_states[1] = 0;
1740
3dab1dad 1741 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1742 /*
1743 Second Pass -- Array Of Lists Representation
1744
1745 Each state will be represented by a list of charid:state records
1746 (reg_trie_trans_le) the first such element holds the CUR and LEN
1747 points of the allocated array. (See defines above).
1748
1749 We build the initial structure using the lists, and then convert
1750 it into the compressed table form which allows faster lookups
1751 (but cant be modified once converted).
a3621e74
YO
1752 */
1753
a3621e74
YO
1754 STRLEN transcount = 1;
1755
1e2e3d02
YO
1756 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1757 "%*sCompiling trie using list compiler\n",
1758 (int)depth * 2 + 2, ""));
686b73d4 1759
c944940b
JH
1760 trie->states = (reg_trie_state *)
1761 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762 sizeof(reg_trie_state) );
a3621e74
YO
1763 TRIE_LIST_NEW(1);
1764 next_alloc = 2;
1765
1766 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1767
df826430 1768 regnode *noper = NEXTOPER( cur );
c445ea15 1769 U8 *uc = (U8*)STRING( noper );
df826430 1770 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
1771 U32 state = 1; /* required init */
1772 U16 charid = 0; /* sanity init */
1773 U8 *scan = (U8*)NULL; /* sanity init */
1774 STRLEN foldlen = 0; /* required init */
07be1b83 1775 U32 wordlen = 0; /* required init */
c445ea15 1776 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
fab2782b 1777 STRLEN skiplen = 0;
c445ea15 1778
df826430
YO
1779 if (OP(noper) == NOTHING) {
1780 regnode *noper_next= regnext(noper);
1781 if (noper_next != tail && OP(noper_next) == flags) {
1782 noper = noper_next;
1783 uc= (U8*)STRING(noper);
1784 e= uc + STR_LEN(noper);
1785 }
1786 }
1787
3dab1dad 1788 if (OP(noper) != NOTHING) {
786e8c11 1789 for ( ; uc < e ; uc += len ) {
c445ea15 1790
786e8c11 1791 TRIE_READ_CHAR;
c445ea15 1792
786e8c11
YO
1793 if ( uvc < 256 ) {
1794 charid = trie->charmap[ uvc ];
c445ea15 1795 } else {
55eed653 1796 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1797 if ( !svpp ) {
1798 charid = 0;
1799 } else {
1800 charid=(U16)SvIV( *svpp );
1801 }
c445ea15 1802 }
786e8c11
YO
1803 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1804 if ( charid ) {
a3621e74 1805
786e8c11
YO
1806 U16 check;
1807 U32 newstate = 0;
a3621e74 1808
786e8c11
YO
1809 charid--;
1810 if ( !trie->states[ state ].trans.list ) {
1811 TRIE_LIST_NEW( state );
c445ea15 1812 }
786e8c11
YO
1813 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1814 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1815 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1816 break;
1817 }
1818 }
1819 if ( ! newstate ) {
1820 newstate = next_alloc++;
2e64971a 1821 prev_states[newstate] = state;
786e8c11
YO
1822 TRIE_LIST_PUSH( state, charid, newstate );
1823 transcount++;
1824 }
1825 state = newstate;
1826 } else {
1827 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1828 }
a28509cc 1829 }
c445ea15 1830 }
3dab1dad 1831 TRIE_HANDLE_WORD(state);
a3621e74
YO
1832
1833 } /* end second pass */
1834
1e2e3d02
YO
1835 /* next alloc is the NEXT state to be allocated */
1836 trie->statecount = next_alloc;
c944940b
JH
1837 trie->states = (reg_trie_state *)
1838 PerlMemShared_realloc( trie->states,
1839 next_alloc
1840 * sizeof(reg_trie_state) );
a3621e74 1841
3dab1dad 1842 /* and now dump it out before we compress it */
2b8b4781
NC
1843 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1844 revcharmap, next_alloc,
1845 depth+1)
1e2e3d02 1846 );
a3621e74 1847
c944940b
JH
1848 trie->trans = (reg_trie_trans *)
1849 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1850 {
1851 U32 state;
a3621e74
YO
1852 U32 tp = 0;
1853 U32 zp = 0;
1854
1855
1856 for( state=1 ; state < next_alloc ; state ++ ) {
1857 U32 base=0;
1858
1859 /*
1860 DEBUG_TRIE_COMPILE_MORE_r(
1861 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1862 );
1863 */
1864
1865 if (trie->states[state].trans.list) {
1866 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1867 U16 maxid=minid;
a28509cc 1868 U16 idx;
a3621e74
YO
1869
1870 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1871 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1872 if ( forid < minid ) {
1873 minid=forid;
1874 } else if ( forid > maxid ) {
1875 maxid=forid;
1876 }
a3621e74
YO
1877 }
1878 if ( transcount < tp + maxid - minid + 1) {
1879 transcount *= 2;
c944940b
JH
1880 trie->trans = (reg_trie_trans *)
1881 PerlMemShared_realloc( trie->trans,
446bd890
NC
1882 transcount
1883 * sizeof(reg_trie_trans) );
a3621e74
YO
1884 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1885 }
1886 base = trie->uniquecharcount + tp - minid;
1887 if ( maxid == minid ) {
1888 U32 set = 0;
1889 for ( ; zp < tp ; zp++ ) {
1890 if ( ! trie->trans[ zp ].next ) {
1891 base = trie->uniquecharcount + zp - minid;
1892 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1893 trie->trans[ zp ].check = state;
1894 set = 1;
1895 break;
1896 }
1897 }
1898 if ( !set ) {
1899 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1900 trie->trans[ tp ].check = state;
1901 tp++;
1902 zp = tp;
1903 }
1904 } else {
1905 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1906 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1907 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1908 trie->trans[ tid ].check = state;
1909 }
1910 tp += ( maxid - minid + 1 );
1911 }
1912 Safefree(trie->states[ state ].trans.list);
1913 }
1914 /*
1915 DEBUG_TRIE_COMPILE_MORE_r(
1916 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1917 );
1918 */
1919 trie->states[ state ].trans.base=base;
1920 }
cc601c31 1921 trie->lasttrans = tp + 1;
a3621e74
YO
1922 }
1923 } else {
1924 /*
1925 Second Pass -- Flat Table Representation.
1926
1927 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1928 We know that we will need Charcount+1 trans at most to store the data
1929 (one row per char at worst case) So we preallocate both structures
1930 assuming worst case.
1931
1932 We then construct the trie using only the .next slots of the entry
1933 structs.
1934
3b753521 1935 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1936 make compression both faster and easier by keeping track of how many non
1937 zero fields are in the node.
1938
1939 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1940 transition.
1941
1942 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1943 number representing the first entry of the node, and state as a
1944 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1945 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1946 are 2 entrys per node. eg:
1947
1948 A B A B
1949 1. 2 4 1. 3 7
1950 2. 0 3 3. 0 5
1951 3. 0 0 5. 0 0
1952 4. 0 0 7. 0 0
1953
1954 The table is internally in the right hand, idx form. However as we also
1955 have to deal with the states array which is indexed by nodenum we have to
1956 use TRIE_NODENUM() to convert.
1957
1958 */
1e2e3d02
YO
1959 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1960 "%*sCompiling trie using table compiler\n",
1961 (int)depth * 2 + 2, ""));
3dab1dad 1962
c944940b
JH
1963 trie->trans = (reg_trie_trans *)
1964 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1965 * trie->uniquecharcount + 1,
1966 sizeof(reg_trie_trans) );
1967 trie->states = (reg_trie_state *)
1968 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1969 sizeof(reg_trie_state) );
a3621e74
YO
1970 next_alloc = trie->uniquecharcount + 1;
1971
3dab1dad 1972
a3621e74
YO
1973 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1974
df826430 1975 regnode *noper = NEXTOPER( cur );
a28509cc 1976 const U8 *uc = (U8*)STRING( noper );
df826430 1977 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
1978
1979 U32 state = 1; /* required init */
1980
1981 U16 charid = 0; /* sanity init */
1982 U32 accept_state = 0; /* sanity init */
1983 U8 *scan = (U8*)NULL; /* sanity init */
1984
1985 STRLEN foldlen = 0; /* required init */
07be1b83 1986 U32 wordlen = 0; /* required init */
fab2782b 1987 STRLEN skiplen = 0;
a3621e74
YO
1988 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1989
df826430
YO
1990 if (OP(noper) == NOTHING) {
1991 regnode *noper_next= regnext(noper);
1992 if (noper_next != tail && OP(noper_next) == flags) {
1993 noper = noper_next;
1994 uc= (U8*)STRING(noper);
1995 e= uc + STR_LEN(noper);
1996 }
1997 }
fab2782b 1998
3dab1dad 1999 if ( OP(noper) != NOTHING ) {
786e8c11 2000 for ( ; uc < e ; uc += len ) {
a3621e74 2001
786e8c11 2002 TRIE_READ_CHAR;
a3621e74 2003
786e8c11
YO
2004 if ( uvc < 256 ) {
2005 charid = trie->charmap[ uvc ];
2006 } else {
55eed653 2007 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2008 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2009 }
786e8c11
YO
2010 if ( charid ) {
2011 charid--;
2012 if ( !trie->trans[ state + charid ].next ) {
2013 trie->trans[ state + charid ].next = next_alloc;
2014 trie->trans[ state ].check++;
2e64971a
DM
2015 prev_states[TRIE_NODENUM(next_alloc)]
2016 = TRIE_NODENUM(state);
786e8c11
YO
2017 next_alloc += trie->uniquecharcount;
2018 }
2019 state = trie->trans[ state + charid ].next;
2020 } else {
2021 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2022 }
2023 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2024 }
a3621e74 2025 }
3dab1dad
YO
2026 accept_state = TRIE_NODENUM( state );
2027 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2028
2029 } /* end second pass */
2030
3dab1dad 2031 /* and now dump it out before we compress it */
2b8b4781
NC
2032 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2033 revcharmap,
2034 next_alloc, depth+1));
a3621e74 2035
a3621e74
YO
2036 {
2037 /*
2038 * Inplace compress the table.*
2039
2040 For sparse data sets the table constructed by the trie algorithm will
2041 be mostly 0/FAIL transitions or to put it another way mostly empty.
2042 (Note that leaf nodes will not contain any transitions.)
2043
2044 This algorithm compresses the tables by eliminating most such
2045 transitions, at the cost of a modest bit of extra work during lookup:
2046
2047 - Each states[] entry contains a .base field which indicates the
2048 index in the state[] array wheres its transition data is stored.
2049
3b753521 2050 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2051
2052 - If .base is nonzero then charid is added to it to find an entry in
2053 the trans array.
2054
2055 -If trans[states[state].base+charid].check!=state then the
2056 transition is taken to be a 0/Fail transition. Thus if there are fail
2057 transitions at the front of the node then the .base offset will point
2058 somewhere inside the previous nodes data (or maybe even into a node
2059 even earlier), but the .check field determines if the transition is
2060 valid.
2061
786e8c11 2062 XXX - wrong maybe?
a3621e74 2063 The following process inplace converts the table to the compressed
3b753521 2064 table: We first do not compress the root node 1,and mark all its
a3621e74 2065 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2066 allows us to do a DFA construction from the compressed table later,
2067 and ensures that any .base pointers we calculate later are greater
2068 than 0.
a3621e74
YO
2069
2070 - We set 'pos' to indicate the first entry of the second node.
2071
2072 - We then iterate over the columns of the node, finding the first and
2073 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2074 and set the .check pointers accordingly, and advance pos
2075 appropriately and repreat for the next node. Note that when we copy
2076 the next pointers we have to convert them from the original
2077 NODEIDX form to NODENUM form as the former is not valid post
2078 compression.
2079
2080 - If a node has no transitions used we mark its base as 0 and do not
2081 advance the pos pointer.
2082
2083 - If a node only has one transition we use a second pointer into the
2084 structure to fill in allocated fail transitions from other states.
2085 This pointer is independent of the main pointer and scans forward
2086 looking for null transitions that are allocated to a state. When it
2087 finds one it writes the single transition into the "hole". If the
786e8c11 2088 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2089
2090 - Once compressed we can Renew/realloc the structures to release the
2091 excess space.
2092
2093 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2094 specifically Fig 3.47 and the associated pseudocode.
2095
2096 demq
2097 */
a3b680e6 2098 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2099 U32 state, charid;
a3621e74 2100 U32 pos = 0, zp=0;
1e2e3d02 2101 trie->statecount = laststate;
a3621e74
YO
2102
2103 for ( state = 1 ; state < laststate ; state++ ) {
2104 U8 flag = 0;
a28509cc
AL
2105 const U32 stateidx = TRIE_NODEIDX( state );
2106 const U32 o_used = trie->trans[ stateidx ].check;
2107 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2108 trie->trans[ stateidx ].check = 0;
2109
2110 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2111 if ( flag || trie->trans[ stateidx + charid ].next ) {
2112 if ( trie->trans[ stateidx + charid ].next ) {
2113 if (o_used == 1) {
2114 for ( ; zp < pos ; zp++ ) {
2115 if ( ! trie->trans[ zp ].next ) {
2116 break;
2117 }
2118 }
2119 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2120 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2121 trie->trans[ zp ].check = state;
2122 if ( ++zp > pos ) pos = zp;
2123 break;
2124 }
2125 used--;
2126 }
2127 if ( !flag ) {
2128 flag = 1;
2129 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2130 }
2131 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2132 trie->trans[ pos ].check = state;
2133 pos++;
2134 }
2135 }
2136 }
cc601c31 2137 trie->lasttrans = pos + 1;
c944940b
JH
2138 trie->states = (reg_trie_state *)
2139 PerlMemShared_realloc( trie->states, laststate
2140 * sizeof(reg_trie_state) );
a3621e74 2141 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2142 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2143 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2144 (int)depth * 2 + 2,"",
2145 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2146 (IV)next_alloc,
2147 (IV)pos,
a3621e74
YO
2148 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2149 );
2150
2151 } /* end table compress */
2152 }
1e2e3d02
YO
2153 DEBUG_TRIE_COMPILE_MORE_r(
2154 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2155 (int)depth * 2 + 2, "",
2156 (UV)trie->statecount,
2157 (UV)trie->lasttrans)
2158 );
cc601c31 2159 /* resize the trans array to remove unused space */
c944940b
JH
2160 trie->trans = (reg_trie_trans *)
2161 PerlMemShared_realloc( trie->trans, trie->lasttrans
2162 * sizeof(reg_trie_trans) );
a3621e74 2163
3b753521 2164 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2165 U8 nodetype =(U8)(flags & 0xFF);
2166 char *str=NULL;
786e8c11 2167
07be1b83 2168#ifdef DEBUGGING
e62cc96a 2169 regnode *optimize = NULL;
7122b237
YO
2170#ifdef RE_TRACK_PATTERN_OFFSETS
2171
b57a0404
JH
2172 U32 mjd_offset = 0;
2173 U32 mjd_nodelen = 0;
7122b237
YO
2174#endif /* RE_TRACK_PATTERN_OFFSETS */
2175#endif /* DEBUGGING */
a3621e74 2176 /*
3dab1dad
YO
2177 This means we convert either the first branch or the first Exact,
2178 depending on whether the thing following (in 'last') is a branch
2179 or not and whther first is the startbranch (ie is it a sub part of
2180 the alternation or is it the whole thing.)
3b753521 2181 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2182 the whole branch sequence, including the first.
a3621e74 2183 */
3dab1dad 2184 /* Find the node we are going to overwrite */
7f69552c 2185 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2186 /* branch sub-chain */
3dab1dad 2187 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2188#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2189 DEBUG_r({
2190 mjd_offset= Node_Offset((convert));
2191 mjd_nodelen= Node_Length((convert));
2192 });
7122b237 2193#endif
7f69552c 2194 /* whole branch chain */
7122b237
YO
2195 }
2196#ifdef RE_TRACK_PATTERN_OFFSETS
2197 else {
7f69552c
YO
2198 DEBUG_r({
2199 const regnode *nop = NEXTOPER( convert );
2200 mjd_offset= Node_Offset((nop));
2201 mjd_nodelen= Node_Length((nop));
2202 });
07be1b83
YO
2203 }
2204 DEBUG_OPTIMISE_r(
2205 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2206 (int)depth * 2 + 2, "",
786e8c11 2207 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2208 );
7122b237 2209#endif
3dab1dad
YO
2210 /* But first we check to see if there is a common prefix we can
2211 split out as an EXACT and put in front of the TRIE node. */
2212 trie->startstate= 1;
55eed653 2213 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2214 U32 state;
1e2e3d02 2215 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2216 U32 ofs = 0;
8e11feef
RGS
2217 I32 idx = -1;
2218 U32 count = 0;
2219 const U32 base = trie->states[ state ].trans.base;
a3621e74 2220
3dab1dad 2221 if ( trie->states[state].wordnum )
8e11feef 2222 count = 1;
a3621e74 2223
8e11feef 2224 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2225 if ( ( base + ofs >= trie->uniquecharcount ) &&
2226 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2227 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2228 {
3dab1dad 2229 if ( ++count > 1 ) {
2b8b4781 2230 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2231 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2232 if ( state == 1 ) break;
3dab1dad
YO
2233 if ( count == 2 ) {
2234 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2235 DEBUG_OPTIMISE_r(
8e11feef
RGS
2236 PerlIO_printf(Perl_debug_log,
2237 "%*sNew Start State=%"UVuf" Class: [",
2238 (int)depth * 2 + 2, "",
786e8c11 2239 (UV)state));
be8e71aa 2240 if (idx >= 0) {
2b8b4781 2241 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2242 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2243
3dab1dad 2244 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2245 if ( folder )
2246 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2247 DEBUG_OPTIMISE_r(
f1f66076 2248 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2249 );
8e11feef
RGS
2250 }
2251 }
2252 TRIE_BITMAP_SET(trie,*ch);
2253 if ( folder )
2254 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2255 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2256 }
2257 idx = ofs;
2258 }
3dab1dad
YO
2259 }
2260 if ( count == 1 ) {
2b8b4781 2261 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2262 STRLEN len;
2263 char *ch = SvPV( *tmp, len );
de734bd5
A
2264 DEBUG_OPTIMISE_r({
2265 SV *sv=sv_newmortal();
8e11feef
RGS
2266 PerlIO_printf( Perl_debug_log,
2267 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2268 (int)depth * 2 + 2, "",
de734bd5
A
2269 (UV)state, (UV)idx,
2270 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2271 PL_colors[0], PL_colors[1],
2272 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2273 PERL_PV_ESCAPE_FIRSTCHAR
2274 )
2275 );
2276 });
3dab1dad
YO
2277 if ( state==1 ) {
2278 OP( convert ) = nodetype;
2279 str=STRING(convert);
2280 STR_LEN(convert)=0;
2281 }
c490c714
YO
2282 STR_LEN(convert) += len;
2283 while (len--)
de734bd5 2284 *str++ = *ch++;
8e11feef 2285 } else {
f9049ba1 2286#ifdef DEBUGGING
8e11feef
RGS
2287 if (state>1)
2288 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2289#endif
8e11feef
RGS
2290 break;
2291 }
2292 }
2e64971a 2293 trie->prefixlen = (state-1);
3dab1dad 2294 if (str) {
8e11feef 2295 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2296 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2297 trie->startstate = state;
07be1b83
YO
2298 trie->minlen -= (state - 1);
2299 trie->maxlen -= (state - 1);
33809eae
JH
2300#ifdef DEBUGGING
2301 /* At least the UNICOS C compiler choked on this
2302 * being argument to DEBUG_r(), so let's just have
2303 * it right here. */
2304 if (
2305#ifdef PERL_EXT_RE_BUILD
2306 1
2307#else
2308 DEBUG_r_TEST
2309#endif
2310 ) {
2311 regnode *fix = convert;
2312 U32 word = trie->wordcount;
2313 mjd_nodelen++;
2314 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2315 while( ++fix < n ) {
2316 Set_Node_Offset_Length(fix, 0, 0);
2317 }
2318 while (word--) {
2319 SV ** const tmp = av_fetch( trie_words, word, 0 );
2320 if (tmp) {
2321 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2322 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2323 else
2324 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2325 }
2326 }
2327 }
2328#endif
8e11feef
RGS
2329 if (trie->maxlen) {
2330 convert = n;
2331 } else {
3dab1dad 2332 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2333 DEBUG_r(optimize= n);
3dab1dad
YO
2334 }
2335 }
2336 }
a5ca303d
YO
2337 if (!jumper)
2338 jumper = last;
3dab1dad 2339 if ( trie->maxlen ) {
8e11feef
RGS
2340 NEXT_OFF( convert ) = (U16)(tail - convert);
2341 ARG_SET( convert, data_slot );
786e8c11
YO
2342 /* Store the offset to the first unabsorbed branch in
2343 jump[0], which is otherwise unused by the jump logic.
2344 We use this when dumping a trie and during optimisation. */
2345 if (trie->jump)
7f69552c 2346 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2347
6c48061a
YO
2348 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2349 * and there is a bitmap
2350 * and the first "jump target" node we found leaves enough room
2351 * then convert the TRIE node into a TRIEC node, with the bitmap
2352 * embedded inline in the opcode - this is hypothetically faster.
2353 */
2354 if ( !trie->states[trie->startstate].wordnum
2355 && trie->bitmap
2356 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2357 {
2358 OP( convert ) = TRIEC;
2359 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2360 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2361 trie->bitmap= NULL;
2362 } else
2363 OP( convert ) = TRIE;
a3621e74 2364
3dab1dad
YO
2365 /* store the type in the flags */
2366 convert->flags = nodetype;
a5ca303d
YO
2367 DEBUG_r({
2368 optimize = convert
2369 + NODE_STEP_REGNODE
2370 + regarglen[ OP( convert ) ];
2371 });
2372 /* XXX We really should free up the resource in trie now,
2373 as we won't use them - (which resources?) dmq */
3dab1dad 2374 }
a3621e74 2375 /* needed for dumping*/
e62cc96a 2376 DEBUG_r(if (optimize) {
07be1b83 2377 regnode *opt = convert;
bcdf7404 2378
e62cc96a 2379 while ( ++opt < optimize) {
07be1b83
YO
2380 Set_Node_Offset_Length(opt,0,0);
2381 }
786e8c11
YO
2382 /*
2383 Try to clean up some of the debris left after the
2384 optimisation.
a3621e74 2385 */
786e8c11 2386 while( optimize < jumper ) {
07be1b83 2387 mjd_nodelen += Node_Length((optimize));
a3621e74 2388 OP( optimize ) = OPTIMIZED;
07be1b83 2389 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2390 optimize++;
2391 }
07be1b83 2392 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2393 });
2394 } /* end node insert */
2e64971a
DM
2395
2396 /* Finish populating the prev field of the wordinfo array. Walk back
2397 * from each accept state until we find another accept state, and if
2398 * so, point the first word's .prev field at the second word. If the
2399 * second already has a .prev field set, stop now. This will be the
2400 * case either if we've already processed that word's accept state,
3b753521
FN
2401 * or that state had multiple words, and the overspill words were
2402 * already linked up earlier.
2e64971a
DM
2403 */
2404 {
2405 U16 word;
2406 U32 state;
2407 U16 prev;
2408
2409 for (word=1; word <= trie->wordcount; word++) {
2410 prev = 0;
2411 if (trie->wordinfo[word].prev)
2412 continue;
2413 state = trie->wordinfo[word].accept;
2414 while (state) {
2415 state = prev_states[state];
2416 if (!state)
2417 break;
2418 prev = trie->states[state].wordnum;
2419 if (prev)
2420 break;
2421 }
2422 trie->wordinfo[word].prev = prev;
2423 }
2424 Safefree(prev_states);
2425 }
2426
2427
2428 /* and now dump out the compressed format */
2429 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2430
55eed653 2431 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2432#ifdef DEBUGGING
2433 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2434 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2435#else
2436 SvREFCNT_dec(revcharmap);
07be1b83 2437#endif
786e8c11
YO
2438 return trie->jump
2439 ? MADE_JUMP_TRIE
2440 : trie->startstate>1
2441 ? MADE_EXACT_TRIE
2442 : MADE_TRIE;
2443}
2444
2445STATIC void
2446S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2447{
3b753521 2448/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2449
2450 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2451 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2452 ISBN 0-201-10088-6
2453
2454 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2455 suffix of the current state's 'word' that is also a proper prefix of another word in our
2456 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2457 the DFA not to have to restart after its tried and failed a word at a given point, it
2458 simply continues as though it had been matching the other word in the first place.
2459 Consider
2460 'abcdgu'=~/abcdefg|cdgu/
2461 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2462 fail, which would bring us to the state representing 'd' in the second word where we would
2463 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2464 */
2465 /* add a fail transition */
3251b653
NC
2466 const U32 trie_offset = ARG(source);
2467 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2468 U32 *q;
2469 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2470 const U32 numstates = trie->statecount;
786e8c11
YO
2471 const U32 ubound = trie->lasttrans + ucharcount;
2472 U32 q_read = 0;
2473 U32 q_write = 0;
2474 U32 charid;
2475 U32 base = trie->states[ 1 ].trans.base;
2476 U32 *fail;
2477 reg_ac_data *aho;
2478 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2479 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2480
2481 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2482#ifndef DEBUGGING
2483 PERL_UNUSED_ARG(depth);
2484#endif
2485
2486
2487 ARG_SET( stclass, data_slot );
c944940b 2488 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2489 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2490 aho->trie=trie_offset;
446bd890
NC
2491 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2492 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2493 Newxz( q, numstates, U32);
c944940b 2494 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2495 aho->refcount = 1;
2496 fail = aho->fail;
2497 /* initialize fail[0..1] to be 1 so that we always have
2498 a valid final fail state */
2499 fail[ 0 ] = fail[ 1 ] = 1;
2500
2501 for ( charid = 0; charid < ucharcount ; charid++ ) {
2502 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2503 if ( newstate ) {
2504 q[ q_write ] = newstate;
2505 /* set to point at the root */
2506 fail[ q[ q_write++ ] ]=1;
2507 }
2508 }
2509 while ( q_read < q_write) {
2510 const U32 cur = q[ q_read++ % numstates ];
2511 base = trie->states[ cur ].trans.base;
2512
2513 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2514 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2515 if (ch_state) {
2516 U32 fail_state = cur;
2517 U32 fail_base;
2518 do {
2519 fail_state = fail[ fail_state ];
2520 fail_base = aho->states[ fail_state ].trans.base;
2521 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2522
2523 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2524 fail[ ch_state ] = fail_state;
2525 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2526 {
2527 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2528 }
2529 q[ q_write++ % numstates] = ch_state;
2530 }
2531 }
2532 }
2533 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2534 when we fail in state 1, this allows us to use the
2535 charclass scan to find a valid start char. This is based on the principle
2536 that theres a good chance the string being searched contains lots of stuff
2537 that cant be a start char.
2538 */
2539 fail[ 0 ] = fail[ 1 ] = 0;
2540 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2541 PerlIO_printf(Perl_debug_log,
2542 "%*sStclass Failtable (%"UVuf" states): 0",
2543 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2544 );
786e8c11
YO
2545 for( q_read=1; q_read<numstates; q_read++ ) {
2546 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2547 }
2548 PerlIO_printf(Perl_debug_log, "\n");
2549 });
2550 Safefree(q);
2551 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2552}
2553
786e8c11 2554
a3621e74 2555/*
5d1c421c
JH
2556 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2557 * These need to be revisited when a newer toolchain becomes available.
2558 */
2559#if defined(__sparc64__) && defined(__GNUC__)
2560# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2561# undef SPARC64_GCC_WORKAROUND
2562# define SPARC64_GCC_WORKAROUND 1
2563# endif
2564#endif
2565
07be1b83 2566#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2567 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2568 SV * const mysv=sv_newmortal(); \
2569 regnode *Next = regnext(scan); \
2570 regprop(RExC_rx, mysv, scan); \
7f69552c 2571 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2572 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2573 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2574 }});
07be1b83 2575
1de06328 2576
bb914485
KW
2577/* The below joins as many adjacent EXACTish nodes as possible into a single
2578 * one, and looks for problematic sequences of characters whose folds vs.
2579 * non-folds have sufficiently different lengths, that the optimizer would be
2580 * fooled into rejecting legitimate matches of them, and the trie construction
2581 * code can't cope with them. The joining is only done if:
2582 * 1) there is room in the current conglomerated node to entirely contain the
2583 * next one.
2584 * 2) they are the exact same node type
2585 *
2586 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2587 * these get optimized out
2588 *
9d071ca8
KW
2589 * If there are problematic code sequences, *min_subtract is set to the delta
2590 * that the minimum size of the node can be less than its actual size. And,
2591 * the node type of the result is changed to reflect that it contains these
bb914485
KW
2592 * sequences.
2593 *
a0c4c608
KW
2594 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2595 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2596 *
bb914485
KW
2597 * This is as good a place as any to discuss the design of handling these
2598 * problematic sequences. It's been wrong in Perl for a very long time. There
2599 * are three code points in Unicode whose folded lengths differ so much from
2600 * the un-folded lengths that it causes problems for the optimizer and trie
2601 * construction. Why only these are problematic, and not others where lengths
2602 * also differ is something I (khw) do not understand. New versions of Unicode
2603 * might add more such code points. Hopefully the logic in fold_grind.t that
287722f3 2604 * figures out what to test (in part by verifying that each size-combination
bb914485 2605 * gets tested) will catch any that do come along, so they can be added to the
287722f3
KW
2606 * special handling below. The chances of new ones are actually rather small,
2607 * as most, if not all, of the world's scripts that have casefolding have
2608 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2609 * made to allow compatibility with pre-existing standards, and almost all of
2610 * those have already been dealt with. These would otherwise be the most
2611 * likely candidates for generating further tricky sequences. In other words,
2612 * Unicode by itself is unlikely to add new ones unless it is for compatibility
a0c4c608 2613 * with pre-existing standards, and there aren't many of those left.
bb914485
KW
2614 *
2615 * The previous designs for dealing with these involved assigning a special
2616 * node for them. This approach doesn't work, as evidenced by this example:
a0c4c608 2617 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
bb914485
KW
2618 * Both these fold to "sss", but if the pattern is parsed to create a node of
2619 * that would match just the \xDF, it won't be able to handle the case where a
2620 * successful match would have to cross the node's boundary. The new approach
2621 * that hopefully generally solves the problem generates an EXACTFU_SS node
2622 * that is "sss".
2623 *
2624 * There are a number of components to the approach (a lot of work for just
2625 * three code points!):
2626 * 1) This routine examines each EXACTFish node that could contain the
9d071ca8
KW
2627 * problematic sequences. It returns in *min_subtract how much to
2628 * subtract from the the actual length of the string to get a real minimum
2629 * for one that could match it. This number is usually 0 except for the
2630 * problematic sequences. This delta is used by the caller to adjust the
2631 * min length of the match, and the delta between min and max, so that the
2632 * optimizer doesn't reject these possibilities based on size constraints.
bb914485
KW
2633 * 2) These sequences are not currently correctly handled by the trie code
2634 * either, so it changes the joined node type to ops that are not handled
fab2782b 2635 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
bb914485
KW
2636 * 3) This is sufficient for the two Greek sequences (described below), but
2637 * the one involving the Sharp s (\xDF) needs more. The node type
2638 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2639 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2640 * case where there is a possible fold length change. That means that a
2641 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2642 * itself with length changes, and so can be processed faster. regexec.c
2643 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2644 * is pre-folded by regcomp.c. This saves effort in regex matching.
2645 * However, probably mostly for historical reasons, the pre-folding isn't
a0c4c608
KW
2646 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2647 * nodes, as what they fold to isn't known until runtime.) The fold
2648 * possibilities for the non-UTF8 patterns are quite simple, except for
2649 * the sharp s. All the ones that don't involve a UTF-8 target string
2650 * are members of a fold-pair, and arrays are set up for all of them
2651 * that quickly find the other member of the pair. It might actually
2652 * be faster to pre-fold these, but it isn't currently done, except for
2653 * the sharp s. Code elsewhere in this file makes sure that it gets
2654 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2655 * issues described in the next item.
bb914485
KW
2656 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2657 * 'ss' or not is not knowable at compile time. It will match iff the
2658 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2659 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2660 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2661 * described in item 3). An assumption that the optimizer part of
2662 * regexec.c (probably unwittingly) makes is that a character in the
2663 * pattern corresponds to at most a single character in the target string.
2664 * (And I do mean character, and not byte here, unlike other parts of the
2665 * documentation that have never been updated to account for multibyte
2666 * Unicode.) This assumption is wrong only in this case, as all other
2667 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2668 * virtue of having this file pre-fold UTF-8 patterns. I'm
2669 * reluctant to try to change this assumption, so instead the code punts.
9d071ca8
KW
2670 * This routine examines EXACTF nodes for the sharp s, and returns a
2671 * boolean indicating whether or not the node is an EXACTF node that
2672 * contains a sharp s. When it is true, the caller sets a flag that later
2673 * causes the optimizer in this file to not set values for the floating
2674 * and fixed string lengths, and thus avoids the optimizer code in
2675 * regexec.c that makes the invalid assumption. Thus, there is no
2676 * optimization based on string lengths for EXACTF nodes that contain the
2677 * sharp s. This only happens for /id rules (which means the pattern
2678 * isn't in UTF-8).
bb914485 2679 */
1de06328 2680
9d071ca8 2681#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 2682 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 2683 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 2684
be8e71aa 2685STATIC U32
9d071ca8 2686S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
2687 /* Merge several consecutive EXACTish nodes into one. */
2688 regnode *n = regnext(scan);
2689 U32 stringok = 1;
2690 regnode *next = scan + NODE_SZ_STR(scan);
2691 U32 merged = 0;
2692 U32 stopnow = 0;
2693#ifdef DEBUGGING
2694 regnode *stop = scan;
72f13be8 2695 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2696#else
d47053eb
RGS
2697 PERL_UNUSED_ARG(depth);
2698#endif
7918f24d
NC
2699
2700 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2701#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2702 PERL_UNUSED_ARG(flags);
2703 PERL_UNUSED_ARG(val);
07be1b83 2704#endif
07be1b83 2705 DEBUG_PEEP("join",scan,depth);
bb914485 2706
3f410cf6
KW
2707 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2708 * EXACT ones that are mergeable to the current one. */
2709 while (n
2710 && (PL_regkind[OP(n)] == NOTHING
2711 || (stringok && OP(n) == OP(scan)))
07be1b83 2712 && NEXT_OFF(n)
3f410cf6
KW
2713 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2714 {
07be1b83
YO
2715
2716 if (OP(n) == TAIL || n > next)
2717 stringok = 0;
2718 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2719 DEBUG_PEEP("skip:",n,depth);
2720 NEXT_OFF(scan) += NEXT_OFF(n);
2721 next = n + NODE_STEP_REGNODE;
2722#ifdef DEBUGGING
2723 if (stringok)
2724 stop = n;
2725#endif
2726 n = regnext(n);
2727 }
2728 else if (stringok) {
786e8c11 2729 const unsigned int oldl = STR_LEN(scan);
07be1b83 2730 regnode * const nnext = regnext(n);
b2230d39
KW
2731
2732 if (oldl + STR_LEN(n) > U8_MAX)
2733 break;
07be1b83
YO
2734
2735 DEBUG_PEEP("merg",n,depth);
07be1b83 2736 merged++;
b2230d39 2737
07be1b83
YO
2738 NEXT_OFF(scan) += NEXT_OFF(n);
2739 STR_LEN(scan) += STR_LEN(n);
2740 next = n + NODE_SZ_STR(n);
2741 /* Now we can overwrite *n : */
2742 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2743#ifdef DEBUGGING
2744 stop = next - 1;
2745#endif
2746 n = nnext;
2747 if (stopnow) break;
2748 }
2749
d47053eb
RGS
2750#ifdef EXPERIMENTAL_INPLACESCAN
2751 if (flags && !NEXT_OFF(n)) {
2752 DEBUG_PEEP("atch", val, depth);
2753 if (reg_off_by_arg[OP(n)]) {
2754 ARG_SET(n, val - n);
2755 }
2756 else {
2757 NEXT_OFF(n) = val - n;
2758 }
2759 stopnow = 1;
2760 }
07be1b83
YO
2761#endif
2762 }
2c2b7f86 2763
9d071ca8 2764 *min_subtract = 0;
f758bddf 2765 *has_exactf_sharp_s = FALSE;
f646642f 2766
3f410cf6
KW
2767 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2768 * can now analyze for sequences of problematic code points. (Prior to
2769 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
2770 * hence missed). The sequences only happen in folding, hence for any
2771 * non-EXACT EXACTish node */
86d6fcad 2772 if (OP(scan) != EXACT) {
f758bddf
KW
2773 U8 *s;
2774 U8 * s0 = (U8*) STRING(scan);
2775 U8 * const s_end = s0 + STR_LEN(scan);
2776
2777 /* The below is perhaps overboard, but this allows us to save a test
2778 * each time through the loop at the expense of a mask. This is
2779 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2780 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2781 * This uses an exclusive 'or' to find that bit and then inverts it to
2782 * form a mask, with just a single 0, in the bit position where 'S' and
2783 * 's' differ. */
dbeb8947 2784 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
f758bddf
KW
2785 const U8 s_masked = 's' & S_or_s_mask;
2786
2787 /* One pass is made over the node's string looking for all the
2788 * possibilities. to avoid some tests in the loop, there are two main
2789 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2790 * non-UTF-8 */
2791 if (UTF) {
86d6fcad 2792
f758bddf
KW
2793 /* There are two problematic Greek code points in Unicode
2794 * casefolding
86d6fcad
KW
2795 *
2796 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2797 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2798 *
2799 * which casefold to
2800 *
2801 * Unicode UTF-8
2802 *
2803 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2804 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2805 *
2806 * This means that in case-insensitive matching (or "loose
2807 * matching", as Unicode calls it), an EXACTF of length six (the
2808 * UTF-8 encoded byte length of the above casefolded versions) can
2809 * match a target string of length two (the byte length of UTF-8
2810 * encoded U+0390 or U+03B0). This would rather mess up the
2811 * minimum length computation. (there are other code points that
2812 * also fold to these two sequences, but the delta is smaller)
2813 *
f758bddf
KW
2814 * If these sequences are found, the minimum length is decreased by
2815 * four (six minus two).
86d6fcad 2816 *
f758bddf
KW
2817 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2818 * LETTER SHARP S. We decrease the min length by 1 for each
2819 * occurrence of 'ss' found */
3f410cf6 2820
e294cc5d 2821#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
f758bddf
KW
2822# define U390_first_byte 0xb4
2823 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2824# define U3B0_first_byte 0xb5
2825 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
e294cc5d 2826#else
f758bddf
KW
2827# define U390_first_byte 0xce
2828 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2829# define U3B0_first_byte 0xcf
2830 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
e294cc5d 2831#endif
f758bddf
KW
2832 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2833 yields a net of 0 */
2834 /* Examine the string for one of the problematic sequences */
2835 for (s = s0;
2836 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2837 * sequence we are looking for is 2 */
2838 s += UTF8SKIP(s))
86d6fcad 2839 {
bb914485 2840
f758bddf
KW
2841 /* Look for the first byte in each problematic sequence */
2842 switch (*s) {
2843 /* We don't have to worry about other things that fold to
2844 * 's' (such as the long s, U+017F), as all above-latin1
2845 * code points have been pre-folded */
2846 case 's':
2847 case 'S':
2848
a0c4c608
KW
2849 /* Current character is an 's' or 'S'. If next one is
2850 * as well, we have the dreaded sequence */
f758bddf
KW
2851 if (((*(s+1) & S_or_s_mask) == s_masked)
2852 /* These two node types don't have special handling
2853 * for 'ss' */
2854 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2855 {
9d071ca8 2856 *min_subtract += 1;
f758bddf
KW
2857 OP(scan) = EXACTFU_SS;
2858 s++; /* No need to look at this character again */
2859 }
2860 break;
2861
2862 case U390_first_byte:
2863 if (s_end - s >= len
2864
2865 /* The 1's are because are skipping comparing the
2866 * first byte */
2867 && memEQ(s + 1, U390_tail, len - 1))
2868 {
2869 goto greek_sequence;
2870 }
2871 break;
2872
2873 case U3B0_first_byte:
2874 if (! (s_end - s >= len
2875 && memEQ(s + 1, U3B0_tail, len - 1)))
2876 {
2877 break;
2878 }
2879 greek_sequence:
9d071ca8 2880 *min_subtract += 4;
f758bddf
KW
2881
2882 /* This can't currently be handled by trie's, so change
2883 * the node type to indicate this. If EXACTFA and
2884 * EXACTFL were ever to be handled by trie's, this
2885 * would have to be changed. If this node has already
2886 * been changed to EXACTFU_SS in this loop, leave it as
2887 * is. (I (khw) think it doesn't matter in regexec.c
2888 * for UTF patterns, but no need to change it */
2889 if (OP(scan) == EXACTFU) {
fab2782b 2890 OP(scan) = EXACTFU_TRICKYFOLD;
f758bddf
KW
2891 }
2892 s += 6; /* We already know what this sequence is. Skip
2893 the rest of it */
2894 break;
bb914485
KW
2895 }
2896 }
2897 }
f758bddf 2898 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
bb914485 2899
f758bddf
KW
2900 /* Here, the pattern is not UTF-8. We need to look only for the
2901 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2902 * in the final position. Otherwise we can stop looking 1 byte
2903 * earlier because have to find both the first and second 's' */
2904 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2905
2906 for (s = s0; s < upper; s++) {
2907 switch (*s) {
2908 case 'S':
2909 case 's':
2910 if (s_end - s > 1
2911 && ((*(s+1) & S_or_s_mask) == s_masked))
2912 {
9d071ca8 2913 *min_subtract += 1;
f758bddf
KW
2914
2915 /* EXACTF nodes need to know that the minimum
2916 * length changed so that a sharp s in the string
2917 * can match this ss in the pattern, but they
2918 * remain EXACTF nodes, as they are not trie'able,
2919 * so don't have to invent a new node type to
2920 * exclude them from the trie code */
2921 if (OP(scan) != EXACTF) {
2922 OP(scan) = EXACTFU_SS;
2923 }
2924 s++;
2925 }
2926 break;
2927 case LATIN_SMALL_LETTER_SHARP_S:
2928 if (OP(scan) == EXACTF) {
2929 *has_exactf_sharp_s = TRUE;
2930 }
2931 break;
86d6fcad
KW
2932 }
2933 }
2934 }
07be1b83 2935 }
3f410cf6 2936
07be1b83 2937#ifdef DEBUGGING
bb789b09
DM
2938 /* Allow dumping but overwriting the collection of skipped
2939 * ops and/or strings with fake optimized ops */
07be1b83
YO
2940 n = scan + NODE_SZ_STR(scan);
2941 while (n <= stop) {
bb789b09
DM
2942 OP(n) = OPTIMIZED;
2943 FLAGS(n) = 0;
2944 NEXT_OFF(n) = 0;
07be1b83
YO
2945 n++;
2946 }
2947#endif
2948 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2949 return stopnow;
2950}
2951
486ec47a 2952/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2953 Finds fixed substrings. */
2954
a0288114 2955/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2956 to the position after last scanned or to NULL. */
2957
40d049e4
YO
2958#define INIT_AND_WITHP \
2959 assert(!and_withp); \
2960 Newx(and_withp,1,struct regnode_charclass_class); \
2961 SAVEFREEPV(and_withp)
07be1b83 2962
b515a41d 2963/* this is a chain of data about sub patterns we are processing that
486ec47a 2964 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2965 we can simulate recursion without losing state. */
2966struct scan_frame;
2967typedef struct scan_frame {
2968 regnode *last; /* last node to process in this frame */
2969 regnode *next; /* next node to process when last is reached */
2970 struct scan_frame *prev; /*previous frame*/
2971 I32 stop; /* what stopparen do we use */
2972} scan_frame;
2973
304ee84b
YO
2974
2975#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2976
e1d1eefb
YO
2977#define CASE_SYNST_FNC(nAmE) \
2978case nAmE: \
2979 if (flags & SCF_DO_STCLASS_AND) { \
2980 for (value = 0; value < 256; value++) \
2981 if (!is_ ## nAmE ## _cp(value)) \
2982 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2983 } \
2984 else { \
2985 for (value = 0; value < 256; value++) \
2986 if (is_ ## nAmE ## _cp(value)) \
2987 ANYOF_BITMAP_SET(data->start_class, value); \
2988 } \
2989 break; \
2990case N ## nAmE: \
2991 if (flags & SCF_DO_STCLASS_AND) { \
2992 for (value = 0; value < 256; value++) \
2993 if (is_ ## nAmE ## _cp(value)) \
2994 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2995 } \
2996 else { \
2997 for (value = 0; value < 256; value++) \
2998 if (!is_ ## nAmE ## _cp(value)) \
2999 ANYOF_BITMAP_SET(data->start_class, value); \
3000 } \
3001 break
3002
3003
3004
76e3520e 3005STATIC I32
40d049e4 3006S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 3007 I32 *minlenp, I32 *deltap,
40d049e4
YO
3008 regnode *last,
3009 scan_data_t *data,
3010 I32 stopparen,
3011 U8* recursed,
3012 struct regnode_charclass_class *and_withp,
3013 U32 flags, U32 depth)
c277df42
IZ
3014 /* scanp: Start here (read-write). */
3015 /* deltap: Write maxlen-minlen here. */
3016 /* last: Stop before this one. */
40d049e4
YO
3017 /* data: string data about the pattern */
3018 /* stopparen: treat close N as END */
3019 /* recursed: which subroutines have we recursed into */
3020 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3021{
97aff369 3022 dVAR;
c277df42
IZ
3023 I32 min = 0, pars = 0, code;
3024 regnode *scan = *scanp, *next;
3025 I32 delta = 0;
3026 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3027 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3028 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3029 scan_data_t data_fake;
a3621e74 3030 SV *re_trie_maxbuff = NULL;
786e8c11 3031 regnode *first_non_open = scan;
e2e6a0f1 3032 I32 stopmin = I32_MAX;
8aa23a47 3033 scan_frame *frame = NULL;
a3621e74 3034 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3035
7918f24d
NC
3036 PERL_ARGS_ASSERT_STUDY_CHUNK;
3037
13a24bad 3038#ifdef DEBUGGING
40d049e4 3039 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3040#endif
40d049e4 3041
786e8c11 3042 if ( depth == 0 ) {
40d049e4 3043 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3044 first_non_open=regnext(first_non_open);
3045 }
3046
b81d288d 3047
8aa23a47
YO
3048 fake_study_recurse:
3049 while ( scan && OP(scan) != END && scan < last ){
9d071ca8
KW
3050 UV min_subtract = 0; /* How much to subtract from the minimum node
3051 length to get a real minimum (because the
3052 folded version may be shorter) */
f758bddf 3053 bool has_exactf_sharp_s = FALSE;
8aa23a47 3054 /* Peephole optimizer: */
304ee84b 3055 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 3056 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
3057
3058 /* Its not clear to khw or hv why this is done here, and not in the
3059 * clauses that deal with EXACT nodes. khw's guess is that it's
3060 * because of a previous design */
9d071ca8 3061 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3062
3063 /* Follow the next-chain of the current node and optimize
3064 away all the NOTHINGs from it. */
3065 if (OP(scan) != CURLYX) {
3066 const int max = (reg_off_by_arg[OP(scan)]
3067 ? I32_MAX
3068 /* I32 may be smaller than U16 on CRAYs! */
3069 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3070 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3071 int noff;
3072 regnode *n = scan;
686b73d4 3073
8aa23a47
YO
3074 /* Skip NOTHING and LONGJMP. */
3075 while ((n = regnext(n))
3076 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3077 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3078 && off + noff < max)
3079 off += noff;
3080 if (reg_off_by_arg[OP(scan)])
3081 ARG(scan) = off;
3082 else
3083 NEXT_OFF(scan) = off;
3084 }
a3621e74 3085
c277df42 3086
8aa23a47
YO
3087
3088 /* The principal pseudo-switch. Cannot be a switch, since we
3089 look into several different things. */
3090 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3091 || OP(scan) == IFTHEN) {
3092 next = regnext(scan);
3093 code = OP(scan);
3094 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3095
8aa23a47
YO
3096 if (OP(next) == code || code == IFTHEN) {
3097 /* NOTE - There is similar code to this block below for handling
3098 TRIE nodes on a re-study. If you change stuff here check there
3099 too. */
3100 I32 max1 = 0, min1 = I32_MAX, num = 0;
3101 struct regnode_charclass_class accum;
3102 regnode * const startbranch=scan;
686b73d4 3103
8aa23a47 3104 if (flags & SCF_DO_SUBSTR)
304ee84b 3105 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3106 if (flags & SCF_DO_STCLASS)
e755fd73 3107 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
3108
3109 while (OP(scan) == code) {
3110 I32 deltanext, minnext, f = 0, fake;
3111 struct regnode_charclass_class this_class;
3112
3113 num++;
3114 data_fake.flags = 0;
3115 if (data) {
3116 data_fake.whilem_c = data->whilem_c;
3117 data_fake.last_closep = data->last_closep;
3118 }
3119 else
3120 data_fake.last_closep = &fake;
58e23c8d
YO
3121
3122 data_fake.pos_delta = delta;
8aa23a47
YO
3123 next = regnext(scan);
3124 scan = NEXTOPER(scan);
3125 if (code != BRANCH)
c277df42 3126 scan = NEXTOPER(scan);
8aa23a47 3127 if (flags & SCF_DO_STCLASS) {
e755fd73 3128 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3129 data_fake.start_class = &this_class;
3130 f = SCF_DO_STCLASS_AND;
58e23c8d 3131 }
8aa23a47
YO
3132 if (flags & SCF_WHILEM_VISITED_POS)
3133 f |= SCF_WHILEM_VISITED_POS;
3134
3135 /* we suppose the run is continuous, last=next...*/
3136 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3137 next, &data_fake,
3138 stopparen, recursed, NULL, f,depth+1);
3139 if (min1 > minnext)
3140 min1 = minnext;
3141 if (max1 < minnext + deltanext)
3142 max1 = minnext + deltanext;
3143 if (deltanext == I32_MAX)
3144 is_inf = is_inf_internal = 1;
3145 scan = next;
3146 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3147 pars++;
3148 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3149 if ( stopmin > minnext)
3150 stopmin = min + min1;
3151 flags &= ~SCF_DO_SUBSTR;
3152 if (data)
3153 data->flags |= SCF_SEEN_ACCEPT;
3154 }
3155 if (data) {
3156 if (data_fake.flags & SF_HAS_EVAL)
3157 data->flags |= SF_HAS_EVAL;
3158 data->whilem_c = data_fake.whilem_c;
3dab1dad 3159 }
8aa23a47 3160 if (flags & SCF_DO_STCLASS)
3fffb88a 3161 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3162 }
3163 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3164 min1 = 0;
3165 if (flags & SCF_DO_SUBSTR) {
3166 data->pos_min += min1;
3167 data->pos_delta += max1 - min1;
3168 if (max1 != min1 || is_inf)
3169 data->longest = &(data->longest_float);
3170 }
3171 min += min1;
3172 delta += max1 - min1;
3173 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3174 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
3175 if (min1) {
3176 cl_and(data->start_class, and_withp);
3177 flags &= ~SCF_DO_STCLASS;
653099ff 3178 }
8aa23a47
YO
3179 }
3180 else if (flags & SCF_DO_STCLASS_AND) {
3181 if (min1) {
3182 cl_and(data->start_class, &accum);
3183 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3184 }
8aa23a47
YO
3185 else {
3186 /* Switch to OR mode: cache the old value of
3187 * data->start_class */
3188 INIT_AND_WITHP;
3189 StructCopy(data->start_class, and_withp,
3190 struct regnode_charclass_class);
3191 flags &= ~SCF_DO_STCLASS_AND;
3192 StructCopy(&accum, data->start_class,
3193 struct regnode_charclass_class);
3194 flags |= SCF_DO_STCLASS_OR;
3195 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 3196 }
8aa23a47 3197 }
a3621e74 3198
8aa23a47
YO
3199 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3200 /* demq.
a3621e74 3201
8aa23a47
YO
3202 Assuming this was/is a branch we are dealing with: 'scan' now
3203 points at the item that follows the branch sequence, whatever
3204 it is. We now start at the beginning of the sequence and look
3205 for subsequences of
a3621e74 3206
8aa23a47
YO
3207 BRANCH->EXACT=>x1
3208 BRANCH->EXACT=>x2
3209 tail
a3621e74 3210
8aa23a47 3211 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3212
486ec47a 3213 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3214 element into a trie and then add the subsequent branch exact
3215 strings to the trie.
a3621e74 3216
8aa23a47 3217 We have two cases
a3621e74 3218
3b753521 3219 1. patterns where the whole set of branches can be converted.
a3621e74 3220
8aa23a47 3221 2. patterns where only a subset can be converted.
a3621e74 3222
8aa23a47
YO
3223 In case 1 we can replace the whole set with a single regop
3224 for the trie. In case 2 we need to keep the start and end
3b753521 3225 branches so
a3621e74 3226
8aa23a47
YO
3227 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3228 becomes BRANCH TRIE; BRANCH X;
786e8c11 3229
8aa23a47
YO
3230 There is an additional case, that being where there is a
3231 common prefix, which gets split out into an EXACT like node
3232 preceding the TRIE node.
a3621e74 3233
8aa23a47
YO
3234 If x(1..n)==tail then we can do a simple trie, if not we make
3235 a "jump" trie, such that when we match the appropriate word
486ec47a 3236 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3237 a nested if into a case structure of sorts.
b515a41d 3238
8aa23a47 3239 */
686b73d4 3240
8aa23a47
YO
3241 int made=0;
3242 if (!re_trie_maxbuff) {
3243 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3244 if (!SvIOK(re_trie_maxbuff))
3245 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3246 }
3247 if ( SvIV(re_trie_maxbuff)>=0 ) {
3248 regnode *cur;
3249 regnode *first = (regnode *)NULL;
3250 regnode *last = (regnode *)NULL;
3251 regnode *tail = scan;
fab2782b 3252 U8 trietype = 0;
8aa23a47 3253 U32 count=0;
a3621e74
YO
3254
3255#ifdef DEBUGGING
8aa23a47 3256 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3257#endif
8aa23a47
YO
3258 /* var tail is used because there may be a TAIL
3259 regop in the way. Ie, the exacts will point to the
3260 thing following the TAIL, but the last branch will
3261 point at the TAIL. So we advance tail. If we
3262 have nested (?:) we may have to move through several
3263 tails.
3264 */
3265
3266 while ( OP( tail ) == TAIL ) {
3267 /* this is the TAIL generated by (?:) */
3268 tail = regnext( tail );
3269 }
a3621e74 3270
8aa23a47 3271
df826430 3272 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3273 regprop(RExC_rx, mysv, tail );
3274 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3275 (int)depth * 2 + 2, "",
3276 "Looking for TRIE'able sequences. Tail node is: ",
3277 SvPV_nolen_const( mysv )
3278 );
3279 });
3280
3281 /*
3282
fab2782b
YO
3283 Step through the branches
3284 cur represents each branch,
3285 noper is the first thing to be matched as part of that branch
3286 noper_next is the regnext() of that node.
3287
3288 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3289 via a "jump trie" but we also support building with NOJUMPTRIE,
3290 which restricts the trie logic to structures like /FOO|BAR/.
3291
3292 If noper is a trieable nodetype then the branch is a possible optimization
3293 target. If we are building under NOJUMPTRIE then we require that noper_next
3294 is the same as scan (our current position in the regex program).
3295
3296 Once we have two or more consecutive such branches we can create a
3297 trie of the EXACT's contents and stitch it in place into the program.
3298
3299 If the sequence represents all of the branches in the alternation we
3300 replace the entire thing with a single TRIE node.
3301
3302 Otherwise when it is a subsequence we need to stitch it in place and
3303 replace only the relevant branches. This means the first branch has
3304 to remain as it is used by the alternation logic, and its next pointer,
3305 and needs to be repointed at the item on the branch chain following
3306 the last branch we have optimized away.
3307
3308 This could be either a BRANCH, in which case the subsequence is internal,
3309 or it could be the item following the branch sequence in which case the
3310 subsequence is at the end (which does not necessarily mean the first node
3311 is the start of the alternation).
3312
3313 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3314
3315 optype | trietype
3316 ----------------+-----------
3317 NOTHING | NOTHING
3318 EXACT | EXACT
3319 EXACTFU | EXACTFU
3320 EXACTFU_SS | EXACTFU
3321 EXACTFU_TRICKYFOLD | EXACTFU
3322 EXACTFA | 0
3323
8aa23a47
YO
3324
3325 */
fab2782b
YO
3326#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3327 ( EXACT == (X) ) ? EXACT : \
3328 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3329 0 )
8aa23a47
YO
3330
3331 /* dont use tail as the end marker for this traverse */
3332 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3333 regnode * const noper = NEXTOPER( cur );
fab2782b
YO
3334 U8 noper_type = OP( noper );
3335 U8 noper_trietype = TRIE_TYPE( noper_type );
b515a41d 3336#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 3337 regnode * const noper_next = regnext( noper );
df826430
YO
3338 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3339 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
b515a41d
YO
3340#endif
3341
df826430 3342 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3343 regprop(RExC_rx, mysv, cur);
3344 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3345 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3346
3347 regprop(RExC_rx, mysv, noper);
3348 PerlIO_printf( Perl_debug_log, " -> %s",
3349 SvPV_nolen_const(mysv));
3350
3351 if ( noper_next ) {
3352 regprop(RExC_rx, mysv, noper_next );
3353 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3354 SvPV_nolen_const(mysv));
3355 }
df826430
YO
3356 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3357 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3358 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3359 );
8aa23a47 3360 });
fab2782b
YO
3361
3362 /* Is noper a trieable nodetype that can be merged with the
3363 * current trie (if there is one)? */
3364 if ( noper_trietype
3365 &&
3366 (
df826430
YO
3367 ( noper_trietype == NOTHING)
3368 || ( trietype == NOTHING )
a40630bf 3369 || ( trietype == noper_trietype )
fab2782b 3370 )
786e8c11 3371#ifdef NOJUMPTRIE
8aa23a47 3372 && noper_next == tail
786e8c11 3373#endif
8aa23a47
YO
3374 && count < U16_MAX)
3375 {
fab2782b
YO
3376 /* Handle mergable triable node
3377 * Either we are the first node in a new trieable sequence,
3378 * in which case we do some bookkeeping, otherwise we update
3379 * the end pointer. */
fab2782b 3380 if ( !first ) {
3b6759a6 3381 first = cur;
df826430
YO
3382 if ( noper_trietype == NOTHING ) {
3383#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3384 regnode * const noper_next = regnext( noper );
3b6759a6 3385 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
df826430
YO
3386 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3387#endif
3388
190c1910 3389 if ( noper_next_trietype ) {
df826430 3390 trietype = noper_next_trietype;
190c1910
YO
3391 } else if (noper_next_type) {
3392 /* a NOTHING regop is 1 regop wide. We need at least two
3393 * for a trie so we can't merge this in */
3394 first = NULL;
3395 }
3396 } else {
3397 trietype = noper_trietype;
3b6759a6 3398 }
8aa23a47 3399 } else {
fab2782b
YO
3400 if ( trietype == NOTHING )
3401 trietype = noper_trietype;
8aa23a47
YO
3402 last = cur;
3403 }
df826430
YO
3404 if (first)
3405 count++;
fab2782b
YO
3406 } /* end handle mergable triable node */
3407 else {
3408 /* handle unmergable node -
3409 * noper may either be a triable node which can not be tried
3410 * together with the current trie, or a non triable node */
729aaeb5
YO
3411 if ( last ) {
3412 /* If last is set and trietype is not NOTHING then we have found
3413 * at least two triable branch sequences in a row of a similar
3414 * trietype so we can turn them into a trie. If/when we
3415 * allow NOTHING to start a trie sequence this condition will be
3416 * required, and it isn't expensive so we leave it in for now. */
3417 if ( trietype != NOTHING )
3418 make_trie( pRExC_state,
3419 startbranch, first, cur, tail, count,
3420 trietype, depth+1 );
fab2782b 3421 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
8aa23a47 3422 }
fab2782b 3423 if ( noper_trietype
786e8c11 3424#ifdef NOJUMPTRIE
8aa23a47 3425 && noper_next == tail
786e8c11 3426#endif
8aa23a47 3427 ){
fab2782b 3428 /* noper is triable, so we can start a new trie sequence */
8aa23a47
YO
3429 count = 1;
3430 first = cur;
fab2782b
YO
3431 trietype = noper_trietype;
3432 } else if (first) {
3433 /* if we already saw a first but the current node is not triable then we have
3434 * to reset the first information. */
8aa23a47
YO
3435 count = 0;
3436 first = NULL;
fab2782b 3437 trietype = 0;
8aa23a47 3438 }
fab2782b
YO
3439 } /* end handle unmergable node */
3440 } /* loop over branches */
df826430 3441 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3442 regprop(RExC_rx, mysv, cur);
3443 PerlIO_printf( Perl_debug_log,
3444 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3445 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3446
3447 });
3b6759a6
YO
3448 if ( last ) {
3449 if ( trietype != NOTHING ) {
3450 /* the last branch of the sequence was part of a trie,
3451 * so we have to construct it here outside of the loop
3452 */
3453 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
686b73d4 3454#ifdef TRIE_STUDY_OPT
3b6759a6
YO
3455 if ( ((made == MADE_EXACT_TRIE &&
3456 startbranch == first)
3457 || ( first_non_open == first )) &&
3458 depth==0 ) {
3459 flags |= SCF_TRIE_RESTUDY;
3460 if ( startbranch == first
3461 && scan == tail )
3462 {
3463 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3464 }
8aa23a47 3465 }
3dab1dad 3466#endif
3b6759a6
YO
3467 } else {
3468 /* at this point we know whatever we have is a NOTHING sequence/branch
3469 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3470 */
3471 if ( startbranch == first ) {
3472 regnode *opt;
3473 /* the entire thing is a NOTHING sequence, something like this:
3474 * (?:|) So we can turn it into a plain NOTHING op. */
3475 DEBUG_TRIE_COMPILE_r({
3476 regprop(RExC_rx, mysv, cur);
3477 PerlIO_printf( Perl_debug_log,
3478 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3479 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3480
3481 });
3482 OP(startbranch)= NOTHING;
3483 NEXT_OFF(startbranch)= tail - startbranch;
3484 for ( opt= startbranch + 1; opt < tail ; opt++ )
3485 OP(opt)= OPTIMIZED;
3486 }
3487 }
fab2782b
YO
3488 } /* end if ( last) */
3489 } /* TRIE_MAXBUF is non zero */
8aa23a47
YO
3490
3491 } /* do trie */
3492
653099ff 3493 }
8aa23a47
YO
3494 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3495 scan = NEXTOPER(NEXTOPER(scan));
3496 } else /* single branch is optimized. */
3497 scan = NEXTOPER(scan);
3498 continue;
3499 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3500 scan_frame *newframe = NULL;
3501 I32 paren;
3502 regnode *start;
3503 regnode *end;
3504
3505 if (OP(scan) != SUSPEND) {
3506 /* set the pointer */
3507 if (OP(scan) == GOSUB) {
3508 paren = ARG(scan);
3509 RExC_recurse[ARG2L(scan)] = scan;
3510 start = RExC_open_parens[paren-1];
3511 end = RExC_close_parens[paren-1];
3512 } else {
3513 paren = 0;
f8fc2ecf 3514 start = RExC_rxi->program + 1;
8aa23a47
YO
3515 end = RExC_opend;
3516 }
3517 if (!recursed) {
3518 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3519 SAVEFREEPV(recursed);
3520 }
3521 if (!PAREN_TEST(recursed,paren+1)) {
3522 PAREN_SET(recursed,paren+1);
3523 Newx(newframe,1,scan_frame);
3524 } else {
3525 if (flags & SCF_DO_SUBSTR) {
304ee84b 3526 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3527 data->longest = &(data->longest_float);
3528 }
3529 is_inf = is_inf_internal = 1;
3530 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3531 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3532 flags &= ~SCF_DO_STCLASS;
3533 }
3534 } else {
3535 Newx(newframe,1,scan_frame);
3536 paren = stopparen;
3537 start = scan+2;
3538 end = regnext(scan);
3539 }
3540 if (newframe) {
3541 assert(start);
3542 assert(end);
3543 SAVEFREEPV(newframe);
3544 newframe->next = regnext(scan);
3545 newframe->last = last;
3546 newframe->stop = stopparen;
3547 newframe->prev = frame;
3548
3549 frame = newframe;
3550 scan = start;
3551 stopparen = paren;
3552 last = end;
3553
3554 continue;
3555 }
3556 }
3557 else if (OP(scan) == EXACT) {
3558 I32 l = STR_LEN(scan);
3559 UV uc;
3560 if (UTF) {
3561 const U8 * const s = (U8*)STRING(scan);
4b88fb76 3562 uc = utf8_to_uvchr_buf(s, s + l, NULL);
8aa23a47 3563 l = utf8_length(s, s + l);
8aa23a47
YO
3564 } else {
3565 uc = *((U8*)STRING(scan));
3566 }
3567 min += l;
3568 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3569 /* The code below prefers earlier match for fixed
3570 offset, later match for variable offset. */
3571 if (data->last_end == -1) { /* Update the start info. */
3572 data->last_start_min = data->pos_min;
3573 data->last_start_max = is_inf
3574 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3575 }
8aa23a47
YO
3576 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3577 if (UTF)
3578 SvUTF8_on(data->last_found);
3579 {
3580 SV * const sv = data->last_found;
3581 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3582 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3583 if (mg && mg->mg_len >= 0)
3584 mg->mg_len += utf8_length((U8*)STRING(scan),
3585 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3586 }
8aa23a47
YO
3587 data->last_end = data->pos_min + l;
3588 data->pos_min += l; /* As in the first entry. */
3589 data->flags &= ~SF_BEFORE_EOL;
3590 }
3591 if (flags & SCF_DO_STCLASS_AND) {
3592 /* Check whether it is compatible with what we know already! */
3593 int compat = 1;
3594
54251c2e 3595
486ec47a 3596 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3597 * in the bitmp and either 1) its bit or its fold is set, or 2)
3598 * it's for a locale. Even if there isn't unicode semantics
3599 * here, at runtime there may be because of matching against a
3600 * utf8 string, so accept a possible false positive for
3601 * latin1-range folds */
8aa23a47
YO
3602 if (uc >= 0x100 ||
3603 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3604 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3605 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3606 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3607 )
d18bf9dc 3608 {
8aa23a47 3609 compat = 0;
d18bf9dc 3610 }
8aa23a47
YO
3611 ANYOF_CLASS_ZERO(data->start_class);
3612 ANYOF_BITMAP_ZERO(data->start_class);
3613 if (compat)
3614 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3615 else if (uc >= 0x100) {
3616 int i;
3617
3618 /* Some Unicode code points fold to the Latin1 range; as
3619 * XXX temporary code, instead of figuring out if this is
3620 * one, just assume it is and set all the start class bits
3621 * that could be some such above 255 code point's fold
3622 * which will generate fals positives. As the code
3623 * elsewhere that does compute the fold settles down, it
3624 * can be extracted out and re-used here */
3625 for (i = 0; i < 256; i++){
94dc5c2d 3626 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
d18bf9dc
KW
3627 ANYOF_BITMAP_SET(data->start_class, i);
3628 }
3629 }
3630 }
8aa23a47
YO
3631 data->start_class->flags &= ~ANYOF_EOS;
3632 if (uc < 0x100)
3633 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3634 }
3635 else if (flags & SCF_DO_STCLASS_OR) {
3636 /* false positive possible if the class is case-folded */
3637 if (uc < 0x100)
3638 ANYOF_BITMAP_SET(data->start_class, uc);
3639 else
3640 data->start_class->flags |= ANYOF_UNICODE_ALL;
3641 data->start_class->flags &= ~ANYOF_EOS;
3642 cl_and(data->start_class, and_withp);
3643 }
3644 flags &= ~SCF_DO_STCLASS;
3645 }
3646 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3647 I32 l = STR_LEN(scan);
3648 UV uc = *((U8*)STRING(scan));
3649
3650 /* Search for fixed substrings supports EXACT only. */
3651 if (flags & SCF_DO_SUBSTR) {
3652 assert(data);
304ee84b 3653 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3654 }
3655 if (UTF) {
3656 const U8 * const s = (U8 *)STRING(scan);
4b88fb76 3657 uc = utf8_to_uvchr_buf(s, s + l, NULL);
8aa23a47 3658 l = utf8_length(s, s + l);
8aa23a47 3659 }
f758bddf
KW
3660 else if (has_exactf_sharp_s) {
3661 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
bb914485 3662 }
9d071ca8 3663 min += l - min_subtract;
f646642f
KW
3664 if (min < 0) {
3665 min = 0;
3666 }
9d071ca8 3667 delta += min_subtract;
f646642f 3668 if (flags & SCF_DO_SUBSTR) {
9d071ca8 3669 data->pos_min += l - min_subtract;
f646642f
KW
3670 if (data->pos_min < 0) {
3671 data->pos_min = 0;
3672 }
9d071ca8
KW
3673 data->pos_delta += min_subtract;
3674 if (min_subtract) {
d2197104
KW
3675 data->longest = &(data->longest_float);
3676 }
f646642f 3677 }
8aa23a47
YO
3678 if (flags & SCF_DO_STCLASS_AND) {
3679 /* Check whether it is compatible with what we know already! */
3680 int compat = 1;
8aa23a47 3681 if (uc >= 0x100 ||
54251c2e
KW
3682 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3683 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3684 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3685 {
8aa23a47 3686 compat = 0;
54251c2e 3687 }
8aa23a47
YO
3688 ANYOF_CLASS_ZERO(data->start_class);
3689 ANYOF_BITMAP_ZERO(data->start_class);
3690 if (compat) {
3691 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3692 data->start_class->flags &= ~ANYOF_EOS;
39065660 3693 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3694 if (OP(scan) == EXACTFL) {
af302e7f
KW
3695 /* XXX This set is probably no longer necessary, and
3696 * probably wrong as LOCALE now is on in the initial
3697 * state */
8aa23a47 3698 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3699 }
3700 else {
3701
54251c2e
KW
3702 /* Also set the other member of the fold pair. In case
3703 * that unicode semantics is called for at runtime, use
3704 * the full latin1 fold. (Can't do this for locale,
a0c4c608 3705 * because not known until runtime) */
54251c2e 3706 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
e22b340a 3707
a0c4c608
KW
3708 /* All other (EXACTFL handled above) folds except under
3709 * /iaa that include s, S, and sharp_s also may include
3710 * the others */
e22b340a
KW
3711 if (OP(scan) != EXACTFA) {
3712 if (uc == 's' || uc == 'S') {
3713 ANYOF_BITMAP_SET(data->start_class,
3714 LATIN_SMALL_LETTER_SHARP_S);
3715 }
3716 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3717 ANYOF_BITMAP_SET(data->start_class, 's');
3718 ANYOF_BITMAP_SET(data->start_class, 'S');
3719 }
3720 }
970c8436 3721 }
653099ff 3722 }
d18bf9dc
KW
3723 else if (uc >= 0x100) {
3724 int i;
3725 for (i = 0; i < 256; i++){
3726 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3727 ANYOF_BITMAP_SET(data->start_class, i);
3728 }
3729 }
3730 }
8aa23a47
YO
3731 }
3732 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3733 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3734 /* false positive possible if the class is case-folded.
3735 Assume that the locale settings are the same... */
970c8436 3736 if (uc < 0x100) {
1aa99e6b 3737 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3738 if (OP(scan) != EXACTFL) {
3739
3740 /* And set the other member of the fold pair, but
3741 * can't do that in locale because not known until
3742 * run-time */
3743 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3744 PL_fold_latin1[uc]);
e22b340a
KW
3745
3746 /* All folds except under /iaa that include s, S,
3747 * and sharp_s also may include the others */
3748 if (OP(scan) != EXACTFA) {
3749 if (uc == 's' || uc == 'S') {
3750 ANYOF_BITMAP_SET(data->start_class,
3751 LATIN_SMALL_LETTER_SHARP_S);
3752 }
3753 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3754 ANYOF_BITMAP_SET(data->start_class, 's');
3755 ANYOF_BITMAP_SET(data->start_class, 'S');
3756 }
3757 }
970c8436
KW
3758 }
3759 }
653099ff
GS
3760 data->start_class->flags &= ~ANYOF_EOS;
3761 }
8aa23a47 3762 cl_and(data->start_class, and_withp);
653099ff 3763 }
8aa23a47
YO
3764 flags &= ~SCF_DO_STCLASS;
3765 }
e52fc539 3766 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3767 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3768 I32 f = flags, pos_before = 0;
3769 regnode * const oscan = scan;
3770 struct regnode_charclass_class this_class;
3771 struct regnode_charclass_class *oclass = NULL;
3772 I32 next_is_eval = 0;
3773
3774 switch (PL_regkind[OP(scan)]) {
3775 case WHILEM: /* End of (?:...)* . */
3776 scan = NEXTOPER(scan);
3777 goto finish;
3778 case PLUS:
3779 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3780 next = NEXTOPER(scan);
3781 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3782 mincount = 1;
3783 maxcount = REG_INFTY;
3784 next = regnext(scan);
3785 scan = NEXTOPER(scan);
3786 goto do_curly;
3787 }
3788 }
3789 if (flags & SCF_DO_SUBSTR)
3790 data->pos_min++;
3791 min++;
3792 /* Fall through. */
3793 case STAR:
3794 if (flags & SCF_DO_STCLASS) {
3795 mincount = 0;
3796 maxcount = REG_INFTY;
3797 next = regnext(scan);
3798 scan = NEXTOPER(scan);
3799 goto do_curly;
3800 }
3801 is_inf = is_inf_internal = 1;
3802 scan = regnext(scan);
c277df42 3803 if (flags & SCF_DO_SUBSTR) {
304ee84b 3804 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3805 data->longest = &(data->longest_float);
c277df42 3806 }
8aa23a47
YO
3807 goto optimize_curly_tail;
3808 case CURLY:
3809 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3810 && (scan->flags == stopparen))
3811 {
3812 mincount = 1;
3813 maxcount = 1;
3814 } else {
3815 mincount = ARG1(scan);
3816 maxcount = ARG2(scan);
653099ff 3817 }
8aa23a47
YO
3818 next = regnext(scan);
3819 if (OP(scan) == CURLYX) {
3820 I32 lp = (data ? *(data->last_closep) : 0);
3821 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3822 }
8aa23a47
YO
3823 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3824 next_is_eval = (OP(scan) == EVAL);
3825 do_curly:
3826 if (flags & SCF_DO_SUBSTR) {
304ee84b 3827 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3828 pos_before = data->pos_min;
b45f050a 3829 }
8aa23a47
YO
3830 if (data) {
3831 fl = data->flags;
3832 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3833 if (is_inf)
3834 data->flags |= SF_IS_INF;
3835 }
3836 if (flags & SCF_DO_STCLASS) {
e755fd73 3837 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3838 oclass = data->start_class;
3839 data->start_class = &this_class;
3840 f |= SCF_DO_STCLASS_AND;
3841 f &= ~SCF_DO_STCLASS_OR;
3842 }
779bcb7d
NC
3843 /* Exclude from super-linear cache processing any {n,m}
3844 regops for which the combination of input pos and regex
3845 pos is not enough information to determine if a match
3846 will be possible.
3847
3848 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3849 regex pos at the \s*, the prospects for a match depend not
3850 only on the input position but also on how many (bar\s*)
3851 repeats into the {4,8} we are. */
3852 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3853 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3854
8aa23a47
YO
3855 /* This will finish on WHILEM, setting scan, or on NULL: */
3856 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3857 last, data, stopparen, recursed, NULL,
3858 (mincount == 0
3859 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3860
8aa23a47
YO
3861 if (flags & SCF_DO_STCLASS)
3862 data->start_class = oclass;
3863 if (mincount == 0 || minnext == 0) {
3864 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3865 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3866 }
3867 else if (flags & SCF_DO_STCLASS_AND) {
3868 /* Switch to OR mode: cache the old value of
3869 * data->start_class */
3870 INIT_AND_WITHP;
3871 StructCopy(data->start_class, and_withp,
3872 struct regnode_charclass_class);
3873 flags &= ~SCF_DO_STCLASS_AND;
3874 StructCopy(&this_class, data->start_class,
3875 struct regnode_charclass_class);
3876 flags |= SCF_DO_STCLASS_OR;
3877 data->start_class->flags |= ANYOF_EOS;
3878 }
3879 } else { /* Non-zero len */
3880 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3881 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3882 cl_and(data->start_class, and_withp);
3883 }
3884 else if (flags & SCF_DO_STCLASS_AND)
3885 cl_and(data->start_class, &this_class);
3886 flags &= ~SCF_DO_STCLASS;
3887 }
3888 if (!scan) /* It was not CURLYX, but CURLY. */
3889 scan = next;
3890 if ( /* ? quantifier ok, except for (?{ ... }) */
3891 (next_is_eval || !(mincount == 0 && maxcount == 1))
3892 && (minnext == 0) && (deltanext == 0)
3893 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3894 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3895 {
668c081a
NC
3896 ckWARNreg(RExC_parse,
3897 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3898 }
3899
3900 min += minnext * mincount;
3901 is_inf_internal |= ((maxcount == REG_INFTY
3902 && (minnext + deltanext) > 0)
3903 || deltanext == I32_MAX);
3904 is_inf |= is_inf_internal;
3905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3906
3907 /* Try powerful optimization CURLYX => CURLYN. */
3908 if ( OP(oscan) == CURLYX && data
3909 && data->flags & SF_IN_PAR
3910 && !(data->flags & SF_HAS_EVAL)
3911 && !deltanext && minnext == 1 ) {
3912 /* Try to optimize to CURLYN. */
3913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914 regnode * const nxt1 = nxt;
497b47a8 3915#ifdef DEBUGGING
8aa23a47 3916 regnode *nxt2;
497b47a8 3917#endif
c277df42 3918
8aa23a47
YO
3919 /* Skip open. */
3920 nxt = regnext(nxt);
e52fc539 3921 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3922 && !(PL_regkind[OP(nxt)] == EXACT
3923 && STR_LEN(nxt) == 1))
3924 goto nogo;
497b47a8 3925#ifdef DEBUGGING
8aa23a47 3926 nxt2 = nxt;
497b47a8 3927#endif
8aa23a47
YO
3928 nxt = regnext(nxt);
3929 if (OP(nxt) != CLOSE)
3930 goto nogo;
3931 if (RExC_open_parens) {
3932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3934 }
3935 /* Now we know that nxt2 is the only contents: */
3936 oscan->flags = (U8)ARG(nxt);
3937 OP(oscan) = CURLYN;
3938 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3939
c277df42 3940#ifdef DEBUGGING
8aa23a47 3941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3945 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3946 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3947#endif
8aa23a47
YO
3948 }
3949 nogo:
3950
3951 /* Try optimization CURLYX => CURLYM. */
3952 if ( OP(oscan) == CURLYX && data
3953 && !(data->flags & SF_HAS_PAR)
3954 && !(data->flags & SF_HAS_EVAL)
3955 && !deltanext /* atom is fixed width */
3956 && minnext != 0 /* CURLYM can't handle zero width */
3957 ) {
3958 /* XXXX How to optimize if data == 0? */
3959 /* Optimize to a simpler form. */
3960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3961 regnode *nxt2;
3962
3963 OP(oscan) = CURLYM;
3964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3965 && (OP(nxt2) != WHILEM))
3966 nxt = nxt2;
3967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3968 /* Need to optimize away parenths. */
b3c0965f 3969 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3970 /* Set the parenth number. */
3971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3972
8aa23a47
YO
3973 oscan->flags = (U8)ARG(nxt);
3974 if (RExC_open_parens) {
3975 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3976 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3977 }
8aa23a47
YO
3978 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3979 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3980
c277df42 3981#ifdef DEBUGGING
8aa23a47
YO
3982 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3983 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3984 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3985 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3986#endif
c277df42 3987#if 0
8aa23a47
YO
3988 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3989 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3990 if (nnxt == nxt) {
3991 if (reg_off_by_arg[OP(nxt1)])
3992 ARG_SET(nxt1, nxt2 - nxt1);
3993 else if (nxt2 - nxt1 < U16_MAX)
3994 NEXT_OFF(nxt1) = nxt2 - nxt1;
3995 else
3996 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3997 }
8aa23a47 3998 nxt1 = nnxt;
c277df42 3999 }
5d1c421c 4000#endif
8aa23a47
YO
4001 /* Optimize again: */
4002 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4003 NULL, stopparen, recursed, NULL, 0,depth+1);
4004 }
4005 else
4006 oscan->flags = 0;
4007 }
4008 else if ((OP(oscan) == CURLYX)
4009 && (flags & SCF_WHILEM_VISITED_POS)
4010 /* See the comment on a similar expression above.
3b753521 4011 However, this time it's not a subexpression
8aa23a47
YO
4012 we care about, but the expression itself. */
4013 && (maxcount == REG_INFTY)
4014 && data && ++data->whilem_c < 16) {
4015 /* This stays as CURLYX, we can put the count/of pair. */
4016 /* Find WHILEM (as in regexec.c) */
4017 regnode *nxt = oscan + NEXT_OFF(oscan);
4018
4019 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4020 nxt += ARG(nxt);
4021 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4022 | (RExC_whilem_seen << 4)); /* On WHILEM */
4023 }
4024 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4025 pars++;
4026 if (flags & SCF_DO_SUBSTR) {
4027 SV *last_str = NULL;
4028 int counted = mincount != 0;
a0ed51b3 4029
8aa23a47
YO
4030 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4031#if defined(SPARC64_GCC_WORKAROUND)
4032 I32 b = 0;
4033 STRLEN l = 0;
4034 const char *s = NULL;
4035 I32 old = 0;
b515a41d 4036
8aa23a47
YO
4037 if (pos_before >= data->last_start_min)
4038 b = pos_before;
4039 else
4040 b = data->last_start_min;
b515a41d 4041
8aa23a47
YO
4042 l = 0;
4043 s = SvPV_const(data->last_found, l);
4044 old = b - data->last_start_min;
4045
4046#else
4047 I32 b = pos_before >= data->last_start_min
4048 ? pos_before : data->last_start_min;
4049 STRLEN l;
4050 const char * const s = SvPV_const(data->last_found, l);
4051 I32 old = b - data->last_start_min;
4052#endif
4053
4054 if (UTF)
4055 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
4056 l -= old;
4057 /* Get the added string: */
740cce10 4058 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
4059 if (deltanext == 0 && pos_before == b) {
4060 /* What was added is a constant string */
4061 if (mincount > 1) {
4062 SvGROW(last_str, (mincount * l) + 1);
4063 repeatcpy(SvPVX(last_str) + l,
4064 SvPVX_const(last_str), l, mincount - 1);
4065 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4066 /* Add additional parts. */
4067 SvCUR_set(data->last_found,
4068 SvCUR(data->last_found) - l);
4069 sv_catsv(data->last_found, last_str);
4070 {
4071 SV * sv = data->last_found;
4072 MAGIC *mg =
4073 SvUTF8(sv) && SvMAGICAL(sv) ?
4074 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4075 if (mg && mg->mg_len >= 0)
bd94e887 4076 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 4077 }
8aa23a47 4078 data->last_end += l * (mincount - 1);
b515a41d 4079 }
8aa23a47
YO
4080 } else {
4081 /* start offset must point into the last copy */
4082 data->last_start_min += minnext * (mincount - 1);
4083 data->last_start_max += is_inf ? I32_MAX
4084 : (maxcount - 1) * (minnext + data->pos_delta);
4085 }
c277df42 4086 }
8aa23a47
YO
4087 /* It is counted once already... */
4088 data->pos_min += minnext * (mincount - counted);
4089 data->pos_delta += - counted * deltanext +
4090 (minnext + deltanext) * maxcount - minnext * mincount;
4091 if (mincount != maxcount) {
4092 /* Cannot extend fixed substrings found inside
4093 the group. */
304ee84b 4094 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
4095 if (mincount && last_str) {
4096 SV * const sv = data->last_found;
4097 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4098 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4099
4100 if (mg)
4101 mg->mg_len = -1;
4102 sv_setsv(sv, last_str);
4103 data->last_end = data->pos_min;
4104 data->last_start_min =
4105 data->pos_min - CHR_SVLEN(last_str);
4106 data->last_start_max = is_inf
4107 ? I32_MAX
4108 : data->pos_min + data->pos_delta
4109 - CHR_SVLEN(last_str);
4110 }
4111 data->longest = &(data->longest_float);
4112 }
4113 SvREFCNT_dec(last_str);
c277df42 4114 }
8aa23a47
YO
4115 if (data && (fl & SF_HAS_EVAL))
4116 data->flags |= SF_HAS_EVAL;
4117 optimize_curly_tail:
4118 if (OP(oscan) != CURLYX) {
4119 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4120 && NEXT_OFF(next))
4121 NEXT_OFF(oscan) += NEXT_OFF(next);
4122 }
4123 continue;
f56b6394 4124 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 4125 if (flags & SCF_DO_SUBSTR) {
304ee84b 4126 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4127 data->longest = &(data->longest_float);
4128 }
4129 is_inf = is_inf_internal = 1;
4130 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 4131 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
4132 flags &= ~SCF_DO_STCLASS;
4133 break;
c277df42 4134 }
8aa23a47 4135 }
e1d1eefb
YO
4136 else if (OP(scan) == LNBREAK) {
4137 if (flags & SCF_DO_STCLASS) {
4138 int value = 0;
4139 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4140 if (flags & SCF_DO_STCLASS_AND) {
4141 for (value = 0; value < 256; value++)
e64b1bd1 4142 if (!is_VERTWS_cp(value))
b9a59e08
KW
4143 ANYOF_BITMAP_CLEAR(data->start_class, value);
4144 }
4145 else {
e1d1eefb 4146 for (value = 0; value < 256; value++)
e64b1bd1 4147 if (is_VERTWS_cp(value))
b9a59e08
KW
4148 ANYOF_BITMAP_SET(data->start_class, value);
4149 }
e1d1eefb
YO
4150 if (flags & SCF_DO_STCLASS_OR)
4151 cl_and(data->start_class, and_withp);
4152 flags &= ~SCF_DO_STCLASS;
4153 }
4154 min += 1;
f9a79580 4155 delta += 1;
e1d1eefb
YO
4156 if (flags & SCF_DO_SUBSTR) {
4157 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4158 data->pos_min += 1;
f9a79580 4159 data->pos_delta += 1;
e1d1eefb
YO
4160 data->longest = &(data->longest_float);
4161 }
e1d1eefb 4162 }
e52fc539 4163 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 4164 int value = 0;
653099ff 4165
8aa23a47 4166 if (flags & SCF_DO_SUBSTR) {
304ee84b 4167 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
4168 data->pos_min++;
4169 }
4170 min++;
4171 if (flags & SCF_DO_STCLASS) {
4172 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 4173
8aa23a47
YO
4174 /* Some of the logic below assumes that switching
4175 locale on will only add false positives. */
4176 switch (PL_regkind[OP(scan)]) {
4177 case SANY:
4178 default:
4179 do_default:
4180 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4181 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4182 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
4183 break;
4184 case REG_ANY:
4185 if (OP(scan) == SANY)
4186 goto do_default;
4187 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4188 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 4189 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3fffb88a 4190 cl_anything(pRExC_state, data->start_class);
653099ff 4191 }
8aa23a47
YO
4192 if (flags & SCF_DO_STCLASS_AND || !value)
4193 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4194 break;
4195 case ANYOF:
4196 if (flags & SCF_DO_STCLASS_AND)
4197 cl_and(data->start_class,
4198 (struct regnode_charclass_class*)scan);
653099ff 4199 else
3fffb88a 4200 cl_or(pRExC_state, data->start_class,
8aa23a47
YO
4201 (struct regnode_charclass_class*)scan);
4202 break;
4203 case ALNUM:
4204 if (flags & SCF_DO_STCLASS_AND) {
4205 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4206 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 4207 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4208 for (value = 0; value < 256; value++) {
4209 if (!isWORDCHAR_L1(value)) {
4210 ANYOF_BITMAP_CLEAR(data->start_class, value);
4211 }
4212 }
4213 } else {
4214 for (value = 0; value < 256; value++) {
4215 if (!isALNUM(value)) {
4216 ANYOF_BITMAP_CLEAR(data->start_class, value);
4217 }
4218 }
4219 }
8aa23a47 4220 }
653099ff 4221 }
8aa23a47
YO
4222 else {
4223 if (data->start_class->flags & ANYOF_LOCALE)
4224 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
af302e7f
KW
4225
4226 /* Even if under locale, set the bits for non-locale
4227 * in case it isn't a true locale-node. This will
4228 * create false positives if it truly is locale */
4229 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4230 for (value = 0; value < 256; value++) {
4231 if (isWORDCHAR_L1(value)) {
4232 ANYOF_BITMAP_SET(data->start_class, value);
4233 }
4234 }
4235 } else {
4236 for (value = 0; value < 256; value++) {
4237 if (isALNUM(value)) {
4238 ANYOF_BITMAP_SET(data->start_class, value);
4239 }
4240 }
4241 }
8aa23a47
YO
4242 }
4243 break;
8aa23a47
YO
4244 case NALNUM:
4245 if (flags & SCF_DO_STCLASS_AND) {
4246 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4247 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 4248 if (OP(scan) == NALNUMU) {
a12cf05f
KW
4249 for (value = 0; value < 256; value++) {
4250 if (isWORDCHAR_L1(value)) {
4251 ANYOF_BITMAP_CLEAR(data->start_class, value);
4252 }
4253 }
4254 } else {
4255 for (value = 0; value < 256; value++) {
4256 if (isALNUM(value)) {
4257 ANYOF_BITMAP_CLEAR(data->start_class, value);
4258 }
4259 }
4260 }
653099ff
GS
4261 }
4262 }
8aa23a47
YO
4263 else {
4264 if (data->start_class->flags & ANYOF_LOCALE)
4265 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
af302e7f 4266
75950e1c
KW
4267 /* Even if under locale, set the bits for non-locale in
4268 * case it isn't a true locale-node. This will create
4269 * false positives if it truly is locale */
4270 if (OP(scan) == NALNUMU) {
4271 for (value = 0; value < 256; value++) {
4272 if (! isWORDCHAR_L1(value)) {
4273 ANYOF_BITMAP_SET(data->start_class, value);
4274 }
e9a9c1bc 4275 }
75950e1c
KW
4276 } else {
4277 for (value = 0; value < 256; value++) {
4278 if (! isALNUM(value)) {
4279 ANYOF_BITMAP_SET(data->start_class, value);
4280 }
4281 }
4282 }
653099ff 4283 }
8aa23a47 4284 break;
8aa23a47
YO
4285 case SPACE:
4286 if (flags & SCF_DO_STCLASS_AND) {
4287 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4288 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 4289 if (OP(scan) == SPACEU) {
a12cf05f
KW
4290 for (value = 0; value < 256; value++) {
4291 if (!isSPACE_L1(value)) {
4292 ANYOF_BITMAP_CLEAR(data->start_class, value);
4293 }
4294 }
4295 } else {
4296 for (value = 0; value < 256; value++) {
4297 if (!isSPACE(value)) {
4298 ANYOF_BITMAP_CLEAR(data->start_class, value);
4299 }
4300 }
4301 }
653099ff
GS
4302 }
4303 }
8aa23a47 4304 else {
a12cf05f 4305 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 4306 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 4307 }
af302e7f 4308 if (OP(scan) == SPACEU) {
a12cf05f
KW
4309 for (value = 0; value < 256; value++) {
4310 if (isSPACE_L1(value)) {
4311 ANYOF_BITMAP_SET(data->start_class, value);
4312 }
4313 }
4314 } else {
4315 for (value = 0; value < 256; value++) {
4316 if (isSPACE(value)) {
4317 ANYOF_BITMAP_SET(data->start_class, value);
4318 }
4319 }
8aa23a47 4320 }
653099ff 4321 }
8aa23a47 4322 break;
8aa23a47
YO
4323 case NSPACE:
4324 if (flags & SCF_DO_STCLASS_AND) {
4325 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4326 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 4327 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4328 for (value = 0; value < 256; value++) {
4329 if (isSPACE_L1(value)) {
4330 ANYOF_BITMAP_CLEAR(data->start_class, value);
4331 }
4332 }
4333 } else {
4334 for (value = 0; value < 256; value++) {
4335 if (isSPACE(value)) {
4336 ANYOF_BITMAP_CLEAR(data->start_class, value);
4337 }
4338 }
4339 }
653099ff 4340 }
8aa23a47
YO
4341 }
4342 else {
4343 if (data->start_class->flags & ANYOF_LOCALE)
4344 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
af302e7f 4345 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4346 for (value = 0; value < 256; value++) {
4347 if (!isSPACE_L1(value)) {
4348 ANYOF_BITMAP_SET(data->start_class, value);
4349 }
4350 }
4351 }
4352 else {
4353 for (value = 0; value < 256; value++) {
4354 if (!isSPACE(value)) {
4355 ANYOF_BITMAP_SET(data->start_class, value);
4356 }
4357 }
4358 }
653099ff 4359 }
8aa23a47 4360 break;
8aa23a47
YO
4361 case DIGIT:
4362 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4363 if (!(data->start_class->flags & ANYOF_LOCALE)) {
bf3c5c06
KW
4364 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4365 for (value = 0; value < 256; value++)
4366 if (!isDIGIT(value))
4367 ANYOF_BITMAP_CLEAR(data->start_class, value);
bcc0256f 4368 }
8aa23a47
YO
4369 }
4370 else {
4371 if (data->start_class->flags & ANYOF_LOCALE)
4372 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
75950e1c
KW
4373 for (value = 0; value < 256; value++)
4374 if (isDIGIT(value))
4375 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
4376 }
4377 break;
4378 case NDIGIT:
4379 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4380 if (!(data->start_class->flags & ANYOF_LOCALE))
bf3c5c06 4381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
8aa23a47
YO
4382 for (value = 0; value < 256; value++)
4383 if (isDIGIT(value))
4384 ANYOF_BITMAP_CLEAR(data->start_class, value);
4385 }
4386 else {
4387 if (data->start_class->flags & ANYOF_LOCALE)
4388 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
75950e1c
KW
4389 for (value = 0; value < 256; value++)
4390 if (!isDIGIT(value))
4391 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 4392 }
8aa23a47 4393 break;
e1d1eefb
YO
4394 CASE_SYNST_FNC(VERTWS);
4395 CASE_SYNST_FNC(HORIZWS);
686b73d4 4396
8aa23a47
YO
4397 }
4398 if (flags & SCF_DO_STCLASS_OR)
4399 cl_and(data->start_class, and_withp);
4400 flags &= ~SCF_DO_STCLASS;
4401 }
4402 }
4403 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4404 data->flags |= (OP(scan) == MEOL
4405 ? SF_BEFORE_MEOL
4406 : SF_BEFORE_SEOL);
4407 }
4408 else if ( PL_regkind[OP(scan)] == BRANCHJ
4409 /* Lookbehind, or need to calculate parens/evals/stclass: */
4410 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4411 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3b6759a6
YO
4412 if ( OP(scan) == UNLESSM &&
4413 scan->flags == 0 &&
4414 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4415 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4416 ) {
4417 regnode *opt;
4418 regnode *upto= regnext(scan);
4419 DEBUG_PARSE_r({
4420 SV * const mysv_val=sv_newmortal();
4421 DEBUG_STUDYDATA("OPFAIL",data,depth);
4422
4423 /*DEBUG_PARSE_MSG("opfail");*/
4424 regprop(RExC_rx, mysv_val, upto);
4425 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4426 SvPV_nolen_const(mysv_val),
4427 (IV)REG_NODE_NUM(upto),
4428 (IV)(upto - scan)
4429 );
4430 });
4431 OP(scan) = OPFAIL;
4432 NEXT_OFF(scan) = upto - scan;
4433 for (opt= scan + 1; opt < upto ; opt++)
4434 OP(opt) = OPTIMIZED;
4435 scan= upto;
4436 continue;
4437 }
8aa23a47
YO
4438 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4439 || OP(scan) == UNLESSM )
4440 {
4441 /* Negative Lookahead/lookbehind
4442 In this case we can't do fixed string optimisation.
4443 */
1de06328 4444
8aa23a47
YO
4445 I32 deltanext, minnext, fake = 0;
4446 regnode *nscan;
4447 struct regnode_charclass_class intrnl;
4448 int f = 0;
1de06328 4449
8aa23a47
YO
4450 data_fake.flags = 0;
4451 if (data) {
4452 data_fake.whilem_c = data->whilem_c;
4453 data_fake.last_closep = data->last_closep;
c277df42 4454 }
8aa23a47
YO
4455 else
4456 data_fake.last_closep = &fake;
58e23c8d 4457 data_fake.pos_delta = delta;
8aa23a47
YO
4458 if ( flags & SCF_DO_STCLASS && !scan->flags
4459 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4460 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4461 data_fake.start_class = &intrnl;
4462 f |= SCF_DO_STCLASS_AND;
4463 }
4464 if (flags & SCF_WHILEM_VISITED_POS)
4465 f |= SCF_WHILEM_VISITED_POS;
4466 next = regnext(scan);
4467 nscan = NEXTOPER(NEXTOPER(scan));
4468 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4469 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4470 if (scan->flags) {
4471 if (deltanext) {
58e23c8d 4472 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4473 }
4474 else if (minnext > (I32)U8_MAX) {
58e23c8d 4475 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4476 }
4477 scan->flags = (U8)minnext;
4478 }
4479 if (data) {
4480 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4481 pars++;
4482 if (data_fake.flags & SF_HAS_EVAL)
4483 data->flags |= SF_HAS_EVAL;
4484 data->whilem_c = data_fake.whilem_c;
4485 }
4486 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
4487 if (flags & SCF_DO_STCLASS_OR) {
4488 /* OR before, AND after: ideally we would recurse with
4489 * data_fake to get the AND applied by study of the
4490 * remainder of the pattern, and then derecurse;
4491 * *** HACK *** for now just treat as "no information".
4492 * See [perl #56690].
4493 */
e755fd73 4494 cl_init(pRExC_state, data->start_class);
906cdd2b
HS
4495 } else {
4496 /* AND before and after: combine and continue */
4497 const int was = (data->start_class->flags & ANYOF_EOS);
4498
4499 cl_and(data->start_class, &intrnl);
4500 if (was)
4501 data->start_class->flags |= ANYOF_EOS;
4502 }
8aa23a47 4503 }
cb434fcc 4504 }
8aa23a47
YO
4505#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4506 else {
4507 /* Positive Lookahead/lookbehind
4508 In this case we can do fixed string optimisation,
4509 but we must be careful about it. Note in the case of
4510 lookbehind the positions will be offset by the minimum
4511 length of the pattern, something we won't know about
4512 until after the recurse.
4513 */
4514 I32 deltanext, fake = 0;
4515 regnode *nscan;
4516 struct regnode_charclass_class intrnl;
4517 int f = 0;
4518 /* We use SAVEFREEPV so that when the full compile
4519 is finished perl will clean up the allocated
3b753521 4520 minlens when it's all done. This way we don't
8aa23a47
YO
4521 have to worry about freeing them when we know
4522 they wont be used, which would be a pain.
4523 */
4524 I32 *minnextp;
4525 Newx( minnextp, 1, I32 );
4526 SAVEFREEPV(minnextp);
4527
4528 if (data) {
4529 StructCopy(data, &data_fake, scan_data_t);
4530 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4531 f |= SCF_DO_SUBSTR;
4532 if (scan->flags)
304ee84b 4533 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
4534 data_fake.last_found=newSVsv(data->last_found);
4535 }
4536 }
4537 else
4538 data_fake.last_closep = &fake;
4539 data_fake.flags = 0;
58e23c8d 4540 data_fake.pos_delta = delta;
8aa23a47
YO
4541 if (is_inf)
4542 data_fake.flags |= SF_IS_INF;
4543 if ( flags & SCF_DO_STCLASS && !scan->flags
4544 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4545 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4546 data_fake.start_class = &intrnl;
4547 f |= SCF_DO_STCLASS_AND;
4548 }
4549 if (flags & SCF_WHILEM_VISITED_POS)
4550 f |= SCF_WHILEM_VISITED_POS;
4551 next = regnext(scan);
4552 nscan = NEXTOPER(NEXTOPER(scan));
4553
4554 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4555 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4556 if (scan->flags) {
4557 if (deltanext) {
58e23c8d 4558 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4559 }
4560 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 4561 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4562 }
4563 scan->flags = (U8)*minnextp;
4564 }
4565
4566 *minnextp += min;
4567
4568 if (f & SCF_DO_STCLASS_AND) {
4569 const int was = (data->start_class->flags & ANYOF_EOS);
4570
4571 cl_and(data->start_class, &intrnl);
4572 if (was)
4573 data->start_class->flags |= ANYOF_EOS;
4574 }
4575 if (data) {
4576 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4577 pars++;
4578 if (data_fake.flags & SF_HAS_EVAL)
4579 data->flags |= SF_HAS_EVAL;
4580 data->whilem_c = data_fake.whilem_c;
4581 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4582 if (RExC_rx->minlen<*minnextp)
4583 RExC_rx->minlen=*minnextp;
304ee84b 4584 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4585 SvREFCNT_dec(data_fake.last_found);
4586
4587 if ( data_fake.minlen_fixed != minlenp )
4588 {
4589 data->offset_fixed= data_fake.offset_fixed;
4590 data->minlen_fixed= data_fake.minlen_fixed;
4591 data->lookbehind_fixed+= scan->flags;
4592 }
4593 if ( data_fake.minlen_float != minlenp )
4594 {
4595 data->minlen_float= data_fake.minlen_float;
4596 data->offset_float_min=data_fake.offset_float_min;
4597 data->offset_float_max=data_fake.offset_float_max;
4598 data->lookbehind_float+= scan->flags;
4599 }
4600 }
4601 }
40d049e4 4602 }
8aa23a47
YO
4603#endif
4604 }
4605 else if (OP(scan) == OPEN) {
4606 if (stopparen != (I32)ARG(scan))
4607 pars++;
4608 }
4609 else if (OP(scan) == CLOSE) {
4610 if (stopparen == (I32)ARG(scan)) {
4611 break;
4612 }
4613 if ((I32)ARG(scan) == is_par) {
4614 next = regnext(scan);
b515a41d 4615
8aa23a47
YO
4616 if ( next && (OP(next) != WHILEM) && next < last)
4617 is_par = 0; /* Disable optimization */
40d049e4 4618 }
8aa23a47
YO
4619 if (data)
4620 *(data->last_closep) = ARG(scan);
4621 }
4622 else if (OP(scan) == EVAL) {
c277df42
IZ
4623 if (data)
4624 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4625 }
4626 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4627 if (flags & SCF_DO_SUBSTR) {
304ee84b 4628 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4629 flags &= ~SCF_DO_SUBSTR;
40d049e4 4630 }
8aa23a47
YO
4631 if (data && OP(scan)==ACCEPT) {
4632 data->flags |= SCF_SEEN_ACCEPT;
4633 if (stopmin > min)
4634 stopmin = min;
e2e6a0f1 4635 }
8aa23a47
YO
4636 }
4637 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4638 {
0f5d15d6 4639 if (flags & SCF_DO_SUBSTR) {
304ee84b 4640 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4641 data->longest = &(data->longest_float);
4642 }
4643 is_inf = is_inf_internal = 1;
653099ff 4644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4645 cl_anything(pRExC_state, data->start_class);
96776eda 4646 flags &= ~SCF_DO_STCLASS;
8aa23a47 4647 }
58e23c8d 4648 else if (OP(scan) == GPOS) {
bbe252da 4649 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4650 !(delta || is_inf || (data && data->pos_delta)))
4651 {
bbe252da
YO
4652 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4653 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4654 if (RExC_rx->gofs < (U32)min)
4655 RExC_rx->gofs = min;
4656 } else {
bbe252da 4657 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4658 RExC_rx->gofs = 0;
4659 }
4660 }
786e8c11 4661#ifdef TRIE_STUDY_OPT
40d049e4 4662#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4663 else if (PL_regkind[OP(scan)] == TRIE) {
4664 /* NOTE - There is similar code to this block above for handling
4665 BRANCH nodes on the initial study. If you change stuff here
4666 check there too. */
4667 regnode *trie_node= scan;
4668 regnode *tail= regnext(scan);
f8fc2ecf 4669 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4670 I32 max1 = 0, min1 = I32_MAX;
4671 struct regnode_charclass_class accum;
4672
4673 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4674 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47 4675 if (flags & SCF_DO_STCLASS)
e755fd73 4676 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
4677
4678 if (!trie->jump) {
4679 min1= trie->minlen;
4680 max1= trie->maxlen;
4681 } else {
4682 const regnode *nextbranch= NULL;
4683 U32 word;
4684
4685 for ( word=1 ; word <= trie->wordcount ; word++)
4686 {
4687 I32 deltanext=0, minnext=0, f = 0, fake;
4688 struct regnode_charclass_class this_class;
4689
4690 data_fake.flags = 0;
4691 if (data) {
4692 data_fake.whilem_c = data->whilem_c;
4693 data_fake.last_closep = data->last_closep;
4694 }
4695 else
4696 data_fake.last_closep = &fake;
58e23c8d 4697 data_fake.pos_delta = delta;
8aa23a47 4698 if (flags & SCF_DO_STCLASS) {
e755fd73 4699 cl_init(pRExC_state, &this_class);
8aa23a47
YO
4700 data_fake.start_class = &this_class;
4701 f = SCF_DO_STCLASS_AND;
4702 }
4703 if (flags & SCF_WHILEM_VISITED_POS)
4704 f |= SCF_WHILEM_VISITED_POS;
4705
4706 if (trie->jump[word]) {
4707 if (!nextbranch)
4708 nextbranch = trie_node + trie->jump[0];
4709 scan= trie_node + trie->jump[word];
4710 /* We go from the jump point to the branch that follows
4711 it. Note this means we need the vestigal unused branches
4712 even though they arent otherwise used.
4713 */
4714 minnext = study_chunk(pRExC_state, &scan, minlenp,
4715 &deltanext, (regnode *)nextbranch, &data_fake,
4716 stopparen, recursed, NULL, f,depth+1);
4717 }
4718 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4719 nextbranch= regnext((regnode*)nextbranch);
4720
4721 if (min1 > (I32)(minnext + trie->minlen))
4722 min1 = minnext + trie->minlen;
4723 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4724 max1 = minnext + deltanext + trie->maxlen;
4725 if (deltanext == I32_MAX)
4726 is_inf = is_inf_internal = 1;
4727
4728 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4729 pars++;
4730 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4731 if ( stopmin > min + min1)
4732 stopmin = min + min1;
4733 flags &= ~SCF_DO_SUBSTR;
4734 if (data)
4735 data->flags |= SCF_SEEN_ACCEPT;
4736 }
4737 if (data) {
4738 if (data_fake.flags & SF_HAS_EVAL)
4739 data->flags |= SF_HAS_EVAL;
4740 data->whilem_c = data_fake.whilem_c;
4741 }
4742 if (flags & SCF_DO_STCLASS)
3fffb88a 4743 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
4744 }
4745 }
4746 if (flags & SCF_DO_SUBSTR) {
4747 data->pos_min += min1;
4748 data->pos_delta += max1 - min1;
4749 if (max1 != min1 || is_inf)
4750 data->longest = &(data->longest_float);
4751 }
4752 min += min1;
4753 delta += max1 - min1;
4754 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 4755 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
4756 if (min1) {
4757 cl_and(data->start_class, and_withp);
4758 flags &= ~SCF_DO_STCLASS;
4759 }
4760 }
4761 else if (flags & SCF_DO_STCLASS_AND) {
4762 if (min1) {
4763 cl_and(data->start_class, &accum);
4764 flags &= ~SCF_DO_STCLASS;
4765 }
4766 else {
4767 /* Switch to OR mode: cache the old value of
4768 * data->start_class */
4769 INIT_AND_WITHP;
4770 StructCopy(data->start_class, and_withp,
4771 struct regnode_charclass_class);
4772 flags &= ~SCF_DO_STCLASS_AND;
4773 StructCopy(&accum, data->start_class,
4774 struct regnode_charclass_class);
4775 flags |= SCF_DO_STCLASS_OR;
4776 data->start_class->flags |= ANYOF_EOS;
4777 }
4778 }
4779 scan= tail;
4780 continue;
4781 }
786e8c11 4782#else
8aa23a47 4783 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4784 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4785 U8*bang=NULL;
4786
4787 min += trie->minlen;
4788 delta += (trie->maxlen - trie->minlen);
4789 flags &= ~SCF_DO_STCLASS; /* xxx */
4790 if (flags & SCF_DO_SUBSTR) {
304ee84b 4791 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4792 data->pos_min += trie->minlen;
4793 data->pos_delta += (trie->maxlen - trie->minlen);
4794 if (trie->maxlen != trie->minlen)
4795 data->longest = &(data->longest_float);
4796 }
4797 if (trie->jump) /* no more substrings -- for now /grr*/
4798 flags &= ~SCF_DO_SUBSTR;
b515a41d 4799 }
8aa23a47 4800#endif /* old or new */
686b73d4 4801#endif /* TRIE_STUDY_OPT */
e1d1eefb 4802
8aa23a47
YO
4803 /* Else: zero-length, ignore. */
4804 scan = regnext(scan);
4805 }
4806 if (frame) {
4807 last = frame->last;
4808 scan = frame->next;
4809 stopparen = frame->stop;
4810 frame = frame->prev;
4811 goto fake_study_recurse;
c277df42
IZ
4812 }
4813
4814 finish:
8aa23a47 4815 assert(!frame);
304ee84b 4816 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4817
c277df42 4818 *scanp = scan;
aca2d497 4819 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4820 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4821 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4822 if (is_par > (I32)U8_MAX)
c277df42
IZ
4823 is_par = 0;
4824 if (is_par && pars==1 && data) {
4825 data->flags |= SF_IN_PAR;
4826 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4827 }
4828 else if (pars && data) {
c277df42
IZ
4829 data->flags |= SF_HAS_PAR;
4830 data->flags &= ~SF_IN_PAR;
4831 }
653099ff 4832 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4833 cl_and(data->start_class, and_withp);
786e8c11
YO
4834 if (flags & SCF_TRIE_RESTUDY)
4835 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4836
304ee84b 4837 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4838
e2e6a0f1 4839 return min < stopmin ? min : stopmin;
c277df42
IZ
4840}
4841
2eccd3b2
NC
4842STATIC U32
4843S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4844{
4a4e7719
NC
4845 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4846
7918f24d
NC
4847 PERL_ARGS_ASSERT_ADD_DATA;
4848
4a4e7719
NC
4849 Renewc(RExC_rxi->data,
4850 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4851 char, struct reg_data);
4852 if(count)
f8fc2ecf 4853 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4854 else
f8fc2ecf 4855 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4856 RExC_rxi->data->count = count + n;
4857 Copy(s, RExC_rxi->data->what + count, n, U8);
4858 return count;
c277df42
IZ
4859}
4860
f8149455 4861/*XXX: todo make this not included in a non debugging perl */
76234dfb 4862#ifndef PERL_IN_XSUB_RE
d88dccdf 4863void
864dbfa3 4864Perl_reginitcolors(pTHX)
d88dccdf 4865{
97aff369 4866 dVAR;
1df70142 4867 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4868 if (s) {
1df70142
AL
4869 char *t = savepv(s);
4870 int i = 0;
4871 PL_colors[0] = t;
d88dccdf 4872 while (++i < 6) {
1df70142
AL
4873 t = strchr(t, '\t');
4874 if (t) {
4875 *t = '\0';
4876 PL_colors[i] = ++t;
d88dccdf
IZ
4877 }
4878 else
1df70142 4879 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4880 }
4881 } else {
1df70142 4882 int i = 0;
b81d288d 4883 while (i < 6)
06b5626a 4884 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4885 }
4886 PL_colorset = 1;
4887}
76234dfb 4888#endif
8615cb43 4889
07be1b83 4890
786e8c11
YO
4891#ifdef TRIE_STUDY_OPT
4892#define CHECK_RESTUDY_GOTO \
4893 if ( \
4894 (data.flags & SCF_TRIE_RESTUDY) \
4895 && ! restudied++ \
4896 ) goto reStudy
4897#else
4898#define CHECK_RESTUDY_GOTO
4899#endif
f9f4320a 4900
a687059c 4901/*
3a21f536 4902 * pregcomp - compile a regular expression into internal code
a687059c 4903 *
3a21f536
DM
4904 * Decides which engine's compiler to call based on the hint currently in
4905 * scope
a687059c 4906 */
b9b4dddf 4907
6d5c990f 4908#ifndef PERL_IN_XSUB_RE
f8b2cf8a 4909
3c13cae6 4910/* return the currently in-scope regex engine (or the default if none) */
f8b2cf8a 4911
3c13cae6 4912regexp_engine const *
f8b2cf8a
DM
4913Perl_current_re_engine(pTHX)
4914{
4915 dVAR;
4916
4917 if (IN_PERL_COMPILETIME) {
4918 HV * const table = GvHV(PL_hintgv);
4919 SV **ptr;
4920
4921 if (!table)
3c13cae6 4922 return &PL_core_reg_engine;
f8b2cf8a
DM
4923 ptr = hv_fetchs(table, "regcomp", FALSE);
4924 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
3c13cae6 4925 return &PL_core_reg_engine;
f8b2cf8a
DM
4926 return INT2PTR(regexp_engine*,SvIV(*ptr));
4927 }
4928 else {
4929 SV *ptr;
4930 if (!PL_curcop->cop_hints_hash)
3c13cae6 4931 return &PL_core_reg_engine;
f8b2cf8a
DM
4932 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4933 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
3c13cae6 4934 return &PL_core_reg_engine;
f8b2cf8a
DM
4935 return INT2PTR(regexp_engine*,SvIV(ptr));
4936 }
4937}
4938
4939
3ab4a224 4940REGEXP *
1593ad57 4941Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4942{
97aff369 4943 dVAR;
3c13cae6
DM
4944 regexp_engine const *eng = current_re_engine();
4945 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4946
4947 PERL_ARGS_ASSERT_PREGCOMP;
4948
f8b2cf8a 4949 /* Dispatch a request to compile a regexp to correct regexp engine. */
3c13cae6
DM
4950 DEBUG_COMPILE_r({
4951 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4952 PTR2UV(eng));
4953 });
4954 return CALLREGCOMP_ENG(eng, pattern, flags);
2a5d9b1d 4955}
6d5c990f 4956#endif
2a5d9b1d 4957
ab442d34 4958/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
74529a43
DM
4959 * pattern rather than a list of OPs */
4960
4961REGEXP *
37acfcba 4962Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
74529a43 4963{
9f141731 4964 SV *pat = pattern; /* defeat constness! */
74529a43 4965 PERL_ARGS_ASSERT_RE_COMPILE;
3c13cae6
DM
4966 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4967 NULL, NULL, rx_flags, 0);
74529a43
DM
4968}
4969
d24ca0c5
DM
4970/* see if there are any run-time code blocks in the pattern.
4971 * False positives are allowed */
4972
4973static bool
4974S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4975 U32 pm_flags, char *pat, STRLEN plen)
4976{
4977 int n = 0;
4978 STRLEN s;
4979
4980 /* avoid infinitely recursing when we recompile the pattern parcelled up
4981 * as qr'...'. A single constant qr// string can't have have any
4982 * run-time component in it, and thus, no runtime code. (A non-qr
4983 * string, however, can, e.g. $x =~ '(?{})') */
4984 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4985 return 0;
4986
4987 for (s = 0; s < plen; s++) {
4988 if (n < pRExC_state->num_code_blocks
4989 && s == pRExC_state->code_blocks[n].start)
4990 {
4991 s = pRExC_state->code_blocks[n].end;
4992 n++;
4993 continue;
4994 }
4995 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4996 * positives here */
4997 if (pat[s] == '(' && pat[s+1] == '?' &&
4998 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4999 )
5000 return 1;
5001 }
5002 return 0;
5003}
5004
5005/* Handle run-time code blocks. We will already have compiled any direct
5006 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5007 * copy of it, but with any literal code blocks blanked out and
5008 * appropriate chars escaped; then feed it into
5009 *
5010 * eval "qr'modified_pattern'"
5011 *
5012 * For example,
5013 *
5014 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5015 *
5016 * becomes
5017 *
5018 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5019 *
5020 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5021 * and merge them with any code blocks of the original regexp.
5022 *
5023 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5024 * instead, just save the qr and return FALSE; this tells our caller that
5025 * the original pattern needs upgrading to utf8.
5026 */
5027
5028bool
5029S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5030 char *pat, STRLEN plen)
5031{
5032 SV *qr;
5033
5034 GET_RE_DEBUG_FLAGS_DECL;
5035
5036 if (pRExC_state->runtime_code_qr) {
5037 /* this is the second time we've been called; this should
5038 * only happen if the main pattern got upgraded to utf8
5039 * during compilation; re-use the qr we compiled first time
5040 * round (which should be utf8 too)
5041 */
5042 qr = pRExC_state->runtime_code_qr;
5043 pRExC_state->runtime_code_qr = NULL;
5044 assert(RExC_utf8 && SvUTF8(qr));
5045 }
5046 else {
5047 int n = 0;
5048 STRLEN s;
5049 char *p, *newpat;
c8d84f8c 5050 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
d24ca0c5
DM
5051 SV *sv, *qr_ref;
5052 dSP;
5053
5054 /* determine how many extra chars we need for ' and \ escaping */
5055 for (s = 0; s < plen; s++) {
5056 if (pat[s] == '\'' || pat[s] == '\\')
5057 newlen++;
5058 }
5059
5060 Newx(newpat, newlen, char);
5061 p = newpat;
5062 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5063
5064 for (s = 0; s < plen; s++) {
5065 if (n < pRExC_state->num_code_blocks
5066 && s == pRExC_state->code_blocks[n].start)
5067 {
5068 /* blank out literal code block */
5069 assert(pat[s] == '(');
5070 while (s <= pRExC_state->code_blocks[n].end) {
5071 *p++ = ' ';
5072 s++;
5073 }
5074 s--;
5075 n++;
5076 continue;
5077 }
5078 if (pat[s] == '\'' || pat[s] == '\\')
5079 *p++ = '\\';
5080 *p++ = pat[s];
5081 }
5082 *p++ = '\'';
5083 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5084 *p++ = 'x';
5085 *p++ = '\0';
5086 DEBUG_COMPILE_r({
5087 PerlIO_printf(Perl_debug_log,
5088 "%sre-parsing pattern for runtime code:%s %s\n",
5089 PL_colors[4],PL_colors[5],newpat);
5090 });
5091
5092 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5093 Safefree(newpat);
5094
5095 ENTER;
5096 SAVETMPS;
5097 save_re_context();
5098 PUSHSTACKi(PERLSI_REQUIRE);
5099 /* this causes the toker to collapse \\ into \ when parsing
5100 * qr''; normally only q'' does this. It also alters hints
5101 * handling */
5102 PL_reg_state.re_reparsing = TRUE;
5103 eval_sv(sv, G_SCALAR);
5104 SvREFCNT_dec(sv);
5105 SPAGAIN;
5106 qr_ref = POPs;
5107 PUTBACK;
5108 if (SvTRUE(ERRSV))
5109 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5110 assert(SvROK(qr_ref));
5111 qr = SvRV(qr_ref);
5112 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5113 /* the leaving below frees the tmp qr_ref.
5114 * Give qr a life of its own */
5115 SvREFCNT_inc(qr);
5116 POPSTACK;
5117 FREETMPS;
5118 LEAVE;
5119
5120 }
5121
5122 if (!RExC_utf8 && SvUTF8(qr)) {
5123 /* first time through; the pattern got upgraded; save the
5124 * qr for the next time through */
5125 assert(!pRExC_state->runtime_code_qr);
5126 pRExC_state->runtime_code_qr = qr;
5127 return 0;
5128 }
5129
5130
5131 /* extract any code blocks within the returned qr// */
5132
5133
5134 /* merge the main (r1) and run-time (r2) code blocks into one */
5135 {
5136 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5137 struct reg_code_block *new_block, *dst;
5138 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5139 int i1 = 0, i2 = 0;
5140
5141 if (!r2->num_code_blocks) /* we guessed wrong */
5142 return 1;
5143
5144 Newx(new_block,
5145 r1->num_code_blocks + r2->num_code_blocks,
5146 struct reg_code_block);
5147 dst = new_block;
5148
5149 while ( i1 < r1->num_code_blocks
5150 || i2 < r2->num_code_blocks)
5151 {
5152 struct reg_code_block *src;
5153 bool is_qr = 0;
5154
5155 if (i1 == r1->num_code_blocks) {
5156 src = &r2->code_blocks[i2++];
5157 is_qr = 1;
5158 }
5159 else if (i2 == r2->num_code_blocks)
5160 src = &r1->code_blocks[i1++];
5161 else if ( r1->code_blocks[i1].start
5162 < r2->code_blocks[i2].start)
5163 {
5164 src = &r1->code_blocks[i1++];
5165 assert(src->end < r2->code_blocks[i2].start);
5166 }
5167 else {
5168 assert( r1->code_blocks[i1].start
5169 > r2->code_blocks[i2].start);
5170 src = &r2->code_blocks[i2++];
5171 is_qr = 1;
5172 assert(src->end < r1->code_blocks[i1].start);
5173 }
5174
5175 assert(pat[src->start] == '(');
5176 assert(pat[src->end] == ')');
5177 dst->start = src->start;
5178 dst->end = src->end;
5179 dst->block = src->block;
5180 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5181 : src->src_regex;
5182 dst++;
5183 }
5184 r1->num_code_blocks += r2->num_code_blocks;
5185 Safefree(r1->code_blocks);
5186 r1->code_blocks = new_block;
5187 }
5188
5189 SvREFCNT_dec(qr);
5190 return 1;
5191}
5192
68e2671b 5193
3a21f536 5194/*
ab442d34 5195 * Perl_re_op_compile - the perl internal RE engine's function to compile a
74529a43 5196 * regular expression into internal code.
28dc7a3e
DM
5197 * The pattern may be passed either as:
5198 * a list of SVs (patternp plus pat_count)
5199 * a list of OPs (expr)
9f141731 5200 * If both are passed, the SV list is used, but the OP list indicates
ab442d34 5201 * which SVs are actually pre-compiled code blocks
9f141731 5202 *
ab442d34 5203 * The SVs in the list have magic and qr overloading applied to them (and
9f141731
DM
5204 * the list may be modified in-place with replacement SVs in the latter
5205 * case).
5206 *
ab442d34 5207 * If the pattern hasn't changed from old_re, then old_re will be
9f141731
DM
5208 * returned.
5209 *
3c13cae6
DM
5210 * eng is the current engine. If that engine has an op_comp method, then
5211 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5212 * do the initial concatenation of arguments and pass on to the external
ab442d34 5213 * engine.
9f141731 5214 *
ab442d34
DM
5215 * If is_bare_re is not null, set it to a boolean indicating whether the
5216 * arg list reduced (after overloading) to a single bare regex which has
5217 * been returned (i.e. /$qr/).
3a21f536 5218 *
514a91f1
DM
5219 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5220 *
a5ae69f0
DM
5221 * pm_flags contains the PMf_* flags, typically based on those from the
5222 * pm_flags field of the related PMOP. Currently we're only interested in
5223 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
514a91f1 5224 *
3a21f536
DM
5225 * We can't allocate space until we know how big the compiled form will be,
5226 * but we can't compile it (and thus know how big it is) until we've got a
5227 * place to put the code. So we cheat: we compile it twice, once with code
5228 * generation turned off and size counting turned on, and once "for real".
5229 * This also means that we don't allocate space until we are sure that the
5230 * thing really will compile successfully, and we never have to move the
5231 * code and thus invalidate pointers into it. (Note that it has to be in
5232 * one piece because free() must be able to free it all.) [NB: not true in perl]
5233 *
5234 * Beware that the optimization-preparation code in here knows about some
5235 * of the structure of the compiled regexp. [I'll say.]
5236 */
5237
3ab4a224 5238REGEXP *
9f141731 5239Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6ae44cd2 5240 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
76ac488f 5241 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
2a5d9b1d
RGS
5242{
5243 dVAR;
288b8c02
NC
5244 REGEXP *rx;
5245 struct regexp *r;
f8fc2ecf 5246 register regexp_internal *ri;
3ab4a224 5247 STRLEN plen;
21f84aaf 5248 char * VOL exp;
5d51ce98 5249 char* xend;
c277df42 5250 regnode *scan;
a0d0e21e 5251 I32 flags;
a0d0e21e 5252 I32 minlen = 0;
37acfcba 5253 U32 rx_flags;
e174e89a 5254 SV * VOL pat;
e7f38d0f
YO
5255
5256 /* these are all flags - maybe they should be turned
5257 * into a single int with different bit masks */
5258 I32 sawlookahead = 0;
a0d0e21e
LW
5259 I32 sawplus = 0;
5260 I32 sawopen = 0;
29b09c41 5261 bool used_setjump = FALSE;
37acfcba 5262 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
83dd4485 5263 bool code_is_utf8 = 0;
23f4026a 5264 bool VOL recompile = 0;
d24ca0c5 5265 bool runtime_code = 0;
bbd61b5f
KW
5266 U8 jump_ret = 0;
5267 dJMPENV;
2c2d71f5 5268 scan_data_t data;
830247a4 5269 RExC_state_t RExC_state;
be8e71aa 5270 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 5271#ifdef TRIE_STUDY_OPT
5d51ce98 5272 int restudied;
07be1b83
YO
5273 RExC_state_t copyRExC_state;
5274#endif
2a5d9b1d 5275 GET_RE_DEBUG_FLAGS_DECL;
7918f24d 5276
3c13cae6
DM
5277 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5278
6d5c990f 5279 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 5280
370b8f2f 5281#ifndef PERL_IN_XSUB_RE
a3e1f3a6
KW
5282 /* Initialize these here instead of as-needed, as is quick and avoids
5283 * having to test them each time otherwise */
5284 if (! PL_AboveLatin1) {
5285 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5286 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5287 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
3f427fd9
KW
5288
5289 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5290 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5291
5292 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5293 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5294
5295 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5296 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5297
dab0c3e7
KW
5298 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5299
3f427fd9
KW
5300 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5301 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5302
5303 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5304
5305 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5306 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5307
5308 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5309 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5310
3f427fd9
KW
5311 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5312 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5313
5314 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5315 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5316
5317 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5318 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5319
5320 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5321 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5322
5323 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5324 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5325
5326 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5327 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5328
5329 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5330
5331 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5332 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5333
5334 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5335 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
a3e1f3a6 5336 }
370b8f2f 5337#endif
a3e1f3a6 5338
b1603ef8
DM
5339 pRExC_state->code_blocks = NULL;
5340 pRExC_state->num_code_blocks = 0;
9f141731
DM
5341
5342 if (is_bare_re)
76ac488f 5343 *is_bare_re = FALSE;
9f141731 5344
b1603ef8
DM
5345 if (expr && (expr->op_type == OP_LIST ||
5346 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5347
83dd4485
DM
5348 /* is the source UTF8, and how many code blocks are there? */
5349 OP *o;
5350 int ncode = 0;
5351
5352 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5353 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5354 code_is_utf8 = 1;
5355 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5356 /* count of DO blocks */
5357 ncode++;
5358 }
83dd4485 5359 if (ncode) {
b1603ef8 5360 pRExC_state->num_code_blocks = ncode;
3d2bd50a 5361 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
83dd4485
DM
5362 }
5363 }
5364
9f141731
DM
5365 if (pat_count) {
5366 /* handle a list of SVs */
5367
5368 SV **svp;
5369
5370 /* apply magic and RE overloading to each arg */
5371 for (svp = patternp; svp < patternp + pat_count; svp++) {
5372 SV *rx = *svp;
5373 SvGETMAGIC(rx);
5374 if (SvROK(rx) && SvAMAGIC(rx)) {
5375 SV *sv = AMG_CALLunary(rx, regexp_amg);
5376 if (sv) {
5377 if (SvROK(sv))
5378 sv = SvRV(sv);
5379 if (SvTYPE(sv) != SVt_REGEXP)
5380 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5381 *svp = sv;
5382 }
5383 }
5384 }
5385
5386 if (pat_count > 1) {
346d3070
DM
5387 /* concat multiple args and find any code block indexes */
5388
5389 OP *o = NULL;
5390 int n = 0;
947535e3 5391 bool utf8 = 0;
e03b874a 5392 STRLEN orig_patlen = 0;
346d3070
DM
5393
5394 if (pRExC_state->num_code_blocks) {
5395 o = cLISTOPx(expr)->op_first;
5396 assert(o->op_type == OP_PUSHMARK);
5397 o = o->op_sibling;
5398 }
b1603ef8 5399
9f141731
DM
5400 pat = newSVpvn("", 0);
5401 SAVEFREESV(pat);
947535e3
DM
5402
5403 /* determine if the pattern is going to be utf8 (needed
5404 * in advance to align code block indices correctly).
5405 * XXX This could fail to be detected for an arg with
5406 * overloading but not concat overloading; but the main effect
5407 * in this obscure case is to need a 'use re eval' for a
5408 * literal code block */
5409 for (svp = patternp; svp < patternp + pat_count; svp++) {
5410 if (SvUTF8(*svp))
5411 utf8 = 1;
5412 }
5413 if (utf8)
5414 SvUTF8_on(pat);
5415
9f141731
DM
5416 for (svp = patternp; svp < patternp + pat_count; svp++) {
5417 SV *sv, *msv = *svp;
b30fcab9 5418 SV *rx;
346d3070
DM
5419 bool code = 0;
5420 if (o) {
5421 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
b30fcab9
DM
5422 assert(n < pRExC_state->num_code_blocks);
5423 pRExC_state->code_blocks[n].start = SvCUR(pat);
5424 pRExC_state->code_blocks[n].block = o;
5425 pRExC_state->code_blocks[n].src_regex = NULL;
346d3070 5426 n++;
346d3070
DM
5427 code = 1;
5428 o = o->op_sibling; /* skip CONST */
5429 assert(o);
5430 }
5431 o = o->op_sibling;;
5432 }
5433
e03b874a
DM
5434 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5435 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5436 {
5437 sv_setsv(pat, sv);
5438 /* overloading involved: all bets are off over literal
5439 * code. Pretend we haven't seen it */
5440 pRExC_state->num_code_blocks -= n;
5441 n = 0;
5442 rx = NULL;
5443
5444 }
5445 else {
5446 while (SvAMAGIC(msv)
5447 && (sv = AMG_CALLunary(msv, string_amg))
5448 && sv != msv)
5449 {
5450 msv = sv;
5451 SvGETMAGIC(msv);
5452 }
5453 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5454 msv = SvRV(msv);
5455 orig_patlen = SvCUR(pat);
5456 sv_catsv_nomg(pat, msv);
5457 rx = msv;
5458 if (code)
5459 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5460 }
5461
b30fcab9 5462 /* extract any code blocks within any embedded qr//'s */
e03b874a 5463 if (rx && SvTYPE(rx) == SVt_REGEXP
3c13cae6 5464 && RX_ENGINE((REGEXP*)rx)->op_comp)
b30fcab9
DM
5465 {
5466
5467 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5468 if (ri->num_code_blocks) {
5469 int i;
629cd4f3
DM
5470 /* the presence of an embedded qr// with code means
5471 * we should always recompile: the text of the
5472 * qr// may not have changed, but it may be a
5473 * different closure than last time */
5474 recompile = 1;
b30fcab9
DM
5475 Renew(pRExC_state->code_blocks,
5476 pRExC_state->num_code_blocks + ri->num_code_blocks,
5477 struct reg_code_block);
5478 pRExC_state->num_code_blocks += ri->num_code_blocks;
5479 for (i=0; i < ri->num_code_blocks; i++) {
5480 struct reg_code_block *src, *dst;
e03b874a 5481 STRLEN offset = orig_patlen
b30fcab9
DM
5482 + ((struct regexp *)SvANY(rx))->pre_prefix;
5483 assert(n < pRExC_state->num_code_blocks);
5484 src = &ri->code_blocks[i];
5485 dst = &pRExC_state->code_blocks[n];
5486 dst->start = src->start + offset;
5487 dst->end = src->end + offset;
5488 dst->block = src->block;
5489 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5490 src->src_regex
5491 ? src->src_regex
5492 : (REGEXP*)rx);
5493 n++;
5494 }
5495 }
5496 }
9f141731
DM
5497 }
5498 SvSETMAGIC(pat);
5499 }
e03b874a
DM
5500 else {
5501 SV *sv;
9f141731 5502 pat = *patternp;
e03b874a
DM
5503 while (SvAMAGIC(pat)
5504 && (sv = AMG_CALLunary(pat, string_amg))
5505 && sv != pat)
5506 {
5507 pat = sv;
5508 SvGETMAGIC(pat);
5509 }
5510 }
9f141731
DM
5511
5512 /* handle bare regex: foo =~ $re */
5513 {
5514 SV *re = pat;
5515 if (SvROK(re))
5516 re = SvRV(re);
5517 if (SvTYPE(re) == SVt_REGEXP) {
5518 if (is_bare_re)
76ac488f 5519 *is_bare_re = TRUE;
9f141731 5520 SvREFCNT_inc(re);
3d2bd50a 5521 Safefree(pRExC_state->code_blocks);
9f141731
DM
5522 return (REGEXP*)re;
5523 }
5524 }
5525 }
5526 else {
5527 /* not a list of SVs, so must be a list of OPs */
5528 assert(expr);
68e2671b 5529 if (expr->op_type == OP_LIST) {
1eacd84c
DM
5530 int i = -1;
5531 bool is_code = 0;
5532 OP *o;
5533
68e2671b
DM
5534 pat = newSVpvn("", 0);
5535 SAVEFREESV(pat);
83dd4485 5536 if (code_is_utf8)
68e2671b 5537 SvUTF8_on(pat);
1eacd84c
DM
5538
5539 /* given a list of CONSTs and DO blocks in expr, append all
5540 * the CONSTs to pat, and record the start and end of each
5541 * code block in code_blocks[] (each DO{} op is followed by an
5542 * OP_CONST containing the corresponding literal '(?{...})
5543 * text)
5544 */
5545 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5546 if (o->op_type == OP_CONST) {
5547 sv_catsv(pat, cSVOPo_sv);
5548 if (is_code) {
5549 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5550 is_code = 0;
5551 }
5552 }
5553 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5554 assert(i+1 < pRExC_state->num_code_blocks);
5555 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5556 pRExC_state->code_blocks[i].block = o;
b30fcab9 5557 pRExC_state->code_blocks[i].src_regex = NULL;
1eacd84c
DM
5558 is_code = 1;
5559 }
5560 }
68e2671b
DM
5561 }
5562 else {
5563 assert(expr->op_type == OP_CONST);
5564 pat = cSVOPx_sv(expr);
74529a43 5565 }
74529a43 5566 }
74529a43 5567
9f141731 5568 exp = SvPV_nomg(pat, plen);
1b34bc43 5569
3c13cae6 5570 if (!eng->op_comp) {
9f141731
DM
5571 if ((SvUTF8(pat) && IN_BYTES)
5572 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5573 {
5574 /* make a temporary copy; either to convert to bytes,
5575 * or to avoid repeating get-magic / overloaded stringify */
5576 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5577 (IN_BYTES ? 0 : SvUTF8(pat)));
5578 }
3d2bd50a 5579 Safefree(pRExC_state->code_blocks);
37acfcba 5580 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1b34bc43 5581 }
9f141731 5582
9f141731
DM
5583 /* ignore the utf8ness if the pattern is 0 length */
5584 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
e40e74fe 5585 RExC_uni_semantics = 0;
4624b182 5586 RExC_contains_locale = 0;
d24ca0c5 5587 pRExC_state->runtime_code_qr = NULL;
7b597bb8 5588
d6bd454d 5589 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
5590 /* Longjmp back to here if have to switch in midstream to utf8 */
5591 if (! RExC_orig_utf8) {
5592 JMPENV_PUSH(jump_ret);
29b09c41 5593 used_setjump = TRUE;
bbd61b5f
KW
5594 }
5595
5d51ce98 5596 if (jump_ret == 0) { /* First time through */
29b09c41 5597 xend = exp + plen;
29b09c41 5598
5d51ce98
KW
5599 DEBUG_COMPILE_r({
5600 SV *dsv= sv_newmortal();
5601 RE_PV_QUOTED_DECL(s, RExC_utf8,
5602 dsv, exp, plen, 60);
5603 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5604 PL_colors[4],PL_colors[5],s);
5605 });
5606 }
5607 else { /* longjumped back */
2bd8e0da
DM
5608 U8 *src, *dst;
5609 int n=0;
5610 STRLEN s = 0, d = 0;
5611 bool do_end = 0;
bbd61b5f 5612
5d51ce98
KW
5613 /* If the cause for the longjmp was other than changing to utf8, pop
5614 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
5615 if (jump_ret != UTF8_LONGJMP) {
5616 JMPENV_POP;
5617 JMPENV_JUMP(jump_ret);
5618 }
5619
595598ee
KW
5620 GET_RE_DEBUG_FLAGS;
5621
bbd61b5f
KW
5622 /* It's possible to write a regexp in ascii that represents Unicode
5623 codepoints outside of the byte range, such as via \x{100}. If we
5624 detect such a sequence we have to convert the entire pattern to utf8
5625 and then recompile, as our sizing calculation will have been based
5626 on 1 byte == 1 character, but we will need to use utf8 to encode
5627 at least some part of the pattern, and therefore must convert the whole
5628 thing.
5629 -- dmq */
5630 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5631 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
68e2671b 5632
2bd8e0da
DM
5633 /* upgrade pattern to UTF8, and if there are code blocks,
5634 * recalculate the indices.
5635 * This is essentially an unrolled Perl_bytes_to_utf8() */
5636
5637 src = (U8*)SvPV_nomg(pat, plen);
5638 Newx(dst, plen * 2 + 1, U8);
5639
5640 while (s < plen) {
5641 const UV uv = NATIVE_TO_ASCII(src[s]);
5642 if (UNI_IS_INVARIANT(uv))
5643 dst[d] = (U8)UTF_TO_NATIVE(uv);
5644 else {
5645 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5646 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5647 }
5648 if (n < pRExC_state->num_code_blocks) {
5649 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5650 pRExC_state->code_blocks[n].start = d;
5651 assert(dst[d] == '(');
5652 do_end = 1;
5653 }
5654 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5655 pRExC_state->code_blocks[n].end = d;
5656 assert(dst[d] == ')');
5657 do_end = 0;
5658 n++;
5659 }
5660 }
5661 s++;
5662 d++;
68e2671b 5663 }
2bd8e0da
DM
5664 dst[d] = '\0';
5665 plen = d;
5666 exp = (char*) dst;
5667 xend = exp + plen;
5668 SAVEFREEPV(exp);
68e2671b 5669 RExC_orig_utf8 = RExC_utf8 = 1;
3b16d10d 5670 }
6ae44cd2 5671
3b16d10d 5672 /* return old regex if pattern hasn't changed */
6ae44cd2 5673
3b16d10d 5674 if ( old_re
629cd4f3 5675 && !recompile
3b16d10d
DM
5676 && !!RX_UTF8(old_re) == !!RExC_utf8
5677 && RX_PRECOMP(old_re)
5678 && RX_PRELEN(old_re) == plen
5679 && memEQ(RX_PRECOMP(old_re), exp, plen))
5680 {
629cd4f3 5681 /* with runtime code, always recompile */
d24ca0c5
DM
5682 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5683 exp, plen);
5684 if (!runtime_code) {
629cd4f3
DM
5685 ReREFCNT_inc(old_re);
5686 if (used_setjump) {
5687 JMPENV_POP;
5688 }
5689 Safefree(pRExC_state->code_blocks);
5690 return old_re;
3b16d10d 5691 }
bbd61b5f 5692 }
d24ca0c5
DM
5693 else if ((pm_flags & PMf_USE_RE_EVAL)
5694 /* this second condition covers the non-regex literal case,
5695 * i.e. $foo =~ '(?{})'. */
5696 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5697 && (PL_hints & HINT_RE_EVAL))
5698 )
5699 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5700 exp, plen);
bbd61b5f 5701
5d51ce98
KW
5702#ifdef TRIE_STUDY_OPT
5703 restudied = 0;
5704#endif
5705
37acfcba 5706 rx_flags = orig_rx_flags;
a62b1201 5707
4624b182
KW
5708 if (initial_charset == REGEX_LOCALE_CHARSET) {
5709 RExC_contains_locale = 1;
5710 }
5711 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5712
5713 /* Set to use unicode semantics if the pattern is in utf8 and has the
5714 * 'depends' charset specified, as it means unicode when utf8 */
37acfcba 5715 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
5716 }
5717
02daf0ab 5718 RExC_precomp = exp;
37acfcba 5719 RExC_flags = rx_flags;
514a91f1 5720 RExC_pm_flags = pm_flags;
d24ca0c5
DM
5721
5722 if (runtime_code) {
5723 if (PL_tainting && PL_tainted)
5724 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5725
5726 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5727 /* whoops, we have a non-utf8 pattern, whilst run-time code
5728 * got compiled as utf8. Try again with a utf8 pattern */
5729 JMPENV_JUMP(UTF8_LONGJMP);
5730 }
5731 }
5732 assert(!pRExC_state->runtime_code_qr);
5733
830247a4 5734 RExC_sawback = 0;
bbce6d69 5735
830247a4 5736 RExC_seen = 0;
b57e4118 5737 RExC_in_lookbehind = 0;
830247a4 5738 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
830247a4 5739 RExC_extralen = 0;
e2a7e165 5740 RExC_override_recoding = 0;
c277df42 5741
bbce6d69 5742 /* First pass: determine size, legality. */
830247a4 5743 RExC_parse = exp;
fac92740 5744 RExC_start = exp;
830247a4
IZ
5745 RExC_end = xend;
5746 RExC_naughty = 0;
5747 RExC_npar = 1;
e2e6a0f1 5748 RExC_nestroot = 0;
830247a4
IZ
5749 RExC_size = 0L;
5750 RExC_emit = &PL_regdummy;
5751 RExC_whilem_seen = 0;
40d049e4
YO
5752 RExC_open_parens = NULL;
5753 RExC_close_parens = NULL;
5754 RExC_opend = NULL;
81714fb9 5755 RExC_paren_names = NULL;
1f1031fe
YO
5756#ifdef DEBUGGING
5757 RExC_paren_name_list = NULL;
5758#endif
40d049e4
YO
5759 RExC_recurse = NULL;
5760 RExC_recurse_count = 0;
b1603ef8 5761 pRExC_state->code_index = 0;
81714fb9 5762
85ddcde9
JH
5763#if 0 /* REGC() is (currently) a NOP at the first pass.
5764 * Clever compilers notice this and complain. --jhi */
830247a4 5765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 5766#endif
5a415bbc
KW
5767 DEBUG_PARSE_r(
5768 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5769 RExC_lastnum=0;
5770 RExC_lastparse=NULL;
5771 );
3dab1dad 5772 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 5773 RExC_precomp = NULL;
3d2bd50a 5774 Safefree(pRExC_state->code_blocks);
a0d0e21e
LW
5775 return(NULL);
5776 }
bbd61b5f 5777
29b09c41
KW
5778 /* Here, finished first pass. Get rid of any added setjmp */
5779 if (used_setjump) {
bbd61b5f 5780 JMPENV_POP;
02daf0ab 5781 }
e40e74fe 5782
07be1b83 5783 DEBUG_PARSE_r({
81714fb9
YO
5784 PerlIO_printf(Perl_debug_log,
5785 "Required size %"IVdf" nodes\n"
5786 "Starting second pass (creation)\n",
5787 (IV)RExC_size);
07be1b83
YO
5788 RExC_lastnum=0;
5789 RExC_lastparse=NULL;
5790 });
e40e74fe
KW
5791
5792 /* The first pass could have found things that force Unicode semantics */
5793 if ((RExC_utf8 || RExC_uni_semantics)
37acfcba 5794 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
e40e74fe 5795 {
37acfcba 5796 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
e40e74fe
KW
5797 }
5798
c277df42
IZ
5799 /* Small enough for pointer-storage convention?
5800 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
5801 if (RExC_size >= 0x10000L && RExC_extralen)
5802 RExC_size += RExC_extralen;
c277df42 5803 else
830247a4
IZ
5804 RExC_extralen = 0;
5805 if (RExC_whilem_seen > 15)
5806 RExC_whilem_seen = 15;
a0d0e21e 5807
f9f4320a
YO
5808 /* Allocate space and zero-initialize. Note, the two step process
5809 of zeroing when in debug mode, thus anything assigned has to
5810 happen after that */
d2f13c59 5811 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 5812 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
5813 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5814 char, regexp_internal);
5815 if ( r == NULL || ri == NULL )
b45f050a 5816 FAIL("Regexp out of space");
0f79a09d
GS
5817#ifdef DEBUGGING
5818 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 5819 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 5820#else
f8fc2ecf
YO
5821 /* bulk initialize base fields with 0. */
5822 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 5823#endif
58e23c8d
YO
5824
5825 /* non-zero initialization begins here */
f8fc2ecf 5826 RXi_SET( r, ri );
3c13cae6 5827 r->engine= eng;
37acfcba 5828 r->extflags = rx_flags;
514a91f1 5829 if (pm_flags & PMf_IS_QR) {
3d2bd50a
DM
5830 ri->code_blocks = pRExC_state->code_blocks;
5831 ri->num_code_blocks = pRExC_state->num_code_blocks;
5832 }
5833 else
5834 SAVEFREEPV(pRExC_state->code_blocks);
5835
bcdf7404 5836 {
f7819f85 5837 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 5838 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
5839
5840 /* The caret is output if there are any defaults: if not all the STD
5841 * flags are set, or if no character set specifier is needed */
5842 bool has_default =
5843 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5844 || ! has_charset);
bcdf7404 5845 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
5846 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5847 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
5848 const char *fptr = STD_PAT_MODS; /*"msix"*/
5849 char *p;
fb85c044 5850 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
5851 * on. If more precision is desired, we could do a population count of
5852 * the flags set. This could be done with a small lookup table, or by
5853 * shifting, masking and adding, or even, when available, assembly
5854 * language for a machine-language population count.
5855 * We never output a minus, as all those are defaults, so are
5856 * covered by the caret */
fb85c044 5857 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 5858 + has_default /* If needs a caret */
a62b1201
KW
5859
5860 /* If needs a character set specifier */
5861 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
5862 + (sizeof(STD_PAT_MODS) - 1)
5863 + (sizeof("(?:)") - 1);
5864
c5ea2ffa 5865 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 5866 SvPOK_on(rx);
9f141731
DM
5867 if (RExC_utf8)
5868 SvFLAGS(rx) |= SVf_UTF8;
bcdf7404 5869 *p++='('; *p++='?';
9de15fec
KW
5870
5871 /* If a default, cover it using the caret */
c5ea2ffa 5872 if (has_default) {
85508812 5873 *p++= DEFAULT_PAT_MOD;
fb85c044 5874 }
c5ea2ffa 5875 if (has_charset) {
a62b1201
KW
5876 STRLEN len;
5877 const char* const name = get_regex_charset_name(r->extflags, &len);
5878 Copy(name, p, len, char);
5879 p += len;
9de15fec 5880 }
f7819f85
A
5881 if (has_p)
5882 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 5883 {
bcdf7404 5884 char ch;
bcdf7404
YO
5885 while((ch = *fptr++)) {
5886 if(reganch & 1)
5887 *p++ = ch;
bcdf7404
YO
5888 reganch >>= 1;
5889 }
bcdf7404
YO
5890 }
5891
28d8d7f4 5892 *p++ = ':';
bb661a58 5893 Copy(RExC_precomp, p, plen, char);
efd26800
NC
5894 assert ((RX_WRAPPED(rx) - p) < 16);
5895 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 5896 p += plen;
bcdf7404 5897 if (has_runon)
28d8d7f4
YO
5898 *p++ = '\n';
5899 *p++ = ')';
5900 *p = 0;
fb85c044 5901 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
5902 }
5903
bbe252da 5904 r->intflags = 0;
830247a4 5905 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 5906
6bda09f9 5907 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
5908 Newxz(RExC_open_parens, RExC_npar,regnode *);
5909 SAVEFREEPV(RExC_open_parens);
5910 Newxz(RExC_close_parens,RExC_npar,regnode *);
5911 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
5912 }
5913
5914 /* Useful during FAIL. */
7122b237
YO
5915#ifdef RE_TRACK_PATTERN_OFFSETS
5916 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 5917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 5918 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 5919 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 5920 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
5921#endif
5922 SetProgLen(ri,RExC_size);
288b8c02 5923 RExC_rx_sv = rx;
830247a4 5924 RExC_rx = r;
f8fc2ecf 5925 RExC_rxi = ri;
bbce6d69 5926
5927 /* Second pass: emit code. */
37acfcba 5928 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
514a91f1 5929 RExC_pm_flags = pm_flags;
830247a4
IZ
5930 RExC_parse = exp;
5931 RExC_end = xend;
5932 RExC_naughty = 0;
5933 RExC_npar = 1;
f8fc2ecf
YO
5934 RExC_emit_start = ri->program;
5935 RExC_emit = ri->program;
3b57cd43 5936 RExC_emit_bound = ri->program + RExC_size + 1;
68e2671b 5937 pRExC_state->code_index = 0;
3b57cd43 5938
830247a4 5939 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 5940 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 5941 ReREFCNT_dec(rx);
a0d0e21e 5942 return(NULL);
80757612 5943 }
07be1b83
YO
5944 /* XXXX To minimize changes to RE engine we always allocate
5945 3-units-long substrs field. */
5946 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
5947 if (RExC_recurse_count) {
5948 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5949 SAVEFREEPV(RExC_recurse);
5950 }
a0d0e21e 5951
07be1b83 5952reStudy:
e7f38d0f 5953 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 5954 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 5955
07be1b83 5956#ifdef TRIE_STUDY_OPT
0934c9d9
SH
5957 if (!restudied) {
5958 StructCopy(&zero_scan_data, &data, scan_data_t);
5959 copyRExC_state = RExC_state;
5960 } else {
5d458dd8 5961 U32 seen=RExC_seen;
07be1b83 5962 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
5963
5964 RExC_state = copyRExC_state;
5965 if (seen & REG_TOP_LEVEL_BRANCHES)
5966 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5967 else
5968 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 5969 if (data.last_found) {
07be1b83 5970 SvREFCNT_dec(data.longest_fixed);
07be1b83 5971 SvREFCNT_dec(data.longest_float);
07be1b83 5972 SvREFCNT_dec(data.last_found);
1de06328 5973 }
40d049e4 5974 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5975 }
40d049e4
YO
5976#else
5977 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5978#endif
fc8cd66c 5979
a0d0e21e 5980 /* Dig out information for optimizations. */
f7819f85 5981 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
5982 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5983
a0ed51b3 5984 if (UTF)
8f6ae13c 5985 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 5986 ri->regstclass = NULL;
830247a4 5987 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 5988 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 5989 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 5990
1de06328
YO
5991 /* testing for BRANCH here tells us whether there is "must appear"
5992 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 5993 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 5994 I32 fake;
c5254dd6 5995 STRLEN longest_float_length, longest_fixed_length;
07be1b83 5996 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 5997 int stclass_flag;
07be1b83 5998 I32 last_close = 0; /* pointed to by data */
5339e136
YO
5999 regnode *first= scan;
6000 regnode *first_next= regnext(first);
639081d6
YO
6001 /*
6002 * Skip introductions and multiplicators >= 1
6003 * so that we can extract the 'meat' of the pattern that must
6004 * match in the large if() sequence following.
6005 * NOTE that EXACT is NOT covered here, as it is normally
6006 * picked up by the optimiser separately.
6007 *
6008 * This is unfortunate as the optimiser isnt handling lookahead
6009 * properly currently.
6010 *
6011 */
a0d0e21e 6012 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 6013 /* An OR of *one* alternative - should not happen now. */
5339e136 6014 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 6015 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 6016 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
6017 (OP(first) == PLUS) ||
6018 (OP(first) == MINMOD) ||
653099ff 6019 /* An {n,m} with n>0 */
5339e136
YO
6020 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6021 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 6022 {
639081d6
YO
6023 /*
6024 * the only op that could be a regnode is PLUS, all the rest
6025 * will be regnode_1 or regnode_2.
6026 *
6027 */
a0d0e21e
LW
6028 if (OP(first) == PLUS)
6029 sawplus = 1;
6030 else
3dab1dad 6031 first += regarglen[OP(first)];
686b73d4 6032
639081d6 6033 first = NEXTOPER(first);
5339e136 6034 first_next= regnext(first);
a687059c
LW
6035 }
6036
a0d0e21e
LW
6037 /* Starting-point info. */
6038 again:
786e8c11 6039 DEBUG_PEEP("first:",first,0);
07be1b83 6040 /* Ignore EXACT as we deal with it later. */
3dab1dad 6041 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 6042 if (OP(first) == EXACT)
6f207bd3 6043 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 6044 else
f8fc2ecf 6045 ri->regstclass = first;
b3c9acc1 6046 }
686b73d4 6047#ifdef TRIE_STCLASS
786e8c11 6048 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 6049 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 6050 {
786e8c11 6051 regnode *trie_op;
07be1b83 6052 /* this can happen only on restudy */
786e8c11 6053 if ( OP(first) == TRIE ) {
c944940b 6054 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 6055 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
6056 StructCopy(first,trieop,struct regnode_1);
6057 trie_op=(regnode *)trieop;
6058 } else {
c944940b 6059 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 6060 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
6061 StructCopy(first,trieop,struct regnode_charclass);
6062 trie_op=(regnode *)trieop;
6063 }
1de06328 6064 OP(trie_op)+=2;
786e8c11 6065 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 6066 ri->regstclass = trie_op;
07be1b83 6067 }
686b73d4 6068#endif
e52fc539 6069 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 6070 ri->regstclass = first;
3dab1dad
YO
6071 else if (PL_regkind[OP(first)] == BOUND ||
6072 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 6073 ri->regstclass = first;
3dab1dad 6074 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
6075 r->extflags |= (OP(first) == MBOL
6076 ? RXf_ANCH_MBOL
cad2e5aa 6077 : (OP(first) == SBOL
bbe252da
YO
6078 ? RXf_ANCH_SBOL
6079 : RXf_ANCH_BOL));
a0d0e21e 6080 first = NEXTOPER(first);
774d564b 6081 goto again;
6082 }
6083 else if (OP(first) == GPOS) {
bbe252da 6084 r->extflags |= RXf_ANCH_GPOS;
774d564b 6085 first = NEXTOPER(first);
6086 goto again;
a0d0e21e 6087 }
cf2a2b69
YO
6088 else if ((!sawopen || !RExC_sawback) &&
6089 (OP(first) == STAR &&
3dab1dad 6090 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
471f5387 6091 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
a0d0e21e
LW
6092 {
6093 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
6094 const int type =
6095 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
6096 ? RXf_ANCH_MBOL
6097 : RXf_ANCH_SBOL;
6098 r->extflags |= type;
6099 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 6100 first = NEXTOPER(first);
774d564b 6101 goto again;
a0d0e21e 6102 }
e7f38d0f 6103 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
471f5387 6104 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
cad2e5aa 6105 /* x+ must match at the 1st pos of run of x's */
bbe252da 6106 r->intflags |= PREGf_SKIP;
a0d0e21e 6107
c277df42 6108 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 6109#ifdef TRIE_STUDY_OPT
81714fb9 6110 DEBUG_PARSE_r(
be8e71aa
YO
6111 if (!restudied)
6112 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6113 (IV)(first - scan + 1))
6114 );
6115#else
81714fb9 6116 DEBUG_PARSE_r(
be8e71aa
YO
6117 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6118 (IV)(first - scan + 1))
6119 );
6120#endif
6121
6122
a0d0e21e
LW
6123 /*
6124 * If there's something expensive in the r.e., find the
6125 * longest literal string that must appear and make it the
6126 * regmust. Resolve ties in favor of later strings, since
6127 * the regstart check works with the beginning of the r.e.
6128 * and avoiding duplication strengthens checking. Not a
6129 * strong reason, but sufficient in the absence of others.
6130 * [Now we resolve ties in favor of the earlier string if
c277df42 6131 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
6132 * earlier string may buy us something the later one won't.]
6133 */
686b73d4 6134
396482e1
GA
6135 data.longest_fixed = newSVpvs("");
6136 data.longest_float = newSVpvs("");
6137 data.last_found = newSVpvs("");
c277df42
IZ
6138 data.longest = &(data.longest_fixed);
6139 first = scan;
f8fc2ecf 6140 if (!ri->regstclass) {
e755fd73 6141 cl_init(pRExC_state, &ch_class);
653099ff
GS
6142 data.start_class = &ch_class;
6143 stclass_flag = SCF_DO_STCLASS_AND;
6144 } else /* XXXX Check for BOUND? */
6145 stclass_flag = 0;
cb434fcc 6146 data.last_closep = &last_close;
de8c5301 6147
1de06328 6148 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
6149 &data, -1, NULL, NULL,
6150 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 6151
686b73d4 6152
786e8c11
YO
6153 CHECK_RESTUDY_GOTO;
6154
6155
830247a4 6156 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 6157 && data.last_start_min == 0 && data.last_end > 0
830247a4 6158 && !RExC_seen_zerolen
2bf803e2 6159 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
6160 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6161 r->extflags |= RXf_CHECK_ALL;
304ee84b 6162 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
6163 SvREFCNT_dec(data.last_found);
6164
1de06328
YO
6165 /* Note that code very similar to this but for anchored string
6166 follows immediately below, changes may need to be made to both.
6167 Be careful.
6168 */
a0ed51b3 6169 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 6170 if (longest_float_length
c277df42
IZ
6171 || (data.flags & SF_FL_BEFORE_EOL
6172 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 6173 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 6174 {
1182767e 6175 I32 t,ml;
cf93c79d 6176
a0c4c608 6177 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
6178 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6179 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6180 && data.offset_fixed == data.offset_float_min
6181 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
aca2d497
IZ
6182 goto remove_float; /* As in (a)+. */
6183
1de06328
YO
6184 /* copy the information about the longest float from the reg_scan_data
6185 over to the program. */
33b8afdf
JH
6186 if (SvUTF8(data.longest_float)) {
6187 r->float_utf8 = data.longest_float;
c445ea15 6188 r->float_substr = NULL;
33b8afdf
JH
6189 } else {
6190 r->float_substr = data.longest_float;
c445ea15 6191 r->float_utf8 = NULL;
33b8afdf 6192 }
1de06328
YO
6193 /* float_end_shift is how many chars that must be matched that
6194 follow this item. We calculate it ahead of time as once the
6195 lookbehind offset is added in we lose the ability to correctly
6196 calculate it.*/
6197 ml = data.minlen_float ? *(data.minlen_float)
1182767e 6198 : (I32)longest_float_length;
1de06328
YO
6199 r->float_end_shift = ml - data.offset_float_min
6200 - longest_float_length + (SvTAIL(data.longest_float) != 0)
6201 + data.lookbehind_float;
6202 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 6203 r->float_max_offset = data.offset_float_max;
1182767e 6204 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
6205 r->float_max_offset -= data.lookbehind_float;
6206
cf93c79d
IZ
6207 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
6208 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 6209 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 6210 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
6211 }
6212 else {
aca2d497 6213 remove_float:
c445ea15 6214 r->float_substr = r->float_utf8 = NULL;
c277df42 6215 SvREFCNT_dec(data.longest_float);
c5254dd6 6216 longest_float_length = 0;
a0d0e21e 6217 }
c277df42 6218
1de06328
YO
6219 /* Note that code very similar to this but for floating string
6220 is immediately above, changes may need to be made to both.
6221 Be careful.
6222 */
a0ed51b3 6223 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
a0c4c608
KW
6224
6225 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
6226 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6227 && (longest_fixed_length
6228 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
6229 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6230 || (RExC_flags & RXf_PMf_MULTILINE)))) )
1de06328 6231 {
1182767e 6232 I32 t,ml;
cf93c79d 6233
1de06328
YO
6234 /* copy the information about the longest fixed
6235 from the reg_scan_data over to the program. */
33b8afdf
JH
6236 if (SvUTF8(data.longest_fixed)) {
6237 r->anchored_utf8 = data.longest_fixed;
c445ea15 6238 r->anchored_substr = NULL;
33b8afdf
JH
6239 } else {
6240 r->anchored_substr = data.longest_fixed;
c445ea15 6241 r->anchored_utf8 = NULL;
33b8afdf 6242 }
1de06328
YO
6243 /* fixed_end_shift is how many chars that must be matched that
6244 follow this item. We calculate it ahead of time as once the
6245 lookbehind offset is added in we lose the ability to correctly
6246 calculate it.*/
6247 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 6248 : (I32)longest_fixed_length;
1de06328
YO
6249 r->anchored_end_shift = ml - data.offset_fixed
6250 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
6251 + data.lookbehind_fixed;
6252 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6253
cf93c79d
IZ
6254 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6255 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 6256 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 6257 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
6258 }
6259 else {
c445ea15 6260 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 6261 SvREFCNT_dec(data.longest_fixed);
c5254dd6 6262 longest_fixed_length = 0;
a0d0e21e 6263 }
f8fc2ecf
YO
6264 if (ri->regstclass
6265 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6266 ri->regstclass = NULL;
f4244008 6267
33b8afdf
JH
6268 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6269 && stclass_flag
653099ff 6270 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
6271 && !cl_is_anything(data.start_class))
6272 {
2eccd3b2 6273 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 6274 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 6275
f8fc2ecf 6276 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
6277 struct regnode_charclass_class);
6278 StructCopy(data.start_class,
f8fc2ecf 6279 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 6280 struct regnode_charclass_class);
f8fc2ecf 6281 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 6282 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 6283 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 6284 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 6285 PerlIO_printf(Perl_debug_log,
a0288114 6286 "synthetic stclass \"%s\".\n",
3f7c398e 6287 SvPVX_const(sv));});
653099ff 6288 }
c277df42
IZ
6289
6290 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 6291 if (longest_fixed_length > longest_float_length) {
1de06328 6292 r->check_end_shift = r->anchored_end_shift;
c277df42 6293 r->check_substr = r->anchored_substr;
33b8afdf 6294 r->check_utf8 = r->anchored_utf8;
c277df42 6295 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
6296 if (r->extflags & RXf_ANCH_SINGLE)
6297 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
6298 }
6299 else {
1de06328 6300 r->check_end_shift = r->float_end_shift;
c277df42 6301 r->check_substr = r->float_substr;
33b8afdf 6302 r->check_utf8 = r->float_utf8;
1de06328
YO
6303 r->check_offset_min = r->float_min_offset;
6304 r->check_offset_max = r->float_max_offset;
a0d0e21e 6305 }
30382c73
IZ
6306 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6307 This should be changed ASAP! */
bbe252da
YO
6308 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6309 r->extflags |= RXf_USE_INTUIT;
33b8afdf 6310 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 6311 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 6312 }
1de06328
YO
6313 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6314 if ( (STRLEN)minlen < longest_float_length )
6315 minlen= longest_float_length;
6316 if ( (STRLEN)minlen < longest_fixed_length )
6317 minlen= longest_fixed_length;
6318 */
a0ed51b3
LW
6319 }
6320 else {
c277df42
IZ
6321 /* Several toplevels. Best we can is to set minlen. */
6322 I32 fake;
653099ff 6323 struct regnode_charclass_class ch_class;
cb434fcc 6324 I32 last_close = 0;
686b73d4 6325
5d458dd8 6326 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 6327
f8fc2ecf 6328 scan = ri->program + 1;
e755fd73 6329 cl_init(pRExC_state, &ch_class);
653099ff 6330 data.start_class = &ch_class;
cb434fcc 6331 data.last_closep = &last_close;
07be1b83 6332
de8c5301 6333
1de06328 6334 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 6335 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 6336
786e8c11 6337 CHECK_RESTUDY_GOTO;
07be1b83 6338
33b8afdf 6339 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 6340 = r->float_substr = r->float_utf8 = NULL;
f4244008 6341
653099ff 6342 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
6343 && !cl_is_anything(data.start_class))
6344 {
2eccd3b2 6345 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 6346 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 6347
f8fc2ecf 6348 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
6349 struct regnode_charclass_class);
6350 StructCopy(data.start_class,
f8fc2ecf 6351 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 6352 struct regnode_charclass_class);
f8fc2ecf 6353 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 6354 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 6355 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 6356 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 6357 PerlIO_printf(Perl_debug_log,
a0288114 6358 "synthetic stclass \"%s\".\n",
3f7c398e 6359 SvPVX_const(sv));});
653099ff 6360 }
a0d0e21e
LW
6361 }
6362
1de06328
YO
6363 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6364 the "real" pattern. */
cf9788e3
RGS
6365 DEBUG_OPTIMISE_r({
6366 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 6367 (IV)minlen, (IV)r->minlen);
cf9788e3 6368 });
de8c5301 6369 r->minlenret = minlen;
1de06328
YO
6370 if (r->minlen < minlen)
6371 r->minlen = minlen;
6372
b81d288d 6373 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 6374 r->extflags |= RXf_GPOS_SEEN;
830247a4 6375 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 6376 r->extflags |= RXf_LOOKBEHIND_SEEN;
471f5387 6377 if (pRExC_state->num_code_blocks)
bbe252da 6378 r->extflags |= RXf_EVAL_SEEN;
f33976b4 6379 if (RExC_seen & REG_SEEN_CANY)
bbe252da 6380 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 6381 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 6382 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 6383 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 6384 r->intflags |= PREGf_CUTGROUP_SEEN;
732caac7
DM
6385 if (pm_flags & PMf_USE_RE_EVAL)
6386 r->intflags |= PREGf_USE_RE_EVAL;
81714fb9 6387 if (RExC_paren_names)
85fbaab2 6388 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 6389 else
5daac39c 6390 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 6391
7bd1e614 6392#ifdef STUPID_PATTERN_CHECKS
5509d87a 6393 if (RX_PRELEN(rx) == 0)
640f820d 6394 r->extflags |= RXf_NULL;
5509d87a 6395 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
6396 /* XXX: this should happen BEFORE we compile */
6397 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 6398 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 6399 r->extflags |= RXf_WHITE;
5509d87a 6400 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 6401 r->extflags |= RXf_START_ONLY;
f1b875a0 6402#else
5509d87a 6403 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
6404 /* XXX: this should happen BEFORE we compile */
6405 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6406 else {
6407 regnode *first = ri->program + 1;
39aa8307 6408 U8 fop = OP(first);
f6d9469c
DM
6409
6410 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 6411 r->extflags |= RXf_NULL;
f6d9469c 6412 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 6413 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
6414 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6415 && OP(regnext(first)) == END)
7bd1e614
YO
6416 r->extflags |= RXf_WHITE;
6417 }
f1b875a0 6418#endif
1f1031fe
YO
6419#ifdef DEBUGGING
6420 if (RExC_paren_names) {
af534a04 6421 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
6422 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6423 } else
1f1031fe 6424#endif
cde0cee5 6425 ri->name_list_idx = 0;
1f1031fe 6426
40d049e4
YO
6427 if (RExC_recurse_count) {
6428 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6429 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6430 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6431 }
6432 }
f0ab9afb 6433 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
6434 /* assume we don't need to swap parens around before we match */
6435
be8e71aa
YO
6436 DEBUG_DUMP_r({
6437 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
6438 regdump(r);
6439 });
7122b237
YO
6440#ifdef RE_TRACK_PATTERN_OFFSETS
6441 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6442 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
6443 U32 i;
6444 GET_RE_DEBUG_FLAGS_DECL;
7122b237 6445 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 6446 for (i = 1; i <= len; i++) {
7122b237 6447 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 6448 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 6449 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
6450 }
6451 PerlIO_printf(Perl_debug_log, "\n");
6452 });
7122b237 6453#endif
288b8c02 6454 return rx;
a687059c
LW
6455}
6456
93b32b6d 6457
81714fb9 6458SV*
192b9cd1
AB
6459Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6460 const U32 flags)
6461{
7918f24d
NC
6462 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6463
192b9cd1
AB
6464 PERL_UNUSED_ARG(value);
6465
f1b875a0 6466 if (flags & RXapif_FETCH) {
192b9cd1 6467 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 6468 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 6469 Perl_croak_no_modify(aTHX);
192b9cd1 6470 return NULL;
f1b875a0 6471 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
6472 return reg_named_buff_exists(rx, key, flags)
6473 ? &PL_sv_yes
6474 : &PL_sv_no;
f1b875a0 6475 } else if (flags & RXapif_REGNAMES) {
192b9cd1 6476 return reg_named_buff_all(rx, flags);
f1b875a0 6477 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
6478 return reg_named_buff_scalar(rx, flags);
6479 } else {
6480 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6481 return NULL;
6482 }
6483}
6484
6485SV*
6486Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6487 const U32 flags)
6488{
7918f24d 6489 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
6490 PERL_UNUSED_ARG(lastkey);
6491
f1b875a0 6492 if (flags & RXapif_FIRSTKEY)
192b9cd1 6493 return reg_named_buff_firstkey(rx, flags);
f1b875a0 6494 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
6495 return reg_named_buff_nextkey(rx, flags);
6496 else {
6497 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6498 return NULL;
6499 }
6500}
6501
6502SV*
288b8c02
NC
6503Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6504 const U32 flags)
81714fb9 6505{
44a2ac75
YO
6506 AV *retarray = NULL;
6507 SV *ret;
288b8c02 6508 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
6509
6510 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6511
f1b875a0 6512 if (flags & RXapif_ALL)
44a2ac75 6513 retarray=newAV();
93b32b6d 6514
5daac39c
NC
6515 if (rx && RXp_PAREN_NAMES(rx)) {
6516 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
6517 if (he_str) {
6518 IV i;
6519 SV* sv_dat=HeVAL(he_str);
6520 I32 *nums=(I32*)SvPVX(sv_dat);
6521 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
6522 if ((I32)(rx->nparens) >= nums[i]
6523 && rx->offs[nums[i]].start != -1
6524 && rx->offs[nums[i]].end != -1)
93b32b6d 6525 {
49d7dfbc 6526 ret = newSVpvs("");
288b8c02 6527 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
6528 if (!retarray)
6529 return ret;
6530 } else {
7402016d
AB
6531 if (retarray)
6532 ret = newSVsv(&PL_sv_undef);
93b32b6d 6533 }
ec83ea38 6534 if (retarray)
93b32b6d 6535 av_push(retarray, ret);
81714fb9 6536 }
93b32b6d 6537 if (retarray)
ad64d0ec 6538 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
6539 }
6540 }
6541 return NULL;
6542}
6543
6544bool
288b8c02 6545Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
6546 const U32 flags)
6547{
288b8c02 6548 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
6549
6550 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6551
5daac39c 6552 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 6553 if (flags & RXapif_ALL) {
5daac39c 6554 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 6555 } else {
288b8c02 6556 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
6557 if (sv) {
6558 SvREFCNT_dec(sv);
192b9cd1
AB
6559 return TRUE;
6560 } else {
6561 return FALSE;
6562 }
6563 }
6564 } else {
6565 return FALSE;
6566 }
6567}
6568
6569SV*
288b8c02 6570Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 6571{
288b8c02 6572 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
6573
6574 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6575
5daac39c
NC
6576 if ( rx && RXp_PAREN_NAMES(rx) ) {
6577 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 6578
288b8c02 6579 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
6580 } else {
6581 return FALSE;
6582 }
192b9cd1
AB
6583}
6584
6585SV*
288b8c02 6586Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 6587{
288b8c02 6588 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 6589 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6590
6591 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6592
5daac39c
NC
6593 if (rx && RXp_PAREN_NAMES(rx)) {
6594 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
6595 HE *temphe;
6596 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6597 IV i;
6598 IV parno = 0;
6599 SV* sv_dat = HeVAL(temphe);
6600 I32 *nums = (I32*)SvPVX(sv_dat);
6601 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 6602 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
6603 rx->offs[nums[i]].start != -1 &&
6604 rx->offs[nums[i]].end != -1)
6605 {
6606 parno = nums[i];
6607 break;
6608 }
6609 }
f1b875a0 6610 if (parno || flags & RXapif_ALL) {
a663657d 6611 return newSVhek(HeKEY_hek(temphe));
192b9cd1 6612 }
81714fb9
YO
6613 }
6614 }
44a2ac75
YO
6615 return NULL;
6616}
6617
192b9cd1 6618SV*
288b8c02 6619Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
6620{
6621 SV *ret;
6622 AV *av;
6623 I32 length;
288b8c02 6624 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 6625
7918f24d
NC
6626 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6627
5daac39c 6628 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 6629 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 6630 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 6631 } else if (flags & RXapif_ONE) {
288b8c02 6632 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 6633 av = MUTABLE_AV(SvRV(ret));
192b9cd1 6634 length = av_len(av);
ec83ea38 6635 SvREFCNT_dec(ret);
192b9cd1
AB
6636 return newSViv(length + 1);
6637 } else {
6638 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6639 return NULL;
6640 }
6641 }
6642 return &PL_sv_undef;
6643}
6644
6645SV*
288b8c02 6646Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 6647{
288b8c02 6648 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
6649 AV *av = newAV();
6650
7918f24d
NC
6651 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6652
5daac39c
NC
6653 if (rx && RXp_PAREN_NAMES(rx)) {
6654 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
6655 HE *temphe;
6656 (void)hv_iterinit(hv);
6657 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6658 IV i;
6659 IV parno = 0;
6660 SV* sv_dat = HeVAL(temphe);
6661 I32 *nums = (I32*)SvPVX(sv_dat);
6662 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 6663 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
6664 rx->offs[nums[i]].start != -1 &&
6665 rx->offs[nums[i]].end != -1)
6666 {
6667 parno = nums[i];
6668 break;
6669 }
6670 }
f1b875a0 6671 if (parno || flags & RXapif_ALL) {
a663657d 6672 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
6673 }
6674 }
6675 }
6676
ad64d0ec 6677 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
6678}
6679
49d7dfbc 6680void
288b8c02
NC
6681Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6682 SV * const sv)
44a2ac75 6683{
288b8c02 6684 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 6685 char *s = NULL;
a9d504c3 6686 I32 i = 0;
44a2ac75 6687 I32 s1, t1;
7918f24d
NC
6688
6689 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 6690
cde0cee5
YO
6691 if (!rx->subbeg) {
6692 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6693 return;
cde0cee5
YO
6694 }
6695 else
f1b875a0 6696 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 6697 /* $` */
f0ab9afb 6698 i = rx->offs[0].start;
cde0cee5 6699 s = rx->subbeg;
44a2ac75
YO
6700 }
6701 else
f1b875a0 6702 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 6703 /* $' */
f0ab9afb
NC
6704 s = rx->subbeg + rx->offs[0].end;
6705 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
6706 }
6707 else
6708 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
6709 (s1 = rx->offs[paren].start) != -1 &&
6710 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
6711 {
6712 /* $& $1 ... */
6713 i = t1 - s1;
6714 s = rx->subbeg + s1;
cde0cee5
YO
6715 } else {
6716 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6717 return;
cde0cee5
YO
6718 }
6719 assert(rx->sublen >= (s - rx->subbeg) + i );
6720 if (i >= 0) {
6721 const int oldtainted = PL_tainted;
6722 TAINT_NOT;
6723 sv_setpvn(sv, s, i);
6724 PL_tainted = oldtainted;
6725 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 6726 ? (RXp_MATCH_UTF8(rx)
cde0cee5 6727 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 6728 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
6729 {
6730 SvUTF8_on(sv);
6731 }
6732 else
6733 SvUTF8_off(sv);
6734 if (PL_tainting) {
07bc277f 6735 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
6736 if (SvTYPE(sv) >= SVt_PVMG) {
6737 MAGIC* const mg = SvMAGIC(sv);
6738 MAGIC* mgt;
6739 PL_tainted = 1;
6740 SvMAGIC_set(sv, mg->mg_moremagic);
6741 SvTAINT(sv);
6742 if ((mgt = SvMAGIC(sv))) {
6743 mg->mg_moremagic = mgt;
6744 SvMAGIC_set(sv, mg);
44a2ac75 6745 }
cde0cee5
YO
6746 } else {
6747 PL_tainted = 1;
6748 SvTAINT(sv);
6749 }
6750 } else
6751 SvTAINTED_off(sv);
44a2ac75 6752 }
81714fb9 6753 } else {
44a2ac75 6754 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 6755 return;
81714fb9
YO
6756 }
6757}
93b32b6d 6758
2fdbfb4d
AB
6759void
6760Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6761 SV const * const value)
6762{
7918f24d
NC
6763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6764
2fdbfb4d
AB
6765 PERL_UNUSED_ARG(rx);
6766 PERL_UNUSED_ARG(paren);
6767 PERL_UNUSED_ARG(value);
6768
6769 if (!PL_localizing)
6ad8f254 6770 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
6771}
6772
6773I32
288b8c02 6774Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
6775 const I32 paren)
6776{
288b8c02 6777 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
6778 I32 i;
6779 I32 s1, t1;
6780
7918f24d
NC
6781 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6782
2fdbfb4d
AB
6783 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6784 switch (paren) {
192b9cd1 6785 /* $` / ${^PREMATCH} */
f1b875a0 6786 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
6787 if (rx->offs[0].start != -1) {
6788 i = rx->offs[0].start;
6789 if (i > 0) {
6790 s1 = 0;
6791 t1 = i;
6792 goto getlen;
6793 }
6794 }
6795 return 0;
192b9cd1 6796 /* $' / ${^POSTMATCH} */
f1b875a0 6797 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
6798 if (rx->offs[0].end != -1) {
6799 i = rx->sublen - rx->offs[0].end;
6800 if (i > 0) {
6801 s1 = rx->offs[0].end;
6802 t1 = rx->sublen;
6803 goto getlen;
6804 }
6805 }
6806 return 0;
192b9cd1
AB
6807 /* $& / ${^MATCH}, $1, $2, ... */
6808 default:
2fdbfb4d
AB
6809 if (paren <= (I32)rx->nparens &&
6810 (s1 = rx->offs[paren].start) != -1 &&
6811 (t1 = rx->offs[paren].end) != -1)
6812 {
6813 i = t1 - s1;
6814 goto getlen;
6815 } else {
6816 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 6817 report_uninit((const SV *)sv);
2fdbfb4d
AB
6818 return 0;
6819 }
6820 }
6821 getlen:
07bc277f 6822 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
6823 const char * const s = rx->subbeg + s1;
6824 const U8 *ep;
6825 STRLEN el;
6826
6827 i = t1 - s1;
6828 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6829 i = el;
6830 }
6831 return i;
6832}
6833
fe578d7f 6834SV*
49d7dfbc 6835Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 6836{
7918f24d 6837 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 6838 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
6839 if (0)
6840 return NULL;
6841 else
6842 return newSVpvs("Regexp");
fe578d7f 6843}
0a4db386 6844
894be9b7 6845/* Scans the name of a named buffer from the pattern.
0a4db386
YO
6846 * If flags is REG_RSN_RETURN_NULL returns null.
6847 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6848 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6849 * to the parsed name as looked up in the RExC_paren_names hash.
6850 * If there is an error throws a vFAIL().. type exception.
894be9b7 6851 */
0a4db386
YO
6852
6853#define REG_RSN_RETURN_NULL 0
6854#define REG_RSN_RETURN_NAME 1
6855#define REG_RSN_RETURN_DATA 2
6856
894be9b7 6857STATIC SV*
7918f24d
NC
6858S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6859{
894be9b7 6860 char *name_start = RExC_parse;
1f1031fe 6861
7918f24d
NC
6862 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6863
1f1031fe
YO
6864 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6865 /* skip IDFIRST by using do...while */
6866 if (UTF)
6867 do {
6868 RExC_parse += UTF8SKIP(RExC_parse);
6869 } while (isALNUM_utf8((U8*)RExC_parse));
6870 else
6871 do {
6872 RExC_parse++;
6873 } while (isALNUM(*RExC_parse));
1f4f6bf1
YO
6874 } else {
6875 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6876 vFAIL("Group name must start with a non-digit word character");
894be9b7 6877 }
0a4db386 6878 if ( flags ) {
59cd0e26
NC
6879 SV* sv_name
6880 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6881 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
6882 if ( flags == REG_RSN_RETURN_NAME)
6883 return sv_name;
6884 else if (flags==REG_RSN_RETURN_DATA) {
6885 HE *he_str = NULL;
6886 SV *sv_dat = NULL;
6887 if ( ! sv_name ) /* should not happen*/
6888 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6889 if (RExC_paren_names)
6890 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6891 if ( he_str )
6892 sv_dat = HeVAL(he_str);
6893 if ( ! sv_dat )
6894 vFAIL("Reference to nonexistent named group");
6895 return sv_dat;
6896 }
6897 else {
5637ef5b
NC
6898 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6899 (unsigned long) flags);
0a4db386 6900 }
118e2215 6901 assert(0); /* NOT REACHED */
894be9b7 6902 }
0a4db386 6903 return NULL;
894be9b7
YO
6904}
6905
3dab1dad
YO
6906#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6907 int rem=(int)(RExC_end - RExC_parse); \
6908 int cut; \
6909 int num; \
6910 int iscut=0; \
6911 if (rem>10) { \
6912 rem=10; \
6913 iscut=1; \
6914 } \
6915 cut=10-rem; \
6916 if (RExC_lastparse!=RExC_parse) \
6917 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6918 rem, RExC_parse, \
6919 cut + 4, \
6920 iscut ? "..." : "<" \
6921 ); \
6922 else \
6923 PerlIO_printf(Perl_debug_log,"%16s",""); \
6924 \
6925 if (SIZE_ONLY) \
3b57cd43 6926 num = RExC_size + 1; \
3dab1dad
YO
6927 else \
6928 num=REG_NODE_NUM(RExC_emit); \
6929 if (RExC_lastnum!=num) \
0a4db386 6930 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 6931 else \
0a4db386 6932 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
6933 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6934 (int)((depth*2)), "", \
3dab1dad
YO
6935 (funcname) \
6936 ); \
6937 RExC_lastnum=num; \
6938 RExC_lastparse=RExC_parse; \
6939})
6940
07be1b83
YO
6941
6942
3dab1dad
YO
6943#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6944 DEBUG_PARSE_MSG((funcname)); \
6945 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6946})
6bda09f9
YO
6947#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6948 DEBUG_PARSE_MSG((funcname)); \
6949 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6950})
d764b54e
KW
6951
6952/* This section of code defines the inversion list object and its methods. The
6953 * interfaces are highly subject to change, so as much as possible is static to
fa2d2a23
KW
6954 * this file. An inversion list is here implemented as a malloc'd C UV array
6955 * with some added info that is placed as UVs at the beginning in a header
6956 * portion. An inversion list for Unicode is an array of code points, sorted
6957 * by ordinal number. The zeroth element is the first code point in the list.
6958 * The 1th element is the first element beyond that not in the list. In other
6959 * words, the first range is
6960 * invlist[0]..(invlist[1]-1)
dbe7a391
KW
6961 * The other ranges follow. Thus every element whose index is divisible by two
6962 * marks the beginning of a range that is in the list, and every element not
fa2d2a23
KW
6963 * divisible by two marks the beginning of a range not in the list. A single
6964 * element inversion list that contains the single code point N generally
6965 * consists of two elements
6966 * invlist[0] == N
6967 * invlist[1] == N+1
6968 * (The exception is when N is the highest representable value on the
6969 * machine, in which case the list containing just it would be a single
6970 * element, itself. By extension, if the last range in the list extends to
6971 * infinity, then the first element of that range will be in the inversion list
6972 * at a position that is divisible by two, and is the final element in the
6973 * list.)
f1b67122
KW
6974 * Taking the complement (inverting) an inversion list is quite simple, if the
6975 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6976 * This implementation reserves an element at the beginning of each inversion list
6977 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6978 * beginning of the list is either that element if 0, or the next one if 1.
6979 *
fa2d2a23
KW
6980 * More about inversion lists can be found in "Unicode Demystified"
6981 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
97b14ce7 6982 * More will be coming when functionality is added later.
d764b54e 6983 *
fa2d2a23
KW
6984 * The inversion list data structure is currently implemented as an SV pointing
6985 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6986 * array of UV whose memory management is automatically handled by the existing
6987 * facilities for SV's.
62672576 6988 *
d764b54e
KW
6989 * Some of the methods should always be private to the implementation, and some
6990 * should eventually be made public */
6991
fa2d2a23
KW
6992#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6993#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6994
f59ff194
KW
6995/* This is a combination of a version and data structure type, so that one
6996 * being passed in can be validated to be an inversion list of the correct
6997 * vintage. When the structure of the header is changed, a new random number
6998 * in the range 2**31-1 should be generated and the new() method changed to
6999 * insert that at this location. Then, if an auxiliary program doesn't change
7000 * correspondingly, it will be discovered immediately */
7001#define INVLIST_VERSION_ID_OFFSET 2
7002#define INVLIST_VERSION_ID 1064334010
7003
7004/* For safety, when adding new elements, remember to #undef them at the end of
7005 * the inversion list code section */
7006
7007#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
f1b67122
KW
7008/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7009 * contains the code point U+00000, and begins here. If 1, the inversion list
7010 * doesn't contain U+0000, and it begins at the next UV in the array.
7011 * Inverting an inversion list consists of adding or removing the 0 at the
7012 * beginning of it. By reserving a space for that 0, inversion can be made
7013 * very fast */
7014
7015#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
97b14ce7
KW
7016
7017/* Internally things are UVs */
7018#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7019#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7020
d764b54e 7021#define INVLIST_INITIAL_LEN 10
d764b54e
KW
7022
7023PERL_STATIC_INLINE UV*
f1b67122
KW
7024S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7025{
7026 /* Returns a pointer to the first element in the inversion list's array.
7027 * This is called upon initialization of an inversion list. Where the
7028 * array begins depends on whether the list has the code point U+0000
7029 * in it or not. The other parameter tells it whether the code that
7030 * follows this call is about to put a 0 in the inversion list or not.
7031 * The first element is either the element with 0, if 0, or the next one,
7032 * if 1 */
7033
7034 UV* zero = get_invlist_zero_addr(invlist);
7035
7036 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7037
7038 /* Must be empty */
7039 assert(! *get_invlist_len_addr(invlist));
7040
7041 /* 1^1 = 0; 1^0 = 1 */
7042 *zero = 1 ^ will_have_0;
7043 return zero + *zero;
7044}
7045
7046PERL_STATIC_INLINE UV*
a25abddc 7047S_invlist_array(pTHX_ SV* const invlist)
d764b54e
KW
7048{
7049 /* Returns the pointer to the inversion list's array. Every time the
7050 * length changes, this needs to be called in case malloc or realloc moved
7051 * it */
7052
d764b54e
KW
7053 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7054
dbe7a391
KW
7055 /* Must not be empty. If these fail, you probably didn't check for <len>
7056 * being non-zero before trying to get the array */
f1b67122
KW
7057 assert(*get_invlist_len_addr(invlist));
7058 assert(*get_invlist_zero_addr(invlist) == 0
7059 || *get_invlist_zero_addr(invlist) == 1);
7060
7061 /* The array begins either at the element reserved for zero if the
7062 * list contains 0 (that element will be set to 0), or otherwise the next
7063 * element (in which case the reserved element will be set to 1). */
7064 return (UV *) (get_invlist_zero_addr(invlist)
7065 + *get_invlist_zero_addr(invlist));
d764b54e
KW
7066}
7067
61bdbf38
KW
7068PERL_STATIC_INLINE UV*
7069S_get_invlist_len_addr(pTHX_ SV* invlist)
7070{
7071 /* Return the address of the UV that contains the current number
7072 * of used elements in the inversion list */
7073
7074 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7075
7076 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7077}
7078
d764b54e 7079PERL_STATIC_INLINE UV
a25abddc 7080S_invlist_len(pTHX_ SV* const invlist)
d764b54e 7081{
dbe7a391
KW
7082 /* Returns the current number of elements stored in the inversion list's
7083 * array */
d764b54e 7084
d764b54e
KW
7085 PERL_ARGS_ASSERT_INVLIST_LEN;
7086
61bdbf38 7087 return *get_invlist_len_addr(invlist);
d764b54e
KW
7088}
7089
c56a880b
KW
7090PERL_STATIC_INLINE void
7091S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7092{
7093 /* Sets the current number of elements stored in the inversion list */
7094
7095 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7096
c56a880b 7097 *get_invlist_len_addr(invlist) = len;
f1b67122 7098
32f89ef6
KW
7099 assert(len <= SvLEN(invlist));
7100
f1b67122
KW
7101 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7102 /* If the list contains U+0000, that element is part of the header,
7103 * and should not be counted as part of the array. It will contain
7104 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7105 * subtract:
7106 * SvCUR_set(invlist,
7107 * TO_INTERNAL_SIZE(len
7108 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7109 * But, this is only valid if len is not 0. The consequences of not doing
9479a769
KW
7110 * this is that the memory allocation code may think that 1 more UV is
7111 * being used than actually is, and so might do an unnecessary grow. That
7112 * seems worth not bothering to make this the precise amount.
25e94a65
KW
7113 *
7114 * Note that when inverting, SvCUR shouldn't change */
c56a880b
KW
7115}
7116
d764b54e 7117PERL_STATIC_INLINE UV
a25abddc 7118S_invlist_max(pTHX_ SV* const invlist)
d764b54e
KW
7119{
7120 /* Returns the maximum number of elements storable in the inversion list's
7121 * array, without having to realloc() */
7122
d764b54e
KW
7123 PERL_ARGS_ASSERT_INVLIST_MAX;
7124
005b65ed 7125 return FROM_INTERNAL_SIZE(SvLEN(invlist));
d764b54e
KW
7126}
7127
f1b67122
KW
7128PERL_STATIC_INLINE UV*
7129S_get_invlist_zero_addr(pTHX_ SV* invlist)
7130{
7131 /* Return the address of the UV that is reserved to hold 0 if the inversion
7132 * list contains 0. This has to be the last element of the heading, as the
7133 * list proper starts with either it if 0, or the next element if not.
7134 * (But we force it to contain either 0 or 1) */
7135
7136 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7137
7138 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7139}
d764b54e 7140
8d69a883 7141#ifndef PERL_IN_XSUB_RE
a25abddc 7142SV*
d764b54e
KW
7143Perl__new_invlist(pTHX_ IV initial_size)
7144{
7145
7146 /* Return a pointer to a newly constructed inversion list, with enough
7147 * space to store 'initial_size' elements. If that number is negative, a
7148 * system default is used instead */
7149
97b14ce7
KW
7150 SV* new_list;
7151
d764b54e
KW
7152 if (initial_size < 0) {
7153 initial_size = INVLIST_INITIAL_LEN;
7154 }
7155
7156 /* Allocate the initial space */
97b14ce7
KW
7157 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7158 invlist_set_len(new_list, 0);
7159
f3dc70d1
KW
7160 /* Force iterinit() to be used to get iteration to work */
7161 *get_invlist_iter_addr(new_list) = UV_MAX;
7162
f1b67122
KW
7163 /* This should force a segfault if a method doesn't initialize this
7164 * properly */
7165 *get_invlist_zero_addr(new_list) = UV_MAX;
7166
f59ff194
KW
7167 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7168#if HEADER_LENGTH != 4
7169# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7170#endif
7171
97b14ce7 7172 return new_list;
d764b54e 7173}
8d69a883 7174#endif
d764b54e 7175
f59ff194
KW
7176STATIC SV*
7177S__new_invlist_C_array(pTHX_ UV* list)
7178{
7179 /* Return a pointer to a newly constructed inversion list, initialized to
7180 * point to <list>, which has to be in the exact correct inversion list
7181 * form, including internal fields. Thus this is a dangerous routine that
7182 * should not be used in the wrong hands */
7183
7184 SV* invlist = newSV_type(SVt_PV);
7185
7186 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7187
7188 SvPV_set(invlist, (char *) list);
7189 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7190 shouldn't touch it */
7191 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7192
7193 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7194 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7195 }
7196
7197 return invlist;
7198}
7199
d764b54e 7200STATIC void
a25abddc 7201S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
d764b54e 7202{
62672576 7203 /* Grow the maximum size of an inversion list */
d764b54e
KW
7204
7205 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7206
005b65ed 7207 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
d764b54e
KW
7208}
7209
7210PERL_STATIC_INLINE void
a25abddc 7211S_invlist_trim(pTHX_ SV* const invlist)
d764b54e
KW
7212{
7213 PERL_ARGS_ASSERT_INVLIST_TRIM;
7214
7215 /* Change the length of the inversion list to how many entries it currently
7216 * has */
7217
62672576 7218 SvPV_shrink_to_cur((SV *) invlist);
d764b54e
KW
7219}
7220
7221/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7222 * etc */
4096c37b
KW
7223#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7224#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
d764b54e 7225
8dc9348a
KW
7226#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7227
9d501133
KW
7228STATIC void
7229S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
d764b54e
KW
7230{
7231 /* Subject to change or removal. Append the range from 'start' to 'end' at
7232 * the end of the inversion list. The range must be above any existing
7233 * ones. */
7234
f1b67122 7235 UV* array;
d764b54e
KW
7236 UV max = invlist_max(invlist);
7237 UV len = invlist_len(invlist);
7238
7239 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7240
f1b67122
KW
7241 if (len == 0) { /* Empty lists must be initialized */
7242 array = _invlist_array_init(invlist, start == 0);
7243 }
7244 else {
d764b54e
KW
7245 /* Here, the existing list is non-empty. The current max entry in the
7246 * list is generally the first value not in the set, except when the
7247 * set extends to the end of permissible values, in which case it is
7248 * the first entry in that final set, and so this call is an attempt to
7249 * append out-of-order */
7250
7251 UV final_element = len - 1;
f1b67122 7252 array = invlist_array(invlist);
d764b54e 7253 if (array[final_element] > start
4096c37b 7254 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
d764b54e 7255 {
5637ef5b
NC
7256 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7257 array[final_element], start,
7258 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
d764b54e
KW
7259 }
7260
7261 /* Here, it is a legal append. If the new range begins with the first
7262 * value not in the set, it is extending the set, so the new first
7263 * value not in the set is one greater than the newly extended range.
7264 * */
7265 if (array[final_element] == start) {
7266 if (end != UV_MAX) {
7267 array[final_element] = end + 1;
7268 }
7269 else {
7270 /* But if the end is the maximum representable on the machine,
dbe7a391 7271 * just let the range that this would extend to have no end */
d764b54e
KW
7272 invlist_set_len(invlist, len - 1);
7273 }
7274 return;
7275 }
7276 }
7277
7278 /* Here the new range doesn't extend any existing set. Add it */
7279
7280 len += 2; /* Includes an element each for the start and end of range */
7281
7282 /* If overflows the existing space, extend, which may cause the array to be
7283 * moved */
7284 if (max < len) {
7285 invlist_extend(invlist, len);
f1b67122
KW
7286 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7287 failure in invlist_array() */
d764b54e
KW
7288 array = invlist_array(invlist);
7289 }
f1b67122
KW
7290 else {
7291 invlist_set_len(invlist, len);
7292 }
d764b54e
KW
7293
7294 /* The next item on the list starts the range, the one after that is
7295 * one past the new range. */
7296 array[len - 2] = start;
7297 if (end != UV_MAX) {
7298 array[len - 1] = end + 1;
7299 }
7300 else {
7301 /* But if the end is the maximum representable on the machine, just let
7302 * the range have no end */
7303 invlist_set_len(invlist, len - 1);
7304 }
7305}
7306
9d501133
KW
7307#ifndef PERL_IN_XSUB_RE
7308
d5e82ecc
KW
7309STATIC IV
7310S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7311{
7312 /* Searches the inversion list for the entry that contains the input code
7313 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7314 * return value is the index into the list's array of the range that
7315 * contains <cp> */
7316
7317 IV low = 0;
7318 IV high = invlist_len(invlist);
7319 const UV * const array = invlist_array(invlist);
7320
7321 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7322
7323 /* If list is empty or the code point is before the first element, return
7324 * failure. */
7325 if (high == 0 || cp < array[0]) {
7326 return -1;
7327 }
7328
7329 /* Binary search. What we are looking for is <i> such that
7330 * array[i] <= cp < array[i+1]
7331 * The loop below converges on the i+1. */
7332 while (low < high) {
7333 IV mid = (low + high) / 2;
7334 if (array[mid] <= cp) {
7335 low = mid + 1;
7336
7337 /* We could do this extra test to exit the loop early.
7338 if (cp < array[low]) {
7339 return mid;
7340 }
7341 */
7342 }
7343 else { /* cp < array[mid] */
7344 high = mid;
7345 }
7346 }
7347
7348 return high - 1;
7349}
7350
86f766ab 7351void
b6a0ff33
KW
7352Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7353{
7354 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7355 * but is used when the swash has an inversion list. This makes this much
7356 * faster, as it uses a binary search instead of a linear one. This is
7357 * intimately tied to that function, and perhaps should be in utf8.c,
7358 * except it is intimately tied to inversion lists as well. It assumes
7359 * that <swatch> is all 0's on input */
7360
7361 UV current = start;
7362 const IV len = invlist_len(invlist);
7363 IV i;
7364 const UV * array;
7365
7366 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7367
7368 if (len == 0) { /* Empty inversion list */
7369 return;
7370 }
7371
7372 array = invlist_array(invlist);
7373
7374 /* Find which element it is */
7375 i = invlist_search(invlist, start);
7376
7377 /* We populate from <start> to <end> */
7378 while (current < end) {
7379 UV upper;
7380
7381 /* The inversion list gives the results for every possible code point
7382 * after the first one in the list. Only those ranges whose index is
7383 * even are ones that the inversion list matches. For the odd ones,
7384 * and if the initial code point is not in the list, we have to skip
7385 * forward to the next element */
7386 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7387 i++;
7388 if (i >= len) { /* Finished if beyond the end of the array */
7389 return;
7390 }
7391 current = array[i];
7392 if (current >= end) { /* Finished if beyond the end of what we
7393 are populating */
7394 return;
7395 }
7396 }
7397 assert(current >= start);
7398
7399 /* The current range ends one below the next one, except don't go past
7400 * <end> */
7401 i++;
7402 upper = (i < len && array[i] < end) ? array[i] : end;
7403
7404 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7405 * for each code point in it */
7406 for (; current < upper; current++) {
7407 const STRLEN offset = (STRLEN)(current - start);
7408 swatch[offset >> 3] |= 1 << (offset & 7);
7409 }
7410
7411 /* Quit if at the end of the list */
7412 if (i >= len) {
7413
7414 /* But first, have to deal with the highest possible code point on
7415 * the platform. The previous code assumes that <end> is one
7416 * beyond where we want to populate, but that is impossible at the
7417 * platform's infinity, so have to handle it specially */
7418 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7419 {
7420 const STRLEN offset = (STRLEN)(end - start);
7421 swatch[offset >> 3] |= 1 << (offset & 7);
7422 }
7423 return;
7424 }
7425
7426 /* Advance to the next range, which will be for code points not in the
7427 * inversion list */
7428 current = array[i];
7429 }
7430
7431 return;
7432}
7433
8dc9348a 7434
b6a0ff33 7435void
164173a2 7436Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
d764b54e 7437{
4065ba03
KW
7438 /* Take the union of two inversion lists and point <output> to it. *output
7439 * should be defined upon input, and if it points to one of the two lists,
f45adb79
KW
7440 * the reference count to that list will be decremented. The first list,
7441 * <a>, may be NULL, in which case a copy of the second list is returned.
164173a2
KW
7442 * If <complement_b> is TRUE, the union is taken of the complement
7443 * (inversion) of <b> instead of b itself.
f45adb79 7444 *
d764b54e
KW
7445 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7446 * Richard Gillam, published by Addison-Wesley, and explained at some
7447 * length there. The preface says to incorporate its examples into your
7448 * code at your own risk.
7449 *
7450 * The algorithm is like a merge sort.
7451 *
7452 * XXX A potential performance improvement is to keep track as we go along
7453 * if only one of the inputs contributes to the result, meaning the other
7454 * is a subset of that one. In that case, we can skip the final copy and
a2995b7f
KW
7455 * return the larger of the input lists, but then outside code might need
7456 * to keep track of whether to free the input list or not */
d764b54e 7457
f1b67122
KW
7458 UV* array_a; /* a's array */
7459 UV* array_b;
7460 UV len_a; /* length of a's array */
7461 UV len_b;
d764b54e 7462
a25abddc 7463 SV* u; /* the resulting union */
d764b54e
KW
7464 UV* array_u;
7465 UV len_u;
7466
7467 UV i_a = 0; /* current index into a's array */
7468 UV i_b = 0;
7469 UV i_u = 0;
7470
7471 /* running count, as explained in the algorithm source book; items are
7472 * stopped accumulating and are output when the count changes to/from 0.
7473 * The count is incremented when we start a range that's in the set, and
7474 * decremented when we start a range that's not in the set. So its range
7475 * is 0 to 2. Only when the count is zero is something not in the set.
7476 */
7477 UV count = 0;
7478
164173a2 7479 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
60825692 7480 assert(a != b);
d764b54e 7481
f1b67122 7482 /* If either one is empty, the union is the other one */
f45adb79 7483 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
4065ba03 7484 if (*output == a) {
f45adb79
KW
7485 if (a != NULL) {
7486 SvREFCNT_dec(a);
7487 }
f1b67122 7488 }
4065ba03 7489 if (*output != b) {
f1b67122 7490 *output = invlist_clone(b);
164173a2
KW
7491 if (complement_b) {
7492 _invlist_invert(*output);
7493 }
dbe7a391 7494 } /* else *output already = b; */
f1b67122
KW
7495 return;
7496 }
7497 else if ((len_b = invlist_len(b)) == 0) {
4065ba03 7498 if (*output == b) {
f1b67122
KW
7499 SvREFCNT_dec(b);
7500 }
164173a2
KW
7501
7502 /* The complement of an empty list is a list that has everything in it,
7503 * so the union with <a> includes everything too */
7504 if (complement_b) {
7505 if (a == *output) {
7506 SvREFCNT_dec(a);
7507 }
7508 *output = _new_invlist(1);
7509 _append_range_to_invlist(*output, 0, UV_MAX);
7510 }
7511 else if (*output != a) {
7512 *output = invlist_clone(a);
7513 }
7514 /* else *output already = a; */
f1b67122
KW
7515 return;
7516 }
7517
7518 /* Here both lists exist and are non-empty */
7519 array_a = invlist_array(a);
7520 array_b = invlist_array(b);
7521
164173a2
KW
7522 /* If are to take the union of 'a' with the complement of b, set it
7523 * up so are looking at b's complement. */
7524 if (complement_b) {
7525
7526 /* To complement, we invert: if the first element is 0, remove it. To
7527 * do this, we just pretend the array starts one later, and clear the
7528 * flag as we don't have to do anything else later */
7529 if (array_b[0] == 0) {
7530 array_b++;
7531 len_b--;
7532 complement_b = FALSE;
7533 }
7534 else {
7535
7536 /* But if the first element is not zero, we unshift a 0 before the
7537 * array. The data structure reserves a space for that 0 (which
7538 * should be a '1' right now), so physical shifting is unneeded,
7539 * but temporarily change that element to 0. Before exiting the
7540 * routine, we must restore the element to '1' */
7541 array_b--;
7542 len_b++;
7543 array_b[0] = 0;
7544 }
7545 }
7546
d764b54e
KW
7547 /* Size the union for the worst case: that the sets are completely
7548 * disjoint */
7549 u = _new_invlist(len_a + len_b);
f1b67122
KW
7550
7551 /* Will contain U+0000 if either component does */
7552 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7553 || (len_b > 0 && array_b[0] == 0));
d764b54e
KW
7554
7555 /* Go through each list item by item, stopping when exhausted one of
7556 * them */
7557 while (i_a < len_a && i_b < len_b) {
7558 UV cp; /* The element to potentially add to the union's array */
7559 bool cp_in_set; /* is it in the the input list's set or not */
7560
7561 /* We need to take one or the other of the two inputs for the union.
7562 * Since we are merging two sorted lists, we take the smaller of the
7563 * next items. In case of a tie, we take the one that is in its set
7564 * first. If we took one not in the set first, it would decrement the
7565 * count, possibly to 0 which would cause it to be output as ending the
7566 * range, and the next time through we would take the same number, and
7567 * output it again as beginning the next range. By doing it the
7568 * opposite way, there is no possibility that the count will be
7569 * momentarily decremented to 0, and thus the two adjoining ranges will
7570 * be seamlessly merged. (In a tie and both are in the set or both not
7571 * in the set, it doesn't matter which we take first.) */
7572 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
7573 || (array_a[i_a] == array_b[i_b]
7574 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 7575 {
4096c37b 7576 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
7577 cp= array_a[i_a++];
7578 }
7579 else {
4096c37b 7580 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
7581 cp= array_b[i_b++];
7582 }
7583
7584 /* Here, have chosen which of the two inputs to look at. Only output
7585 * if the running count changes to/from 0, which marks the
7586 * beginning/end of a range in that's in the set */
7587 if (cp_in_set) {
7588 if (count == 0) {
7589 array_u[i_u++] = cp;
7590 }
7591 count++;
7592 }
7593 else {
7594 count--;
7595 if (count == 0) {
7596 array_u[i_u++] = cp;
7597 }
7598 }
7599 }
7600
7601 /* Here, we are finished going through at least one of the lists, which
7602 * means there is something remaining in at most one. We check if the list
7603 * that hasn't been exhausted is positioned such that we are in the middle
bac5f0ae
KW
7604 * of a range in its set or not. (i_a and i_b point to the element beyond
7605 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7606 * is potentially more to output.
d764b54e
KW
7607 * There are four cases:
7608 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7609 * in the union is entirely from the non-exhausted set.
7610 * 2) Both were in their sets, count is 2. Nothing further should
7611 * be output, as everything that remains will be in the exhausted
7612 * list's set, hence in the union; decrementing to 1 but not 0 insures
7613 * that
7614 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7615 * Nothing further should be output because the union includes
bac5f0ae 7616 * everything from the exhausted set. Not decrementing ensures that.
d764b54e
KW
7617 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7618 * decrementing to 0 insures that we look at the remainder of the
7619 * non-exhausted set */
4096c37b
KW
7620 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7621 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e
KW
7622 {
7623 count--;
7624 }
7625
7626 /* The final length is what we've output so far, plus what else is about to
7627 * be output. (If 'count' is non-zero, then the input list we exhausted
7628 * has everything remaining up to the machine's limit in its set, and hence
7629 * in the union, so there will be no further output. */
7630 len_u = i_u;
7631 if (count == 0) {
7632 /* At most one of the subexpressions will be non-zero */
7633 len_u += (len_a - i_a) + (len_b - i_b);
7634 }
7635
7636 /* Set result to final length, which can change the pointer to array_u, so
7637 * re-find it */
7638 if (len_u != invlist_len(u)) {
7639 invlist_set_len(u, len_u);
7640 invlist_trim(u);
7641 array_u = invlist_array(u);
7642 }
7643
7644 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7645 * the other) ended with everything above it not in its set. That means
7646 * that the remaining part of the union is precisely the same as the
7647 * non-exhausted list, so can just copy it unchanged. (If both list were
7648 * exhausted at the same time, then the operations below will be both 0.)
7649 */
7650 if (count == 0) {
7651 IV copy_count; /* At most one will have a non-zero copy count */
7652 if ((copy_count = len_a - i_a) > 0) {
7653 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7654 }
7655 else if ((copy_count = len_b - i_b) > 0) {
7656 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7657 }
7658 }
7659
a2995b7f 7660 /* We may be removing a reference to one of the inputs */
4065ba03 7661 if (a == *output || b == *output) {
a2995b7f
KW
7662 SvREFCNT_dec(*output);
7663 }
7664
164173a2
KW
7665 /* If we've changed b, restore it */
7666 if (complement_b) {
7667 array_b[0] = 1;
7668 }
7669
a2995b7f
KW
7670 *output = u;
7671 return;
d764b54e
KW
7672}
7673
86f766ab 7674void
52ae8f7e 7675Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
d764b54e 7676{
4065ba03
KW
7677 /* Take the intersection of two inversion lists and point <i> to it. *i
7678 * should be defined upon input, and if it points to one of the two lists,
7679 * the reference count to that list will be decremented.
52ae8f7e
KW
7680 * If <complement_b> is TRUE, the result will be the intersection of <a>
7681 * and the complement (or inversion) of <b> instead of <b> directly.
7682 *
a2995b7f
KW
7683 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7684 * Richard Gillam, published by Addison-Wesley, and explained at some
7685 * length there. The preface says to incorporate its examples into your
7686 * code at your own risk. In fact, it had bugs
d764b54e
KW
7687 *
7688 * The algorithm is like a merge sort, and is essentially the same as the
7689 * union above
7690 */
7691
f1b67122
KW
7692 UV* array_a; /* a's array */
7693 UV* array_b;
7694 UV len_a; /* length of a's array */
7695 UV len_b;
d764b54e 7696
a25abddc 7697 SV* r; /* the resulting intersection */
d764b54e
KW
7698 UV* array_r;
7699 UV len_r;
7700
7701 UV i_a = 0; /* current index into a's array */
7702 UV i_b = 0;
7703 UV i_r = 0;
7704
7705 /* running count, as explained in the algorithm source book; items are
7706 * stopped accumulating and are output when the count changes to/from 2.
7707 * The count is incremented when we start a range that's in the set, and
7708 * decremented when we start a range that's not in the set. So its range
7709 * is 0 to 2. Only when the count is 2 is something in the intersection.
7710 */
7711 UV count = 0;
7712
52ae8f7e 7713 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
60825692 7714 assert(a != b);
d764b54e 7715
52ae8f7e 7716 /* Special case if either one is empty */
f1b67122
KW
7717 len_a = invlist_len(a);
7718 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
f1b67122 7719
52ae8f7e
KW
7720 if (len_a != 0 && complement_b) {
7721
7722 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7723 * be empty. Here, also we are using 'b's complement, which hence
7724 * must be every possible code point. Thus the intersection is
7725 * simply 'a'. */
7726 if (*i != a) {
7727 *i = invlist_clone(a);
7728
7729 if (*i == b) {
7730 SvREFCNT_dec(b);
7731 }
7732 }
7733 /* else *i is already 'a' */
7734 return;
7735 }
7736
7737 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7738 * intersection must be empty */
4065ba03 7739 if (*i == a) {
f1b67122
KW
7740 SvREFCNT_dec(a);
7741 }
4065ba03 7742 else if (*i == b) {
f1b67122
KW
7743 SvREFCNT_dec(b);
7744 }
2ea86699 7745 *i = _new_invlist(0);
f1b67122
KW
7746 return;
7747 }
7748
7749 /* Here both lists exist and are non-empty */
7750 array_a = invlist_array(a);
7751 array_b = invlist_array(b);
7752
52ae8f7e
KW
7753 /* If are to take the intersection of 'a' with the complement of b, set it
7754 * up so are looking at b's complement. */
7755 if (complement_b) {
7756
7757 /* To complement, we invert: if the first element is 0, remove it. To
7758 * do this, we just pretend the array starts one later, and clear the
7759 * flag as we don't have to do anything else later */
7760 if (array_b[0] == 0) {
7761 array_b++;
7762 len_b--;
7763 complement_b = FALSE;
7764 }
7765 else {
7766
7767 /* But if the first element is not zero, we unshift a 0 before the
7768 * array. The data structure reserves a space for that 0 (which
7769 * should be a '1' right now), so physical shifting is unneeded,
7770 * but temporarily change that element to 0. Before exiting the
7771 * routine, we must restore the element to '1' */
7772 array_b--;
7773 len_b++;
7774 array_b[0] = 0;
7775 }
7776 }
7777
d764b54e
KW
7778 /* Size the intersection for the worst case: that the intersection ends up
7779 * fragmenting everything to be completely disjoint */
7780 r= _new_invlist(len_a + len_b);
f1b67122
KW
7781
7782 /* Will contain U+0000 iff both components do */
7783 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7784 && len_b > 0 && array_b[0] == 0);
d764b54e
KW
7785
7786 /* Go through each list item by item, stopping when exhausted one of
7787 * them */
7788 while (i_a < len_a && i_b < len_b) {
7789 UV cp; /* The element to potentially add to the intersection's
7790 array */
7791 bool cp_in_set; /* Is it in the input list's set or not */
7792
c4a30257
KW
7793 /* We need to take one or the other of the two inputs for the
7794 * intersection. Since we are merging two sorted lists, we take the
7795 * smaller of the next items. In case of a tie, we take the one that
7796 * is not in its set first (a difference from the union algorithm). If
7797 * we took one in the set first, it would increment the count, possibly
7798 * to 2 which would cause it to be output as starting a range in the
7799 * intersection, and the next time through we would take that same
7800 * number, and output it again as ending the set. By doing it the
7801 * opposite of this, there is no possibility that the count will be
7802 * momentarily incremented to 2. (In a tie and both are in the set or
7803 * both not in the set, it doesn't matter which we take first.) */
d764b54e 7804 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
7805 || (array_a[i_a] == array_b[i_b]
7806 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 7807 {
4096c37b 7808 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
7809 cp= array_a[i_a++];
7810 }
7811 else {
4096c37b 7812 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
7813 cp= array_b[i_b++];
7814 }
7815
7816 /* Here, have chosen which of the two inputs to look at. Only output
7817 * if the running count changes to/from 2, which marks the
7818 * beginning/end of a range that's in the intersection */
7819 if (cp_in_set) {
7820 count++;
7821 if (count == 2) {
7822 array_r[i_r++] = cp;
7823 }
7824 }
7825 else {
7826 if (count == 2) {
7827 array_r[i_r++] = cp;
7828 }
7829 count--;
7830 }
7831 }
7832
c4a30257
KW
7833 /* Here, we are finished going through at least one of the lists, which
7834 * means there is something remaining in at most one. We check if the list
7835 * that has been exhausted is positioned such that we are in the middle
7836 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7837 * the ones we care about.) There are four cases:
7838 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7839 * nothing left in the intersection.
7840 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7841 * above 2. What should be output is exactly that which is in the
7842 * non-exhausted set, as everything it has is also in the intersection
7843 * set, and everything it doesn't have can't be in the intersection
7844 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7845 * gets incremented to 2. Like the previous case, the intersection is
7846 * everything that remains in the non-exhausted set.
7847 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7848 * remains 1. And the intersection has nothing more. */
4096c37b
KW
7849 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7850 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e 7851 {
c4a30257 7852 count++;
d764b54e
KW
7853 }
7854
7855 /* The final length is what we've output so far plus what else is in the
c4a30257 7856 * intersection. At most one of the subexpressions below will be non-zero */
d764b54e 7857 len_r = i_r;
c4a30257 7858 if (count >= 2) {
d764b54e
KW
7859 len_r += (len_a - i_a) + (len_b - i_b);
7860 }
7861
7862 /* Set result to final length, which can change the pointer to array_r, so
7863 * re-find it */
7864 if (len_r != invlist_len(r)) {
7865 invlist_set_len(r, len_r);
7866 invlist_trim(r);
7867 array_r = invlist_array(r);
7868 }
7869
7870 /* Finish outputting any remaining */
c4a30257 7871 if (count >= 2) { /* At most one will have a non-zero copy count */
d764b54e
KW
7872 IV copy_count;
7873 if ((copy_count = len_a - i_a) > 0) {
7874 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7875 }
7876 else if ((copy_count = len_b - i_b) > 0) {
7877 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7878 }
7879 }
7880
a2995b7f 7881 /* We may be removing a reference to one of the inputs */
4065ba03 7882 if (a == *i || b == *i) {
a2995b7f
KW
7883 SvREFCNT_dec(*i);
7884 }
7885
52ae8f7e
KW
7886 /* If we've changed b, restore it */
7887 if (complement_b) {
7888 array_b[0] = 1;
7889 }
7890
a2995b7f
KW
7891 *i = r;
7892 return;
d764b54e
KW
7893}
7894
9d501133
KW
7895SV*
7896Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
d764b54e
KW
7897{
7898 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7899 * set. A pointer to the inversion list is returned. This may actually be
c52a3e71
KW
7900 * a new list, in which case the passed in one has been destroyed. The
7901 * passed in inversion list can be NULL, in which case a new one is created
7902 * with just the one range in it */
d764b54e 7903
a25abddc 7904 SV* range_invlist;
c52a3e71 7905 UV len;
d764b54e 7906
c52a3e71
KW
7907 if (invlist == NULL) {
7908 invlist = _new_invlist(2);
7909 len = 0;
7910 }
7911 else {
7912 len = invlist_len(invlist);
7913 }
d764b54e
KW
7914
7915 /* If comes after the final entry, can just append it to the end */
7916 if (len == 0
7917 || start >= invlist_array(invlist)
7918 [invlist_len(invlist) - 1])
7919 {
7920 _append_range_to_invlist(invlist, start, end);
7921 return invlist;
7922 }
7923
7924 /* Here, can't just append things, create and return a new inversion list
7925 * which is the union of this range and the existing inversion list */
7926 range_invlist = _new_invlist(2);
7927 _append_range_to_invlist(range_invlist, start, end);
7928
37e85ffe 7929 _invlist_union(invlist, range_invlist, &invlist);
d764b54e 7930
0a89af2f 7931 /* The temporary can be freed */
318c430e 7932 SvREFCNT_dec(range_invlist);
d764b54e 7933
6d63a9fb 7934 return invlist;
d764b54e
KW
7935}
7936
9d501133
KW
7937#endif
7938
a25abddc
KW
7939PERL_STATIC_INLINE SV*
7940S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9d501133 7941 return _add_range_to_invlist(invlist, cp, cp);
c229b64c
KW
7942}
7943
3c234b35 7944#ifndef PERL_IN_XSUB_RE
86f766ab
KW
7945void
7946Perl__invlist_invert(pTHX_ SV* const invlist)
25e94a65
KW
7947{
7948 /* Complement the input inversion list. This adds a 0 if the list didn't
7949 * have a zero; removes it otherwise. As described above, the data
7950 * structure is set up so that this is very efficient */
7951
7952 UV* len_pos = get_invlist_len_addr(invlist);
7953
37e85ffe 7954 PERL_ARGS_ASSERT__INVLIST_INVERT;
25e94a65
KW
7955
7956 /* The inverse of matching nothing is matching everything */
7957 if (*len_pos == 0) {
7958 _append_range_to_invlist(invlist, 0, UV_MAX);
7959 return;
7960 }
7961
7962 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7963 * zero element was a 0, so it is being removed, so the length decrements
7964 * by 1; and vice-versa. SvCUR is unaffected */
7965 if (*get_invlist_zero_addr(invlist) ^= 1) {
7966 (*len_pos)--;
7967 }
7968 else {
7969 (*len_pos)++;
7970 }
7971}
89302fc2
KW
7972
7973void
7974Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7975{
7976 /* Complement the input inversion list (which must be a Unicode property,
7977 * all of which don't match above the Unicode maximum code point.) And
7978 * Perl has chosen to not have the inversion match above that either. This
7979 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7980 */
7981
7982 UV len;
7983 UV* array;
7984
7985 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7986
7987 _invlist_invert(invlist);
7988
7989 len = invlist_len(invlist);
7990
7991 if (len != 0) { /* If empty do nothing */
7992 array = invlist_array(invlist);
7993 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7994 /* Add 0x110000. First, grow if necessary */
7995 len++;
7996 if (invlist_max(invlist) < len) {
7997 invlist_extend(invlist, len);
7998 array = invlist_array(invlist);
7999 }
8000 invlist_set_len(invlist, len);
8001 array[len - 1] = PERL_UNICODE_MAX + 1;
8002 }
8003 else { /* Remove the 0x110000 */
8004 invlist_set_len(invlist, len - 1);
8005 }
8006 }
8007
8008 return;
8009}
3c234b35 8010#endif
25e94a65
KW
8011
8012PERL_STATIC_INLINE SV*
8013S_invlist_clone(pTHX_ SV* const invlist)
8014{
8015
8016 /* Return a new inversion list that is a copy of the input one, which is
8017 * unchanged */
8018
6c6c83ac
KW
8019 /* Need to allocate extra space to accommodate Perl's addition of a
8020 * trailing NUL to SvPV's, since it thinks they are always strings */
8021 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
6d47fb3d 8022 STRLEN length = SvCUR(invlist);
25e94a65
KW
8023
8024 PERL_ARGS_ASSERT_INVLIST_CLONE;
8025
6d47fb3d
KW
8026 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8027 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8028
25e94a65
KW
8029 return new_invlist;
8030}
8031
f3dc70d1
KW
8032PERL_STATIC_INLINE UV*
8033S_get_invlist_iter_addr(pTHX_ SV* invlist)
8034{
8035 /* Return the address of the UV that contains the current iteration
8036 * position */
8037
8038 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8039
8040 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8041}
8042
f59ff194
KW
8043PERL_STATIC_INLINE UV*
8044S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8045{
8046 /* Return the address of the UV that contains the version id. */
8047
8048 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8049
8050 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8051}
8052
f3dc70d1
KW
8053PERL_STATIC_INLINE void
8054S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8055{
8056 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8057
8058 *get_invlist_iter_addr(invlist) = 0;
8059}
8060
8061STATIC bool
8062S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8063{
dbe7a391
KW
8064 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8065 * This call sets in <*start> and <*end>, the next range in <invlist>.
8066 * Returns <TRUE> if successful and the next call will return the next
8067 * range; <FALSE> if was already at the end of the list. If the latter,
8068 * <*start> and <*end> are unchanged, and the next call to this function
8069 * will start over at the beginning of the list */
8070
f3dc70d1
KW
8071 UV* pos = get_invlist_iter_addr(invlist);
8072 UV len = invlist_len(invlist);
8073 UV *array;
8074
8075 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8076
8077 if (*pos >= len) {
8078 *pos = UV_MAX; /* Force iternit() to be required next time */
8079 return FALSE;
8080 }
8081
8082 array = invlist_array(invlist);
8083
8084 *start = array[(*pos)++];
8085
8086 if (*pos >= len) {
8087 *end = UV_MAX;
8088 }
8089 else {
8090 *end = array[(*pos)++] - 1;
8091 }
8092
8093 return TRUE;
8094}
8095
b2b97e77
KW
8096#ifndef PERL_IN_XSUB_RE
8097SV *
8098Perl__invlist_contents(pTHX_ SV* const invlist)
8099{
8100 /* Get the contents of an inversion list into a string SV so that they can
8101 * be printed out. It uses the format traditionally done for debug tracing
8102 */
8103
8104 UV start, end;
8105 SV* output = newSVpvs("\n");
8106
8107 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8108
8109 invlist_iterinit(invlist);
8110 while (invlist_iternext(invlist, &start, &end)) {
8111 if (end == UV_MAX) {
8112 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8113 }
8114 else if (end != start) {
8115 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8116 start, end);
8117 }
8118 else {
8119 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8120 }
8121 }
8122
8123 return output;
8124}
8125#endif
8126
768318b8
KW
8127#if 0
8128void
8129S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8130{
8131 /* Dumps out the ranges in an inversion list. The string 'header'
8132 * if present is output on a line before the first range */
8133
8134 UV start, end;
8135
8136 if (header && strlen(header)) {
8137 PerlIO_printf(Perl_debug_log, "%s\n", header);
8138 }
8139 invlist_iterinit(invlist);
8140 while (invlist_iternext(invlist, &start, &end)) {
8141 if (end == UV_MAX) {
8142 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8143 }
8144 else {
8145 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8146 }
8147 }
8148}
8149#endif
8150
97b14ce7 8151#undef HEADER_LENGTH
060b7a35 8152#undef INVLIST_INITIAL_LENGTH
005b65ed
KW
8153#undef TO_INTERNAL_SIZE
8154#undef FROM_INTERNAL_SIZE
f1b67122
KW
8155#undef INVLIST_LEN_OFFSET
8156#undef INVLIST_ZERO_OFFSET
f3dc70d1 8157#undef INVLIST_ITER_OFFSET
f59ff194 8158#undef INVLIST_VERSION_ID
060b7a35 8159
d764b54e
KW
8160/* End of inversion list object */
8161
a687059c
LW
8162/*
8163 - reg - regular expression, i.e. main body or parenthesized thing
8164 *
8165 * Caller must absorb opening parenthesis.
8166 *
8167 * Combining parenthesis handling with the base level of regular expression
8168 * is a trifle forced, but the need to tie the tails of the branches to what
8169 * follows makes it hard to avoid.
8170 */
07be1b83
YO
8171#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8172#ifdef DEBUGGING
8173#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8174#else
8175#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8176#endif
3dab1dad 8177
76e3520e 8178STATIC regnode *
3dab1dad 8179S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 8180 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 8181{
27da23d5 8182 dVAR;
c277df42
IZ
8183 register regnode *ret; /* Will be the head of the group. */
8184 register regnode *br;
8185 register regnode *lastbr;
cbbf8932 8186 register regnode *ender = NULL;
a0d0e21e 8187 register I32 parno = 0;
cbbf8932 8188 I32 flags;
f7819f85 8189 U32 oregflags = RExC_flags;
6136c704
AL
8190 bool have_branch = 0;
8191 bool is_open = 0;
594d7033
YO
8192 I32 freeze_paren = 0;
8193 I32 after_freeze = 0;
9d1d55b5
JP
8194
8195 /* for (?g), (?gc), and (?o) warnings; warning
8196 about (?c) will warn about (?g) -- japhy */
8197
6136c704
AL
8198#define WASTED_O 0x01
8199#define WASTED_G 0x02
8200#define WASTED_C 0x04
8201#define WASTED_GC (0x02|0x04)
cbbf8932 8202 I32 wastedflags = 0x00;
9d1d55b5 8203
fac92740 8204 char * parse_start = RExC_parse; /* MJD */
a28509cc 8205 char * const oregcomp_parse = RExC_parse;
a0d0e21e 8206
3dab1dad 8207 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8208
8209 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
8210 DEBUG_PARSE("reg ");
8211
821b33a5 8212 *flagp = 0; /* Tentatively. */
a0d0e21e 8213
9d1d55b5 8214
a0d0e21e
LW
8215 /* Make an OPEN node, if parenthesized. */
8216 if (paren) {
e2e6a0f1
YO
8217 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8218 char *start_verb = RExC_parse;
8219 STRLEN verb_len = 0;
8220 char *start_arg = NULL;
8221 unsigned char op = 0;
8222 int argok = 1;
8223 int internal_argval = 0; /* internal_argval is only useful if !argok */
8224 while ( *RExC_parse && *RExC_parse != ')' ) {
8225 if ( *RExC_parse == ':' ) {
8226 start_arg = RExC_parse + 1;
8227 break;
8228 }
8229 RExC_parse++;
8230 }
8231 ++start_verb;
8232 verb_len = RExC_parse - start_verb;
8233 if ( start_arg ) {
8234 RExC_parse++;
8235 while ( *RExC_parse && *RExC_parse != ')' )
8236 RExC_parse++;
8237 if ( *RExC_parse != ')' )
8238 vFAIL("Unterminated verb pattern argument");
8239 if ( RExC_parse == start_arg )
8240 start_arg = NULL;
8241 } else {
8242 if ( *RExC_parse != ')' )
8243 vFAIL("Unterminated verb pattern");
8244 }
5d458dd8 8245
e2e6a0f1
YO
8246 switch ( *start_verb ) {
8247 case 'A': /* (*ACCEPT) */
568a785a 8248 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
8249 op = ACCEPT;
8250 internal_argval = RExC_nestroot;
8251 }
8252 break;
8253 case 'C': /* (*COMMIT) */
568a785a 8254 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 8255 op = COMMIT;
e2e6a0f1
YO
8256 break;
8257 case 'F': /* (*FAIL) */
568a785a 8258 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
8259 op = OPFAIL;
8260 argok = 0;
8261 }
8262 break;
5d458dd8
YO
8263 case ':': /* (*:NAME) */
8264 case 'M': /* (*MARK:NAME) */
568a785a 8265 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 8266 op = MARKPOINT;
5d458dd8
YO
8267 argok = -1;
8268 }
8269 break;
8270 case 'P': /* (*PRUNE) */
568a785a 8271 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 8272 op = PRUNE;
e2e6a0f1 8273 break;
5d458dd8 8274 case 'S': /* (*SKIP) */
568a785a 8275 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
8276 op = SKIP;
8277 break;
8278 case 'T': /* (*THEN) */
8279 /* [19:06] <TimToady> :: is then */
568a785a 8280 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
8281 op = CUTGROUP;
8282 RExC_seen |= REG_SEEN_CUTGROUP;
8283 }
e2e6a0f1
YO
8284 break;
8285 }
8286 if ( ! op ) {
8287 RExC_parse++;
8288 vFAIL3("Unknown verb pattern '%.*s'",
8289 verb_len, start_verb);
8290 }
8291 if ( argok ) {
8292 if ( start_arg && internal_argval ) {
8293 vFAIL3("Verb pattern '%.*s' may not have an argument",
8294 verb_len, start_verb);
8295 } else if ( argok < 0 && !start_arg ) {
8296 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8297 verb_len, start_verb);
8298 } else {
8299 ret = reganode(pRExC_state, op, internal_argval);
8300 if ( ! internal_argval && ! SIZE_ONLY ) {
8301 if (start_arg) {
8302 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8303 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 8304 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
8305 ret->flags = 0;
8306 } else {
8307 ret->flags = 1;
8308 }
8309 }
8310 }
8311 if (!internal_argval)
8312 RExC_seen |= REG_SEEN_VERBARG;
8313 } else if ( start_arg ) {
8314 vFAIL3("Verb pattern '%.*s' may not have an argument",
8315 verb_len, start_verb);
8316 } else {
8317 ret = reg_node(pRExC_state, op);
8318 }
8319 nextchar(pRExC_state);
8320 return ret;
8321 } else
fac92740 8322 if (*RExC_parse == '?') { /* (?...) */
6136c704 8323 bool is_logical = 0;
a28509cc 8324 const char * const seqstart = RExC_parse;
fb85c044 8325 bool has_use_defaults = FALSE;
ca9dfc88 8326
830247a4
IZ
8327 RExC_parse++;
8328 paren = *RExC_parse++;
c277df42 8329 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 8330 switch (paren) {
894be9b7 8331
1f1031fe
YO
8332 case 'P': /* (?P...) variants for those used to PCRE/Python */
8333 paren = *RExC_parse++;
8334 if ( paren == '<') /* (?P<...>) named capture */
8335 goto named_capture;
8336 else if (paren == '>') { /* (?P>name) named recursion */
8337 goto named_recursion;
8338 }
8339 else if (paren == '=') { /* (?P=...) named backref */
8340 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8341 you change this make sure you change that */
8342 char* name_start = RExC_parse;
8343 U32 num = 0;
8344 SV *sv_dat = reg_scan_name(pRExC_state,
8345 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8346 if (RExC_parse == name_start || *RExC_parse != ')')
8347 vFAIL2("Sequence %.3s... not terminated",parse_start);
8348
8349 if (!SIZE_ONLY) {
8350 num = add_data( pRExC_state, 1, "S" );
8351 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8352 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
8353 }
8354 RExC_sawback = 1;
4444fd9f
KW
8355 ret = reganode(pRExC_state,
8356 ((! FOLD)
8357 ? NREF
2f7f8cb1
KW
8358 : (MORE_ASCII_RESTRICTED)
8359 ? NREFFA
8360 : (AT_LEAST_UNI_SEMANTICS)
8361 ? NREFFU
8362 : (LOC)
8363 ? NREFFL
8364 : NREFF),
4444fd9f 8365 num);
1f1031fe
YO
8366 *flagp |= HASWIDTH;
8367
8368 Set_Node_Offset(ret, parse_start+1);
8369 Set_Node_Cur_Length(ret); /* MJD */
8370
8371 nextchar(pRExC_state);
8372 return ret;
8373 }
57b84237
YO
8374 RExC_parse++;
8375 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8376 /*NOTREACHED*/
8377 case '<': /* (?<...) */
b81d288d 8378 if (*RExC_parse == '!')
c277df42 8379 paren = ',';
0a4db386 8380 else if (*RExC_parse != '=')
1f1031fe 8381 named_capture:
0a4db386 8382 { /* (?<...>) */
81714fb9 8383 char *name_start;
894be9b7 8384 SV *svname;
81714fb9
YO
8385 paren= '>';
8386 case '\'': /* (?'...') */
8387 name_start= RExC_parse;
0a4db386
YO
8388 svname = reg_scan_name(pRExC_state,
8389 SIZE_ONLY ? /* reverse test from the others */
8390 REG_RSN_RETURN_NAME :
8391 REG_RSN_RETURN_NULL);
57b84237
YO
8392 if (RExC_parse == name_start) {
8393 RExC_parse++;
8394 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8395 /*NOTREACHED*/
8396 }
81714fb9
YO
8397 if (*RExC_parse != paren)
8398 vFAIL2("Sequence (?%c... not terminated",
8399 paren=='>' ? '<' : paren);
8400 if (SIZE_ONLY) {
e62cc96a
YO
8401 HE *he_str;
8402 SV *sv_dat = NULL;
486ec47a 8403 if (!svname) /* shouldn't happen */
894be9b7
YO
8404 Perl_croak(aTHX_
8405 "panic: reg_scan_name returned NULL");
81714fb9
YO
8406 if (!RExC_paren_names) {
8407 RExC_paren_names= newHV();
ad64d0ec 8408 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
8409#ifdef DEBUGGING
8410 RExC_paren_name_list= newAV();
ad64d0ec 8411 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 8412#endif
81714fb9
YO
8413 }
8414 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 8415 if ( he_str )
81714fb9 8416 sv_dat = HeVAL(he_str);
e62cc96a 8417 if ( ! sv_dat ) {
81714fb9 8418 /* croak baby croak */
e62cc96a
YO
8419 Perl_croak(aTHX_
8420 "panic: paren_name hash element allocation failed");
8421 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
8422 /* (?|...) can mean we have dupes so scan to check
8423 its already been stored. Maybe a flag indicating
8424 we are inside such a construct would be useful,
8425 but the arrays are likely to be quite small, so
8426 for now we punt -- dmq */
8427 IV count = SvIV(sv_dat);
8428 I32 *pv = (I32*)SvPVX(sv_dat);
8429 IV i;
8430 for ( i = 0 ; i < count ; i++ ) {
8431 if ( pv[i] == RExC_npar ) {
8432 count = 0;
8433 break;
8434 }
8435 }
8436 if ( count ) {
8437 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8438 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8439 pv[count] = RExC_npar;
3a92e6ae 8440 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 8441 }
81714fb9
YO
8442 } else {
8443 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8444 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8445 SvIOK_on(sv_dat);
3ec35e0f 8446 SvIV_set(sv_dat, 1);
e62cc96a 8447 }
1f1031fe 8448#ifdef DEBUGGING
17a3c617 8449 /* Yes this does cause a memory leak in debugging Perls */
1f1031fe
YO
8450 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8451 SvREFCNT_dec(svname);
8452#endif
e62cc96a 8453
81714fb9
YO
8454 /*sv_dump(sv_dat);*/
8455 }
8456 nextchar(pRExC_state);
8457 paren = 1;
8458 goto capturing_parens;
8459 }
8460 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 8461 RExC_in_lookbehind++;
830247a4 8462 RExC_parse++;
fac92740 8463 case '=': /* (?=...) */
89c6a13e 8464 RExC_seen_zerolen++;
5c3fa2e7 8465 break;
fac92740 8466 case '!': /* (?!...) */
830247a4 8467 RExC_seen_zerolen++;
e2e6a0f1
YO
8468 if (*RExC_parse == ')') {
8469 ret=reg_node(pRExC_state, OPFAIL);
8470 nextchar(pRExC_state);
8471 return ret;
8472 }
594d7033
YO
8473 break;
8474 case '|': /* (?|...) */
8475 /* branch reset, behave like a (?:...) except that
8476 buffers in alternations share the same numbers */
8477 paren = ':';
8478 after_freeze = freeze_paren = RExC_npar;
8479 break;
fac92740
MJD
8480 case ':': /* (?:...) */
8481 case '>': /* (?>...) */
a0d0e21e 8482 break;
fac92740
MJD
8483 case '$': /* (?$...) */
8484 case '@': /* (?@...) */
8615cb43 8485 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 8486 break;
fac92740 8487 case '#': /* (?#...) */
830247a4
IZ
8488 while (*RExC_parse && *RExC_parse != ')')
8489 RExC_parse++;
8490 if (*RExC_parse != ')')
c277df42 8491 FAIL("Sequence (?#... not terminated");
830247a4 8492 nextchar(pRExC_state);
a0d0e21e
LW
8493 *flagp = TRYAGAIN;
8494 return NULL;
894be9b7
YO
8495 case '0' : /* (?0) */
8496 case 'R' : /* (?R) */
8497 if (*RExC_parse != ')')
6bda09f9 8498 FAIL("Sequence (?R) not terminated");
1a147d38 8499 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 8500 *flagp |= POSTPONED;
7f69552c
YO
8501 nextchar(pRExC_state);
8502 return ret;
8503 /*notreached*/
894be9b7
YO
8504 { /* named and numeric backreferences */
8505 I32 num;
894be9b7
YO
8506 case '&': /* (?&NAME) */
8507 parse_start = RExC_parse - 1;
1f1031fe 8508 named_recursion:
894be9b7 8509 {
0a4db386
YO
8510 SV *sv_dat = reg_scan_name(pRExC_state,
8511 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8512 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
8513 }
8514 goto gen_recurse_regop;
118e2215 8515 assert(0); /* NOT REACHED */
542fa716
YO
8516 case '+':
8517 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8518 RExC_parse++;
8519 vFAIL("Illegal pattern");
8520 }
8521 goto parse_recursion;
8522 /* NOT REACHED*/
8523 case '-': /* (?-1) */
8524 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8525 RExC_parse--; /* rewind to let it be handled later */
8526 goto parse_flags;
8527 }
8528 /*FALLTHROUGH */
6bda09f9
YO
8529 case '1': case '2': case '3': case '4': /* (?1) */
8530 case '5': case '6': case '7': case '8': case '9':
8531 RExC_parse--;
542fa716 8532 parse_recursion:
894be9b7
YO
8533 num = atoi(RExC_parse);
8534 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
8535 if (*RExC_parse == '-')
8536 RExC_parse++;
6bda09f9
YO
8537 while (isDIGIT(*RExC_parse))
8538 RExC_parse++;
8539 if (*RExC_parse!=')')
8540 vFAIL("Expecting close bracket");
686b73d4 8541
894be9b7 8542 gen_recurse_regop:
542fa716
YO
8543 if ( paren == '-' ) {
8544 /*
8545 Diagram of capture buffer numbering.
8546 Top line is the normal capture buffer numbers
3b753521 8547 Bottom line is the negative indexing as from
542fa716
YO
8548 the X (the (?-2))
8549
8550 + 1 2 3 4 5 X 6 7
8551 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8552 - 5 4 3 2 1 X x x
8553
8554 */
8555 num = RExC_npar + num;
8556 if (num < 1) {
8557 RExC_parse++;
8558 vFAIL("Reference to nonexistent group");
8559 }
8560 } else if ( paren == '+' ) {
8561 num = RExC_npar + num - 1;
8562 }
8563
1a147d38 8564 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
8565 if (!SIZE_ONLY) {
8566 if (num > (I32)RExC_rx->nparens) {
8567 RExC_parse++;
8568 vFAIL("Reference to nonexistent group");
8569 }
40d049e4 8570 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 8571 RExC_emit++;
226de585 8572 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 8573 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 8574 } else {
6bda09f9 8575 RExC_size++;
6bda09f9 8576 }
0a4db386 8577 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 8578 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
8579 Set_Node_Offset(ret, parse_start); /* MJD */
8580
a3b492c3 8581 *flagp |= POSTPONED;
6bda09f9
YO
8582 nextchar(pRExC_state);
8583 return ret;
894be9b7 8584 } /* named and numeric backreferences */
118e2215 8585 assert(0); /* NOT REACHED */
894be9b7 8586
fac92740 8587 case '?': /* (??...) */
6136c704 8588 is_logical = 1;
57b84237
YO
8589 if (*RExC_parse != '{') {
8590 RExC_parse++;
8591 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8592 /*NOTREACHED*/
8593 }
a3b492c3 8594 *flagp |= POSTPONED;
830247a4 8595 paren = *RExC_parse++;
0f5d15d6 8596 /* FALL THROUGH */
fac92740 8597 case '{': /* (?{...}) */
c277df42 8598 {
2eccd3b2 8599 U32 n = 0;
d24ca0c5 8600 struct reg_code_block *cb;
c277df42 8601
830247a4 8602 RExC_seen_zerolen++;
68e2671b 8603
d24ca0c5
DM
8604 if ( !pRExC_state->num_code_blocks
8605 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8606 || pRExC_state->code_blocks[pRExC_state->code_index].start
8607 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
68e2671b
DM
8608 - RExC_start)
8609 ) {
d24ca0c5
DM
8610 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8611 FAIL("panic: Sequence (?{...}): no code block found\n");
8612 FAIL("Eval-group not allowed at runtime, use re 'eval'");
c277df42 8613 }
d24ca0c5
DM
8614 /* this is a pre-compiled code block (?{...}) */
8615 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8616 RExC_parse = RExC_start + cb->end;
5f616ea7 8617 if (!SIZE_ONLY) {
d24ca0c5
DM
8618 OP *o = cb->block;
8619 if (cb->src_regex) {
8620 n = add_data(pRExC_state, 2, "rl");
8621 RExC_rxi->data->data[n] =
8622 (void*)SvREFCNT_inc((SV*)cb->src_regex);
2e2e3f36 8623 RExC_rxi->data->data[n+1] = (void*)o;
68e2671b 8624 }
d24ca0c5
DM
8625 else {
8626 n = add_data(pRExC_state, 1,
8627 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
2e2e3f36 8628 RExC_rxi->data->data[n] = (void*)o;
68e2671b 8629 }
c277df42 8630 }
d24ca0c5 8631 pRExC_state->code_index++;
830247a4 8632 nextchar(pRExC_state);
68e2671b 8633
6136c704 8634 if (is_logical) {
ec841a27 8635 regnode *eval;
830247a4 8636 ret = reg_node(pRExC_state, LOGICAL);
ec841a27
DM
8637 eval = reganode(pRExC_state, EVAL, n);
8638 if (!SIZE_ONLY) {
0f5d15d6 8639 ret->flags = 2;
ec841a27 8640 /* for later propagation into (??{}) return value */
db703679 8641 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
ec841a27
DM
8642 }
8643 REGTAIL(pRExC_state, ret, eval);
fac92740 8644 /* deal with the length of this later - MJD */
0f5d15d6
IZ
8645 return ret;
8646 }
ccb2c380
MP
8647 ret = reganode(pRExC_state, EVAL, n);
8648 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8649 Set_Node_Offset(ret, parse_start);
8650 return ret;
c277df42 8651 }
fac92740 8652 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 8653 {
0a4db386 8654 int is_define= 0;
fac92740 8655 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
8656 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8657 || RExC_parse[1] == '<'
830247a4 8658 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42 8659 I32 flag;
686b73d4 8660
830247a4 8661 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
8662 if (!SIZE_ONLY)
8663 ret->flags = 1;
3dab1dad 8664 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 8665 goto insert_if;
b81d288d 8666 }
a0ed51b3 8667 }
0a4db386
YO
8668 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8669 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8670 {
8671 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8672 char *name_start= RExC_parse++;
2eccd3b2 8673 U32 num = 0;
0a4db386
YO
8674 SV *sv_dat=reg_scan_name(pRExC_state,
8675 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8676 if (RExC_parse == name_start || *RExC_parse != ch)
8677 vFAIL2("Sequence (?(%c... not terminated",
8678 (ch == '>' ? '<' : ch));
8679 RExC_parse++;
8680 if (!SIZE_ONLY) {
8681 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 8682 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8683 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
8684 }
8685 ret = reganode(pRExC_state,NGROUPP,num);
8686 goto insert_if_check_paren;
8687 }
8688 else if (RExC_parse[0] == 'D' &&
8689 RExC_parse[1] == 'E' &&
8690 RExC_parse[2] == 'F' &&
8691 RExC_parse[3] == 'I' &&
8692 RExC_parse[4] == 'N' &&
8693 RExC_parse[5] == 'E')
8694 {
8695 ret = reganode(pRExC_state,DEFINEP,0);
8696 RExC_parse +=6 ;
8697 is_define = 1;
8698 goto insert_if_check_paren;
8699 }
8700 else if (RExC_parse[0] == 'R') {
8701 RExC_parse++;
8702 parno = 0;
8703 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8704 parno = atoi(RExC_parse++);
8705 while (isDIGIT(*RExC_parse))
8706 RExC_parse++;
8707 } else if (RExC_parse[0] == '&') {
8708 SV *sv_dat;
8709 RExC_parse++;
8710 sv_dat = reg_scan_name(pRExC_state,
8711 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8712 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8713 }
1a147d38 8714 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
8715 goto insert_if_check_paren;
8716 }
830247a4 8717 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 8718 /* (?(1)...) */
6136c704 8719 char c;
830247a4 8720 parno = atoi(RExC_parse++);
c277df42 8721
830247a4
IZ
8722 while (isDIGIT(*RExC_parse))
8723 RExC_parse++;
fac92740 8724 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 8725
0a4db386 8726 insert_if_check_paren:
830247a4 8727 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 8728 vFAIL("Switch condition not recognized");
c277df42 8729 insert_if:
3dab1dad
YO
8730 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8731 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 8732 if (br == NULL)
830247a4 8733 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 8734 else
3dab1dad 8735 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 8736 c = *nextchar(pRExC_state);
d1b80229
IZ
8737 if (flags&HASWIDTH)
8738 *flagp |= HASWIDTH;
c277df42 8739 if (c == '|') {
0a4db386
YO
8740 if (is_define)
8741 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 8742 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
8743 regbranch(pRExC_state, &flags, 1,depth+1);
8744 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
8745 if (flags&HASWIDTH)
8746 *flagp |= HASWIDTH;
830247a4 8747 c = *nextchar(pRExC_state);
a0ed51b3
LW
8748 }
8749 else
c277df42
IZ
8750 lastbr = NULL;
8751 if (c != ')')
8615cb43 8752 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 8753 ender = reg_node(pRExC_state, TAIL);
3dab1dad 8754 REGTAIL(pRExC_state, br, ender);
c277df42 8755 if (lastbr) {
3dab1dad
YO
8756 REGTAIL(pRExC_state, lastbr, ender);
8757 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
8758 }
8759 else
3dab1dad 8760 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
8761 RExC_size++; /* XXX WHY do we need this?!!
8762 For large programs it seems to be required
8763 but I can't figure out why. -- dmq*/
c277df42 8764 return ret;
a0ed51b3
LW
8765 }
8766 else {
830247a4 8767 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
8768 }
8769 }
1b1626e4 8770 case 0:
830247a4 8771 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 8772 vFAIL("Sequence (? incomplete");
1b1626e4 8773 break;
85508812
KW
8774 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8775 that follow */
fb85c044
KW
8776 has_use_defaults = TRUE;
8777 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
8778 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8779 ? REGEX_UNICODE_CHARSET
8780 : REGEX_DEPENDS_CHARSET);
fb85c044 8781 goto parse_flags;
a0d0e21e 8782 default:
cde0cee5
YO
8783 --RExC_parse;
8784 parse_flags: /* (?i) */
8785 {
8786 U32 posflags = 0, negflags = 0;
8787 U32 *flagsp = &posflags;
f6a766d5 8788 char has_charset_modifier = '\0';
96f54887
KW
8789 regex_charset cs = get_regex_charset(RExC_flags);
8790 if (cs == REGEX_DEPENDS_CHARSET
8791 && (RExC_utf8 || RExC_uni_semantics))
8792 {
8793 cs = REGEX_UNICODE_CHARSET;
8794 }
cde0cee5
YO
8795
8796 while (*RExC_parse) {
8797 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
8798 /* (?g), (?gc) and (?o) are useless here
8799 and must be globally applied -- japhy */
cde0cee5
YO
8800 switch (*RExC_parse) {
8801 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec 8802 case LOCALE_PAT_MOD:
f6a766d5
KW
8803 if (has_charset_modifier) {
8804 goto excess_modifier;
8805 }
8806 else if (flagsp == &negflags) {
9442e3b8 8807 goto neg_modifier;
9de15fec 8808 }
a62b1201 8809 cs = REGEX_LOCALE_CHARSET;
f6a766d5 8810 has_charset_modifier = LOCALE_PAT_MOD;
4624b182 8811 RExC_contains_locale = 1;
9de15fec
KW
8812 break;
8813 case UNICODE_PAT_MOD:
f6a766d5
KW
8814 if (has_charset_modifier) {
8815 goto excess_modifier;
8816 }
8817 else if (flagsp == &negflags) {
9442e3b8 8818 goto neg_modifier;
9de15fec 8819 }
a62b1201 8820 cs = REGEX_UNICODE_CHARSET;
f6a766d5 8821 has_charset_modifier = UNICODE_PAT_MOD;
9de15fec 8822 break;
cfaf538b 8823 case ASCII_RESTRICT_PAT_MOD:
f6a766d5 8824 if (flagsp == &negflags) {
9442e3b8 8825 goto neg_modifier;
cfaf538b 8826 }
f6a766d5
KW
8827 if (has_charset_modifier) {
8828 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8829 goto excess_modifier;
8830 }
2f7f8cb1 8831 /* Doubled modifier implies more restricted */
f6a766d5
KW
8832 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8833 }
2f7f8cb1
KW
8834 else {
8835 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8836 }
f6a766d5 8837 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
cfaf538b 8838 break;
50e91148 8839 case DEPENDS_PAT_MOD:
9442e3b8 8840 if (has_use_defaults) {
9de15fec 8841 goto fail_modifiers;
f6a766d5 8842 }
9442e3b8
KW
8843 else if (flagsp == &negflags) {
8844 goto neg_modifier;
8845 }
f6a766d5
KW
8846 else if (has_charset_modifier) {
8847 goto excess_modifier;
9de15fec 8848 }
7b98bc43
KW
8849
8850 /* The dual charset means unicode semantics if the
8851 * pattern (or target, not known until runtime) are
e40e74fe
KW
8852 * utf8, or something in the pattern indicates unicode
8853 * semantics */
8854 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
8855 ? REGEX_UNICODE_CHARSET
8856 : REGEX_DEPENDS_CHARSET;
f6a766d5 8857 has_charset_modifier = DEPENDS_PAT_MOD;
9de15fec 8858 break;
f6a766d5
KW
8859 excess_modifier:
8860 RExC_parse++;
8861 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
0c96c706 8862 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
f6a766d5
KW
8863 }
8864 else if (has_charset_modifier == *(RExC_parse - 1)) {
0c96c706 8865 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
f6a766d5
KW
8866 }
8867 else {
0c96c706 8868 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
f6a766d5
KW
8869 }
8870 /*NOTREACHED*/
9442e3b8
KW
8871 neg_modifier:
8872 RExC_parse++;
8873 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8874 /*NOTREACHED*/
f7819f85
A
8875 case ONCE_PAT_MOD: /* 'o' */
8876 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 8877 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 8878 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
8879 if (! (wastedflags & wflagbit) ) {
8880 wastedflags |= wflagbit;
8881 vWARN5(
8882 RExC_parse + 1,
8883 "Useless (%s%c) - %suse /%c modifier",
8884 flagsp == &negflags ? "?-" : "?",
8885 *RExC_parse,
8886 flagsp == &negflags ? "don't " : "",
8887 *RExC_parse
8888 );
8889 }
8890 }
cde0cee5
YO
8891 break;
8892
f7819f85 8893 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 8894 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
8895 if (! (wastedflags & WASTED_C) ) {
8896 wastedflags |= WASTED_GC;
9d1d55b5
JP
8897 vWARN3(
8898 RExC_parse + 1,
8899 "Useless (%sc) - %suse /gc modifier",
8900 flagsp == &negflags ? "?-" : "?",
8901 flagsp == &negflags ? "don't " : ""
8902 );
8903 }
8904 }
cde0cee5 8905 break;
f7819f85 8906 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 8907 if (flagsp == &negflags) {
668c081a
NC
8908 if (SIZE_ONLY)
8909 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
8910 } else {
8911 *flagsp |= RXf_PMf_KEEPCOPY;
8912 }
8913 break;
8914 case '-':
3b753521 8915 /* A flag is a default iff it is following a minus, so
fb85c044
KW
8916 * if there is a minus, it means will be trying to
8917 * re-specify a default which is an error */
8918 if (has_use_defaults || flagsp == &negflags) {
9de15fec 8919 fail_modifiers:
57b84237
YO
8920 RExC_parse++;
8921 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8922 /*NOTREACHED*/
8923 }
cde0cee5
YO
8924 flagsp = &negflags;
8925 wastedflags = 0; /* reset so (?g-c) warns twice */
8926 break;
8927 case ':':
8928 paren = ':';
8929 /*FALLTHROUGH*/
8930 case ')':
8931 RExC_flags |= posflags;
8932 RExC_flags &= ~negflags;
a62b1201 8933 set_regex_charset(&RExC_flags, cs);
f7819f85
A
8934 if (paren != ':') {
8935 oregflags |= posflags;
8936 oregflags &= ~negflags;
a62b1201 8937 set_regex_charset(&oregflags, cs);
f7819f85 8938 }
cde0cee5
YO
8939 nextchar(pRExC_state);
8940 if (paren != ':') {
8941 *flagp = TRYAGAIN;
8942 return NULL;
8943 } else {
8944 ret = NULL;
8945 goto parse_rest;
8946 }
8947 /*NOTREACHED*/
8948 default:
cde0cee5
YO
8949 RExC_parse++;
8950 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8951 /*NOTREACHED*/
8952 }
830247a4 8953 ++RExC_parse;
48c036b1 8954 }
cde0cee5 8955 }} /* one for the default block, one for the switch */
a0d0e21e 8956 }
fac92740 8957 else { /* (...) */
81714fb9 8958 capturing_parens:
830247a4
IZ
8959 parno = RExC_npar;
8960 RExC_npar++;
e2e6a0f1 8961
830247a4 8962 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
8963 if (!SIZE_ONLY ){
8964 if (!RExC_nestroot)
8965 RExC_nestroot = parno;
c009da3d
YO
8966 if (RExC_seen & REG_SEEN_RECURSE
8967 && !RExC_open_parens[parno-1])
8968 {
e2e6a0f1 8969 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
8970 "Setting open paren #%"IVdf" to %d\n",
8971 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
8972 RExC_open_parens[parno-1]= ret;
8973 }
6bda09f9 8974 }
fac92740
MJD
8975 Set_Node_Length(ret, 1); /* MJD */
8976 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 8977 is_open = 1;
a0d0e21e 8978 }
a0ed51b3 8979 }
fac92740 8980 else /* ! paren */
a0d0e21e 8981 ret = NULL;
cde0cee5
YO
8982
8983 parse_rest:
a0d0e21e 8984 /* Pick up the branches, linking them together. */
fac92740 8985 parse_start = RExC_parse; /* MJD */
3dab1dad 8986 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e 8987
fac92740 8988 /* branch_len = (paren != 0); */
2af232bd 8989
a0d0e21e
LW
8990 if (br == NULL)
8991 return(NULL);
830247a4
IZ
8992 if (*RExC_parse == '|') {
8993 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 8994 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 8995 }
fac92740 8996 else { /* MJD */
6bda09f9 8997 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
8998 Set_Node_Length(br, paren != 0);
8999 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9000 }
c277df42
IZ
9001 have_branch = 1;
9002 if (SIZE_ONLY)
830247a4 9003 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
9004 }
9005 else if (paren == ':') {
c277df42
IZ
9006 *flagp |= flags&SIMPLE;
9007 }
6136c704 9008 if (is_open) { /* Starts with OPEN. */
3dab1dad 9009 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
9010 }
9011 else if (paren != '?') /* Not Conditional */
a0d0e21e 9012 ret = br;
8ae10a67 9013 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 9014 lastbr = br;
830247a4
IZ
9015 while (*RExC_parse == '|') {
9016 if (!SIZE_ONLY && RExC_extralen) {
9017 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 9018 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
9019 }
9020 if (SIZE_ONLY)
830247a4
IZ
9021 RExC_extralen += 2; /* Account for LONGJMP. */
9022 nextchar(pRExC_state);
594d7033
YO
9023 if (freeze_paren) {
9024 if (RExC_npar > after_freeze)
9025 after_freeze = RExC_npar;
9026 RExC_npar = freeze_paren;
9027 }
3dab1dad 9028 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 9029
a687059c 9030 if (br == NULL)
a0d0e21e 9031 return(NULL);
3dab1dad 9032 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 9033 lastbr = br;
8ae10a67 9034 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
9035 }
9036
c277df42
IZ
9037 if (have_branch || paren != ':') {
9038 /* Make a closing node, and hook it on the end. */
9039 switch (paren) {
9040 case ':':
830247a4 9041 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
9042 break;
9043 case 1:
830247a4 9044 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
9045 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9046 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9047 "Setting close paren #%"IVdf" to %d\n",
9048 (IV)parno, REG_NODE_NUM(ender)));
9049 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
9050 if (RExC_nestroot == parno)
9051 RExC_nestroot = 0;
40d049e4 9052 }
fac92740
MJD
9053 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9054 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
9055 break;
9056 case '<':
c277df42
IZ
9057 case ',':
9058 case '=':
9059 case '!':
c277df42 9060 *flagp &= ~HASWIDTH;
821b33a5
IZ
9061 /* FALL THROUGH */
9062 case '>':
830247a4 9063 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
9064 break;
9065 case 0:
830247a4 9066 ender = reg_node(pRExC_state, END);
40d049e4
YO
9067 if (!SIZE_ONLY) {
9068 assert(!RExC_opend); /* there can only be one! */
9069 RExC_opend = ender;
9070 }
c277df42
IZ
9071 break;
9072 }
3b6759a6
YO
9073 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9074 SV * const mysv_val1=sv_newmortal();
9075 SV * const mysv_val2=sv_newmortal();
9076 DEBUG_PARSE_MSG("lsbr");
9077 regprop(RExC_rx, mysv_val1, lastbr);
9078 regprop(RExC_rx, mysv_val2, ender);
9079 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9080 SvPV_nolen_const(mysv_val1),
9081 (IV)REG_NODE_NUM(lastbr),
9082 SvPV_nolen_const(mysv_val2),
9083 (IV)REG_NODE_NUM(ender),
9084 (IV)(ender - lastbr)
9085 );
9086 });
eaf3ca90 9087 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 9088
9674d46a 9089 if (have_branch && !SIZE_ONLY) {
3b6759a6 9090 char is_nothing= 1;
eaf3ca90
YO
9091 if (depth==1)
9092 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9093
c277df42 9094 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
9095 for (br = ret; br; br = regnext(br)) {
9096 const U8 op = PL_regkind[OP(br)];
9097 if (op == BRANCH) {
07be1b83 9098 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
3b6759a6
YO
9099 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9100 is_nothing= 0;
9674d46a
AL
9101 }
9102 else if (op == BRANCHJ) {
07be1b83 9103 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
3b6759a6
YO
9104 /* for now we always disable this optimisation * /
9105 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9106 */
9107 is_nothing= 0;
9674d46a 9108 }
c277df42 9109 }
3b6759a6
YO
9110 if (is_nothing) {
9111 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9112 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9113 SV * const mysv_val1=sv_newmortal();
9114 SV * const mysv_val2=sv_newmortal();
9115 DEBUG_PARSE_MSG("NADA");
9116 regprop(RExC_rx, mysv_val1, ret);
9117 regprop(RExC_rx, mysv_val2, ender);
9118 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9119 SvPV_nolen_const(mysv_val1),
9120 (IV)REG_NODE_NUM(ret),
9121 SvPV_nolen_const(mysv_val2),
9122 (IV)REG_NODE_NUM(ender),
9123 (IV)(ender - ret)
9124 );
9125 });
9126 OP(br)= NOTHING;
9127 if (OP(ender) == TAIL) {
9128 NEXT_OFF(br)= 0;
9129 RExC_emit= br + 1;
9130 } else {
9131 regnode *opt;
9132 for ( opt= br + 1; opt < ender ; opt++ )
9133 OP(opt)= OPTIMIZED;
9134 NEXT_OFF(br)= ender - br;
9135 }
9136 }
c277df42 9137 }
a0d0e21e 9138 }
c277df42
IZ
9139
9140 {
e1ec3a88
AL
9141 const char *p;
9142 static const char parens[] = "=!<,>";
c277df42
IZ
9143
9144 if (paren && (p = strchr(parens, paren))) {
eb160463 9145 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
9146 int flag = (p - parens) > 1;
9147
9148 if (paren == '>')
9149 node = SUSPEND, flag = 0;
6bda09f9 9150 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
9151 Set_Node_Cur_Length(ret);
9152 Set_Node_Offset(ret, parse_start + 1);
c277df42 9153 ret->flags = flag;
07be1b83 9154 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 9155 }
a0d0e21e
LW
9156 }
9157
9158 /* Check for proper termination. */
ce3e6498 9159 if (paren) {
e2509266 9160 RExC_flags = oregflags;
830247a4
IZ
9161 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9162 RExC_parse = oregcomp_parse;
380a0633 9163 vFAIL("Unmatched (");
ce3e6498 9164 }
a0ed51b3 9165 }
830247a4
IZ
9166 else if (!paren && RExC_parse < RExC_end) {
9167 if (*RExC_parse == ')') {
9168 RExC_parse++;
380a0633 9169 vFAIL("Unmatched )");
a0ed51b3
LW
9170 }
9171 else
b45f050a 9172 FAIL("Junk on end of regexp"); /* "Can't happen". */
118e2215 9173 assert(0); /* NOTREACHED */
a0d0e21e 9174 }
b57e4118
KW
9175
9176 if (RExC_in_lookbehind) {
9177 RExC_in_lookbehind--;
9178 }
fd4be6f0 9179 if (after_freeze > RExC_npar)
594d7033 9180 RExC_npar = after_freeze;
a0d0e21e 9181 return(ret);
a687059c
LW
9182}
9183
9184/*
9185 - regbranch - one alternative of an | operator
9186 *
9187 * Implements the concatenation operator.
9188 */
76e3520e 9189STATIC regnode *
3dab1dad 9190S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 9191{
97aff369 9192 dVAR;
c277df42
IZ
9193 register regnode *ret;
9194 register regnode *chain = NULL;
9195 register regnode *latest;
9196 I32 flags = 0, c = 0;
3dab1dad 9197 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9198
9199 PERL_ARGS_ASSERT_REGBRANCH;
9200
3dab1dad 9201 DEBUG_PARSE("brnc");
02daf0ab 9202
b81d288d 9203 if (first)
c277df42
IZ
9204 ret = NULL;
9205 else {
b81d288d 9206 if (!SIZE_ONLY && RExC_extralen)
830247a4 9207 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 9208 else {
830247a4 9209 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
9210 Set_Node_Length(ret, 1);
9211 }
c277df42 9212 }
686b73d4 9213
b81d288d 9214 if (!first && SIZE_ONLY)
830247a4 9215 RExC_extralen += 1; /* BRANCHJ */
b81d288d 9216
c277df42 9217 *flagp = WORST; /* Tentatively. */
a0d0e21e 9218
830247a4
IZ
9219 RExC_parse--;
9220 nextchar(pRExC_state);
9221 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 9222 flags &= ~TRYAGAIN;
3dab1dad 9223 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
9224 if (latest == NULL) {
9225 if (flags & TRYAGAIN)
9226 continue;
9227 return(NULL);
a0ed51b3
LW
9228 }
9229 else if (ret == NULL)
c277df42 9230 ret = latest;
8ae10a67 9231 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 9232 if (chain == NULL) /* First piece. */
a0d0e21e
LW
9233 *flagp |= flags&SPSTART;
9234 else {
830247a4 9235 RExC_naughty++;
3dab1dad 9236 REGTAIL(pRExC_state, chain, latest);
a687059c 9237 }
a0d0e21e 9238 chain = latest;
c277df42
IZ
9239 c++;
9240 }
9241 if (chain == NULL) { /* Loop ran zero times. */
830247a4 9242 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
9243 if (ret == NULL)
9244 ret = chain;
9245 }
9246 if (c == 1) {
9247 *flagp |= flags&SIMPLE;
a0d0e21e 9248 }
a687059c 9249
d4c19fe8 9250 return ret;
a687059c
LW
9251}
9252
9253/*
9254 - regpiece - something followed by possible [*+?]
9255 *
9256 * Note that the branching code sequences used for ? and the general cases
9257 * of * and + are somewhat optimized: they use the same NOTHING node as
9258 * both the endmarker for their branch list and the body of the last branch.
9259 * It might seem that this node could be dispensed with entirely, but the
9260 * endmarker role is not redundant.
9261 */
76e3520e 9262STATIC regnode *
3dab1dad 9263S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 9264{
97aff369 9265 dVAR;
c277df42 9266 register regnode *ret;
a0d0e21e
LW
9267 register char op;
9268 register char *next;
9269 I32 flags;
1df70142 9270 const char * const origparse = RExC_parse;
a0d0e21e 9271 I32 min;
c277df42 9272 I32 max = REG_INFTY;
f19a8d85 9273#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9274 char *parse_start;
f19a8d85 9275#endif
10edeb5d 9276 const char *maxpos = NULL;
3dab1dad 9277 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9278
9279 PERL_ARGS_ASSERT_REGPIECE;
9280
3dab1dad 9281 DEBUG_PARSE("piec");
a0d0e21e 9282
3dab1dad 9283 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
9284 if (ret == NULL) {
9285 if (flags & TRYAGAIN)
9286 *flagp |= TRYAGAIN;
9287 return(NULL);
9288 }
9289
830247a4 9290 op = *RExC_parse;
a0d0e21e 9291
830247a4 9292 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 9293 maxpos = NULL;
f19a8d85 9294#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9295 parse_start = RExC_parse; /* MJD */
f19a8d85 9296#endif
830247a4 9297 next = RExC_parse + 1;
a0d0e21e
LW
9298 while (isDIGIT(*next) || *next == ',') {
9299 if (*next == ',') {
9300 if (maxpos)
9301 break;
9302 else
9303 maxpos = next;
a687059c 9304 }
a0d0e21e
LW
9305 next++;
9306 }
9307 if (*next == '}') { /* got one */
9308 if (!maxpos)
9309 maxpos = next;
830247a4
IZ
9310 RExC_parse++;
9311 min = atoi(RExC_parse);
a0d0e21e
LW
9312 if (*maxpos == ',')
9313 maxpos++;
9314 else
830247a4 9315 maxpos = RExC_parse;
a0d0e21e
LW
9316 max = atoi(maxpos);
9317 if (!max && *maxpos != '0')
c277df42
IZ
9318 max = REG_INFTY; /* meaning "infinity" */
9319 else if (max >= REG_INFTY)
8615cb43 9320 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
9321 RExC_parse = next;
9322 nextchar(pRExC_state);
a0d0e21e
LW
9323
9324 do_curly:
9325 if ((flags&SIMPLE)) {
830247a4 9326 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 9327 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
9328 Set_Node_Offset(ret, parse_start+1); /* MJD */
9329 Set_Node_Cur_Length(ret);
a0d0e21e
LW
9330 }
9331 else {
3dab1dad 9332 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
9333
9334 w->flags = 0;
3dab1dad 9335 REGTAIL(pRExC_state, ret, w);
830247a4 9336 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
9337 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9338 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
9339 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9340 }
6bda09f9 9341 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
9342 /* MJD hk */
9343 Set_Node_Offset(ret, parse_start+1);
2af232bd 9344 Set_Node_Length(ret,
fac92740 9345 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 9346
830247a4 9347 if (!SIZE_ONLY && RExC_extralen)
c277df42 9348 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 9349 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 9350 if (SIZE_ONLY)
830247a4
IZ
9351 RExC_whilem_seen++, RExC_extralen += 3;
9352 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 9353 }
c277df42 9354 ret->flags = 0;
a0d0e21e
LW
9355
9356 if (min > 0)
821b33a5
IZ
9357 *flagp = WORST;
9358 if (max > 0)
9359 *flagp |= HASWIDTH;
8fa23287 9360 if (max < min)
8615cb43 9361 vFAIL("Can't do {n,m} with n > m");
c277df42 9362 if (!SIZE_ONLY) {
eb160463
GS
9363 ARG1_SET(ret, (U16)min);
9364 ARG2_SET(ret, (U16)max);
a687059c 9365 }
a687059c 9366
a0d0e21e 9367 goto nest_check;
a687059c 9368 }
a0d0e21e 9369 }
a687059c 9370
a0d0e21e
LW
9371 if (!ISMULT1(op)) {
9372 *flagp = flags;
a687059c 9373 return(ret);
a0d0e21e 9374 }
bb20fd44 9375
c277df42 9376#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
9377
9378 /* if this is reinstated, don't forget to put this back into perldiag:
9379
9380 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9381
9382 (F) The part of the regexp subject to either the * or + quantifier
9383 could match an empty string. The {#} shows in the regular
9384 expression about where the problem was discovered.
9385
9386 */
9387
bb20fd44 9388 if (!(flags&HASWIDTH) && op != '?')
b45f050a 9389 vFAIL("Regexp *+ operand could be empty");
b81d288d 9390#endif
bb20fd44 9391
f19a8d85 9392#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9393 parse_start = RExC_parse;
f19a8d85 9394#endif
830247a4 9395 nextchar(pRExC_state);
a0d0e21e 9396
821b33a5 9397 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
9398
9399 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 9400 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 9401 ret->flags = 0;
830247a4 9402 RExC_naughty += 4;
a0d0e21e
LW
9403 }
9404 else if (op == '*') {
9405 min = 0;
9406 goto do_curly;
a0ed51b3
LW
9407 }
9408 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 9409 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 9410 ret->flags = 0;
830247a4 9411 RExC_naughty += 3;
a0d0e21e
LW
9412 }
9413 else if (op == '+') {
9414 min = 1;
9415 goto do_curly;
a0ed51b3
LW
9416 }
9417 else if (op == '?') {
a0d0e21e
LW
9418 min = 0; max = 1;
9419 goto do_curly;
9420 }
9421 nest_check:
668c081a
NC
9422 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9423 ckWARN3reg(RExC_parse,
9424 "%.*s matches null string many times",
9425 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9426 origparse);
a0d0e21e
LW
9427 }
9428
b9b4dddf 9429 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 9430 nextchar(pRExC_state);
6bda09f9 9431 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 9432 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 9433 }
b9b4dddf
YO
9434#ifndef REG_ALLOW_MINMOD_SUSPEND
9435 else
9436#endif
9437 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9438 regnode *ender;
9439 nextchar(pRExC_state);
9440 ender = reg_node(pRExC_state, SUCCEED);
9441 REGTAIL(pRExC_state, ret, ender);
9442 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9443 ret->flags = 0;
9444 ender = reg_node(pRExC_state, TAIL);
9445 REGTAIL(pRExC_state, ret, ender);
9446 /*ret= ender;*/
9447 }
9448
9449 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 9450 RExC_parse++;
b45f050a
JF
9451 vFAIL("Nested quantifiers");
9452 }
a0d0e21e
LW
9453
9454 return(ret);
a687059c
LW
9455}
9456
fc8cd66c 9457
9d64099b 9458/* reg_namedseq(pRExC_state,UVp, UV depth)
fc8cd66c
YO
9459
9460 This is expected to be called by a parser routine that has
afefe6bf 9461 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
9462 expected to point at the first char following the N at the time
9463 of the call.
ff3f963a
KW
9464
9465 The \N may be inside (indicated by valuep not being NULL) or outside a
9466 character class.
9467
9468 \N may begin either a named sequence, or if outside a character class, mean
9469 to match a non-newline. For non single-quoted regexes, the tokenizer has
9470 attempted to decide which, and in the case of a named sequence converted it
9471 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9472 where c1... are the characters in the sequence. For single-quoted regexes,
9473 the tokenizer passes the \N sequence through unchanged; this code will not
9474 attempt to determine this nor expand those. The net effect is that if the
9475 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9476 signals that this \N occurrence means to match a non-newline.
9477
9478 Only the \N{U+...} form should occur in a character class, for the same
9479 reason that '.' inside a character class means to just match a period: it
9480 just doesn't make sense.
fc8cd66c
YO
9481
9482 If valuep is non-null then it is assumed that we are parsing inside
9483 of a charclass definition and the first codepoint in the resolved
9484 string is returned via *valuep and the routine will return NULL.
9485 In this mode if a multichar string is returned from the charnames
ff3f963a 9486 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
9487 sequence will be examined. If the string returned is zero length
9488 then the value of *valuep is undefined and NON-NULL will
9489 be returned to indicate failure. (This will NOT be a valid pointer
9490 to a regnode.)
9491
ff3f963a
KW
9492 If valuep is null then it is assumed that we are parsing normal text and a
9493 new EXACT node is inserted into the program containing the resolved string,
9494 and a pointer to the new node is returned. But if the string is zero length
9495 a NOTHING node is emitted instead.
afefe6bf 9496
fc8cd66c 9497 On success RExC_parse is set to the char following the endbrace.
ff3f963a 9498 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
9499 */
9500STATIC regnode *
9d64099b 9501S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
fc8cd66c 9502{
c3c41406 9503 char * endbrace; /* '}' following the name */
fc8cd66c 9504 regnode *ret = NULL;
c3c41406 9505 char* p;
ff3f963a
KW
9506
9507 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9508
9509 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
9510
9511 GET_RE_DEBUG_FLAGS;
c3c41406
KW
9512
9513 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9514 * modifier. The other meaning does not */
9515 p = (RExC_flags & RXf_PMf_EXTENDED)
9516 ? regwhite( pRExC_state, RExC_parse )
9517 : RExC_parse;
7918f24d 9518
ff3f963a 9519 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
9520 * [^\n]. The former is assumed when it can't be the latter. */
9521 if (*p != '{' || regcurly(p)) {
9522 RExC_parse = p;
ff3f963a 9523 if (valuep) {
afefe6bf 9524 /* no bare \N in a charclass */
ff3f963a
KW
9525 vFAIL("\\N in a character class must be a named character: \\N{...}");
9526 }
afefe6bf
RGS
9527 nextchar(pRExC_state);
9528 ret = reg_node(pRExC_state, REG_ANY);
9529 *flagp |= HASWIDTH|SIMPLE;
9530 RExC_naughty++;
9531 RExC_parse--;
9532 Set_Node_Length(ret, 1); /* MJD */
9533 return ret;
fc8cd66c 9534 }
a4893424 9535
c3c41406
KW
9536 /* Here, we have decided it should be a named sequence */
9537
9538 /* The test above made sure that the next real character is a '{', but
9539 * under the /x modifier, it could be separated by space (or a comment and
9540 * \n) and this is not allowed (for consistency with \x{...} and the
9541 * tokenizer handling of \N{NAME}). */
9542 if (*RExC_parse != '{') {
9543 vFAIL("Missing braces on \\N{}");
9544 }
9545
ff3f963a 9546 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
9547
9548 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9549 || ! (endbrace == RExC_parse /* nothing between the {} */
9550 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9551 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9552 {
9553 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9554 vFAIL("\\N{NAME} must be resolved by the lexer");
9555 }
9556
ff3f963a
KW
9557 if (endbrace == RExC_parse) { /* empty: \N{} */
9558 if (! valuep) {
9559 RExC_parse = endbrace + 1;
9560 return reg_node(pRExC_state,NOTHING);
a4893424 9561 }
fc8cd66c 9562
ff3f963a
KW
9563 if (SIZE_ONLY) {
9564 ckWARNreg(RExC_parse,
9565 "Ignoring zero length \\N{} in character class"
9566 );
9567 RExC_parse = endbrace + 1;
9568 }
9569 *valuep = 0;
9570 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 9571 }
ff3f963a 9572
62fed28b 9573 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
9574 RExC_parse += 2; /* Skip past the 'U+' */
9575
9576 if (valuep) { /* In a bracketed char class */
9577 /* We only pay attention to the first char of
9578 multichar strings being returned. I kinda wonder
9579 if this makes sense as it does change the behaviour
9580 from earlier versions, OTOH that behaviour was broken
9581 as well. XXX Solution is to recharacterize as
9582 [rest-of-class]|multi1|multi2... */
9583
9584 STRLEN length_of_hex;
9585 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9586 | PERL_SCAN_DISALLOW_PREFIX
9587 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9588
37820adc
KW
9589 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9590 if (endchar < endbrace) {
ff3f963a
KW
9591 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9592 }
ff3f963a
KW
9593
9594 length_of_hex = (STRLEN)(endchar - RExC_parse);
9595 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9596
9597 /* The tokenizer should have guaranteed validity, but it's possible to
9598 * bypass it by using single quoting, so check */
c3c41406
KW
9599 if (length_of_hex == 0
9600 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9601 {
9602 RExC_parse += length_of_hex; /* Includes all the valid */
9603 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9604 ? UTF8SKIP(RExC_parse)
9605 : 1;
9606 /* Guard against malformed utf8 */
9607 if (RExC_parse >= endchar) RExC_parse = endchar;
9608 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
9609 }
9610
9611 RExC_parse = endbrace + 1;
9612 if (endchar == endbrace) return NULL;
9613
9614 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 9615 }
ff3f963a 9616 else { /* Not a char class */
e2a7e165
KW
9617
9618 /* What is done here is to convert this to a sub-pattern of the form
9619 * (?:\x{char1}\x{char2}...)
9620 * and then call reg recursively. That way, it retains its atomicness,
9621 * while not having to worry about special handling that some code
9622 * points may have. toke.c has converted the original Unicode values
9623 * to native, so that we can just pass on the hex values unchanged. We
9624 * do have to set a flag to keep recoding from happening in the
9625 * recursion */
9626
9627 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9628 STRLEN len;
ff3f963a
KW
9629 char *endchar; /* Points to '.' or '}' ending cur char in the input
9630 stream */
e2a7e165
KW
9631 char *orig_end = RExC_end;
9632
9633 while (RExC_parse < endbrace) {
ff3f963a
KW
9634
9635 /* Code points are separated by dots. If none, there is only one
9636 * code point, and is terminated by the brace */
37820adc 9637 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a 9638
e2a7e165
KW
9639 /* Convert to notation the rest of the code understands */
9640 sv_catpv(substitute_parse, "\\x{");
9641 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9642 sv_catpv(substitute_parse, "}");
ff3f963a
KW
9643
9644 /* Point to the beginning of the next character in the sequence. */
9645 RExC_parse = endchar + 1;
ff3f963a 9646 }
e2a7e165 9647 sv_catpv(substitute_parse, ")");
ff3f963a 9648
e2a7e165 9649 RExC_parse = SvPV(substitute_parse, len);
ff3f963a 9650
e2a7e165
KW
9651 /* Don't allow empty number */
9652 if (len < 8) {
9653 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a 9654 }
e2a7e165 9655 RExC_end = RExC_parse + len;
ff3f963a 9656
e2a7e165
KW
9657 /* The values are Unicode, and therefore not subject to recoding */
9658 RExC_override_recoding = 1;
9659
9660 ret = reg(pRExC_state, 1, flagp, depth+1);
9661
9662 RExC_parse = endbrace;
9663 RExC_end = orig_end;
9664 RExC_override_recoding = 0;
ff3f963a 9665
ff3f963a
KW
9666 nextchar(pRExC_state);
9667 }
9668
9669 return ret;
fc8cd66c
YO
9670}
9671
9672
9e08bc66
TS
9673/*
9674 * reg_recode
9675 *
9676 * It returns the code point in utf8 for the value in *encp.
9677 * value: a code value in the source encoding
9678 * encp: a pointer to an Encode object
9679 *
9680 * If the result from Encode is not a single character,
9681 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9682 */
9683STATIC UV
9684S_reg_recode(pTHX_ const char value, SV **encp)
9685{
9686 STRLEN numlen = 1;
59cd0e26 9687 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 9688 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
9689 const STRLEN newlen = SvCUR(sv);
9690 UV uv = UNICODE_REPLACEMENT;
9691
7918f24d
NC
9692 PERL_ARGS_ASSERT_REG_RECODE;
9693
9e08bc66
TS
9694 if (newlen)
9695 uv = SvUTF8(sv)
9696 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9697 : *(U8*)s;
9698
9699 if (!newlen || numlen != newlen) {
9700 uv = UNICODE_REPLACEMENT;
c86f7df5 9701 *encp = NULL;
9e08bc66
TS
9702 }
9703 return uv;
9704}
9705
fc8cd66c 9706
a687059c
LW
9707/*
9708 - regatom - the lowest level
ee9b8eae
YO
9709
9710 Try to identify anything special at the start of the pattern. If there
9711 is, then handle it as required. This may involve generating a single regop,
9712 such as for an assertion; or it may involve recursing, such as to
9713 handle a () structure.
9714
9715 If the string doesn't start with something special then we gobble up
9716 as much literal text as we can.
9717
9718 Once we have been able to handle whatever type of thing started the
9719 sequence, we return.
9720
9721 Note: we have to be careful with escapes, as they can be both literal
639c2774
KW
9722 and special, and in the case of \10 and friends, context determines which.
9723
9724 A summary of the code structure is:
9725
9726 switch (first_byte) {
9727 cases for each special:
9728 handle this special;
9729 break;
9730 case '\\':
9731 switch (2nd byte) {
9732 cases for each unambiguous special:
9733 handle this special;
9734 break;
9735 cases for each ambigous special/literal:
9736 disambiguate;
9737 if (special) handle here
9738 else goto defchar;
9739 default: // unambiguously literal:
9740 goto defchar;
9741 }
9742 default: // is a literal char
9743 // FALL THROUGH
9744 defchar:
9745 create EXACTish node for literal;
9746 while (more input and node isn't full) {
9747 switch (input_byte) {
9748 cases for each special;
9749 make sure parse pointer is set so that the next call to
9750 regatom will see this special first
9751 goto loopdone; // EXACTish node terminated by prev. char
9752 default:
9753 append char to EXACTISH node;
9754 }
9755 get next input byte;
9756 }
9757 loopdone:
9758 }
9759 return the generated node;
9760
9761 Specifically there are two separate switches for handling
ee9b8eae
YO
9762 escape sequences, with the one for handling literal escapes requiring
9763 a dummy entry for all of the special escapes that are actually handled
9764 by the other.
9765*/
9766
76e3520e 9767STATIC regnode *
3dab1dad 9768S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 9769{
97aff369 9770 dVAR;
cbbf8932 9771 register regnode *ret = NULL;
a0d0e21e 9772 I32 flags;
45948336 9773 char *parse_start = RExC_parse;
980866de 9774 U8 op;
3dab1dad
YO
9775 GET_RE_DEBUG_FLAGS_DECL;
9776 DEBUG_PARSE("atom");
a0d0e21e
LW
9777 *flagp = WORST; /* Tentatively. */
9778
7918f24d 9779 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 9780
a0d0e21e 9781tryagain:
f9a79580 9782 switch ((U8)*RExC_parse) {
a0d0e21e 9783 case '^':
830247a4
IZ
9784 RExC_seen_zerolen++;
9785 nextchar(pRExC_state);
bbe252da 9786 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 9787 ret = reg_node(pRExC_state, MBOL);
bbe252da 9788 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 9789 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 9790 else
830247a4 9791 ret = reg_node(pRExC_state, BOL);
fac92740 9792 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9793 break;
9794 case '$':
830247a4 9795 nextchar(pRExC_state);
b81d288d 9796 if (*RExC_parse)
830247a4 9797 RExC_seen_zerolen++;
bbe252da 9798 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 9799 ret = reg_node(pRExC_state, MEOL);
bbe252da 9800 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 9801 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 9802 else
830247a4 9803 ret = reg_node(pRExC_state, EOL);
fac92740 9804 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9805 break;
9806 case '.':
830247a4 9807 nextchar(pRExC_state);
bbe252da 9808 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
9809 ret = reg_node(pRExC_state, SANY);
9810 else
9811 ret = reg_node(pRExC_state, REG_ANY);
9812 *flagp |= HASWIDTH|SIMPLE;
830247a4 9813 RExC_naughty++;
fac92740 9814 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
9815 break;
9816 case '[':
b45f050a 9817 {
3dab1dad
YO
9818 char * const oregcomp_parse = ++RExC_parse;
9819 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
9820 if (*RExC_parse != ']') {
9821 RExC_parse = oregcomp_parse;
b45f050a
JF
9822 vFAIL("Unmatched [");
9823 }
830247a4 9824 nextchar(pRExC_state);
a0d0e21e 9825 *flagp |= HASWIDTH|SIMPLE;
fac92740 9826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 9827 break;
b45f050a 9828 }
a0d0e21e 9829 case '(':
830247a4 9830 nextchar(pRExC_state);
3dab1dad 9831 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 9832 if (ret == NULL) {
bf93d4cc 9833 if (flags & TRYAGAIN) {
830247a4 9834 if (RExC_parse == RExC_end) {
bf93d4cc
GS
9835 /* Make parent create an empty node if needed. */
9836 *flagp |= TRYAGAIN;
9837 return(NULL);
9838 }
a0d0e21e 9839 goto tryagain;
bf93d4cc 9840 }
a0d0e21e
LW
9841 return(NULL);
9842 }
a3b492c3 9843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
9844 break;
9845 case '|':
9846 case ')':
9847 if (flags & TRYAGAIN) {
9848 *flagp |= TRYAGAIN;
9849 return NULL;
9850 }
b45f050a 9851 vFAIL("Internal urp");
a0d0e21e
LW
9852 /* Supposed to be caught earlier. */
9853 break;
9854 case '?':
9855 case '+':
9856 case '*':
830247a4 9857 RExC_parse++;
b45f050a 9858 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
9859 break;
9860 case '\\':
ee9b8eae
YO
9861 /* Special Escapes
9862
9863 This switch handles escape sequences that resolve to some kind
9864 of special regop and not to literal text. Escape sequnces that
9865 resolve to literal text are handled below in the switch marked
9866 "Literal Escapes".
9867
9868 Every entry in this switch *must* have a corresponding entry
9869 in the literal escape switch. However, the opposite is not
9870 required, as the default for this switch is to jump to the
9871 literal text handling code.
9872 */
a0a388a1 9873 switch ((U8)*++RExC_parse) {
ee9b8eae 9874 /* Special Escapes */
a0d0e21e 9875 case 'A':
830247a4
IZ
9876 RExC_seen_zerolen++;
9877 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 9878 *flagp |= SIMPLE;
ee9b8eae 9879 goto finish_meta_pat;
a0d0e21e 9880 case 'G':
830247a4
IZ
9881 ret = reg_node(pRExC_state, GPOS);
9882 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 9883 *flagp |= SIMPLE;
ee9b8eae
YO
9884 goto finish_meta_pat;
9885 case 'K':
9886 RExC_seen_zerolen++;
9887 ret = reg_node(pRExC_state, KEEPS);
9888 *flagp |= SIMPLE;
37923168
RGS
9889 /* XXX:dmq : disabling in-place substitution seems to
9890 * be necessary here to avoid cases of memory corruption, as
9891 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9892 */
9893 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 9894 goto finish_meta_pat;
a0d0e21e 9895 case 'Z':
830247a4 9896 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 9897 *flagp |= SIMPLE;
a1917ab9 9898 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9899 goto finish_meta_pat;
b85d18e9 9900 case 'z':
830247a4 9901 ret = reg_node(pRExC_state, EOS);
b85d18e9 9902 *flagp |= SIMPLE;
830247a4 9903 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9904 goto finish_meta_pat;
4a2d328f 9905 case 'C':
f33976b4
DB
9906 ret = reg_node(pRExC_state, CANY);
9907 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 9908 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9909 goto finish_meta_pat;
a0ed51b3 9910 case 'X':
830247a4 9911 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 9912 *flagp |= HASWIDTH;
ee9b8eae 9913 goto finish_meta_pat;
a0d0e21e 9914 case 'w':
693fefec
KW
9915 op = ALNUM + get_regex_charset(RExC_flags);
9916 if (op > ALNUMA) { /* /aa is same as /a */
9917 op = ALNUMA;
a12cf05f 9918 }
980866de 9919 ret = reg_node(pRExC_state, op);
a0d0e21e 9920 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9921 goto finish_meta_pat;
a0d0e21e 9922 case 'W':
693fefec
KW
9923 op = NALNUM + get_regex_charset(RExC_flags);
9924 if (op > NALNUMA) { /* /aa is same as /a */
9925 op = NALNUMA;
a12cf05f 9926 }
980866de 9927 ret = reg_node(pRExC_state, op);
a0d0e21e 9928 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9929 goto finish_meta_pat;
a0d0e21e 9930 case 'b':
830247a4
IZ
9931 RExC_seen_zerolen++;
9932 RExC_seen |= REG_SEEN_LOOKBEHIND;
693fefec
KW
9933 op = BOUND + get_regex_charset(RExC_flags);
9934 if (op > BOUNDA) { /* /aa is same as /a */
9935 op = BOUNDA;
a12cf05f 9936 }
63ac0dad 9937 ret = reg_node(pRExC_state, op);
b988e673 9938 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9939 *flagp |= SIMPLE;
ee9b8eae 9940 goto finish_meta_pat;
a0d0e21e 9941 case 'B':
830247a4
IZ
9942 RExC_seen_zerolen++;
9943 RExC_seen |= REG_SEEN_LOOKBEHIND;
693fefec
KW
9944 op = NBOUND + get_regex_charset(RExC_flags);
9945 if (op > NBOUNDA) { /* /aa is same as /a */
9946 op = NBOUNDA;
a12cf05f 9947 }
63ac0dad 9948 ret = reg_node(pRExC_state, op);
b988e673 9949 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9950 *flagp |= SIMPLE;
ee9b8eae 9951 goto finish_meta_pat;
a0d0e21e 9952 case 's':
693fefec
KW
9953 op = SPACE + get_regex_charset(RExC_flags);
9954 if (op > SPACEA) { /* /aa is same as /a */
9955 op = SPACEA;
a12cf05f 9956 }
980866de 9957 ret = reg_node(pRExC_state, op);
a0d0e21e 9958 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9959 goto finish_meta_pat;
a0d0e21e 9960 case 'S':
693fefec
KW
9961 op = NSPACE + get_regex_charset(RExC_flags);
9962 if (op > NSPACEA) { /* /aa is same as /a */
9963 op = NSPACEA;
6ab9ea91 9964 }
56ae17b4 9965 ret = reg_node(pRExC_state, op);
a0d0e21e 9966 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9967 goto finish_meta_pat;
a0d0e21e 9968 case 'D':
693fefec
KW
9969 op = NDIGIT;
9970 goto join_D_and_d;
9971 case 'd':
9972 op = DIGIT;
9973 join_D_and_d:
9974 {
9975 U8 offset = get_regex_charset(RExC_flags);
9976 if (offset == REGEX_UNICODE_CHARSET) {
9977 offset = REGEX_DEPENDS_CHARSET;
9978 }
9979 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
9980 offset = REGEX_ASCII_RESTRICTED_CHARSET;
9981 }
9982 op += offset;
6ab9ea91 9983 }
56ae17b4 9984 ret = reg_node(pRExC_state, op);
a0d0e21e 9985 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9986 goto finish_meta_pat;
e1d1eefb
YO
9987 case 'R':
9988 ret = reg_node(pRExC_state, LNBREAK);
9989 *flagp |= HASWIDTH|SIMPLE;
9990 goto finish_meta_pat;
9991 case 'h':
9992 ret = reg_node(pRExC_state, HORIZWS);
9993 *flagp |= HASWIDTH|SIMPLE;
9994 goto finish_meta_pat;
9995 case 'H':
9996 ret = reg_node(pRExC_state, NHORIZWS);
9997 *flagp |= HASWIDTH|SIMPLE;
9998 goto finish_meta_pat;
ee9b8eae 9999 case 'v':
e1d1eefb
YO
10000 ret = reg_node(pRExC_state, VERTWS);
10001 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
10002 goto finish_meta_pat;
10003 case 'V':
e1d1eefb
YO
10004 ret = reg_node(pRExC_state, NVERTWS);
10005 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 10006 finish_meta_pat:
830247a4 10007 nextchar(pRExC_state);
fac92740 10008 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 10009 break;
a14b48bc
LW
10010 case 'p':
10011 case 'P':
686b73d4 10012 {
3dab1dad 10013 char* const oldregxend = RExC_end;
d008bc60 10014#ifdef DEBUGGING
ccb2c380 10015 char* parse_start = RExC_parse - 2;
d008bc60 10016#endif
a14b48bc 10017
830247a4 10018 if (RExC_parse[1] == '{') {
3568d838 10019 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
10020 RExC_end = strchr(RExC_parse, '}');
10021 if (!RExC_end) {
3dab1dad 10022 const U8 c = (U8)*RExC_parse;
830247a4
IZ
10023 RExC_parse += 2;
10024 RExC_end = oldregxend;
0da60cf5 10025 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 10026 }
830247a4 10027 RExC_end++;
a14b48bc 10028 }
af6f566e 10029 else {
830247a4 10030 RExC_end = RExC_parse + 2;
af6f566e
HS
10031 if (RExC_end > oldregxend)
10032 RExC_end = oldregxend;
10033 }
830247a4 10034 RExC_parse--;
a14b48bc 10035
3dab1dad 10036 ret = regclass(pRExC_state,depth+1);
a14b48bc 10037
830247a4
IZ
10038 RExC_end = oldregxend;
10039 RExC_parse--;
ccb2c380
MP
10040
10041 Set_Node_Offset(ret, parse_start + 2);
10042 Set_Node_Cur_Length(ret);
830247a4 10043 nextchar(pRExC_state);
a14b48bc
LW
10044 *flagp |= HASWIDTH|SIMPLE;
10045 }
10046 break;
fc8cd66c 10047 case 'N':
afefe6bf 10048 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
10049 multicharacter. join_exact() will join them up later on.
10050 Also this makes sure that things like /\N{BLAH}+/ and
10051 \N{BLAH} being multi char Just Happen. dmq*/
10052 ++RExC_parse;
9d64099b 10053 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
fc8cd66c 10054 break;
0a4db386 10055 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 10056 parse_named_seq:
81714fb9
YO
10057 {
10058 char ch= RExC_parse[1];
1f1031fe
YO
10059 if (ch != '<' && ch != '\'' && ch != '{') {
10060 RExC_parse++;
10061 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 10062 } else {
1f1031fe
YO
10063 /* this pretty much dupes the code for (?P=...) in reg(), if
10064 you change this make sure you change that */
81714fb9 10065 char* name_start = (RExC_parse += 2);
2eccd3b2 10066 U32 num = 0;
0a4db386
YO
10067 SV *sv_dat = reg_scan_name(pRExC_state,
10068 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 10069 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 10070 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
10071 vFAIL2("Sequence %.3s... not terminated",parse_start);
10072
10073 if (!SIZE_ONLY) {
10074 num = add_data( pRExC_state, 1, "S" );
10075 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 10076 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
10077 }
10078
81714fb9
YO
10079 RExC_sawback = 1;
10080 ret = reganode(pRExC_state,
4444fd9f
KW
10081 ((! FOLD)
10082 ? NREF
2f7f8cb1
KW
10083 : (MORE_ASCII_RESTRICTED)
10084 ? NREFFA
10085 : (AT_LEAST_UNI_SEMANTICS)
10086 ? NREFFU
10087 : (LOC)
10088 ? NREFFL
10089 : NREFF),
4444fd9f 10090 num);
81714fb9 10091 *flagp |= HASWIDTH;
1f1031fe 10092
81714fb9
YO
10093 /* override incorrect value set in reganode MJD */
10094 Set_Node_Offset(ret, parse_start+1);
10095 Set_Node_Cur_Length(ret); /* MJD */
10096 nextchar(pRExC_state);
1f1031fe 10097
81714fb9
YO
10098 }
10099 break;
1f1031fe 10100 }
2bf803e2 10101 case 'g':
a0d0e21e
LW
10102 case '1': case '2': case '3': case '4':
10103 case '5': case '6': case '7': case '8': case '9':
10104 {
c74340f9 10105 I32 num;
2bf803e2
YO
10106 bool isg = *RExC_parse == 'g';
10107 bool isrel = 0;
10108 bool hasbrace = 0;
10109 if (isg) {
c74340f9 10110 RExC_parse++;
2bf803e2
YO
10111 if (*RExC_parse == '{') {
10112 RExC_parse++;
10113 hasbrace = 1;
10114 }
10115 if (*RExC_parse == '-') {
10116 RExC_parse++;
10117 isrel = 1;
10118 }
1f1031fe
YO
10119 if (hasbrace && !isDIGIT(*RExC_parse)) {
10120 if (isrel) RExC_parse--;
10121 RExC_parse -= 2;
10122 goto parse_named_seq;
10123 } }
c74340f9 10124 num = atoi(RExC_parse);
b72d83b2
RGS
10125 if (isg && num == 0)
10126 vFAIL("Reference to invalid group 0");
c74340f9 10127 if (isrel) {
5624f11d 10128 num = RExC_npar - num;
c74340f9
YO
10129 if (num < 1)
10130 vFAIL("Reference to nonexistent or unclosed group");
10131 }
2bf803e2 10132 if (!isg && num > 9 && num >= RExC_npar)
639c2774 10133 /* Probably a character specified in octal, e.g. \35 */
a0d0e21e
LW
10134 goto defchar;
10135 else {
3dab1dad 10136 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
10137 while (isDIGIT(*RExC_parse))
10138 RExC_parse++;
1f1031fe
YO
10139 if (parse_start == RExC_parse - 1)
10140 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
10141 if (hasbrace) {
10142 if (*RExC_parse != '}')
10143 vFAIL("Unterminated \\g{...} pattern");
10144 RExC_parse++;
10145 }
c74340f9
YO
10146 if (!SIZE_ONLY) {
10147 if (num > (I32)RExC_rx->nparens)
10148 vFAIL("Reference to nonexistent group");
c74340f9 10149 }
830247a4 10150 RExC_sawback = 1;
eb160463 10151 ret = reganode(pRExC_state,
4444fd9f
KW
10152 ((! FOLD)
10153 ? REF
2f7f8cb1
KW
10154 : (MORE_ASCII_RESTRICTED)
10155 ? REFFA
10156 : (AT_LEAST_UNI_SEMANTICS)
10157 ? REFFU
10158 : (LOC)
10159 ? REFFL
10160 : REFF),
4444fd9f 10161 num);
a0d0e21e 10162 *flagp |= HASWIDTH;
2af232bd 10163
fac92740 10164 /* override incorrect value set in reganode MJD */
2af232bd 10165 Set_Node_Offset(ret, parse_start+1);
fac92740 10166 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
10167 RExC_parse--;
10168 nextchar(pRExC_state);
a0d0e21e
LW
10169 }
10170 }
10171 break;
10172 case '\0':
830247a4 10173 if (RExC_parse >= RExC_end)
b45f050a 10174 FAIL("Trailing \\");
a0d0e21e
LW
10175 /* FALL THROUGH */
10176 default:
a0288114 10177 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 10178 back into the quick-grab loop below */
45948336 10179 parse_start--;
a0d0e21e
LW
10180 goto defchar;
10181 }
10182 break;
4633a7c4
LW
10183
10184 case '#':
bbe252da 10185 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 10186 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
10187 goto tryagain;
10188 }
10189 /* FALL THROUGH */
10190
f9a79580 10191 default:
561784a5
KW
10192
10193 parse_start = RExC_parse - 1;
10194
10195 RExC_parse++;
10196
10197 defchar: {
ba210ebe 10198 register STRLEN len;
58ae7d3f 10199 register UV ender;
a0d0e21e 10200 register char *p;
3dab1dad 10201 char *s;
80aecb99 10202 STRLEN foldlen;
89ebb4a3 10203 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
d82f9944 10204 U8 node_type;
f06dbbb7 10205
bb914485
KW
10206 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10207 * it is folded to 'ss' even if not utf8 */
10208 bool is_exactfu_sharp_s;
10209
58ae7d3f 10210 ender = 0;
693fefec
KW
10211 if (! FOLD) {
10212 node_type = EXACT;
10213 }
10214 else {
10215 node_type = get_regex_charset(RExC_flags);
10216 if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) {
10217 node_type--; /* /a is same as /u, and map /aa's offset to
10218 what /a's would have been, so there is no
10219 hole */
10220 }
10221 node_type += EXACTF;
10222 }
d82f9944 10223 ret = reg_node(pRExC_state, node_type);
cd439c50 10224 s = STRING(ret);
3f410cf6
KW
10225
10226 /* XXX The node can hold up to 255 bytes, yet this only goes to
10227 * 127. I (khw) do not know why. Keeping it somewhat less than
10228 * 255 allows us to not have to worry about overflow due to
10229 * converting to utf8 and fold expansion, but that value is
10230 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10231 * split up by this limit into a single one using the real max of
10232 * 255. Even at 127, this breaks under rare circumstances. If
10233 * folding, we do not want to split a node at a character that is a
10234 * non-final in a multi-char fold, as an input string could just
10235 * happen to want to match across the node boundary. The join
10236 * would solve that problem if the join actually happens. But a
10237 * series of more than two nodes in a row each of 127 would cause
10238 * the first join to succeed to get to 254, but then there wouldn't
10239 * be room for the next one, which could at be one of those split
10240 * multi-char folds. I don't know of any fool-proof solution. One
10241 * could back off to end with only a code point that isn't such a
10242 * non-final, but it is possible for there not to be any in the
10243 * entire node. */
830247a4 10244 for (len = 0, p = RExC_parse - 1;
3f410cf6
KW
10245 len < 127 && p < RExC_end;
10246 len++)
a0d0e21e 10247 {
3dab1dad 10248 char * const oldp = p;
5b5a24f7 10249
bbe252da 10250 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 10251 p = regwhite( pRExC_state, p );
f9a79580 10252 switch ((U8)*p) {
a0d0e21e
LW
10253 case '^':
10254 case '$':
10255 case '.':
10256 case '[':
10257 case '(':
10258 case ')':
10259 case '|':
10260 goto loopdone;
10261 case '\\':
ee9b8eae
YO
10262 /* Literal Escapes Switch
10263
10264 This switch is meant to handle escape sequences that
10265 resolve to a literal character.
10266
10267 Every escape sequence that represents something
10268 else, like an assertion or a char class, is handled
10269 in the switch marked 'Special Escapes' above in this
10270 routine, but also has an entry here as anything that
10271 isn't explicitly mentioned here will be treated as
10272 an unescaped equivalent literal.
10273 */
10274
a0a388a1 10275 switch ((U8)*++p) {
ee9b8eae
YO
10276 /* These are all the special escapes. */
10277 case 'A': /* Start assertion */
10278 case 'b': case 'B': /* Word-boundary assertion*/
10279 case 'C': /* Single char !DANGEROUS! */
10280 case 'd': case 'D': /* digit class */
10281 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 10282 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
10283 case 'k': case 'K': /* named backref, keep marker */
10284 case 'N': /* named char sequence */
38a44b82 10285 case 'p': case 'P': /* Unicode property */
e1d1eefb 10286 case 'R': /* LNBREAK */
ee9b8eae 10287 case 's': case 'S': /* space class */
e1d1eefb 10288 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
10289 case 'w': case 'W': /* word class */
10290 case 'X': /* eXtended Unicode "combining character sequence" */
10291 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
10292 --p;
10293 goto loopdone;
ee9b8eae
YO
10294
10295 /* Anything after here is an escape that resolves to a
10296 literal. (Except digits, which may or may not)
10297 */
a0d0e21e
LW
10298 case 'n':
10299 ender = '\n';
10300 p++;
a687059c 10301 break;
a0d0e21e
LW
10302 case 'r':
10303 ender = '\r';
10304 p++;
a687059c 10305 break;
a0d0e21e
LW
10306 case 't':
10307 ender = '\t';
10308 p++;
a687059c 10309 break;
a0d0e21e
LW
10310 case 'f':
10311 ender = '\f';
10312 p++;
a687059c 10313 break;
a0d0e21e 10314 case 'e':
c7f1f016 10315 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 10316 p++;
a687059c 10317 break;
a0d0e21e 10318 case 'a':
c7f1f016 10319 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 10320 p++;
a687059c 10321 break;
f0a2b745
KW
10322 case 'o':
10323 {
10324 STRLEN brace_len = len;
00c0cb6d 10325 UV result;
454155d9
KW
10326 const char* error_msg;
10327
10328 bool valid = grok_bslash_o(p,
10329 &result,
10330 &brace_len,
10331 &error_msg,
10332 1);
10333 p += brace_len;
10334 if (! valid) {
10335 RExC_parse = p; /* going to die anyway; point
10336 to exact spot of failure */
f0a2b745
KW
10337 vFAIL(error_msg);
10338 }
00c0cb6d
DG
10339 else
10340 {
10341 ender = result;
10342 }
f0a2b745
KW
10343 if (PL_encoding && ender < 0x100) {
10344 goto recode_encoding;
10345 }
10346 if (ender > 0xff) {
62fed28b 10347 REQUIRE_UTF8;
f0a2b745
KW
10348 }
10349 break;
10350 }
a0d0e21e 10351 case 'x':
a0481293
KW
10352 {
10353 STRLEN brace_len = len;
10354 UV result;
10355 const char* error_msg;
686b73d4 10356
a0481293
KW
10357 bool valid = grok_bslash_x(p,
10358 &result,
10359 &brace_len,
10360 &error_msg,
10361 1);
10362 p += brace_len;
10363 if (! valid) {
10364 RExC_parse = p; /* going to die anyway; point
10365 to exact spot of failure */
10366 vFAIL(error_msg);
b45f050a 10367 }
de5f0749 10368 else {
a0481293 10369 ender = result;
a0ed51b3 10370 }
a0481293
KW
10371 if (PL_encoding && ender < 0x100) {
10372 goto recode_encoding;
10373 }
10374 if (ender > 0xff) {
10375 REQUIRE_UTF8;
10376 }
10377 break;
a0ed51b3 10378 }
a0d0e21e
LW
10379 case 'c':
10380 p++;
17a3df4c 10381 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
a687059c 10382 break;
a0d0e21e 10383 case '0': case '1': case '2': case '3':case '4':
726ee55d 10384 case '5': case '6': case '7':
a0d0e21e 10385 if (*p == '0' ||
ca67da41 10386 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
10387 {
10388 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 10389 STRLEN numlen = 3;
53305cf1 10390 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 10391 if (ender > 0xff) {
62fed28b 10392 REQUIRE_UTF8;
609122bd 10393 }
a0d0e21e
LW
10394 p += numlen;
10395 }
10396 else {
10397 --p;
10398 goto loopdone;
a687059c 10399 }
9e08bc66
TS
10400 if (PL_encoding && ender < 0x100)
10401 goto recode_encoding;
10402 break;
10403 recode_encoding:
e2a7e165 10404 if (! RExC_override_recoding) {
9e08bc66
TS
10405 SV* enc = PL_encoding;
10406 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
10407 if (!enc && SIZE_ONLY)
10408 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 10409 REQUIRE_UTF8;
9e08bc66 10410 }
a687059c 10411 break;
a0d0e21e 10412 case '\0':
830247a4 10413 if (p >= RExC_end)
b45f050a 10414 FAIL("Trailing \\");
a687059c 10415 /* FALL THROUGH */
a0d0e21e 10416 default:
726ee55d 10417 if (!SIZE_ONLY&& isALNUMC(*p)) {
2a53d331 10418 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
216bfc0a 10419 }
a0ed51b3 10420 goto normal_default;
a0d0e21e
LW
10421 }
10422 break;
2a53d331
KW
10423 case '{':
10424 /* Currently we don't warn when the lbrace is at the start
10425 * of a construct. This catches it in the middle of a
10426 * literal string, or when its the first thing after
10427 * something like "\b" */
10428 if (! SIZE_ONLY
10429 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10430 {
10431 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10432 }
10433 /*FALLTHROUGH*/
a687059c 10434 default:
a0ed51b3 10435 normal_default:
fd400ab9 10436 if (UTF8_IS_START(*p) && UTF) {
1df70142 10437 STRLEN numlen;
5e12f4fb 10438 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 10439 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
10440 p += numlen;
10441 }
10442 else
5b67c30a 10443 ender = (U8) *p++;
a0d0e21e 10444 break;
7e2509c1
KW
10445 } /* End of switch on the literal */
10446
bb914485
KW
10447 is_exactfu_sharp_s = (node_type == EXACTFU
10448 && ender == LATIN_SMALL_LETTER_SHARP_S);
bcdf7404
YO
10449 if ( RExC_flags & RXf_PMf_EXTENDED)
10450 p = regwhite( pRExC_state, p );
bb914485 10451 if ((UTF && FOLD) || is_exactfu_sharp_s) {
17580e7a
KW
10452 /* Prime the casefolded buffer. Locale rules, which apply
10453 * only to code points < 256, aren't known until execution,
10454 * so for them, just output the original character using
a0c4c608
KW
10455 * utf8. If we start to fold non-UTF patterns, be sure to
10456 * update join_exact() */
17580e7a
KW
10457 if (LOC && ender < 256) {
10458 if (UNI_IS_INVARIANT(ender)) {
10459 *tmpbuf = (U8) ender;
10460 foldlen = 1;
10461 } else {
10462 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10463 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10464 foldlen = 2;
10465 }
10466 }
10467 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10468 */
2f7f8cb1 10469 ender = toLOWER(ender);
cd64649c 10470 *tmpbuf = (U8) ender;
2f7f8cb1
KW
10471 foldlen = 1;
10472 }
17580e7a
KW
10473 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10474
10475 /* Locale and /aa require more selectivity about the
10476 * fold, so are handled below. Otherwise, here, just
10477 * use the fold */
2f7f8cb1
KW
10478 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10479 }
10480 else {
17580e7a
KW
10481 /* Under locale rules or /aa we are not to mix,
10482 * respectively, ords < 256 or ASCII with non-. So
10483 * reject folds that mix them, using only the
10484 * non-folded code point. So do the fold to a
10485 * temporary, and inspect each character in it. */
2f7f8cb1
KW
10486 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10487 U8* s = trialbuf;
10488 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10489 U8* e = s + foldlen;
10490 bool fold_ok = TRUE;
10491
10492 while (s < e) {
17580e7a
KW
10493 if (isASCII(*s)
10494 || (LOC && (UTF8_IS_INVARIANT(*s)
10495 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10496 {
2f7f8cb1
KW
10497 fold_ok = FALSE;
10498 break;
10499 }
10500 s += UTF8SKIP(s);
10501 }
10502 if (fold_ok) {
10503 Copy(trialbuf, tmpbuf, foldlen, U8);
10504 ender = tmpender;
10505 }
10506 else {
10507 uvuni_to_utf8(tmpbuf, ender);
10508 foldlen = UNISKIP(ender);
10509 }
10510 }
60a8b682 10511 }
bcdf7404 10512 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
10513 if (len)
10514 p = oldp;
bb914485 10515 else if (UTF || is_exactfu_sharp_s) {
80aecb99 10516 if (FOLD) {
60a8b682 10517 /* Emit all the Unicode characters. */
1df70142 10518 STRLEN numlen;
80aecb99
JH
10519 for (foldbuf = tmpbuf;
10520 foldlen;
10521 foldlen -= numlen) {
4b88fb76
KW
10522
10523 /* tmpbuf has been constructed by us, so we
10524 * know it is valid utf8 */
10525 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 10526 if (numlen > 0) {
71207a34 10527 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
10528 s += unilen;
10529 len += unilen;
10530 /* In EBCDIC the numlen
10531 * and unilen can differ. */
9dc45d57 10532 foldbuf += numlen;
47654450
JH
10533 if (numlen >= foldlen)
10534 break;
9dc45d57
JH
10535 }
10536 else
10537 break; /* "Can't happen." */
80aecb99
JH
10538 }
10539 }
10540 else {
71207a34 10541 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 10542 if (unilen > 0) {
0ebc6274
JH
10543 s += unilen;
10544 len += unilen;
9dc45d57 10545 }
80aecb99 10546 }
a0ed51b3 10547 }
a0d0e21e
LW
10548 else {
10549 len++;
eb160463 10550 REGC((char)ender, s++);
a0d0e21e
LW
10551 }
10552 break;
a687059c 10553 }
bb914485 10554 if (UTF || is_exactfu_sharp_s) {
80aecb99 10555 if (FOLD) {
60a8b682 10556 /* Emit all the Unicode characters. */
1df70142 10557 STRLEN numlen;
80aecb99
JH
10558 for (foldbuf = tmpbuf;
10559 foldlen;
10560 foldlen -= numlen) {
4b88fb76 10561 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 10562 if (numlen > 0) {
71207a34 10563 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
10564 len += unilen;
10565 s += unilen;
10566 /* In EBCDIC the numlen
10567 * and unilen can differ. */
9dc45d57 10568 foldbuf += numlen;
47654450
JH
10569 if (numlen >= foldlen)
10570 break;
9dc45d57
JH
10571 }
10572 else
10573 break;
80aecb99
JH
10574 }
10575 }
10576 else {
71207a34 10577 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 10578 if (unilen > 0) {
0ebc6274
JH
10579 s += unilen;
10580 len += unilen;
9dc45d57 10581 }
80aecb99
JH
10582 }
10583 len--;
a0ed51b3 10584 }
d669c36c 10585 else {
eb160463 10586 REGC((char)ender, s++);
d669c36c 10587 }
a0d0e21e 10588 }
7e2509c1
KW
10589 loopdone: /* Jumped to when encounters something that shouldn't be in
10590 the node */
830247a4 10591 RExC_parse = p - 1;
fac92740 10592 Set_Node_Cur_Length(ret); /* MJD */
830247a4 10593 nextchar(pRExC_state);
793db0cb
JH
10594 {
10595 /* len is STRLEN which is unsigned, need to copy to signed */
10596 IV iv = len;
10597 if (iv < 0)
10598 vFAIL("Internal disaster");
10599 }
a0d0e21e
LW
10600 if (len > 0)
10601 *flagp |= HASWIDTH;
090f7165 10602 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 10603 *flagp |= SIMPLE;
686b73d4 10604
cd439c50 10605 if (SIZE_ONLY)
830247a4 10606 RExC_size += STR_SZ(len);
3dab1dad
YO
10607 else {
10608 STR_LEN(ret) = len;
830247a4 10609 RExC_emit += STR_SZ(len);
07be1b83 10610 }
3dab1dad 10611 }
a0d0e21e
LW
10612 break;
10613 }
a687059c 10614
a0d0e21e 10615 return(ret);
a687059c
LW
10616}
10617
873ef191 10618STATIC char *
bcdf7404 10619S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 10620{
bcdf7404 10621 const char *e = RExC_end;
7918f24d
NC
10622
10623 PERL_ARGS_ASSERT_REGWHITE;
10624
5b5a24f7
CS
10625 while (p < e) {
10626 if (isSPACE(*p))
10627 ++p;
10628 else if (*p == '#') {
bcdf7404 10629 bool ended = 0;
5b5a24f7 10630 do {
bcdf7404
YO
10631 if (*p++ == '\n') {
10632 ended = 1;
10633 break;
10634 }
10635 } while (p < e);
10636 if (!ended)
10637 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
10638 }
10639 else
10640 break;
10641 }
10642 return p;
10643}
10644
b8c5462f
JH
10645/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10646 Character classes ([:foo:]) can also be negated ([:^foo:]).
10647 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10648 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 10649 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
10650
10651#define POSIXCC_DONE(c) ((c) == ':')
10652#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10653#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10654
b8c5462f 10655STATIC I32
830247a4 10656S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 10657{
97aff369 10658 dVAR;
936ed897 10659 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 10660
7918f24d
NC
10661 PERL_ARGS_ASSERT_REGPPOSIXCC;
10662
830247a4 10663 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 10664 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 10665 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 10666 const char c = UCHARAT(RExC_parse);
097eb12c 10667 char* const s = RExC_parse++;
686b73d4 10668
9a86a77b 10669 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
10670 RExC_parse++;
10671 if (RExC_parse == RExC_end)
620e46c5 10672 /* Grandfather lone [:, [=, [. */
830247a4 10673 RExC_parse = s;
620e46c5 10674 else {
3dab1dad 10675 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
10676 assert(*t == c);
10677
9a86a77b 10678 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 10679 const char *posixcc = s + 1;
830247a4 10680 RExC_parse++; /* skip over the ending ] */
3dab1dad 10681
b8c5462f 10682 if (*s == ':') {
1df70142
AL
10683 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10684 const I32 skip = t - posixcc;
80916619
NC
10685
10686 /* Initially switch on the length of the name. */
10687 switch (skip) {
10688 case 4:
3dab1dad
YO
10689 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10690 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 10691 break;
80916619
NC
10692 case 5:
10693 /* Names all of length 5. */
10694 /* alnum alpha ascii blank cntrl digit graph lower
10695 print punct space upper */
10696 /* Offset 4 gives the best switch position. */
10697 switch (posixcc[4]) {
10698 case 'a':
3dab1dad
YO
10699 if (memEQ(posixcc, "alph", 4)) /* alpha */
10700 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
10701 break;
10702 case 'e':
3dab1dad
YO
10703 if (memEQ(posixcc, "spac", 4)) /* space */
10704 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
10705 break;
10706 case 'h':
3dab1dad
YO
10707 if (memEQ(posixcc, "grap", 4)) /* graph */
10708 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
10709 break;
10710 case 'i':
3dab1dad
YO
10711 if (memEQ(posixcc, "asci", 4)) /* ascii */
10712 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
10713 break;
10714 case 'k':
3dab1dad
YO
10715 if (memEQ(posixcc, "blan", 4)) /* blank */
10716 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
10717 break;
10718 case 'l':
3dab1dad
YO
10719 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10720 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
10721 break;
10722 case 'm':
3dab1dad
YO
10723 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10724 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
10725 break;
10726 case 'r':
3dab1dad
YO
10727 if (memEQ(posixcc, "lowe", 4)) /* lower */
10728 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10729 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10730 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
10731 break;
10732 case 't':
3dab1dad
YO
10733 if (memEQ(posixcc, "digi", 4)) /* digit */
10734 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10735 else if (memEQ(posixcc, "prin", 4)) /* print */
10736 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10737 else if (memEQ(posixcc, "punc", 4)) /* punct */
10738 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 10739 break;
b8c5462f
JH
10740 }
10741 break;
80916619 10742 case 6:
3dab1dad
YO
10743 if (memEQ(posixcc, "xdigit", 6))
10744 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
10745 break;
10746 }
80916619
NC
10747
10748 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
10749 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10750 t - s - 1, s + 1);
80916619
NC
10751 assert (posixcc[skip] == ':');
10752 assert (posixcc[skip+1] == ']');
b45f050a 10753 } else if (!SIZE_ONLY) {
b8c5462f 10754 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 10755
830247a4 10756 /* adjust RExC_parse so the warning shows after
b45f050a 10757 the class closes */
9a86a77b 10758 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 10759 RExC_parse++;
b45f050a
JF
10760 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10761 }
b8c5462f
JH
10762 } else {
10763 /* Maternal grandfather:
10764 * "[:" ending in ":" but not in ":]" */
830247a4 10765 RExC_parse = s;
767d463e 10766 }
620e46c5
JH
10767 }
10768 }
10769
b8c5462f
JH
10770 return namedclass;
10771}
10772
10773STATIC void
830247a4 10774S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 10775{
97aff369 10776 dVAR;
7918f24d
NC
10777
10778 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10779
3dab1dad 10780 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
10781 const char *s = RExC_parse;
10782 const char c = *s++;
b8c5462f 10783
3dab1dad 10784 while (isALNUM(*s))
b8c5462f
JH
10785 s++;
10786 if (*s && c == *s && s[1] == ']') {
668c081a
NC
10787 ckWARN3reg(s+2,
10788 "POSIX syntax [%c %c] belongs inside character classes",
10789 c, c);
b45f050a
JF
10790
10791 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 10792 if (POSIXCC_NOTYET(c)) {
830247a4 10793 /* adjust RExC_parse so the error shows after
b45f050a 10794 the class closes */
9a86a77b 10795 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 10796 NOOP;
b45f050a
JF
10797 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10798 }
b8c5462f
JH
10799 }
10800 }
620e46c5
JH
10801}
10802
ea317ccb
KW
10803/* Generate the code to add a full posix character <class> to the bracketed
10804 * character class given by <node>. (<node> is needed only under locale rules)
10805 * destlist is the inversion list for non-locale rules that this class is
10806 * to be added to
10807 * sourcelist is the ASCII-range inversion list to add under /a rules
10808 * Xsourcelist is the full Unicode range list to use otherwise. */
10809#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10810 if (LOC) { \
10811 SV* scratch_list = NULL; \
10812 \
10813 /* Set this class in the node for runtime matching */ \
10814 ANYOF_CLASS_SET(node, class); \
10815 \
10816 /* For above Latin1 code points, we use the full Unicode range */ \
10817 _invlist_intersection(PL_AboveLatin1, \
10818 Xsourcelist, \
10819 &scratch_list); \
10820 /* And set the output to it, adding instead if there already is an \
10821 * output. Checking if <destlist> is NULL first saves an extra \
10822 * clone. Its reference count will be decremented at the next \
10823 * union, etc, or if this is the only instance, at the end of the \
10824 * routine */ \
10825 if (! destlist) { \
10826 destlist = scratch_list; \
10827 } \
10828 else { \
10829 _invlist_union(destlist, scratch_list, &destlist); \
10830 SvREFCNT_dec(scratch_list); \
10831 } \
10832 } \
10833 else { \
10834 /* For non-locale, just add it to any existing list */ \
10835 _invlist_union(destlist, \
10836 (AT_LEAST_ASCII_RESTRICTED) \
10837 ? sourcelist \
10838 : Xsourcelist, \
10839 &destlist); \
10840 }
10841
10842/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10843 */
10844#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10845 if (LOC) { \
10846 SV* scratch_list = NULL; \
10847 ANYOF_CLASS_SET(node, class); \
10848 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10849 if (! destlist) { \
10850 destlist = scratch_list; \
10851 } \
10852 else { \
10853 _invlist_union(destlist, scratch_list, &destlist); \
10854 SvREFCNT_dec(scratch_list); \
10855 } \
10856 } \
10857 else { \
10858 _invlist_union_complement_2nd(destlist, \
10859 (AT_LEAST_ASCII_RESTRICTED) \
10860 ? sourcelist \
10861 : Xsourcelist, \
10862 &destlist); \
10863 /* Under /d, everything in the upper half of the Latin1 range \
10864 * matches this complement */ \
10865 if (DEPENDS_SEMANTICS) { \
10866 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10867 } \
10868 }
10869
10870/* Generate the code to add a posix character <class> to the bracketed
10871 * character class given by <node>. (<node> is needed only under locale rules)
10872 * destlist is the inversion list for non-locale rules that this class is
10873 * to be added to
10874 * sourcelist is the ASCII-range inversion list to add under /a rules
10875 * l1_sourcelist is the Latin1 range list to use otherwise.
10876 * Xpropertyname is the name to add to <run_time_list> of the property to
10877 * specify the code points above Latin1 that will have to be
10878 * determined at run-time
10879 * run_time_list is a SV* that contains text names of properties that are to
10880 * be computed at run time. This concatenates <Xpropertyname>
10881 * to it, apppropriately
10882 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10883 * time */
10884#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10885 l1_sourcelist, Xpropertyname, run_time_list) \
4f6289a3
RB
10886 /* First, resolve whether to use the ASCII-only list or the L1 \
10887 * list */ \
10888 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10889 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10890 Xpropertyname, run_time_list)
10891
10892#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10893 Xpropertyname, run_time_list) \
ea317ccb
KW
10894 /* If not /a matching, there are going to be code points we will have \
10895 * to defer to runtime to look-up */ \
10896 if (! AT_LEAST_ASCII_RESTRICTED) { \
10897 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10898 } \
10899 if (LOC) { \
10900 ANYOF_CLASS_SET(node, class); \
10901 } \
10902 else { \
4f6289a3 10903 _invlist_union(destlist, sourcelist, &destlist); \
ea317ccb
KW
10904 }
10905
10906/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10907 * this and DO_N_POSIX */
10908#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10909 l1_sourcelist, Xpropertyname, run_time_list) \
10910 if (AT_LEAST_ASCII_RESTRICTED) { \
10911 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10912 } \
10913 else { \
10914 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10915 if (LOC) { \
10916 ANYOF_CLASS_SET(node, namedclass); \
10917 } \
10918 else { \
10919 SV* scratch_list = NULL; \
10920 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10921 if (! destlist) { \
10922 destlist = scratch_list; \
10923 } \
10924 else { \
10925 _invlist_union(destlist, scratch_list, &destlist); \
10926 SvREFCNT_dec(scratch_list); \
10927 } \
10928 if (DEPENDS_SEMANTICS) { \
10929 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10930 } \
10931 } \
10932 }
a12cf05f 10933
c8453963
KW
10934STATIC void
10935S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10936{
10937 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10938 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10939 * the multi-character folds of characters in the node */
10940 SV *sv;
10941
10942 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10943
10944 if (! *alternate_ptr) {
10945 *alternate_ptr = newAV();
10946 }
10947 sv = newSVpvn_utf8((char*)string, len, TRUE);
10948 av_push(*alternate_ptr, sv);
10949 return;
10950}
10951
7f6f358c
YO
10952/*
10953 parse a class specification and produce either an ANYOF node that
ddad5e0b 10954 matches the pattern or perhaps will be optimized into an EXACTish node
679d1424
KW
10955 instead. The node contains a bit map for the first 256 characters, with the
10956 corresponding bit set if that character is in the list. For characters
10957 above 255, a range list is used */
89836f1f 10958
76e3520e 10959STATIC regnode *
3dab1dad 10960S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 10961{
97aff369 10962 dVAR;
9a86a77b 10963 register UV nextvalue;
3568d838 10964 register IV prevvalue = OOB_UNICODE;
ffc61ed2 10965 register IV range = 0;
e1d1eefb 10966 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 10967 register regnode *ret;
ba210ebe 10968 STRLEN numlen;
ffc61ed2 10969 IV namedclass;
cbbf8932 10970 char *rangebegin = NULL;
936ed897 10971 bool need_class = 0;
827f5bb8 10972 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
c445ea15 10973 SV *listsv = NULL;
137165a6
KW
10974 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10975 than just initialized. */
dc415b4a
KW
10976 SV* properties = NULL; /* Code points that match \p{} \P{} */
10977 UV element_count = 0; /* Number of distinct elements in the class.
10978 Optimizations may be possible if this is tiny */
ffc61ed2 10979 UV n;
53742956 10980
3172e3fd
KW
10981 /* Certain named classes have equivalents that can appear outside a
10982 * character class, e.g. \w. These flags are set for these classes. The
10983 * first flag indicates the op depends on the character set modifier, like
10984 * /d, /u.... The second is for those that don't have this dependency. */
10985 bool has_special_charset_op = FALSE;
10986 bool has_special_non_charset_op = FALSE;
10987
dc415b4a 10988 /* Unicode properties are stored in a swash; this holds the current one
88d45d28
KW
10989 * being parsed. If this swash is the only above-latin1 component of the
10990 * character class, an optimization is to pass it directly on to the
10991 * execution engine. Otherwise, it is set to NULL to indicate that there
10992 * are other things in the class that have to be dealt with at execution
10993 * time */
dc415b4a
KW
10994 SV* swash = NULL; /* Code points that match \p{} \P{} */
10995
10996 /* Set if a component of this character class is user-defined; just passed
10997 * on to the engine */
10998 UV has_user_defined_property = 0;
10999
68823f48
KW
11000 /* inversion list of code points this node matches only when the target
11001 * string is in UTF-8. (Because is under /d) */
11002 SV* depends_list = NULL;
11003
cfbb2758
KW
11004 /* inversion list of code points this node matches. For much of the
11005 * function, it includes only those that match regardless of the utf8ness
11006 * of the target string */
11007 SV* cp_list = NULL;
11008
53742956 11009 /* List of multi-character folds that are matched by this node */
cbbf8932 11010 AV* unicode_alternate = NULL;
1b2d223b 11011#ifdef EBCDIC
8f850557
KW
11012 /* In a range, counts how many 0-2 of the ends of it came from literals,
11013 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
1b2d223b
JH
11014 UV literal_endpoint = 0;
11015#endif
ffc130aa 11016 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 11017
3dab1dad 11018 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 11019 case we need to change the emitted regop to an EXACT. */
07be1b83 11020 const char * orig_parse = RExC_parse;
72f13be8 11021 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11022
11023 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
11024#ifndef DEBUGGING
11025 PERL_UNUSED_ARG(depth);
11026#endif
72f13be8 11027
3dab1dad 11028 DEBUG_PARSE("clas");
7f6f358c
YO
11029
11030 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
11031 ret = reganode(pRExC_state, ANYOF, 0);
11032
56ca34ca
KW
11033
11034 if (!SIZE_ONLY) {
ffc61ed2 11035 ANYOF_FLAGS(ret) = 0;
56ca34ca 11036 }
ffc61ed2 11037
9a86a77b 11038 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
11039 RExC_naughty++;
11040 RExC_parse++;
11041 if (!SIZE_ONLY)
11042 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
827f5bb8
KW
11043
11044 /* We have decided to not allow multi-char folds in inverted character
ac455f4c
KW
11045 * classes, due to the confusion that can happen, especially with
11046 * classes that are designed for a non-Unicode world: You have the
11047 * peculiar case that:
827f5bb8
KW
11048 "s s" =~ /^[^\xDF]+$/i => Y
11049 "ss" =~ /^[^\xDF]+$/i => N
11050 *
11051 * See [perl #89750] */
11052 allow_full_fold = FALSE;
ffc61ed2 11053 }
a0d0e21e 11054
73060fc4 11055 if (SIZE_ONLY) {
830247a4 11056 RExC_size += ANYOF_SKIP;
73060fc4
JH
11057 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11058 }
936ed897 11059 else {
830247a4 11060 RExC_emit += ANYOF_SKIP;
3a15e693 11061 if (LOC) {
936ed897 11062 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693 11063 }
ffc61ed2 11064 ANYOF_BITMAP_ZERO(ret);
396482e1 11065 listsv = newSVpvs("# comment\n");
137165a6 11066 initial_listsv_len = SvCUR(listsv);
a0d0e21e 11067 }
b8c5462f 11068
9a86a77b
JH
11069 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11070
b938889d 11071 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 11072 checkposixcc(pRExC_state);
b8c5462f 11073
f064b6ad
HS
11074 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11075 if (UCHARAT(RExC_parse) == ']')
11076 goto charclassloop;
ffc61ed2 11077
fc8cd66c 11078parseit:
9a86a77b 11079 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
11080
11081 charclassloop:
11082
11083 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11084
dc415b4a 11085 if (!range) {
830247a4 11086 rangebegin = RExC_parse;
dc415b4a
KW
11087 element_count++;
11088 }
ffc61ed2 11089 if (UTF) {
5e12f4fb 11090 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 11091 RExC_end - RExC_parse,
9f7f3913 11092 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
11093 RExC_parse += numlen;
11094 }
11095 else
11096 value = UCHARAT(RExC_parse++);
7f6f358c 11097
9a86a77b
JH
11098 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11099 if (value == '[' && POSIXCC(nextvalue))
830247a4 11100 namedclass = regpposixcc(pRExC_state, value);
620e46c5 11101 else if (value == '\\') {
ffc61ed2 11102 if (UTF) {
5e12f4fb 11103 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 11104 RExC_end - RExC_parse,
9f7f3913 11105 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
11106 RExC_parse += numlen;
11107 }
11108 else
11109 value = UCHARAT(RExC_parse++);
470c3474 11110 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 11111 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
11112 * be a problem later if we want switch on Unicode.
11113 * A similar issue a little bit later when switching on
11114 * namedclass. --jhi */
ffc61ed2 11115 switch ((I32)value) {
b8c5462f
JH
11116 case 'w': namedclass = ANYOF_ALNUM; break;
11117 case 'W': namedclass = ANYOF_NALNUM; break;
11118 case 's': namedclass = ANYOF_SPACE; break;
11119 case 'S': namedclass = ANYOF_NSPACE; break;
11120 case 'd': namedclass = ANYOF_DIGIT; break;
11121 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
11122 case 'v': namedclass = ANYOF_VERTWS; break;
11123 case 'V': namedclass = ANYOF_NVERTWS; break;
11124 case 'h': namedclass = ANYOF_HORIZWS; break;
11125 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
11126 case 'N': /* Handle \N{NAME} in class */
11127 {
11128 /* We only pay attention to the first char of
11129 multichar strings being returned. I kinda wonder
11130 if this makes sense as it does change the behaviour
11131 from earlier versions, OTOH that behaviour was broken
11132 as well. */
11133 UV v; /* value is register so we cant & it /grrr */
9d64099b 11134 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
fc8cd66c
YO
11135 goto parseit;
11136 }
11137 value= v;
11138 }
11139 break;
ffc61ed2
JH
11140 case 'p':
11141 case 'P':
3dab1dad
YO
11142 {
11143 char *e;
af6f566e 11144 if (RExC_parse >= RExC_end)
2a4859cd 11145 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 11146 if (*RExC_parse == '{') {
1df70142 11147 const U8 c = (U8)value;
ffc61ed2
JH
11148 e = strchr(RExC_parse++, '}');
11149 if (!e)
0da60cf5 11150 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
11151 while (isSPACE(UCHARAT(RExC_parse)))
11152 RExC_parse++;
11153 if (e == RExC_parse)
0da60cf5 11154 vFAIL2("Empty \\%c{}", c);
ffc61ed2 11155 n = e - RExC_parse;
ab13f0c7
JH
11156 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11157 n--;
ffc61ed2
JH
11158 }
11159 else {
11160 e = RExC_parse;
11161 n = 1;
11162 }
ee410026 11163 if (!SIZE_ONLY) {
dc415b4a
KW
11164 SV** invlistsvp;
11165 SV* invlist;
11166 char* name;
ab13f0c7
JH
11167 if (UCHARAT(RExC_parse) == '^') {
11168 RExC_parse++;
11169 n--;
11170 value = value == 'p' ? 'P' : 'p'; /* toggle */
11171 while (isSPACE(UCHARAT(RExC_parse))) {
11172 RExC_parse++;
11173 n--;
11174 }
11175 }
dc415b4a
KW
11176 /* Try to get the definition of the property into
11177 * <invlist>. If /i is in effect, the effective property
11178 * will have its name be <__NAME_i>. The design is
11179 * discussed in commit
11180 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11181 Newx(name, n + sizeof("_i__\n"), char);
11182
11183 sprintf(name, "%s%.*s%s\n",
11184 (FOLD) ? "__" : "",
11185 (int)n,
11186 RExC_parse,
11187 (FOLD) ? "_i" : ""
11188 );
11189
11190 /* Look up the property name, and get its swash and
11191 * inversion list, if the property is found */
dc415b4a
KW
11192 if (swash) {
11193 SvREFCNT_dec(swash);
11194 }
11195 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11196 1, /* binary */
11197 0, /* not tr/// */
11198 TRUE, /* this routine will handle
11199 undefined properties */
11200 NULL, FALSE /* No inversion list */
11201 );
b6c46382 11202 if ( ! swash
dc415b4a
KW
11203 || ! SvROK(swash)
11204 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11205 || ! (invlistsvp =
11206 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11207 "INVLIST", FALSE))
11208 || ! (invlist = *invlistsvp))
11209 {
11210 if (swash) {
11211 SvREFCNT_dec(swash);
11212 swash = NULL;
11213 }
11214
11215 /* Here didn't find it. It could be a user-defined
11216 * property that will be available at run-time. Add it
11217 * to the list to look up then */
11218 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11219 (value == 'p' ? '+' : '!'),
11220 name);
11221 has_user_defined_property = 1;
11222
11223 /* We don't know yet, so have to assume that the
11224 * property could match something in the Latin1 range,
11225 * hence something that isn't utf8 */
11226 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11227 }
11228 else {
11229
11230 /* Here, did get the swash and its inversion list. If
11231 * the swash is from a user-defined property, then this
11232 * whole character class should be regarded as such */
11233 SV** user_defined_svp =
11234 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11235 "USER_DEFINED", FALSE);
11236 if (user_defined_svp) {
11237 has_user_defined_property
11238 |= SvUV(*user_defined_svp);
11239 }
11240
11241 /* Invert if asking for the complement */
11242 if (value == 'P') {
8dc9348a 11243 _invlist_union_complement_2nd(properties, invlist, &properties);
2f833f52 11244
dc415b4a
KW
11245 /* The swash can't be used as-is, because we've
11246 * inverted things; delay removing it to here after
11247 * have copied its invlist above */
11248 SvREFCNT_dec(swash);
11249 swash = NULL;
11250 }
11251 else {
112b0fc6 11252 _invlist_union(properties, invlist, &properties);
dc415b4a
KW
11253 }
11254 }
11255 Safefree(name);
ffc61ed2
JH
11256 }
11257 RExC_parse = e + 1;
f81125e2 11258 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
11259
11260 /* \p means they want Unicode semantics */
11261 RExC_uni_semantics = 1;
3dab1dad 11262 }
f81125e2 11263 break;
b8c5462f
JH
11264 case 'n': value = '\n'; break;
11265 case 'r': value = '\r'; break;
11266 case 't': value = '\t'; break;
11267 case 'f': value = '\f'; break;
11268 case 'b': value = '\b'; break;
c7f1f016
NIS
11269 case 'e': value = ASCII_TO_NATIVE('\033');break;
11270 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
11271 case 'o':
11272 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
11273 {
11274 const char* error_msg;
11275 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
11276 &value,
11277 &numlen,
454155d9
KW
11278 &error_msg,
11279 SIZE_ONLY);
11280 RExC_parse += numlen;
11281 if (! valid) {
11282 vFAIL(error_msg);
11283 }
f0a2b745 11284 }
f0a2b745
KW
11285 if (PL_encoding && value < 0x100) {
11286 goto recode_encoding;
11287 }
11288 break;
b8c5462f 11289 case 'x':
a0481293
KW
11290 RExC_parse--; /* function expects to be pointed at the 'x' */
11291 {
11292 const char* error_msg;
11293 bool valid = grok_bslash_x(RExC_parse,
11294 &value,
11295 &numlen,
11296 &error_msg,
11297 1);
ffc61ed2 11298 RExC_parse += numlen;
a0481293
KW
11299 if (! valid) {
11300 vFAIL(error_msg);
11301 }
ffc61ed2 11302 }
9e08bc66
TS
11303 if (PL_encoding && value < 0x100)
11304 goto recode_encoding;
b8c5462f
JH
11305 break;
11306 case 'c':
17a3df4c 11307 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
b8c5462f
JH
11308 break;
11309 case '0': case '1': case '2': case '3': case '4':
c99e91e9 11310 case '5': case '6': case '7':
9e08bc66 11311 {
c99e91e9
KW
11312 /* Take 1-3 octal digits */
11313 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
11314 numlen = 3;
11315 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11316 RExC_parse += numlen;
11317 if (PL_encoding && value < 0x100)
11318 goto recode_encoding;
11319 break;
11320 }
11321 recode_encoding:
e2a7e165 11322 if (! RExC_override_recoding) {
9e08bc66
TS
11323 SV* enc = PL_encoding;
11324 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
11325 if (!enc && SIZE_ONLY)
11326 ckWARNreg(RExC_parse,
11327 "Invalid escape in the specified encoding");
9e08bc66
TS
11328 break;
11329 }
1028017a 11330 default:
c99e91e9
KW
11331 /* Allow \_ to not give an error */
11332 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
11333 ckWARN2reg(RExC_parse,
11334 "Unrecognized escape \\%c in character class passed through",
11335 (int)value);
c99e91e9 11336 }
1028017a 11337 break;
b8c5462f 11338 }
ffc61ed2 11339 } /* end of \blah */
1b2d223b
JH
11340#ifdef EBCDIC
11341 else
11342 literal_endpoint++;
11343#endif
ffc61ed2
JH
11344
11345 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11346
2c63ecad
KW
11347 /* What matches in a locale is not known until runtime, so need to
11348 * (one time per class) allocate extra space to pass to regexec.
11349 * The space will contain a bit for each named class that is to be
11350 * matched against. This isn't needed for \p{} and pseudo-classes,
11351 * as they are not affected by locale, and hence are dealt with
11352 * separately */
11353 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11354 need_class = 1;
11355 if (SIZE_ONLY) {
dd58aee1 11356 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
11357 }
11358 else {
dd58aee1 11359 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
11360 ANYOF_CLASS_ZERO(ret);
11361 }
9051cfd9 11362 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 11363 }
ffc61ed2 11364
d5788240 11365 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
1d791ab2
KW
11366 * literal, as is the character that began the false range, i.e.
11367 * the 'a' in the examples */
ffc61ed2 11368 if (range) {
73b437c8 11369 if (!SIZE_ONLY) {
668c081a
NC
11370 const int w =
11371 RExC_parse >= rangebegin ?
11372 RExC_parse - rangebegin : 0;
11373 ckWARN4reg(RExC_parse,
b45f050a 11374 "False [] range \"%*.*s\"",
097eb12c 11375 w, w, rangebegin);
cfbb2758
KW
11376 cp_list = add_cp_to_invlist(cp_list, '-');
11377 cp_list = add_cp_to_invlist(cp_list, prevvalue);
b8c5462f 11378 }
ffc61ed2
JH
11379
11380 range = 0; /* this was not a true range */
3172e3fd 11381 element_count += 2; /* So counts for three values */
73b437c8 11382 }
ffc61ed2 11383
3172e3fd
KW
11384 if (SIZE_ONLY) {
11385
11386 /* In the first pass, do a little extra work so below can
11387 * possibly optimize the whole node to one of the nodes that
11388 * correspond to the classes given below */
11389
11390 /* The optimization will only take place if there is a single
11391 * element in the class, so can skip if there is more than one
11392 */
11393 if (element_count == 1) {
c49a72a9 11394
e2962f66
JH
11395 /* Possible truncation here but in some 64-bit environments
11396 * the compiler gets heartburn about switch on 64-bit values.
11397 * A similar issue a little earlier when switching on value.
98f323fa 11398 * --jhi */
3172e3fd
KW
11399 switch ((I32)namedclass) {
11400 case ANYOF_ALNUM:
11401 case ANYOF_NALNUM:
11402 case ANYOF_DIGIT:
11403 case ANYOF_NDIGIT:
11404 case ANYOF_SPACE:
11405 case ANYOF_NSPACE:
11406 has_special_charset_op = TRUE;
11407 break;
11408
11409 case ANYOF_HORIZWS:
11410 case ANYOF_NHORIZWS:
11411 case ANYOF_VERTWS:
11412 case ANYOF_NVERTWS:
11413 has_special_non_charset_op = TRUE;
11414 break;
11415 }
11416 }
11417 }
11418 else {
e2962f66 11419 switch ((I32)namedclass) {
ea317ccb
KW
11420
11421 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11422 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11423 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11424 break;
11425 case ANYOF_NALNUMC:
11426 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11427 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11428 break;
11429 case ANYOF_ALPHA:
11430 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11431 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11432 break;
11433 case ANYOF_NALPHA:
11434 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11435 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11436 break;
73b437c8 11437 case ANYOF_ASCII:
ea317ccb
KW
11438 if (LOC) {
11439 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 11440 }
ea317ccb
KW
11441 else {
11442 _invlist_union(properties, PL_ASCII, &properties);
11443 }
73b437c8
JH
11444 break;
11445 case ANYOF_NASCII:
ea317ccb
KW
11446 if (LOC) {
11447 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 11448 }
ea317ccb
KW
11449 else {
11450 _invlist_union_complement_2nd(properties,
11451 PL_ASCII, &properties);
11452 if (DEPENDS_SEMANTICS) {
11453 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11454 }
11455 }
11456 break;
11457 case ANYOF_BLANK:
11458 DO_POSIX(ret, namedclass, properties,
11459 PL_PosixBlank, PL_XPosixBlank);
11460 break;
11461 case ANYOF_NBLANK:
11462 DO_N_POSIX(ret, namedclass, properties,
11463 PL_PosixBlank, PL_XPosixBlank);
11464 break;
11465 case ANYOF_CNTRL:
11466 DO_POSIX(ret, namedclass, properties,
11467 PL_PosixCntrl, PL_XPosixCntrl);
11468 break;
11469 case ANYOF_NCNTRL:
11470 DO_N_POSIX(ret, namedclass, properties,
11471 PL_PosixCntrl, PL_XPosixCntrl);
11472 break;
ffc61ed2 11473 case ANYOF_DIGIT:
4f6289a3
RB
11474 /* There are no digits in the Latin1 range outside of
11475 * ASCII, so call the macro that doesn't have to resolve
11476 * them */
11477 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11478 PL_PosixDigit, "XPosixDigit", listsv);
3172e3fd 11479 has_special_charset_op = TRUE;
ea317ccb
KW
11480 break;
11481 case ANYOF_NDIGIT:
11482 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11483 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
3172e3fd 11484 has_special_charset_op = TRUE;
ea317ccb
KW
11485 break;
11486 case ANYOF_GRAPH:
11487 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11488 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11489 break;
11490 case ANYOF_NGRAPH:
11491 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11492 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11493 break;
11494 case ANYOF_HORIZWS:
cfbb2758 11495 /* For these, we use the cp_list, as /d doesn't make a
ea317ccb
KW
11496 * difference in what these match. There would be problems
11497 * if these characters had folds other than themselves, as
cfbb2758 11498 * cp_list is subject to folding. It turns out that \h
24caacbc 11499 * is just a synonym for XPosixBlank */
cfbb2758 11500 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
3172e3fd 11501 has_special_non_charset_op = TRUE;
ea317ccb
KW
11502 break;
11503 case ANYOF_NHORIZWS:
cfbb2758
KW
11504 _invlist_union_complement_2nd(cp_list,
11505 PL_XPosixBlank, &cp_list);
3172e3fd 11506 has_special_non_charset_op = TRUE;
ea317ccb
KW
11507 break;
11508 case ANYOF_LOWER:
11509 case ANYOF_NLOWER:
11510 { /* These require special handling, as they differ under
dab0c3e7
KW
11511 folding, matching Cased there (which in the ASCII range
11512 is the same as Alpha */
ea317ccb
KW
11513
11514 SV* ascii_source;
11515 SV* l1_source;
11516 const char *Xname;
11517
11518 if (FOLD && ! LOC) {
11519 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
11520 l1_source = PL_L1Cased;
11521 Xname = "Cased";
ea317ccb 11522 }
ffc61ed2 11523 else {
ea317ccb
KW
11524 ascii_source = PL_PosixLower;
11525 l1_source = PL_L1PosixLower;
11526 Xname = "XPosixLower";
11527 }
11528 if (namedclass == ANYOF_LOWER) {
11529 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11530 ascii_source, l1_source, Xname, listsv);
11531 }
11532 else {
11533 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11534 properties, ascii_source, l1_source, Xname, listsv);
ffc61ed2 11535 }
ffc61ed2 11536 break;
ea317ccb
KW
11537 }
11538 case ANYOF_PRINT:
11539 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11540 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11541 break;
11542 case ANYOF_NPRINT:
11543 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11544 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11545 break;
11546 case ANYOF_PUNCT:
11547 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11548 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11549 break;
11550 case ANYOF_NPUNCT:
11551 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11552 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11553 break;
11554 case ANYOF_PSXSPC:
11555 DO_POSIX(ret, namedclass, properties,
11556 PL_PosixSpace, PL_XPosixSpace);
11557 break;
11558 case ANYOF_NPSXSPC:
11559 DO_N_POSIX(ret, namedclass, properties,
11560 PL_PosixSpace, PL_XPosixSpace);
11561 break;
11562 case ANYOF_SPACE:
11563 DO_POSIX(ret, namedclass, properties,
11564 PL_PerlSpace, PL_XPerlSpace);
3172e3fd 11565 has_special_charset_op = TRUE;
ea317ccb
KW
11566 break;
11567 case ANYOF_NSPACE:
11568 DO_N_POSIX(ret, namedclass, properties,
11569 PL_PerlSpace, PL_XPerlSpace);
3172e3fd 11570 has_special_charset_op = TRUE;
ea317ccb
KW
11571 break;
11572 case ANYOF_UPPER: /* Same as LOWER, above */
11573 case ANYOF_NUPPER:
11574 {
11575 SV* ascii_source;
11576 SV* l1_source;
11577 const char *Xname;
11578
11579 if (FOLD && ! LOC) {
11580 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
11581 l1_source = PL_L1Cased;
11582 Xname = "Cased";
ea317ccb 11583 }
ffc61ed2 11584 else {
ea317ccb
KW
11585 ascii_source = PL_PosixUpper;
11586 l1_source = PL_L1PosixUpper;
11587 Xname = "XPosixUpper";
ffc61ed2 11588 }
ea317ccb
KW
11589 if (namedclass == ANYOF_UPPER) {
11590 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11591 ascii_source, l1_source, Xname, listsv);
cfaf538b 11592 }
ea317ccb
KW
11593 else {
11594 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11595 properties, ascii_source, l1_source, Xname, listsv);
11596 }
11597 break;
11598 }
11599 case ANYOF_ALNUM: /* Really is 'Word' */
11600 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11601 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
3172e3fd 11602 has_special_charset_op = TRUE;
ea317ccb
KW
11603 break;
11604 case ANYOF_NALNUM:
11605 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11606 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
3172e3fd 11607 has_special_charset_op = TRUE;
ea317ccb
KW
11608 break;
11609 case ANYOF_VERTWS:
cfbb2758 11610 /* For these, we use the cp_list, as /d doesn't make a
ea317ccb
KW
11611 * difference in what these match. There would be problems
11612 * if these characters had folds other than themselves, as
cfbb2758
KW
11613 * cp_list is subject to folding */
11614 _invlist_union(cp_list, PL_VertSpace, &cp_list);
3172e3fd 11615 has_special_non_charset_op = TRUE;
ea317ccb
KW
11616 break;
11617 case ANYOF_NVERTWS:
cfbb2758
KW
11618 _invlist_union_complement_2nd(cp_list,
11619 PL_VertSpace, &cp_list);
3172e3fd 11620 has_special_non_charset_op = TRUE;
ea317ccb
KW
11621 break;
11622 case ANYOF_XDIGIT:
11623 DO_POSIX(ret, namedclass, properties,
11624 PL_PosixXDigit, PL_XPosixXDigit);
11625 break;
11626 case ANYOF_NXDIGIT:
11627 DO_N_POSIX(ret, namedclass, properties,
11628 PL_PosixXDigit, PL_XPosixXDigit);
11629 break;
f81125e2
JP
11630 case ANYOF_MAX:
11631 /* this is to handle \p and \P */
11632 break;
73b437c8 11633 default:
b45f050a 11634 vFAIL("Invalid [::] class");
73b437c8 11635 break;
b8c5462f 11636 }
ce1c68b2 11637
73b437c8 11638 continue;
a0d0e21e 11639 }
ffc61ed2
JH
11640 } /* end of namedclass \blah */
11641
a0d0e21e 11642 if (range) {
eb160463 11643 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
11644 const int w = RExC_parse - rangebegin;
11645 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 11646 range = 0; /* not a valid range */
73b437c8 11647 }
a0d0e21e
LW
11648 }
11649 else {
3568d838 11650 prevvalue = value; /* save the beginning of the range */
646253b5
KW
11651 if (RExC_parse+1 < RExC_end
11652 && *RExC_parse == '-'
11653 && RExC_parse[1] != ']')
11654 {
830247a4 11655 RExC_parse++;
ffc61ed2
JH
11656
11657 /* a bad range like \w-, [:word:]- ? */
11658 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 11659 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 11660 const int w =
afd78fd5
JH
11661 RExC_parse >= rangebegin ?
11662 RExC_parse - rangebegin : 0;
830247a4 11663 vWARN4(RExC_parse,
b45f050a 11664 "False [] range \"%*.*s\"",
097eb12c 11665 w, w, rangebegin);
afd78fd5 11666 }
8f850557 11667 if (!SIZE_ONLY)
cfbb2758 11668 cp_list = add_cp_to_invlist(cp_list, '-');
73b437c8 11669 } else
ffc61ed2
JH
11670 range = 1; /* yeah, it's a range! */
11671 continue; /* but do it the next time */
a0d0e21e 11672 }
a687059c 11673 }
ffc61ed2 11674
046c4055
KW
11675 /* non-Latin1 code point implies unicode semantics. Must be set in
11676 * pass1 so is there for the whole of pass 2 */
56ca34ca
KW
11677 if (value > 255) {
11678 RExC_uni_semantics = 1;
11679 }
11680
93733859 11681 /* now is the next time */
ae5c130c 11682 if (!SIZE_ONLY) {
68823f48 11683#ifndef EBCDIC
cfbb2758 11684 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
68823f48
KW
11685#else
11686 UV* this_range = _new_invlist(1);
11687 _append_range_to_invlist(this_range, prevvalue, value);
11688
11689 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
11690 * If this range was specified using something like 'i-j', we want
11691 * to include only the 'i' and the 'j', and not anything in
11692 * between, so exclude non-ASCII, non-alphabetics from it.
11693 * However, if the range was specified with something like
11694 * [\x89-\x91] or [\x89-j], all code points within it should be
11695 * included. literal_endpoint==2 means both ends of the range used
11696 * a literal character, not \x{foo} */
11697 if (literal_endpoint == 2
11698 && (prevvalue >= 'a' && value <= 'z')
11699 || (prevvalue >= 'A' && value <= 'Z'))
11700 {
11701 _invlist_intersection(this_range, PL_ASCII, &this_range, );
11702 _invlist_intersection(this_range, PL_Alpha, &this_range, );
68823f48 11703 }
cfbb2758 11704 _invlist_union(cp_list, this_range, &cp_list);
68823f48 11705 literal_endpoint = 0;
1b2d223b 11706#endif
8ada0baa 11707 }
ffc61ed2
JH
11708
11709 range = 0; /* this range (if it was one) is done now */
a0d0e21e 11710 }
ffc61ed2 11711
3172e3fd 11712 /* [\w] can be optimized into \w, but not if there is anything else in the
3a64b515
KW
11713 * brackets (except for an initial '^' which indictes omplementing). We
11714 * also can optimize the common special case /[0-9]/ into /\d/a */
11715 if (element_count == 1 &&
11716 (has_special_charset_op
11717 || has_special_non_charset_op
11718 || (prevvalue == '0' && value == '9')))
11719 {
3172e3fd
KW
11720 U8 op;
11721 bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
11722 const char * cur_parse = RExC_parse;
11723
11724 if (has_special_charset_op) {
11725 U8 offset = get_regex_charset(RExC_flags);
11726
11727 /* /aa is the same as /a for these */
11728 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
11729 offset = REGEX_ASCII_RESTRICTED_CHARSET;
11730 }
11731 switch ((I32)namedclass) {
11732 case ANYOF_NALNUM:
11733 invert = ! invert;
11734 /* FALLTHROUGH */
11735 case ANYOF_ALNUM:
11736 op = ALNUM;
11737 break;
11738 case ANYOF_NSPACE:
11739 invert = ! invert;
11740 /* FALLTHROUGH */
11741 case ANYOF_SPACE:
11742 op = SPACE;
11743 break;
11744 case ANYOF_NDIGIT:
11745 invert = ! invert;
11746 /* FALLTHROUGH */
11747 case ANYOF_DIGIT:
11748 op = DIGIT;
11749
11750 /* There is no DIGITU */
11751 if (offset == REGEX_UNICODE_CHARSET) {
11752 offset = REGEX_DEPENDS_CHARSET;
11753 }
11754 break;
11755 default:
11756 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11757 }
11758
11759 /* The number of varieties of each of these is the same, hence, so
11760 * is the delta between the normal and complemented nodes */
11761 if (invert) {
11762 offset += NALNUM - ALNUM;
11763 }
11764
11765 op += offset;
11766 }
11767 else if (has_special_non_charset_op) {
11768 switch ((I32)namedclass) {
11769 case ANYOF_NHORIZWS:
11770 invert = ! invert;
11771 /* FALLTHROUGH */
11772 case ANYOF_HORIZWS:
11773 op = HORIZWS;
11774 break;
11775 case ANYOF_NVERTWS:
11776 invert = ! invert;
11777 /* FALLTHROUGH */
11778 case ANYOF_VERTWS:
11779 op = VERTWS;
11780 break;
11781 default:
11782 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11783 }
11784
11785 /* The complement version of each of these nodes is adjacently next
11786 * */
11787 if (invert) {
11788 op++;
11789 }
11790 }
3a64b515
KW
11791 else { /* The remaining possibility is [0-9] */
11792 op = (invert) ? NDIGITA : DIGITA;
11793 }
3172e3fd
KW
11794
11795 /* Throw away this ANYOF regnode, and emit the calculated one, which
11796 * should correspond to the beginning, not current, state of the parse
11797 */
11798 RExC_parse = (char *)orig_parse;
11799 RExC_emit = (regnode *)orig_emit;
11800 ret = reg_node(pRExC_state, op);
11801 RExC_parse = (char *) cur_parse;
11802
11803 SvREFCNT_dec(listsv);
11804 return ret;
11805 }
11806
7f6f358c
YO
11807 if (SIZE_ONLY)
11808 return ret;
11809 /****** !SIZE_ONLY AFTER HERE *********/
11810
68823f48
KW
11811 /* If folding, we calculate all characters that could fold to or from the
11812 * ones already on the list */
cfbb2758 11813 if (FOLD && cp_list) {
0d527bf8 11814 UV start, end; /* End points of code point ranges */
56ca34ca 11815
4065ba03 11816 SV* fold_intersection = NULL;
93e5bb1c 11817
cfbb2758 11818 const UV highest_index = invlist_len(cp_list) - 1;
68823f48
KW
11819
11820 /* In the Latin1 range, the characters that can be folded-to or -from
11821 * are precisely the alphabetic characters. If the highest code point
11822 * is within Latin1, we can use the compiled-in list, and not have to
11823 * go out to disk. If the last element in the array is in the
11824 * inversion list set, it starts a range that goes to infinity, so the
11825 * maximum of the inversion list is definitely above Latin1.
11826 * Otherwise, it starts a range that isn't in the set, so the max is
11827 * one less than it */
11828 if (! ELEMENT_RANGE_MATCHES_INVLIST(highest_index)
cfbb2758 11829 && invlist_array(cp_list)[highest_index] <= 256)
68823f48 11830 {
cfbb2758 11831 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
68823f48
KW
11832 }
11833 else {
11834
8f850557
KW
11835 /* This is a list of all the characters that participate in folds
11836 * (except marks, etc in multi-char folds */
11837 if (! PL_utf8_foldable) {
11838 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11839 PL_utf8_foldable = _swash_to_invlist(swash);
11840 SvREFCNT_dec(swash);
11841 }
68823f48 11842
8f850557
KW
11843 /* This is a hash that for a particular fold gives all characters
11844 * that are involved in it */
11845 if (! PL_utf8_foldclosures) {
11846
11847 /* If we were unable to find any folds, then we likely won't be
11848 * able to find the closures. So just create an empty list.
11849 * Folding will effectively be restricted to the non-Unicode
11850 * rules hard-coded into Perl. (This case happens legitimately
11851 * during compilation of Perl itself before the Unicode tables
11852 * are generated) */
11853 if (invlist_len(PL_utf8_foldable) == 0) {
11854 PL_utf8_foldclosures = newHV();
11855 }
11856 else {
11857 /* If the folds haven't been read in, call a fold function
11858 * to force that */
11859 if (! PL_utf8_tofold) {
11860 U8 dummy[UTF8_MAXBYTES+1];
11861 STRLEN dummy_len;
11862
11863 /* This particular string is above \xff in both UTF-8
11864 * and UTFEBCDIC */
11865 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11866 assert(PL_utf8_tofold); /* Verify that worked */
11867 }
11868 PL_utf8_foldclosures =
11869 _swash_inversion_hash(PL_utf8_tofold);
11870 }
11871 }
93e5bb1c 11872
8f850557
KW
11873 /* Only the characters in this class that participate in folds need
11874 * be checked. Get the intersection of this class and all the
11875 * possible characters that are foldable. This can quickly narrow
11876 * down a large class */
cfbb2758 11877 _invlist_intersection(PL_utf8_foldable, cp_list,
8f850557 11878 &fold_intersection);
68823f48 11879 }
93e5bb1c
KW
11880
11881 /* Now look at the foldable characters in this class individually */
0d527bf8
KW
11882 invlist_iterinit(fold_intersection);
11883 while (invlist_iternext(fold_intersection, &start, &end)) {
93e5bb1c
KW
11884 UV j;
11885
68823f48
KW
11886 /* Locale folding for Latin1 characters is deferred until runtime */
11887 if (LOC && start < 256) {
11888 start = 256;
11889 }
11890
93e5bb1c
KW
11891 /* Look at every character in the range */
11892 for (j = start; j <= end; j++) {
11893
93e5bb1c
KW
11894 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11895 STRLEN foldlen;
68823f48
KW
11896 UV f;
11897
11898 if (j < 256) {
11899
11900 /* We have the latin1 folding rules hard-coded here so that
11901 * an innocent-looking character class, like /[ks]/i won't
11902 * have to go out to disk to find the possible matches.
11903 * XXX It would be better to generate these via regen, in
11904 * case a new version of the Unicode standard adds new
11905 * mappings, though that is not really likely, and may be
11906 * caught by the default: case of the switch below. */
11907
11908 if (PL_fold_latin1[j] != j) {
11909
11910 /* ASCII is always matched; non-ASCII is matched only
11911 * under Unicode rules */
11912 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
cfbb2758
KW
11913 cp_list =
11914 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
68823f48
KW
11915 }
11916 else {
11917 depends_list =
11918 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
11919 }
11920 }
11921
11922 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
11923 && (! isASCII(j) || ! MORE_ASCII_RESTRICTED))
11924 {
11925 /* Certain Latin1 characters have matches outside
11926 * Latin1, or are multi-character. To get here, 'j' is
11927 * one of those characters. None of these matches is
11928 * valid for ASCII characters under /aa, which is why
11929 * the 'if' just above excludes those. The matches
11930 * fall into three categories:
11931 * 1) They are singly folded-to or -from an above 255
11932 * character, e.g., LATIN SMALL LETTER Y WITH
11933 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
11934 * DIAERESIS;
11935 * 2) They are part of a multi-char fold with another
11936 * latin1 character; only LATIN SMALL LETTER
11937 * SHARP S => "ss" fits this;
11938 * 3) They are part of a multi-char fold with a
11939 * character outside of Latin1, such as various
11940 * ligatures.
11941 * We aren't dealing fully with multi-char folds, except
11942 * we do deal with the pattern containing a character
11943 * that has a multi-char fold (not so much the inverse).
11944 * For types 1) and 3), the matches only happen when the
11945 * target string is utf8; that's not true for 2), and we
11946 * set a flag for it.
11947 *
11948 * The code below adds the single fold closures for 'j'
11949 * to the inversion list. */
11950 switch (j) {
11951 case 'k':
11952 case 'K':
11953 /* KELVIN SIGN */
cfbb2758
KW
11954 cp_list =
11955 add_cp_to_invlist(cp_list, 0x212A);
68823f48
KW
11956 break;
11957 case 's':
11958 case 'S':
11959 /* LATIN SMALL LETTER LONG S */
cfbb2758
KW
11960 cp_list =
11961 add_cp_to_invlist(cp_list, 0x017F);
68823f48
KW
11962 break;
11963 case MICRO_SIGN:
cfbb2758 11964 cp_list = add_cp_to_invlist(cp_list,
68823f48 11965 GREEK_SMALL_LETTER_MU);
cfbb2758 11966 cp_list = add_cp_to_invlist(cp_list,
68823f48
KW
11967 GREEK_CAPITAL_LETTER_MU);
11968 break;
11969 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
11970 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
11971 /* ANGSTROM SIGN */
cfbb2758
KW
11972 cp_list =
11973 add_cp_to_invlist(cp_list, 0x212B);
68823f48
KW
11974 break;
11975 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
cfbb2758 11976 cp_list = add_cp_to_invlist(cp_list,
68823f48
KW
11977 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
11978 break;
11979 case LATIN_SMALL_LETTER_SHARP_S:
cfbb2758 11980 cp_list = add_cp_to_invlist(cp_list,
68823f48
KW
11981 LATIN_CAPITAL_LETTER_SHARP_S);
11982
11983 /* Under /a, /d, and /u, this can match the two
11984 * chars "ss" */
11985 if (! MORE_ASCII_RESTRICTED) {
11986 add_alternate(&unicode_alternate,
11987 (U8 *) "ss", 2);
11988
11989 /* And under /u or /a, it can match even if
11990 * the target is not utf8 */
11991 if (AT_LEAST_UNI_SEMANTICS) {
11992 ANYOF_FLAGS(ret) |=
11993 ANYOF_NONBITMAP_NON_UTF8;
11994 }
11995 }
11996 break;
11997 case 'F': case 'f':
11998 case 'I': case 'i':
11999 case 'L': case 'l':
12000 case 'T': case 't':
12001 case 'A': case 'a':
12002 case 'H': case 'h':
12003 case 'J': case 'j':
12004 case 'N': case 'n':
12005 case 'W': case 'w':
12006 case 'Y': case 'y':
12007 /* These all are targets of multi-character
12008 * folds from code points that require UTF8 to
12009 * express, so they can't match unless the
12010 * target string is in UTF-8, so no action here
12011 * is necessary, as regexec.c properly handles
12012 * the general case for UTF-8 matching */
12013 break;
12014 default:
12015 /* Use deprecated warning to increase the
12016 * chances of this being output */
12017 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12018 break;
12019 }
12020 }
12021 continue;
12022 }
12023
12024 /* Here is an above Latin1 character. We don't have the rules
12025 * hard-coded for it. First, get its fold */
12026 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
c2df36c4
KW
12027 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12028 | ((LOC)
12029 ? FOLD_FLAGS_LOCALE
12030 : (MORE_ASCII_RESTRICTED)
12031 ? FOLD_FLAGS_NOMIX_ASCII
12032 : 0));
93e5bb1c
KW
12033
12034 if (foldlen > (STRLEN)UNISKIP(f)) {
12035
dbe7a391
KW
12036 /* Any multicharacter foldings (disallowed in lookbehind
12037 * patterns) require the following transform: [ABCDEF] ->
12038 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12039 * folds into "rst", all other characters fold to single
12040 * characters. We save away these multicharacter foldings,
12041 * to be later saved as part of the additional "s" data. */
93e5bb1c
KW
12042 if (! RExC_in_lookbehind) {
12043 U8* loc = foldbuf;
12044 U8* e = foldbuf + foldlen;
12045
dbe7a391
KW
12046 /* If any of the folded characters of this are in the
12047 * Latin1 range, tell the regex engine that this can
68823f48 12048 * match a non-utf8 target string. */
8f850557 12049 while (loc < e) {
8f850557
KW
12050 if (UTF8_IS_INVARIANT(*loc)
12051 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12052 {
8f850557
KW
12053 ANYOF_FLAGS(ret)
12054 |= ANYOF_NONBITMAP_NON_UTF8;
12055 break;
12056 }
12057 loc += UTF8SKIP(loc);
12058 }
17580e7a 12059
93e5bb1c 12060 add_alternate(&unicode_alternate, foldbuf, foldlen);
93e5bb1c
KW
12061 }
12062 }
68823f48
KW
12063 else {
12064 /* Single character fold of above Latin1. Add everything
12065 * in its fold closure to the list that this node should
12066 * match */
93e5bb1c
KW
12067 SV** listp;
12068
dbe7a391
KW
12069 /* The fold closures data structure is a hash with the keys
12070 * being every character that is folded to, like 'k', and
12071 * the values each an array of everything that folds to its
12072 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
93e5bb1c
KW
12073 if ((listp = hv_fetch(PL_utf8_foldclosures,
12074 (char *) foldbuf, foldlen, FALSE)))
12075 {
12076 AV* list = (AV*) *listp;
12077 IV k;
12078 for (k = 0; k <= av_len(list); k++) {
12079 SV** c_p = av_fetch(list, k, FALSE);
12080 UV c;
12081 if (c_p == NULL) {
12082 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12083 }
12084 c = SvUV(*c_p);
12085
dbe7a391
KW
12086 /* /aa doesn't allow folds between ASCII and non-;
12087 * /l doesn't allow them between above and below
12088 * 256 */
8f850557
KW
12089 if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
12090 || (LOC && ((c < 256) != (j < 256))))
93e5bb1c
KW
12091 {
12092 continue;
12093 }
56ca34ca 12094
68823f48
KW
12095 /* Folds involving non-ascii Latin1 characters
12096 * under /d are added to a separate list */
8f850557
KW
12097 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12098 {
cfbb2758 12099 cp_list = add_cp_to_invlist(cp_list, c);
68823f48
KW
12100 }
12101 else {
12102 depends_list = add_cp_to_invlist(depends_list, c);
56ca34ca
KW
12103 }
12104 }
12105 }
12106 }
8f850557 12107 }
93e5bb1c 12108 }
318c430e 12109 SvREFCNT_dec(fold_intersection);
56ca34ca
KW
12110 }
12111
dc415b4a
KW
12112 /* And combine the result (if any) with any inversion list from properties.
12113 * The lists are kept separate up to now because we don't want to fold the
12114 * properties */
12115 if (properties) {
68823f48 12116 if (AT_LEAST_UNI_SEMANTICS) {
cfbb2758
KW
12117 if (cp_list) {
12118 _invlist_union(cp_list, properties, &cp_list);
8f850557
KW
12119 SvREFCNT_dec(properties);
12120 }
12121 else {
cfbb2758 12122 cp_list = properties;
8f850557 12123 }
68823f48
KW
12124 }
12125 else {
12126
12127 /* Under /d, we put the things that match only when the target
12128 * string is utf8, into a separate list */
12129 SV* nonascii_but_latin1_properties = NULL;
8f850557
KW
12130 _invlist_intersection(properties, PL_Latin1,
12131 &nonascii_but_latin1_properties);
12132 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12133 &nonascii_but_latin1_properties);
12134 _invlist_subtract(properties, nonascii_but_latin1_properties,
12135 &properties);
cfbb2758
KW
12136 if (cp_list) {
12137 _invlist_union(cp_list, properties, &cp_list);
68823f48
KW
12138 SvREFCNT_dec(properties);
12139 }
12140 else {
cfbb2758 12141 cp_list = properties;
68823f48
KW
12142 }
12143
12144 if (depends_list) {
12145 _invlist_union(depends_list, nonascii_but_latin1_properties,
12146 &depends_list);
12147 SvREFCNT_dec(nonascii_but_latin1_properties);
12148 }
12149 else {
12150 depends_list = nonascii_but_latin1_properties;
12151 }
12152 }
dc415b4a
KW
12153 }
12154
ea364ff5
KW
12155 /* Here, we have calculated what code points should be in the character
12156 * class.
12157 *
12158 * Now we can see about various optimizations. Fold calculation (which we
12159 * did above) needs to take place before inversion. Otherwise /[^k]/i
12160 * would invert to include K, which under /i would match k, which it
12161 * shouldn't. */
12162
12163 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
12164 * set the FOLD flag yet, so this does optimize those. It doesn't
12165 * optimize locale. Doing so perhaps could be done as long as there is
12166 * nothing like \w in it; some thought also would have to be given to the
12167 * interaction with above 0x100 chars */
12168 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
12169 && ! LOC
12170 && ! depends_list
12171 && ! unicode_alternate
12172 && SvCUR(listsv) == initial_listsv_len)
12173 {
12174 _invlist_invert(cp_list);
12175
12176 /* Any swash can't be used as-is, because we've inverted things */
12177 if (swash) {
12178 SvREFCNT_dec(swash);
12179 swash = NULL;
12180 }
12181
12182 /* Clear the invert flag since have just done it here */
12183 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12184 }
12185
cfbb2758 12186 /* Here, <cp_list> contains all the code points we can determine at
68823f48 12187 * compile time that match under all conditions. Go through it, and
e4e94b48 12188 * for things that belong in the bitmap, put them there, and delete from
cfbb2758
KW
12189 * <cp_list> */
12190 if (cp_list) {
e4e94b48 12191
e4e94b48
KW
12192 /* This gets set if we actually need to modify things */
12193 bool change_invlist = FALSE;
12194
12195 UV start, end;
12196
cfbb2758
KW
12197 /* Start looking through <cp_list> */
12198 invlist_iterinit(cp_list);
12199 while (invlist_iternext(cp_list, &start, &end)) {
e4e94b48
KW
12200 UV high;
12201 int i;
12202
12203 /* Quit if are above what we should change */
68823f48 12204 if (start > 255) {
e4e94b48
KW
12205 break;
12206 }
12207
12208 change_invlist = TRUE;
12209
12210 /* Set all the bits in the range, up to the max that we are doing */
68823f48 12211 high = (end < 255) ? end : 255;
e4e94b48
KW
12212 for (i = start; i <= (int) high; i++) {
12213 if (! ANYOF_BITMAP_TEST(ret, i)) {
12214 ANYOF_BITMAP_SET(ret, i);
12215 stored++;
12216 prevvalue = value;
12217 value = i;
12218 }
12219 }
12220 }
12221
a3e1f3a6 12222 /* Done with loop; remove any code points that are in the bitmap from
cfbb2758 12223 * <cp_list> */
e4e94b48 12224 if (change_invlist) {
cfbb2758 12225 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
e4e94b48
KW
12226 }
12227
12228 /* If have completely emptied it, remove it completely */
cfbb2758
KW
12229 if (invlist_len(cp_list) == 0) {
12230 SvREFCNT_dec(cp_list);
12231 cp_list = NULL;
e4e94b48
KW
12232 }
12233 }
dc415b4a 12234
68823f48
KW
12235 /* Combine the two lists into one. */
12236 if (depends_list) {
cfbb2758
KW
12237 if (cp_list) {
12238 _invlist_union(cp_list, depends_list, &cp_list);
68823f48
KW
12239 SvREFCNT_dec(depends_list);
12240 }
12241 else {
cfbb2758 12242 cp_list = depends_list;
68823f48
KW
12243 }
12244 }
12245
0222889f
KW
12246 /* Folding in the bitmap is taken care of above, but not for locale (for
12247 * which we have to wait to see what folding is in effect at runtime), and
d9105c95
KW
12248 * for some things not in the bitmap (only the upper latin folds in this
12249 * case, as all other single-char folding has been set above). Set
12250 * run-time fold flag for these */
12251 if (FOLD && (LOC
12252 || (DEPENDS_SEMANTICS
cfbb2758 12253 && cp_list
d9105c95
KW
12254 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12255 || unicode_alternate))
12256 {
0222889f 12257 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
f56b6394
KW
12258 }
12259
2786be71
KW
12260 /* A single character class can be "optimized" into an EXACTish node.
12261 * Note that since we don't currently count how many characters there are
12262 * outside the bitmap, we are XXX missing optimization possibilities for
12263 * them. This optimization can't happen unless this is a truly single
12264 * character class, which means that it can't be an inversion into a
12265 * many-character class, and there must be no possibility of there being
12266 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
12267 * \w, etc, so have to make a special test that they aren't present
12268 *
12269 * Similarly A 2-character class of the very special form like [bB] can be
12270 * optimized into an EXACTFish node, but only for non-locales, and for
12271 * characters which only have the two folds; so things like 'fF' and 'Ii'
12272 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12273 * FI'. */
cfbb2758 12274 if (! cp_list
53742956 12275 && ! unicode_alternate
137165a6
KW
12276 && SvCUR(listsv) == initial_listsv_len
12277 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
12278 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12279 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12280 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12281 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12282 /* If the latest code point has a fold whose
12283 * bit is set, it must be the only other one */
2dcac756 12284 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 12285 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
12286 {
12287 /* Note that the information needed to decide to do this optimization
12288 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
12289 * used EXACTish node takes less space than the calculated ANYOF node,
12290 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
12291 * than actually used, so this optimization doesn't gain us any space.
12292 * But an EXACT node is faster than an ANYOF node, and can be combined
12293 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
12294 * gains. The speed of executing an EXACTF is similar to an ANYOF
12295 * node, so the optimization advantage comes from the ability to join
12296 * it to adjacent EXACT nodes */
2786be71 12297
07be1b83 12298 const char * cur_parse= RExC_parse;
6da63e10 12299 U8 op;
07be1b83
YO
12300 RExC_emit = (regnode *)orig_emit;
12301 RExC_parse = (char *)orig_parse;
2786be71 12302
6da63e10
KW
12303 if (stored == 1) {
12304
12305 /* A locale node with one point can be folded; all the other cases
12306 * with folding will have two points, since we calculate them above
12307 */
39065660 12308 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
12309 op = EXACTFL;
12310 }
12311 else {
12312 op = EXACT;
12313 }
b36527fc
KW
12314 }
12315 else { /* else 2 chars in the bit map: the folds of each other */
12316
12317 /* Use the folded value, which for the cases where we get here,
12318 * is just the lower case of the current one (which may resolve to
12319 * itself, or to the other one */
12320 value = toLOWER_LATIN1(value);
6da63e10 12321
bf4c00b4
KW
12322 /* To join adjacent nodes, they must be the exact EXACTish type.
12323 * Try to use the most likely type, by using EXACTFA if possible,
12324 * then EXACTFU if the regex calls for it, or is required because
12325 * the character is non-ASCII. (If <value> is ASCII, its fold is
12326 * also ASCII for the cases where we get here.) */
12327 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12328 op = EXACTFA;
12329 }
12330 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
e62862f6
KW
12331 op = EXACTFU;
12332 }
12333 else { /* Otherwise, more likely to be EXACTF type */
12334 op = EXACTF;
12335 }
b36527fc 12336 }
6da63e10
KW
12337
12338 ret = reg_node(pRExC_state, op);
07be1b83 12339 RExC_parse = (char *)cur_parse;
2786be71
KW
12340 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12341 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12342 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12343 STR_LEN(ret)= 2;
12344 RExC_emit += STR_SZ(2);
12345 }
12346 else {
12347 *STRING(ret)= (char)value;
12348 STR_LEN(ret)= 1;
12349 RExC_emit += STR_SZ(1);
12350 }
ef8d46e8 12351 SvREFCNT_dec(listsv);
7f6f358c
YO
12352 return ret;
12353 }
ffc61ed2 12354
dc415b4a
KW
12355 /* If there is a swash and more than one element, we can't use the swash in
12356 * the optimization below. */
12357 if (swash && element_count > 1) {
12358 SvREFCNT_dec(swash);
12359 swash = NULL;
12360 }
cfbb2758 12361 if (! cp_list
c16787fd
KW
12362 && SvCUR(listsv) == initial_listsv_len
12363 && ! unicode_alternate)
12364 {
137165a6
KW
12365 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12366 SvREFCNT_dec(listsv);
12367 SvREFCNT_dec(unicode_alternate);
12368 }
12369 else {
0bd1039c
KW
12370 /* av[0] stores the character class description in its textual form:
12371 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12372 * appropriate swash, and is also useful for dumping the regnode.
12373 * av[1] if NULL, is a placeholder to later contain the swash computed
12374 * from av[0]. But if no further computation need be done, the
12375 * swash is stored there now.
12376 * av[2] stores the multicharacter foldings, used later in
12377 * regexec.c:S_reginclass().
cfbb2758 12378 * av[3] stores the cp_list inversion list for use in addition or
0bd1039c
KW
12379 * instead of av[0]; not used if av[1] isn't NULL
12380 * av[4] is set if any component of the class is from a user-defined
12381 * property; not used if av[1] isn't NULL */
097eb12c 12382 AV * const av = newAV();
ffc61ed2 12383 SV *rv;
0bd1039c 12384
c16787fd
KW
12385 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12386 ? &PL_sv_undef
12387 : listsv);
88d45d28
KW
12388 if (swash) {
12389 av_store(av, 1, swash);
cfbb2758 12390 SvREFCNT_dec(cp_list);
88d45d28
KW
12391 }
12392 else {
12393 av_store(av, 1, NULL);
cfbb2758
KW
12394 if (cp_list) {
12395 av_store(av, 3, cp_list);
dc415b4a 12396 av_store(av, 4, newSVuv(has_user_defined_property));
c16787fd 12397 }
88d45d28 12398 }
827f5bb8
KW
12399
12400 /* Store any computed multi-char folds only if we are allowing
12401 * them */
12402 if (allow_full_fold) {
7b4a7e58
KW
12403 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12404 if (unicode_alternate) { /* This node is variable length */
12405 OP(ret) = ANYOFV;
12406 }
827f5bb8
KW
12407 }
12408 else {
12409 av_store(av, 2, NULL);
12410 }
ad64d0ec 12411 rv = newRV_noinc(MUTABLE_SV(av));
19860706 12412 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 12413 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 12414 ARG_SET(ret, n);
a0ed51b3 12415 }
a0ed51b3
LW
12416 return ret;
12417}
89836f1f 12418
a0ed51b3 12419
bcdf7404
YO
12420/* reg_skipcomment()
12421
12422 Absorbs an /x style # comments from the input stream.
12423 Returns true if there is more text remaining in the stream.
12424 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12425 terminates the pattern without including a newline.
12426
12427 Note its the callers responsibility to ensure that we are
12428 actually in /x mode
12429
12430*/
12431
12432STATIC bool
12433S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12434{
12435 bool ended = 0;
7918f24d
NC
12436
12437 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12438
bcdf7404
YO
12439 while (RExC_parse < RExC_end)
12440 if (*RExC_parse++ == '\n') {
12441 ended = 1;
12442 break;
12443 }
12444 if (!ended) {
12445 /* we ran off the end of the pattern without ending
12446 the comment, so we have to add an \n when wrapping */
12447 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12448 return 0;
12449 } else
12450 return 1;
12451}
12452
12453/* nextchar()
12454
3b753521 12455 Advances the parse position, and optionally absorbs
bcdf7404
YO
12456 "whitespace" from the inputstream.
12457
12458 Without /x "whitespace" means (?#...) style comments only,
12459 with /x this means (?#...) and # comments and whitespace proper.
12460
12461 Returns the RExC_parse point from BEFORE the scan occurs.
12462
12463 This is the /x friendly way of saying RExC_parse++.
12464*/
12465
76e3520e 12466STATIC char*
830247a4 12467S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 12468{
097eb12c 12469 char* const retval = RExC_parse++;
a0d0e21e 12470
7918f24d
NC
12471 PERL_ARGS_ASSERT_NEXTCHAR;
12472
4633a7c4 12473 for (;;) {
d224c965
KW
12474 if (RExC_end - RExC_parse >= 3
12475 && *RExC_parse == '('
12476 && RExC_parse[1] == '?'
12477 && RExC_parse[2] == '#')
12478 {
e994fd66
AE
12479 while (*RExC_parse != ')') {
12480 if (RExC_parse == RExC_end)
12481 FAIL("Sequence (?#... not terminated");
830247a4 12482 RExC_parse++;
e994fd66 12483 }
830247a4 12484 RExC_parse++;
4633a7c4
LW
12485 continue;
12486 }
bbe252da 12487 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
12488 if (isSPACE(*RExC_parse)) {
12489 RExC_parse++;
748a9306
LW
12490 continue;
12491 }
830247a4 12492 else if (*RExC_parse == '#') {
bcdf7404
YO
12493 if ( reg_skipcomment( pRExC_state ) )
12494 continue;
748a9306 12495 }
748a9306 12496 }
4633a7c4 12497 return retval;
a0d0e21e 12498 }
a687059c
LW
12499}
12500
12501/*
c277df42 12502- reg_node - emit a node
a0d0e21e 12503*/
76e3520e 12504STATIC regnode * /* Location. */
830247a4 12505S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 12506{
97aff369 12507 dVAR;
c277df42 12508 register regnode *ptr;
504618e9 12509 regnode * const ret = RExC_emit;
07be1b83 12510 GET_RE_DEBUG_FLAGS_DECL;
a687059c 12511
7918f24d
NC
12512 PERL_ARGS_ASSERT_REG_NODE;
12513
c277df42 12514 if (SIZE_ONLY) {
830247a4
IZ
12515 SIZE_ALIGN(RExC_size);
12516 RExC_size += 1;
a0d0e21e
LW
12517 return(ret);
12518 }
3b57cd43 12519 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
12520 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12521 op, RExC_emit, RExC_emit_bound);
3b57cd43 12522
c277df42 12523 NODE_ALIGN_FILL(ret);
a0d0e21e 12524 ptr = ret;
c277df42 12525 FILL_ADVANCE_NODE(ptr, op);
7122b237 12526#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12527 if (RExC_offsets) { /* MJD */
07be1b83 12528 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 12529 "reg_node", __LINE__,
13d6edb4 12530 PL_reg_name[op],
07be1b83
YO
12531 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12532 ? "Overwriting end of array!\n" : "OK",
12533 (UV)(RExC_emit - RExC_emit_start),
12534 (UV)(RExC_parse - RExC_start),
12535 (UV)RExC_offsets[0]));
ccb2c380 12536 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 12537 }
7122b237 12538#endif
830247a4 12539 RExC_emit = ptr;
a0d0e21e 12540 return(ret);
a687059c
LW
12541}
12542
12543/*
a0d0e21e
LW
12544- reganode - emit a node with an argument
12545*/
76e3520e 12546STATIC regnode * /* Location. */
830247a4 12547S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 12548{
97aff369 12549 dVAR;
c277df42 12550 register regnode *ptr;
504618e9 12551 regnode * const ret = RExC_emit;
07be1b83 12552 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 12553
7918f24d
NC
12554 PERL_ARGS_ASSERT_REGANODE;
12555
c277df42 12556 if (SIZE_ONLY) {
830247a4
IZ
12557 SIZE_ALIGN(RExC_size);
12558 RExC_size += 2;
6bda09f9
YO
12559 /*
12560 We can't do this:
12561
12562 assert(2==regarglen[op]+1);
686b73d4 12563
6bda09f9
YO
12564 Anything larger than this has to allocate the extra amount.
12565 If we changed this to be:
12566
12567 RExC_size += (1 + regarglen[op]);
12568
12569 then it wouldn't matter. Its not clear what side effect
12570 might come from that so its not done so far.
12571 -- dmq
12572 */
a0d0e21e
LW
12573 return(ret);
12574 }
3b57cd43 12575 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
12576 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12577 op, RExC_emit, RExC_emit_bound);
3b57cd43 12578
c277df42 12579 NODE_ALIGN_FILL(ret);
a0d0e21e 12580 ptr = ret;
c277df42 12581 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 12582#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12583 if (RExC_offsets) { /* MJD */
07be1b83 12584 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 12585 "reganode",
ccb2c380 12586 __LINE__,
13d6edb4 12587 PL_reg_name[op],
07be1b83 12588 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 12589 "Overwriting end of array!\n" : "OK",
07be1b83
YO
12590 (UV)(RExC_emit - RExC_emit_start),
12591 (UV)(RExC_parse - RExC_start),
12592 (UV)RExC_offsets[0]));
ccb2c380 12593 Set_Cur_Node_Offset;
fac92740 12594 }
7122b237 12595#endif
830247a4 12596 RExC_emit = ptr;
a0d0e21e 12597 return(ret);
fe14fcc3
LW
12598}
12599
12600/*
cd439c50 12601- reguni - emit (if appropriate) a Unicode character
a0ed51b3 12602*/
71207a34
AL
12603STATIC STRLEN
12604S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 12605{
97aff369 12606 dVAR;
7918f24d
NC
12607
12608 PERL_ARGS_ASSERT_REGUNI;
12609
71207a34 12610 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
12611}
12612
12613/*
a0d0e21e
LW
12614- reginsert - insert an operator in front of already-emitted operand
12615*
12616* Means relocating the operand.
12617*/
76e3520e 12618STATIC void
6bda09f9 12619S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 12620{
97aff369 12621 dVAR;
c277df42
IZ
12622 register regnode *src;
12623 register regnode *dst;
12624 register regnode *place;
504618e9 12625 const int offset = regarglen[(U8)op];
6bda09f9 12626 const int size = NODE_STEP_REGNODE + offset;
07be1b83 12627 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12628
12629 PERL_ARGS_ASSERT_REGINSERT;
def51078 12630 PERL_UNUSED_ARG(depth);
22c35a8c 12631/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 12632 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 12633 if (SIZE_ONLY) {
6bda09f9 12634 RExC_size += size;
a0d0e21e
LW
12635 return;
12636 }
a687059c 12637
830247a4 12638 src = RExC_emit;
6bda09f9 12639 RExC_emit += size;
830247a4 12640 dst = RExC_emit;
40d049e4 12641 if (RExC_open_parens) {
6bda09f9 12642 int paren;
3b57cd43 12643 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 12644 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 12645 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 12646 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
12647 RExC_open_parens[paren] += size;
12648 } else {
3b57cd43 12649 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
12650 }
12651 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 12652 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
12653 RExC_close_parens[paren] += size;
12654 } else {
3b57cd43 12655 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
12656 }
12657 }
6bda09f9 12658 }
40d049e4 12659
fac92740 12660 while (src > opnd) {
c277df42 12661 StructCopy(--src, --dst, regnode);
7122b237 12662#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12663 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 12664 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 12665 "reg_insert",
ccb2c380 12666 __LINE__,
13d6edb4 12667 PL_reg_name[op],
07be1b83
YO
12668 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12669 ? "Overwriting end of array!\n" : "OK",
12670 (UV)(src - RExC_emit_start),
12671 (UV)(dst - RExC_emit_start),
12672 (UV)RExC_offsets[0]));
ccb2c380
MP
12673 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12674 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 12675 }
7122b237 12676#endif
fac92740
MJD
12677 }
12678
a0d0e21e
LW
12679
12680 place = opnd; /* Op node, where operand used to be. */
7122b237 12681#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 12682 if (RExC_offsets) { /* MJD */
07be1b83 12683 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 12684 "reginsert",
ccb2c380 12685 __LINE__,
13d6edb4 12686 PL_reg_name[op],
07be1b83 12687 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 12688 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
12689 (UV)(place - RExC_emit_start),
12690 (UV)(RExC_parse - RExC_start),
786e8c11 12691 (UV)RExC_offsets[0]));
ccb2c380 12692 Set_Node_Offset(place, RExC_parse);
45948336 12693 Set_Node_Length(place, 1);
fac92740 12694 }
7122b237 12695#endif
c277df42
IZ
12696 src = NEXTOPER(place);
12697 FILL_ADVANCE_NODE(place, op);
12698 Zero(src, offset, regnode);
a687059c
LW
12699}
12700
12701/*
c277df42 12702- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 12703- SEE ALSO: regtail_study
a0d0e21e 12704*/
097eb12c 12705/* TODO: All three parms should be const */
76e3520e 12706STATIC void
3dab1dad 12707S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 12708{
97aff369 12709 dVAR;
c277df42 12710 register regnode *scan;
72f13be8 12711 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12712
12713 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
12714#ifndef DEBUGGING
12715 PERL_UNUSED_ARG(depth);
12716#endif
a0d0e21e 12717
c277df42 12718 if (SIZE_ONLY)
a0d0e21e
LW
12719 return;
12720
12721 /* Find last node. */
12722 scan = p;
12723 for (;;) {
504618e9 12724 regnode * const temp = regnext(scan);
3dab1dad
YO
12725 DEBUG_PARSE_r({
12726 SV * const mysv=sv_newmortal();
12727 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12728 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
12729 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12730 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12731 (temp == NULL ? "->" : ""),
13d6edb4 12732 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 12733 );
3dab1dad
YO
12734 });
12735 if (temp == NULL)
12736 break;
12737 scan = temp;
12738 }
12739
12740 if (reg_off_by_arg[OP(scan)]) {
12741 ARG_SET(scan, val - scan);
12742 }
12743 else {
12744 NEXT_OFF(scan) = val - scan;
12745 }
12746}
12747
07be1b83 12748#ifdef DEBUGGING
3dab1dad
YO
12749/*
12750- regtail_study - set the next-pointer at the end of a node chain of p to val.
12751- Look for optimizable sequences at the same time.
12752- currently only looks for EXACT chains.
07be1b83 12753
486ec47a 12754This is experimental code. The idea is to use this routine to perform
07be1b83
YO
12755in place optimizations on branches and groups as they are constructed,
12756with the long term intention of removing optimization from study_chunk so
12757that it is purely analytical.
12758
12759Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12760to control which is which.
12761
3dab1dad
YO
12762*/
12763/* TODO: All four parms should be const */
07be1b83 12764
3dab1dad
YO
12765STATIC U8
12766S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12767{
12768 dVAR;
12769 register regnode *scan;
07be1b83
YO
12770 U8 exact = PSEUDO;
12771#ifdef EXPERIMENTAL_INPLACESCAN
12772 I32 min = 0;
12773#endif
3dab1dad
YO
12774 GET_RE_DEBUG_FLAGS_DECL;
12775
7918f24d
NC
12776 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12777
07be1b83 12778
3dab1dad
YO
12779 if (SIZE_ONLY)
12780 return exact;
12781
12782 /* Find last node. */
12783
12784 scan = p;
12785 for (;;) {
12786 regnode * const temp = regnext(scan);
07be1b83 12787#ifdef EXPERIMENTAL_INPLACESCAN
f758bddf
KW
12788 if (PL_regkind[OP(scan)] == EXACT) {
12789 bool has_exactf_sharp_s; /* Unexamined in this routine */
12790 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
07be1b83 12791 return EXACT;
f758bddf 12792 }
07be1b83 12793#endif
3dab1dad
YO
12794 if ( exact ) {
12795 switch (OP(scan)) {
12796 case EXACT:
12797 case EXACTF:
2f7f8cb1 12798 case EXACTFA:
2c2b7f86 12799 case EXACTFU:
3c760661 12800 case EXACTFU_SS:
fab2782b 12801 case EXACTFU_TRICKYFOLD:
3dab1dad
YO
12802 case EXACTFL:
12803 if( exact == PSEUDO )
12804 exact= OP(scan);
07be1b83
YO
12805 else if ( exact != OP(scan) )
12806 exact= 0;
3dab1dad
YO
12807 case NOTHING:
12808 break;
12809 default:
12810 exact= 0;
12811 }
12812 }
12813 DEBUG_PARSE_r({
12814 SV * const mysv=sv_newmortal();
12815 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12816 regprop(RExC_rx, mysv, scan);
eaf3ca90 12817 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 12818 SvPV_nolen_const(mysv),
eaf3ca90 12819 REG_NODE_NUM(scan),
13d6edb4 12820 PL_reg_name[exact]);
3dab1dad 12821 });
a0d0e21e
LW
12822 if (temp == NULL)
12823 break;
12824 scan = temp;
12825 }
07be1b83
YO
12826 DEBUG_PARSE_r({
12827 SV * const mysv_val=sv_newmortal();
12828 DEBUG_PARSE_MSG("");
12829 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
12830 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12831 SvPV_nolen_const(mysv_val),
12832 (IV)REG_NODE_NUM(val),
12833 (IV)(val - scan)
07be1b83
YO
12834 );
12835 });
c277df42
IZ
12836 if (reg_off_by_arg[OP(scan)]) {
12837 ARG_SET(scan, val - scan);
a0ed51b3
LW
12838 }
12839 else {
c277df42
IZ
12840 NEXT_OFF(scan) = val - scan;
12841 }
3dab1dad
YO
12842
12843 return exact;
a687059c 12844}
07be1b83 12845#endif
a687059c
LW
12846
12847/*
fd181c75 12848 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 12849 */
f7819f85 12850#ifdef DEBUGGING
c33269f7 12851static void
7918f24d
NC
12852S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12853{
f7819f85
A
12854 int bit;
12855 int set=0;
a62b1201 12856 regex_charset cs;
7918f24d 12857
f7819f85
A
12858 for (bit=0; bit<32; bit++) {
12859 if (flags & (1<<bit)) {
a62b1201
KW
12860 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12861 continue;
12862 }
f7819f85
A
12863 if (!set++ && lead)
12864 PerlIO_printf(Perl_debug_log, "%s",lead);
12865 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12866 }
12867 }
a62b1201
KW
12868 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12869 if (!set++ && lead) {
12870 PerlIO_printf(Perl_debug_log, "%s",lead);
12871 }
12872 switch (cs) {
12873 case REGEX_UNICODE_CHARSET:
12874 PerlIO_printf(Perl_debug_log, "UNICODE");
12875 break;
12876 case REGEX_LOCALE_CHARSET:
12877 PerlIO_printf(Perl_debug_log, "LOCALE");
12878 break;
cfaf538b
KW
12879 case REGEX_ASCII_RESTRICTED_CHARSET:
12880 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12881 break;
2f7f8cb1
KW
12882 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12883 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12884 break;
a62b1201
KW
12885 default:
12886 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12887 break;
12888 }
12889 }
f7819f85
A
12890 if (lead) {
12891 if (set)
12892 PerlIO_printf(Perl_debug_log, "\n");
12893 else
12894 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12895 }
12896}
12897#endif
12898
a687059c 12899void
097eb12c 12900Perl_regdump(pTHX_ const regexp *r)
a687059c 12901{
35ff7856 12902#ifdef DEBUGGING
97aff369 12903 dVAR;
c445ea15 12904 SV * const sv = sv_newmortal();
ab3bbdeb 12905 SV *dsv= sv_newmortal();
f8fc2ecf 12906 RXi_GET_DECL(r,ri);
f7819f85 12907 GET_RE_DEBUG_FLAGS_DECL;
a687059c 12908
7918f24d
NC
12909 PERL_ARGS_ASSERT_REGDUMP;
12910
f8fc2ecf 12911 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
12912
12913 /* Header fields of interest. */
ab3bbdeb
YO
12914 if (r->anchored_substr) {
12915 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12916 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 12917 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12918 "anchored %s%s at %"IVdf" ",
12919 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 12920 (IV)r->anchored_offset);
ab3bbdeb
YO
12921 } else if (r->anchored_utf8) {
12922 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12923 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 12924 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12925 "anchored utf8 %s%s at %"IVdf" ",
12926 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 12927 (IV)r->anchored_offset);
ab3bbdeb
YO
12928 }
12929 if (r->float_substr) {
12930 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12931 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 12932 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12933 "floating %s%s at %"IVdf"..%"UVuf" ",
12934 s, RE_SV_TAIL(r->float_substr),
7b0972df 12935 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
12936 } else if (r->float_utf8) {
12937 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12938 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 12939 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12940 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12941 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 12942 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 12943 }
33b8afdf 12944 if (r->check_substr || r->check_utf8)
b81d288d 12945 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
12946 (const char *)
12947 (r->check_substr == r->float_substr
12948 && r->check_utf8 == r->float_utf8
12949 ? "(checking floating" : "(checking anchored"));
bbe252da 12950 if (r->extflags & RXf_NOSCAN)
c277df42 12951 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 12952 if (r->extflags & RXf_CHECK_ALL)
c277df42 12953 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 12954 if (r->check_substr || r->check_utf8)
c277df42
IZ
12955 PerlIO_printf(Perl_debug_log, ") ");
12956
f8fc2ecf
YO
12957 if (ri->regstclass) {
12958 regprop(r, sv, ri->regstclass);
1de06328 12959 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 12960 }
bbe252da 12961 if (r->extflags & RXf_ANCH) {
774d564b 12962 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 12963 if (r->extflags & RXf_ANCH_BOL)
774d564b 12964 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 12965 if (r->extflags & RXf_ANCH_MBOL)
c277df42 12966 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 12967 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 12968 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 12969 if (r->extflags & RXf_ANCH_GPOS)
774d564b 12970 PerlIO_printf(Perl_debug_log, "(GPOS)");
12971 PerlIO_putc(Perl_debug_log, ' ');
12972 }
bbe252da 12973 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 12974 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 12975 if (r->intflags & PREGf_SKIP)
760ac839 12976 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 12977 if (r->intflags & PREGf_IMPLICIT)
760ac839 12978 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 12979 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 12980 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 12981 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 12982 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 12983 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 12984#else
7918f24d 12985 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 12986 PERL_UNUSED_CONTEXT;
65e66c80 12987 PERL_UNUSED_ARG(r);
17c3b450 12988#endif /* DEBUGGING */
a687059c
LW
12989}
12990
12991/*
a0d0e21e
LW
12992- regprop - printable representation of opcode
12993*/
3339dfd8
YO
12994#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12995STMT_START { \
12996 if (do_sep) { \
12997 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12998 if (flags & ANYOF_INVERT) \
12999 /*make sure the invert info is in each */ \
13000 sv_catpvs(sv, "^"); \
13001 do_sep = 0; \
13002 } \
13003} STMT_END
13004
46fc3d4c 13005void
32fc9b6a 13006Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 13007{
35ff7856 13008#ifdef DEBUGGING
97aff369 13009 dVAR;
9b155405 13010 register int k;
f8fc2ecf 13011 RXi_GET_DECL(prog,progi);
1de06328 13012 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 13013
7918f24d 13014 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 13015
76f68e9b 13016 sv_setpvs(sv, "");
8aa23a47 13017
03363afd 13018 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
13019 /* It would be nice to FAIL() here, but this may be called from
13020 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 13021 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 13022 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 13023
3dab1dad 13024 k = PL_regkind[OP(o)];
9b155405 13025
2a782b5b 13026 if (k == EXACT) {
f92a2122 13027 sv_catpvs(sv, " ");
ab3bbdeb
YO
13028 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13029 * is a crude hack but it may be the best for now since
13030 * we have no flag "this EXACTish node was UTF-8"
13031 * --jhi */
f92a2122
NC
13032 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13033 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 13034 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
13035 PERL_PV_PRETTY_ELLIPSES |
13036 PERL_PV_PRETTY_LTGT |
13037 PERL_PV_PRETTY_NOCLEAR
13038 );
bb263b4e 13039 } else if (k == TRIE) {
3dab1dad 13040 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 13041 * progi->data isn't available here */
1de06328 13042 const char op = OP(o);
647f639f 13043 const U32 n = ARG(o);
1de06328 13044 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 13045 (reg_ac_data *)progi->data->data[n] :
1de06328 13046 NULL;
3251b653
NC
13047 const reg_trie_data * const trie
13048 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 13049
13d6edb4 13050 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
13051 DEBUG_TRIE_COMPILE_r(
13052 Perl_sv_catpvf(aTHX_ sv,
13053 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13054 (UV)trie->startstate,
1e2e3d02 13055 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
13056 (UV)trie->wordcount,
13057 (UV)trie->minlen,
13058 (UV)trie->maxlen,
13059 (UV)TRIE_CHARCOUNT(trie),
13060 (UV)trie->uniquecharcount
13061 )
13062 );
13063 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13064 int i;
13065 int rangestart = -1;
f46cb337 13066 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 13067 sv_catpvs(sv, "[");
1de06328
YO
13068 for (i = 0; i <= 256; i++) {
13069 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13070 if (rangestart == -1)
13071 rangestart = i;
13072 } else if (rangestart != -1) {
13073 if (i <= rangestart + 3)
13074 for (; rangestart < i; rangestart++)
13075 put_byte(sv, rangestart);
13076 else {
13077 put_byte(sv, rangestart);
13078 sv_catpvs(sv, "-");
13079 put_byte(sv, i - 1);
13080 }
13081 rangestart = -1;
13082 }
13083 }
f3a2811a 13084 sv_catpvs(sv, "]");
1de06328
YO
13085 }
13086
a3621e74 13087 } else if (k == CURLY) {
cb434fcc 13088 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
13089 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13090 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 13091 }
2c2d71f5
JH
13092 else if (k == WHILEM && o->flags) /* Ordinal/of */
13093 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 13094 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 13095 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 13096 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 13097 if ( k != REF || (OP(o) < NREF)) {
502c6561 13098 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
13099 SV **name= av_fetch(list, ARG(o), 0 );
13100 if (name)
13101 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13102 }
13103 else {
502c6561 13104 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 13105 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
13106 I32 *nums=(I32*)SvPVX(sv_dat);
13107 SV **name= av_fetch(list, nums[0], 0 );
13108 I32 n;
13109 if (name) {
13110 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13111 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13112 (n ? "," : ""), (IV)nums[n]);
13113 }
13114 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 13115 }
1f1031fe 13116 }
ee9b8eae 13117 }
1f1031fe 13118 } else if (k == GOSUB)
6bda09f9 13119 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
13120 else if (k == VERB) {
13121 if (!o->flags)
13122 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 13123 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 13124 } else if (k == LOGICAL)
04ebc1ab 13125 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
13126 else if (k == ANYOF) {
13127 int i, rangestart = -1;
2d03de9c 13128 const U8 flags = ANYOF_FLAGS(o);
24d786f4 13129 int do_sep = 0;
0bd48802
AL
13130
13131 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13132 static const char * const anyofs[] = {
653099ff
GS
13133 "\\w",
13134 "\\W",
13135 "\\s",
13136 "\\S",
13137 "\\d",
13138 "\\D",
13139 "[:alnum:]",
13140 "[:^alnum:]",
13141 "[:alpha:]",
13142 "[:^alpha:]",
13143 "[:ascii:]",
13144 "[:^ascii:]",
24d786f4
YO
13145 "[:cntrl:]",
13146 "[:^cntrl:]",
653099ff
GS
13147 "[:graph:]",
13148 "[:^graph:]",
13149 "[:lower:]",
13150 "[:^lower:]",
13151 "[:print:]",
13152 "[:^print:]",
13153 "[:punct:]",
13154 "[:^punct:]",
13155 "[:upper:]",
aaa51d5e 13156 "[:^upper:]",
653099ff 13157 "[:xdigit:]",
aaa51d5e
JF
13158 "[:^xdigit:]",
13159 "[:space:]",
13160 "[:^space:]",
13161 "[:blank:]",
13162 "[:^blank:]"
653099ff
GS
13163 };
13164
19860706 13165 if (flags & ANYOF_LOCALE)
396482e1 13166 sv_catpvs(sv, "{loc}");
39065660 13167 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 13168 sv_catpvs(sv, "{i}");
653099ff 13169 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 13170 if (flags & ANYOF_INVERT)
396482e1 13171 sv_catpvs(sv, "^");
686b73d4 13172
3339dfd8 13173 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
13174 for (i = 0; i <= 256; i++) {
13175 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13176 if (rangestart == -1)
13177 rangestart = i;
13178 } else if (rangestart != -1) {
13179 if (i <= rangestart + 3)
13180 for (; rangestart < i; rangestart++)
653099ff 13181 put_byte(sv, rangestart);
ffc61ed2
JH
13182 else {
13183 put_byte(sv, rangestart);
396482e1 13184 sv_catpvs(sv, "-");
ffc61ed2 13185 put_byte(sv, i - 1);
653099ff 13186 }
24d786f4 13187 do_sep = 1;
ffc61ed2 13188 rangestart = -1;
653099ff 13189 }
847a199f 13190 }
3339dfd8
YO
13191
13192 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
13193 /* output any special charclass tests (used entirely under use locale) */
13194 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 13195 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 13196 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 13197 sv_catpv(sv, anyofs[i]);
24d786f4
YO
13198 do_sep = 1;
13199 }
13200
3339dfd8
YO
13201 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13202
11454c59
KW
13203 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13204 sv_catpvs(sv, "{non-utf8-latin1-all}");
13205 }
13206
3339dfd8 13207 /* output information about the unicode matching */
ef87b810 13208 if (flags & ANYOF_UNICODE_ALL)
396482e1 13209 sv_catpvs(sv, "{unicode_all}");
137165a6 13210 else if (ANYOF_NONBITMAP(o))
ef87b810 13211 sv_catpvs(sv, "{unicode}");
f5ecd18d 13212 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 13213 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2 13214
1aa9930e 13215 if (ANYOF_NONBITMAP(o)) {
dbe7a391 13216 SV *lv; /* Set if there is something outside the bit map */
32fc9b6a 13217 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
f1114c30
KW
13218 bool byte_output = FALSE; /* If something in the bitmap has been
13219 output */
686b73d4 13220
c16787fd 13221 if (lv && lv != &PL_sv_undef) {
ffc61ed2 13222 if (sw) {
89ebb4a3 13223 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 13224
dbe7a391 13225 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
1df70142 13226 uvchr_to_utf8(s, i);
686b73d4 13227
dcf8909a
KW
13228 if (i < 256
13229 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13230 things already
13231 output as part
13232 of the bitmap */
13233 && swash_fetch(sw, s, TRUE))
13234 {
ffc61ed2
JH
13235 if (rangestart == -1)
13236 rangestart = i;
13237 } else if (rangestart != -1) {
f1114c30 13238 byte_output = TRUE;
ffc61ed2
JH
13239 if (i <= rangestart + 3)
13240 for (; rangestart < i; rangestart++) {
7128c099 13241 put_byte(sv, rangestart);
ffc61ed2
JH
13242 }
13243 else {
7128c099 13244 put_byte(sv, rangestart);
396482e1 13245 sv_catpvs(sv, "-");
7128c099 13246 put_byte(sv, i-1);
ffc61ed2 13247 }
e87973a9 13248 rangestart = -1;
19860706 13249 }
e87973a9 13250 }
19860706 13251 }
fde631ed 13252
ffc61ed2 13253 {
2e0de35c 13254 char *s = savesvpv(lv);
c445ea15 13255 char * const origs = s;
686b73d4 13256
3dab1dad
YO
13257 while (*s && *s != '\n')
13258 s++;
686b73d4 13259
ffc61ed2 13260 if (*s == '\n') {
2d03de9c 13261 const char * const t = ++s;
686b73d4 13262
f1114c30
KW
13263 if (byte_output) {
13264 sv_catpvs(sv, " ");
13265 }
13266
ffc61ed2 13267 while (*s) {
c574ffb9
KW
13268 if (*s == '\n') {
13269
13270 /* Truncate very long output */
13271 if (s - origs > 256) {
13272 Perl_sv_catpvf(aTHX_ sv,
13273 "%.*s...",
13274 (int) (s - origs - 1),
13275 t);
13276 goto out_dump;
13277 }
ffc61ed2 13278 *s = ' ';
1a9c8476
KW
13279 }
13280 else if (*s == '\t') {
13281 *s = '-';
13282 }
ffc61ed2
JH
13283 s++;
13284 }
13285 if (s[-1] == ' ')
13286 s[-1] = 0;
686b73d4 13287
ffc61ed2 13288 sv_catpv(sv, t);
fde631ed 13289 }
686b73d4 13290
c574ffb9
KW
13291 out_dump:
13292
ffc61ed2 13293 Safefree(origs);
fde631ed 13294 }
c16787fd 13295 SvREFCNT_dec(lv);
fde631ed 13296 }
653099ff 13297 }
ffc61ed2 13298
653099ff
GS
13299 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13300 }
9b155405 13301 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 13302 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 13303#else
96a5add6 13304 PERL_UNUSED_CONTEXT;
65e66c80
SP
13305 PERL_UNUSED_ARG(sv);
13306 PERL_UNUSED_ARG(o);
f9049ba1 13307 PERL_UNUSED_ARG(prog);
17c3b450 13308#endif /* DEBUGGING */
35ff7856 13309}
a687059c 13310
cad2e5aa 13311SV *
288b8c02 13312Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 13313{ /* Assume that RE_INTUIT is set */
97aff369 13314 dVAR;
288b8c02 13315 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 13316 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
13317
13318 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
13319 PERL_UNUSED_CONTEXT;
13320
a3621e74 13321 DEBUG_COMPILE_r(
cfd0369c 13322 {
2d03de9c 13323 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 13324 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
13325
13326 if (!PL_colorset) reginitcolors();
13327 PerlIO_printf(Perl_debug_log,
a0288114 13328 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
13329 PL_colors[4],
13330 prog->check_substr ? "" : "utf8 ",
13331 PL_colors[5],PL_colors[0],
cad2e5aa
JH
13332 s,
13333 PL_colors[1],
13334 (strlen(s) > 60 ? "..." : ""));
13335 } );
13336
33b8afdf 13337 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
13338}
13339
84da74a7 13340/*
f8149455 13341 pregfree()
84da74a7 13342
f8149455
YO
13343 handles refcounting and freeing the perl core regexp structure. When
13344 it is necessary to actually free the structure the first thing it
3b753521 13345 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
13346 the regexp, allowing the handling of the void *pprivate; member
13347 first. (This routine is not overridable by extensions, which is why
13348 the extensions free is called first.)
13349
13350 See regdupe and regdupe_internal if you change anything here.
84da74a7 13351*/
f8149455 13352#ifndef PERL_IN_XSUB_RE
2b69d0c2 13353void
84679df5 13354Perl_pregfree(pTHX_ REGEXP *r)
a687059c 13355{
288b8c02
NC
13356 SvREFCNT_dec(r);
13357}
13358
13359void
13360Perl_pregfree2(pTHX_ REGEXP *rx)
13361{
27da23d5 13362 dVAR;
288b8c02 13363 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 13364 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 13365
7918f24d
NC
13366 PERL_ARGS_ASSERT_PREGFREE2;
13367
28d8d7f4
YO
13368 if (r->mother_re) {
13369 ReREFCNT_dec(r->mother_re);
13370 } else {
288b8c02 13371 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 13372 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
13373 }
13374 if (r->substrs) {
ef8d46e8
VP
13375 SvREFCNT_dec(r->anchored_substr);
13376 SvREFCNT_dec(r->anchored_utf8);
13377 SvREFCNT_dec(r->float_substr);
13378 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
13379 Safefree(r->substrs);
13380 }
288b8c02 13381 RX_MATCH_COPY_FREE(rx);
f8c7b90f 13382#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 13383 SvREFCNT_dec(r->saved_copy);
ed252734 13384#endif
f0ab9afb 13385 Safefree(r->offs);
d63c20f2 13386 SvREFCNT_dec(r->qr_anoncv);
f8149455 13387}
28d8d7f4
YO
13388
13389/* reg_temp_copy()
13390
13391 This is a hacky workaround to the structural issue of match results
13392 being stored in the regexp structure which is in turn stored in
13393 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13394 could be PL_curpm in multiple contexts, and could require multiple
13395 result sets being associated with the pattern simultaneously, such
13396 as when doing a recursive match with (??{$qr})
13397
13398 The solution is to make a lightweight copy of the regexp structure
13399 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 13400 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
13401 the starp/end and the actual regexp structure itself.
13402
13403*/
13404
13405
84679df5 13406REGEXP *
f0826785 13407Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 13408{
f0826785 13409 struct regexp *ret;
288b8c02 13410 struct regexp *const r = (struct regexp *)SvANY(rx);
7918f24d
NC
13411
13412 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13413
f0826785
BM
13414 if (!ret_x)
13415 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13416 ret = (struct regexp *)SvANY(ret_x);
13417
288b8c02 13418 (void)ReREFCNT_inc(rx);
f7c278bf
NC
13419 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13420 by pointing directly at the buffer, but flagging that the allocated
13421 space in the copy is zero. As we've just done a struct copy, it's now
13422 a case of zero-ing that, rather than copying the current length. */
13423 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 13424 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
13425 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13426 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 13427 SvLEN_set(ret_x, 0);
b9ad13ac 13428 SvSTASH_set(ret_x, NULL);
703c388d 13429 SvMAGIC_set(ret_x, NULL);
77465632
FP
13430 if (r->offs) {
13431 const I32 npar = r->nparens+1;
13432 Newx(ret->offs, npar, regexp_paren_pair);
13433 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13434 }
28d8d7f4 13435 if (r->substrs) {
28d8d7f4 13436 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
13437 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13438
13439 SvREFCNT_inc_void(ret->anchored_substr);
13440 SvREFCNT_inc_void(ret->anchored_utf8);
13441 SvREFCNT_inc_void(ret->float_substr);
13442 SvREFCNT_inc_void(ret->float_utf8);
13443
13444 /* check_substr and check_utf8, if non-NULL, point to either their
13445 anchored or float namesakes, and don't hold a second reference. */
486913e4 13446 }
288b8c02 13447 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 13448#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 13449 ret->saved_copy = NULL;
28d8d7f4 13450#endif
288b8c02 13451 ret->mother_re = rx;
d63c20f2 13452 SvREFCNT_inc_void(ret->qr_anoncv);
28d8d7f4 13453
288b8c02 13454 return ret_x;
28d8d7f4 13455}
f8149455
YO
13456#endif
13457
13458/* regfree_internal()
13459
13460 Free the private data in a regexp. This is overloadable by
13461 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 13462 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
13463 know about, however of course we have to handle the
13464 regexp_internal structure when no extension is in use.
13465
13466 Note this is called before freeing anything in the regexp
13467 structure.
13468 */
13469
13470void
288b8c02 13471Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
13472{
13473 dVAR;
288b8c02 13474 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
13475 RXi_GET_DECL(r,ri);
13476 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
13477
13478 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13479
f8149455
YO
13480 DEBUG_COMPILE_r({
13481 if (!PL_colorset)
13482 reginitcolors();
13483 {
13484 SV *dsv= sv_newmortal();
3c8556c3 13485 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 13486 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
13487 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13488 PL_colors[4],PL_colors[5],s);
13489 }
13490 });
7122b237
YO
13491#ifdef RE_TRACK_PATTERN_OFFSETS
13492 if (ri->u.offsets)
13493 Safefree(ri->u.offsets); /* 20010421 MJD */
13494#endif
b30fcab9
DM
13495 if (ri->code_blocks) {
13496 int n;
13497 for (n = 0; n < ri->num_code_blocks; n++)
13498 SvREFCNT_dec(ri->code_blocks[n].src_regex);
3d2bd50a 13499 Safefree(ri->code_blocks);
b30fcab9 13500 }
3d2bd50a 13501
f8fc2ecf
YO
13502 if (ri->data) {
13503 int n = ri->data->count;
dfad63ad 13504
c277df42 13505 while (--n >= 0) {
261faec3 13506 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 13507 switch (ri->data->what[n]) {
af534a04 13508 case 'a':
b30fcab9 13509 case 'r':
c277df42 13510 case 's':
81714fb9 13511 case 'S':
55eed653 13512 case 'u':
ad64d0ec 13513 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 13514 break;
653099ff 13515 case 'f':
f8fc2ecf 13516 Safefree(ri->data->data[n]);
653099ff 13517 break;
68e2671b 13518 case 'l':
d63c20f2 13519 case 'L':
9e55ce06 13520 break;
07be1b83 13521 case 'T':
be8e71aa
YO
13522 { /* Aho Corasick add-on structure for a trie node.
13523 Used in stclass optimization only */
07be1b83 13524 U32 refcount;
f8fc2ecf 13525 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
13526 OP_REFCNT_LOCK;
13527 refcount = --aho->refcount;
13528 OP_REFCNT_UNLOCK;
13529 if ( !refcount ) {
446bd890
NC
13530 PerlMemShared_free(aho->states);
13531 PerlMemShared_free(aho->fail);
446bd890
NC
13532 /* do this last!!!! */
13533 PerlMemShared_free(ri->data->data[n]);
13534 PerlMemShared_free(ri->regstclass);
07be1b83
YO
13535 }
13536 }
13537 break;
a3621e74 13538 case 't':
07be1b83 13539 {
be8e71aa 13540 /* trie structure. */
07be1b83 13541 U32 refcount;
f8fc2ecf 13542 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
13543 OP_REFCNT_LOCK;
13544 refcount = --trie->refcount;
13545 OP_REFCNT_UNLOCK;
13546 if ( !refcount ) {
446bd890 13547 PerlMemShared_free(trie->charmap);
446bd890
NC
13548 PerlMemShared_free(trie->states);
13549 PerlMemShared_free(trie->trans);
07be1b83 13550 if (trie->bitmap)
446bd890 13551 PerlMemShared_free(trie->bitmap);
786e8c11 13552 if (trie->jump)
446bd890 13553 PerlMemShared_free(trie->jump);
2e64971a 13554 PerlMemShared_free(trie->wordinfo);
446bd890
NC
13555 /* do this last!!!! */
13556 PerlMemShared_free(ri->data->data[n]);
a3621e74 13557 }
07be1b83
YO
13558 }
13559 break;
c277df42 13560 default:
f8fc2ecf 13561 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
13562 }
13563 }
f8fc2ecf
YO
13564 Safefree(ri->data->what);
13565 Safefree(ri->data);
a0d0e21e 13566 }
28d8d7f4 13567
f8fc2ecf 13568 Safefree(ri);
a687059c 13569}
c277df42 13570
a09252eb
NC
13571#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13572#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
13573#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13574
13575/*
32cd70f6 13576 re_dup - duplicate a regexp.
84da74a7 13577
8233f606
DM
13578 This routine is expected to clone a given regexp structure. It is only
13579 compiled under USE_ITHREADS.
32cd70f6 13580
f8149455
YO
13581 After all of the core data stored in struct regexp is duplicated
13582 the regexp_engine.dupe method is used to copy any private data
13583 stored in the *pprivate pointer. This allows extensions to handle
13584 any duplication it needs to do.
13585
13586 See pregfree() and regfree_internal() if you change anything here.
84da74a7 13587*/
a3c0e9ca 13588#if defined(USE_ITHREADS)
f8149455 13589#ifndef PERL_IN_XSUB_RE
288b8c02
NC
13590void
13591Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 13592{
84da74a7 13593 dVAR;
a86a1ca7 13594 I32 npar;
288b8c02
NC
13595 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13596 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 13597
7918f24d
NC
13598 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13599
84da74a7 13600 npar = r->nparens+1;
f0ab9afb
NC
13601 Newx(ret->offs, npar, regexp_paren_pair);
13602 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 13603 if(ret->swap) {
28d8d7f4 13604 /* no need to copy these */
f0ab9afb 13605 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 13606 }
84da74a7 13607
6057429f 13608 if (ret->substrs) {
32cd70f6
NC
13609 /* Do it this way to avoid reading from *r after the StructCopy().
13610 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13611 cache, it doesn't matter. */
66b1de87
NC
13612 const bool anchored = r->check_substr
13613 ? r->check_substr == r->anchored_substr
13614 : r->check_utf8 == r->anchored_utf8;
785a26d5 13615 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
13616 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13617
32cd70f6
NC
13618 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13619 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13620 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13621 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 13622
32cd70f6
NC
13623 /* check_substr and check_utf8, if non-NULL, point to either their
13624 anchored or float namesakes, and don't hold a second reference. */
13625
13626 if (ret->check_substr) {
13627 if (anchored) {
13628 assert(r->check_utf8 == r->anchored_utf8);
13629 ret->check_substr = ret->anchored_substr;
13630 ret->check_utf8 = ret->anchored_utf8;
13631 } else {
13632 assert(r->check_substr == r->float_substr);
13633 assert(r->check_utf8 == r->float_utf8);
13634 ret->check_substr = ret->float_substr;
13635 ret->check_utf8 = ret->float_utf8;
13636 }
66b1de87
NC
13637 } else if (ret->check_utf8) {
13638 if (anchored) {
13639 ret->check_utf8 = ret->anchored_utf8;
13640 } else {
13641 ret->check_utf8 = ret->float_utf8;
13642 }
32cd70f6 13643 }
6057429f 13644 }
f8149455 13645
5daac39c 13646 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
d63c20f2 13647 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
bcdf7404 13648
6057429f 13649 if (ret->pprivate)
288b8c02 13650 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 13651
288b8c02 13652 if (RX_MATCH_COPIED(dstr))
6057429f 13653 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
13654 else
13655 ret->subbeg = NULL;
13656#ifdef PERL_OLD_COPY_ON_WRITE
13657 ret->saved_copy = NULL;
13658#endif
6057429f 13659
c2123ae3
NC
13660 if (ret->mother_re) {
13661 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13662 /* Our storage points directly to our mother regexp, but that's
13663 1: a buffer in a different thread
13664 2: something we no longer hold a reference on
13665 so we need to copy it locally. */
d5aafdca
FC
13666 /* Note we need to use SvCUR(), rather than
13667 SvLEN(), on our mother_re, because it, in
c2123ae3
NC
13668 turn, may well be pointing to its own mother_re. */
13669 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13670 SvCUR(ret->mother_re)+1));
13671 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13672 }
13673 ret->mother_re = NULL;
13674 }
6057429f 13675 ret->gofs = 0;
f8149455
YO
13676}
13677#endif /* PERL_IN_XSUB_RE */
13678
13679/*
13680 regdupe_internal()
13681
13682 This is the internal complement to regdupe() which is used to copy
13683 the structure pointed to by the *pprivate pointer in the regexp.
13684 This is the core version of the extension overridable cloning hook.
13685 The regexp structure being duplicated will be copied by perl prior
13686 to this and will be provided as the regexp *r argument, however
13687 with the /old/ structures pprivate pointer value. Thus this routine
13688 may override any copying normally done by perl.
13689
13690 It returns a pointer to the new regexp_internal structure.
13691*/
13692
13693void *
288b8c02 13694Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
13695{
13696 dVAR;
288b8c02 13697 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455 13698 regexp_internal *reti;
0780bc72 13699 int len;
f8149455 13700 RXi_GET_DECL(r,ri);
7918f24d
NC
13701
13702 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455 13703
7122b237 13704 len = ProgLen(ri);
f8149455 13705
45cf4570 13706 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455 13707 Copy(ri->program, reti->program, len+1, regnode);
3d2bd50a
DM
13708
13709 reti->num_code_blocks = ri->num_code_blocks;
13710 if (ri->code_blocks) {
b30fcab9 13711 int n;
3d2bd50a
DM
13712 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13713 struct reg_code_block);
13714 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13715 struct reg_code_block);
b30fcab9
DM
13716 for (n = 0; n < ri->num_code_blocks; n++)
13717 reti->code_blocks[n].src_regex = (REGEXP*)
13718 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
3d2bd50a
DM
13719 }
13720 else
13721 reti->code_blocks = NULL;
f8149455 13722
f8fc2ecf 13723 reti->regstclass = NULL;
bcdf7404 13724
f8fc2ecf 13725 if (ri->data) {
84da74a7 13726 struct reg_data *d;
f8fc2ecf 13727 const int count = ri->data->count;
84da74a7
YO
13728 int i;
13729
13730 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13731 char, struct reg_data);
13732 Newx(d->what, count, U8);
13733
13734 d->count = count;
13735 for (i = 0; i < count; i++) {
f8fc2ecf 13736 d->what[i] = ri->data->what[i];
84da74a7 13737 switch (d->what[i]) {
d24ca0c5 13738 /* see also regcomp.h and regfree_internal() */
af534a04 13739 case 'a': /* actually an AV, but the dup function is identical. */
b30fcab9 13740 case 'r':
84da74a7 13741 case 's':
81714fb9 13742 case 'S':
55eed653 13743 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 13744 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 13745 break;
84da74a7
YO
13746 case 'f':
13747 /* This is cheating. */
13748 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 13749 StructCopy(ri->data->data[i], d->data[i],
84da74a7 13750 struct regnode_charclass_class);
f8fc2ecf 13751 reti->regstclass = (regnode*)d->data[i];
84da74a7 13752 break;
23eab42c
NC
13753 case 'T':
13754 /* Trie stclasses are readonly and can thus be shared
13755 * without duplication. We free the stclass in pregfree
13756 * when the corresponding reg_ac_data struct is freed.
13757 */
13758 reti->regstclass= ri->regstclass;
13759 /* Fall through */
84da74a7 13760 case 't':
84da74a7 13761 OP_REFCNT_LOCK;
0536c0a7 13762 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 13763 OP_REFCNT_UNLOCK;
0536c0a7 13764 /* Fall through */
68e2671b 13765 case 'l':
d63c20f2 13766 case 'L':
0536c0a7 13767 d->data[i] = ri->data->data[i];
84da74a7 13768 break;
84da74a7 13769 default:
f8fc2ecf 13770 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
13771 }
13772 }
13773
f8fc2ecf 13774 reti->data = d;
84da74a7
YO
13775 }
13776 else
f8fc2ecf 13777 reti->data = NULL;
84da74a7 13778
cde0cee5
YO
13779 reti->name_list_idx = ri->name_list_idx;
13780
7122b237
YO
13781#ifdef RE_TRACK_PATTERN_OFFSETS
13782 if (ri->u.offsets) {
13783 Newx(reti->u.offsets, 2*len+1, U32);
13784 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13785 }
13786#else
13787 SetProgLen(reti,len);
13788#endif
13789
f8149455 13790 return (void*)reti;
84da74a7 13791}
f8149455
YO
13792
13793#endif /* USE_ITHREADS */
84da74a7 13794
f8149455 13795#ifndef PERL_IN_XSUB_RE
bcdf7404 13796
c277df42
IZ
13797/*
13798 - regnext - dig the "next" pointer out of a node
c277df42
IZ
13799 */
13800regnode *
864dbfa3 13801Perl_regnext(pTHX_ register regnode *p)
c277df42 13802{
97aff369 13803 dVAR;
c277df42
IZ
13804 register I32 offset;
13805
f8fc2ecf 13806 if (!p)
c277df42
IZ
13807 return(NULL);
13808
35db910f
KW
13809 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13810 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13811 }
13812
c277df42
IZ
13813 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13814 if (offset == 0)
13815 return(NULL);
13816
c277df42 13817 return(p+offset);
c277df42 13818}
76234dfb 13819#endif
c277df42 13820
686b73d4 13821STATIC void
cea2e8a9 13822S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
13823{
13824 va_list args;
13825 STRLEN l1 = strlen(pat1);
13826 STRLEN l2 = strlen(pat2);
13827 char buf[512];
06bf62c7 13828 SV *msv;
73d840c0 13829 const char *message;
c277df42 13830
7918f24d
NC
13831 PERL_ARGS_ASSERT_RE_CROAK2;
13832
c277df42
IZ
13833 if (l1 > 510)
13834 l1 = 510;
13835 if (l1 + l2 > 510)
13836 l2 = 510 - l1;
13837 Copy(pat1, buf, l1 , char);
13838 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
13839 buf[l1 + l2] = '\n';
13840 buf[l1 + l2 + 1] = '\0';
8736538c
AS
13841#ifdef I_STDARG
13842 /* ANSI variant takes additional second argument */
c277df42 13843 va_start(args, pat2);
8736538c
AS
13844#else
13845 va_start(args);
13846#endif
5a844595 13847 msv = vmess(buf, &args);
c277df42 13848 va_end(args);
cfd0369c 13849 message = SvPV_const(msv,l1);
c277df42
IZ
13850 if (l1 > 512)
13851 l1 = 512;
13852 Copy(message, buf, l1 , char);
197cf9b9 13853 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 13854 Perl_croak(aTHX_ "%s", buf);
c277df42 13855}
a0ed51b3
LW
13856
13857/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13858
76234dfb 13859#ifndef PERL_IN_XSUB_RE
a0ed51b3 13860void
864dbfa3 13861Perl_save_re_context(pTHX)
b81d288d 13862{
97aff369 13863 dVAR;
1ade1aa1
NC
13864
13865 struct re_save_state *state;
13866
13867 SAVEVPTR(PL_curcop);
13868 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13869
13870 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13871 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 13872 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 13873
46ab3289 13874 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 13875
c445ea15 13876 PL_reg_oldsaved = NULL;
a5db57d6 13877 PL_reg_oldsavedlen = 0;
a5db57d6 13878 PL_reg_maxiter = 0;
a5db57d6 13879 PL_reg_leftiter = 0;
c445ea15 13880 PL_reg_poscache = NULL;
a5db57d6 13881 PL_reg_poscache_size = 0;
1ade1aa1
NC
13882#ifdef PERL_OLD_COPY_ON_WRITE
13883 PL_nrs = NULL;
13884#endif
ada6e8a9 13885
c445ea15
AL
13886 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13887 if (PL_curpm) {
13888 const REGEXP * const rx = PM_GETRE(PL_curpm);
13889 if (rx) {
1df70142 13890 U32 i;
07bc277f 13891 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 13892 char digits[TYPE_CHARS(long)];
d9fad198 13893 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
13894 GV *const *const gvp
13895 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13896
b37c2d43
AL
13897 if (gvp) {
13898 GV * const gv = *gvp;
13899 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13900 save_scalar(gv);
49f27e4b 13901 }
ada6e8a9
AMS
13902 }
13903 }
13904 }
a0ed51b3 13905}
76234dfb 13906#endif
51371543 13907
51371543 13908static void
acfe0abc 13909clear_re(pTHX_ void *r)
51371543 13910{
97aff369 13911 dVAR;
84679df5 13912 ReREFCNT_dec((REGEXP *)r);
51371543 13913}
ffbc6a93 13914
a28509cc
AL
13915#ifdef DEBUGGING
13916
13917STATIC void
13918S_put_byte(pTHX_ SV *sv, int c)
13919{
7918f24d
NC
13920 PERL_ARGS_ASSERT_PUT_BYTE;
13921
7fddd944
NC
13922 /* Our definition of isPRINT() ignores locales, so only bytes that are
13923 not part of UTF-8 are considered printable. I assume that the same
13924 holds for UTF-EBCDIC.
13925 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13926 which Wikipedia says:
13927
13928 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13929 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13930 identical, to the ASCII delete (DEL) or rubout control character.
13931 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
13932 if (!isPRINT(c)) {
13933 if (c < 256) {
13934 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13935 }
13936 else {
13937 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13938 }
13939 }
5e7aa789 13940 else {
88c9ea1e 13941 const char string = c;
5e7aa789
NC
13942 if (c == '-' || c == ']' || c == '\\' || c == '^')
13943 sv_catpvs(sv, "\\");
13944 sv_catpvn(sv, &string, 1);
13945 }
a28509cc
AL
13946}
13947
786e8c11 13948
3dab1dad
YO
13949#define CLEAR_OPTSTART \
13950 if (optstart) STMT_START { \
70685ca0 13951 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
13952 optstart=NULL; \
13953 } STMT_END
13954
786e8c11 13955#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 13956
b5a2f8d8
NC
13957STATIC const regnode *
13958S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
13959 const regnode *last, const regnode *plast,
13960 SV* sv, I32 indent, U32 depth)
a28509cc 13961{
97aff369 13962 dVAR;
786e8c11 13963 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 13964 register const regnode *next;
3dab1dad 13965 const regnode *optstart= NULL;
1f1031fe 13966
f8fc2ecf 13967 RXi_GET_DECL(r,ri);
3dab1dad 13968 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
13969
13970 PERL_ARGS_ASSERT_DUMPUNTIL;
13971
786e8c11
YO
13972#ifdef DEBUG_DUMPUNTIL
13973 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13974 last ? last-start : 0,plast ? plast-start : 0);
13975#endif
13976
13977 if (plast && plast < last)
13978 last= plast;
13979
13980 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 13981 /* While that wasn't END last time... */
a28509cc
AL
13982 NODE_ALIGN(node);
13983 op = OP(node);
de734bd5 13984 if (op == CLOSE || op == WHILEM)
786e8c11 13985 indent--;
b5a2f8d8 13986 next = regnext((regnode *)node);
1f1031fe 13987
a28509cc 13988 /* Where, what. */
8e11feef 13989 if (OP(node) == OPTIMIZED) {
e68ec53f 13990 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 13991 optstart = node;
3dab1dad 13992 else
8e11feef 13993 goto after_print;
3dab1dad
YO
13994 } else
13995 CLEAR_OPTSTART;
686b73d4 13996
32fc9b6a 13997 regprop(r, sv, node);
a28509cc 13998 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 13999 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
14000
14001 if (OP(node) != OPTIMIZED) {
14002 if (next == NULL) /* Next ptr. */
14003 PerlIO_printf(Perl_debug_log, " (0)");
14004 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14005 PerlIO_printf(Perl_debug_log, " (FAIL)");
14006 else
14007 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14008 (void)PerlIO_putc(Perl_debug_log, '\n');
14009 }
14010
a28509cc
AL
14011 after_print:
14012 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
14013 assert(next);
14014 {
14015 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
14016 ? regnext((regnode *)next)
14017 : next);
be8e71aa
YO
14018 if (last && nnode > last)
14019 nnode = last;
786e8c11 14020 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 14021 }
a28509cc
AL
14022 }
14023 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 14024 assert(next);
786e8c11 14025 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
14026 }
14027 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 14028 const regnode *this_trie = node;
1de06328 14029 const char op = OP(node);
647f639f 14030 const U32 n = ARG(node);
1de06328 14031 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 14032 (reg_ac_data *)ri->data->data[n] :
1de06328 14033 NULL;
3251b653
NC
14034 const reg_trie_data * const trie =
14035 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 14036#ifdef DEBUGGING
502c6561 14037 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 14038#endif
786e8c11 14039 const regnode *nextbranch= NULL;
a28509cc 14040 I32 word_idx;
76f68e9b 14041 sv_setpvs(sv, "");
786e8c11 14042 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 14043 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
686b73d4 14044
786e8c11
YO
14045 PerlIO_printf(Perl_debug_log, "%*s%s ",
14046 (int)(2*(indent+3)), "",
14047 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
14048 PL_colors[0], PL_colors[1],
14049 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 14050 PERL_PV_PRETTY_ELLIPSES |
7f69552c 14051 PERL_PV_PRETTY_LTGT
786e8c11
YO
14052 )
14053 : "???"
14054 );
14055 if (trie->jump) {
40d049e4 14056 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
14057 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14058 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
14059 if (dist) {
14060 if (!nextbranch)
24b23f37 14061 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
14062 DUMPUNTIL(this_trie + dist, nextbranch);
14063 }
786e8c11
YO
14064 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14065 nextbranch= regnext((regnode *)nextbranch);
14066 } else {
14067 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 14068 }
786e8c11
YO
14069 }
14070 if (last && next > last)
14071 node= last;
14072 else
14073 node= next;
a28509cc 14074 }
786e8c11
YO
14075 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14076 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14077 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
14078 }
14079 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 14080 assert(next);
786e8c11 14081 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
14082 }
14083 else if ( op == PLUS || op == STAR) {
786e8c11 14084 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 14085 }
f56b6394 14086 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 14087 /* arglen 1 + class block */
4a3ee7a8 14088 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
14089 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14090 node = NEXTOPER(node);
14091 }
14092 else if (PL_regkind[(U8)op] == EXACT) {
14093 /* Literal string, where present. */
14094 node += NODE_SZ_STR(node) - 1;
14095 node = NEXTOPER(node);
14096 }
14097 else {
14098 node = NEXTOPER(node);
14099 node += regarglen[(U8)op];
14100 }
14101 if (op == CURLYX || op == OPEN)
786e8c11 14102 indent++;
a28509cc 14103 }
3dab1dad 14104 CLEAR_OPTSTART;
786e8c11 14105#ifdef DEBUG_DUMPUNTIL
70685ca0 14106 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 14107#endif
1de06328 14108 return node;
a28509cc
AL
14109}
14110
14111#endif /* DEBUGGING */
14112
241d1a3b
NC
14113/*
14114 * Local variables:
14115 * c-indentation-style: bsd
14116 * c-basic-offset: 4
14d04a33 14117 * indent-tabs-mode: nil
241d1a3b
NC
14118 * End:
14119 *
14d04a33 14120 * ex: set ts=8 sts=4 sw=4 et:
37442d52 14121 */