This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #74856] Fix POD syntax in perlapi
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
81714fb9 135 HV *paren_names; /* Paren names */
1f1031fe 136
40d049e4
YO
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
3dab1dad 143#ifdef DEBUGGING
be8e71aa 144 const char *lastparse;
3dab1dad 145 I32 lastnum;
1f1031fe 146 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
147#define RExC_lastparse (pRExC_state->lastparse)
148#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 149#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 150#endif
830247a4
IZ
151} RExC_state_t;
152
e2509266 153#define RExC_flags (pRExC_state->flags)
830247a4 154#define RExC_precomp (pRExC_state->precomp)
288b8c02 155#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 156#define RExC_rx (pRExC_state->rx)
f8fc2ecf 157#define RExC_rxi (pRExC_state->rxi)
fac92740 158#define RExC_start (pRExC_state->start)
830247a4
IZ
159#define RExC_end (pRExC_state->end)
160#define RExC_parse (pRExC_state->parse)
161#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
162#ifdef RE_TRACK_PATTERN_OFFSETS
163#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
164#endif
830247a4 165#define RExC_emit (pRExC_state->emit)
fac92740 166#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 167#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
168#define RExC_naughty (pRExC_state->naughty)
169#define RExC_sawback (pRExC_state->sawback)
170#define RExC_seen (pRExC_state->seen)
171#define RExC_size (pRExC_state->size)
172#define RExC_npar (pRExC_state->npar)
e2e6a0f1 173#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
174#define RExC_extralen (pRExC_state->extralen)
175#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 177#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 178#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
179#define RExC_open_parens (pRExC_state->open_parens)
180#define RExC_close_parens (pRExC_state->close_parens)
181#define RExC_opend (pRExC_state->opend)
81714fb9 182#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
183#define RExC_recurse (pRExC_state->recurse)
184#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 185
cde0cee5 186
a687059c
LW
187#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
a687059c 190
35c8bce7
LW
191#ifdef SPSTART
192#undef SPSTART /* dratted cpp namespace... */
193#endif
a687059c
LW
194/*
195 * Flags to be passed up and down.
196 */
a687059c 197#define WORST 0 /* Worst case. */
a3b492c3
YO
198#define HASWIDTH 0x01 /* Known to match non-null strings. */
199#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200#define SPSTART 0x04 /* Starts with * or +. */
201#define TRYAGAIN 0x08 /* Weeded out a declaration. */
202#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 203
3dab1dad
YO
204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205
07be1b83
YO
206/* whether trie related optimizations are enabled */
207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208#define TRIE_STUDY_OPT
786e8c11 209#define FULL_TRIE_STUDY
07be1b83
YO
210#define TRIE_STCLASS
211#endif
1de06328
YO
212
213
40d049e4
YO
214
215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216#define PBITVAL(paren) (1 << ((paren) & 7))
217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220
221
1de06328
YO
222/* About scan_data_t.
223
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
230 in the pattern:
231
232 /FOO[xX]A.*B[xX]BAR/
233
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
237
238 The strings can be composites, for instance
239
240 /(f)(o)(o)/
241
242 will result in a composite fixed substring 'foo'.
243
244 For each string some basic information is maintained:
245
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
252 found.
253
254 - max_offset
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
258
259 - minlenp
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
263 involved. Consider
264
265 /(?=FOO).*F/
266
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
277
278 - lookbehind
279
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
290 associated string.
291
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
294
295*/
2c2d71f5
JH
296
297typedef struct scan_data_t {
1de06328
YO
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
2c2d71f5
JH
300 I32 pos_min;
301 I32 pos_delta;
302 SV *last_found;
1de06328 303 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
304 I32 last_start_min;
305 I32 last_start_max;
1de06328
YO
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
316 I32 flags;
317 I32 whilem_c;
cb434fcc 318 I32 *last_closep;
653099ff 319 struct regnode_charclass_class *start_class;
2c2d71f5
JH
320} scan_data_t;
321
a687059c 322/*
e50aee73 323 * Forward declarations for pregcomp()'s friends.
a687059c 324 */
a0d0e21e 325
27da23d5 326static const scan_data_t zero_scan_data =
1de06328 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
328
329#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
330#define SF_BEFORE_SEOL 0x0001
331#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
332#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334
09b7f37c
CB
335#ifdef NO_UNARY_PLUS
336# define SF_FIX_SHIFT_EOL (0+2)
337# define SF_FL_SHIFT_EOL (0+4)
338#else
339# define SF_FIX_SHIFT_EOL (+2)
340# define SF_FL_SHIFT_EOL (+4)
341#endif
c277df42
IZ
342
343#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345
346#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
348#define SF_IS_INF 0x0040
349#define SF_HAS_PAR 0x0080
350#define SF_IN_PAR 0x0100
351#define SF_HAS_EVAL 0x0200
352#define SCF_DO_SUBSTR 0x0400
653099ff
GS
353#define SCF_DO_STCLASS_AND 0x0800
354#define SCF_DO_STCLASS_OR 0x1000
355#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 356#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 357
786e8c11 358#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 359#define SCF_SEEN_ACCEPT 0x8000
07be1b83 360
eb160463 361#define UTF (RExC_utf8 != 0)
bbe252da
YO
362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 364
ffc61ed2 365#define OOB_UNICODE 12345678
93733859 366#define OOB_NAMEDCLASS -1
b8c5462f 367
a0ed51b3
LW
368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370
8615cb43 371
b45f050a
JF
372/* length of regex to show in messages that don't mark a position within */
373#define RegexLengthToShowInErrorMessages 127
374
375/*
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
379 */
7253e4e3
RK
380#define MARKER1 "<-- HERE" /* marker as it appears in the description */
381#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 382
7253e4e3 383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
384
385/*
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
388 * "...".
389 */
58e23c8d 390#define _FAIL(code) STMT_START { \
bfed75c6 391 const char *ellipses = ""; \
ccb2c380
MP
392 IV len = RExC_end - RExC_precomp; \
393 \
394 if (!SIZE_ONLY) \
288b8c02 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
399 ellipses = "..."; \
400 } \
58e23c8d 401 code; \
ccb2c380 402} STMT_END
8615cb43 403
58e23c8d
YO
404#define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
407
408#define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
411
b45f050a 412/*
b45f050a
JF
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414 */
ccb2c380 415#define Simple_vFAIL(m) STMT_START { \
a28509cc 416 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423 */
ccb2c380
MP
424#define vFAIL(m) STMT_START { \
425 if (!SIZE_ONLY) \
288b8c02 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
427 Simple_vFAIL(m); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Like Simple_vFAIL(), but accepts two arguments.
432 */
ccb2c380 433#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 434 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441 */
ccb2c380
MP
442#define vFAIL2(m,a1) STMT_START { \
443 if (!SIZE_ONLY) \
288b8c02 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
445 Simple_vFAIL2(m, a1); \
446} STMT_END
b45f050a
JF
447
448
449/*
450 * Like Simple_vFAIL(), but accepts three arguments.
451 */
ccb2c380 452#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460 */
ccb2c380
MP
461#define vFAIL3(m,a1,a2) STMT_START { \
462 if (!SIZE_ONLY) \
288b8c02 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
464 Simple_vFAIL3(m, a1, a2); \
465} STMT_END
b45f050a
JF
466
467/*
468 * Like Simple_vFAIL(), but accepts four arguments.
469 */
ccb2c380 470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 471 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
b45f050a 475
668c081a 476#define ckWARNreg(loc,m) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
480} STMT_END
481
668c081a 482#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
d1d15184 484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
485 m REPORT_LOCATION, \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
487} STMT_END
488
668c081a 489#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
668c081a 491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
668c081a
NC
501#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
ccb2c380 507#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
512
668c081a
NC
513#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
ccb2c380 519#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 520 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
523} STMT_END
9d1d55b5 524
8615cb43 525
cd439c50 526/* Allow for side effects in s */
ccb2c380
MP
527#define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
529} STMT_END
cd439c50 530
fac92740
MJD
531/* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
07be1b83 535 * Position is 1 indexed.
fac92740 536 */
7122b237
YO
537#ifndef RE_TRACK_PATTERN_OFFSETS
538#define Set_Node_Offset_To_R(node,byte)
539#define Set_Node_Offset(node,byte)
540#define Set_Cur_Node_Offset
541#define Set_Node_Length_To_R(node,len)
542#define Set_Node_Length(node,len)
543#define Set_Node_Cur_Length(node)
544#define Node_Offset(n)
545#define Node_Length(n)
546#define Set_Node_Offset_Length(node,offset,len)
547#define ProgLen(ri) ri->u.proglen
548#define SetProgLen(ri,x) ri->u.proglen = x
549#else
550#define ProgLen(ri) ri->u.offsets[0]
551#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
552#define Set_Node_Offset_To_R(node,byte) STMT_START { \
553 if (! SIZE_ONLY) { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 555 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 556 if((node) < 0) { \
551405c4 557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
558 } else { \
559 RExC_offsets[2*(node)-1] = (byte); \
560 } \
561 } \
562} STMT_END
563
564#define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
567
568#define Set_Node_Length_To_R(node,len) STMT_START { \
569 if (! SIZE_ONLY) { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 571 __LINE__, (int)(node), (int)(len))); \
ccb2c380 572 if((node) < 0) { \
551405c4 573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
574 } else { \
575 RExC_offsets[2*(node)] = (len); \
576 } \
577 } \
578} STMT_END
579
580#define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583#define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
585
586/* Get offsets and lengths */
587#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
589
07be1b83
YO
590#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
593} STMT_END
7122b237 594#endif
07be1b83
YO
595
596#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597#define EXPERIMENTAL_INPLACESCAN
7122b237 598#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 599
304ee84b
YO
600#define DEBUG_STUDYDATA(str,data,depth) \
601DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 602 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
304ee84b 608 (UV)((data)->flags), \
1de06328 609 (IV)((data)->whilem_c), \
304ee84b
YO
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
1de06328
YO
612 ); \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
630 ); \
631 PerlIO_printf(Perl_debug_log,"\n"); \
632});
633
acfe0abc 634static void clear_re(pTHX_ void *r);
4327152a 635
653099ff 636/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 637 Update the longest found anchored substring and the longest found
653099ff
GS
638 floating substrings if needed. */
639
4327152a 640STATIC void
304ee84b 641S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 642{
e1ec3a88
AL
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 645 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 646
7918f24d
NC
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
648
c277df42 649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 650 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
b81d288d 654 data->flags
c277df42
IZ
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
656 else
657 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
a0ed51b3 660 }
304ee84b 661 else { /* *data->longest == data->longest_float */
c277df42 662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
663 data->offset_float_max = (l
664 ? data->last_start_max
c277df42 665 : data->pos_min + data->pos_delta);
304ee84b 666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 667 data->offset_float_max = I32_MAX;
c277df42 668 if (data->flags & SF_BEFORE_EOL)
b81d288d 669 data->flags
c277df42
IZ
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
671 else
672 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
c277df42
IZ
675 }
676 }
677 SvCUR_set(data->last_found, 0);
0eda9292 678 {
a28509cc 679 SV * const sv = data->last_found;
097eb12c
AL
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
682 if (mg)
683 mg->mg_len = 0;
684 }
0eda9292 685 }
c277df42
IZ
686 data->last_end = -1;
687 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 688 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
689}
690
653099ff
GS
691/* Can match anything (initialization) */
692STATIC void
097eb12c 693S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 694{
7918f24d
NC
695 PERL_ARGS_ASSERT_CL_ANYTHING;
696
653099ff 697 ANYOF_CLASS_ZERO(cl);
f8bef550 698 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
700 if (LOC)
701 cl->flags |= ANYOF_LOCALE;
702}
703
704/* Can match anything (initialization) */
705STATIC int
5f66b61c 706S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
707{
708 int value;
709
7918f24d
NC
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
711
aaa51d5e 712 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
714 return 1;
1aa99e6b
IH
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
716 return 0;
10edeb5d 717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 718 return 0;
653099ff
GS
719 return 1;
720}
721
722/* Can match anything (initialization) */
723STATIC void
097eb12c 724S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 725{
7918f24d
NC
726 PERL_ARGS_ASSERT_CL_INIT;
727
8ecf7187 728 Zero(cl, 1, struct regnode_charclass_class);
653099ff 729 cl->type = ANYOF;
830247a4 730 cl_anything(pRExC_state, cl);
653099ff
GS
731}
732
733STATIC void
097eb12c 734S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 735{
7918f24d
NC
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
737
8ecf7187 738 Zero(cl, 1, struct regnode_charclass_class);
653099ff 739 cl->type = ANYOF;
830247a4 740 cl_anything(pRExC_state, cl);
653099ff
GS
741 if (LOC)
742 cl->flags |= ANYOF_LOCALE;
743}
744
745/* 'And' a given class with another one. Can create false positives */
746/* We assume that cl is not inverted */
747STATIC void
5f66b61c 748S_cl_and(struct regnode_charclass_class *cl,
a28509cc 749 const struct regnode_charclass_class *and_with)
653099ff 750{
7918f24d 751 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
752
753 assert(and_with->type == ANYOF);
653099ff
GS
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
759 int i;
760
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
764 else
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
1aa99e6b 770
14ebb1a2
JH
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
776 }
14ebb1a2
JH
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 779 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 782 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
783}
784
785/* 'OR' a given class with another one. Can create false positives */
786/* We assume that cl is not inverted */
787STATIC void
097eb12c 788S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 789{
7918f24d
NC
790 PERL_ARGS_ASSERT_CL_OR;
791
653099ff
GS
792 if (or_with->flags & ANYOF_INVERT) {
793 /* We do not use
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
801 */
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
805 int i;
806
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
810 else {
830247a4 811 cl_anything(pRExC_state, cl);
653099ff
GS
812 }
813 } else {
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 816 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
817 || (cl->flags & ANYOF_FOLD)) ) {
818 int i;
819
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
827 }
828 }
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 830 cl_anything(pRExC_state, cl);
653099ff
GS
831 }
832 }
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
835
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
840 }
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
844 }
653099ff
GS
845}
846
a3621e74
YO
847#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
851
3dab1dad
YO
852
853#ifdef DEBUGGING
07be1b83 854/*
2b8b4781
NC
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
858
859 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
863
3dab1dad
YO
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
07be1b83
YO
866
867*/
3dab1dad
YO
868
869/*
3dab1dad
YO
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
872*/
873
874STATIC void
2b8b4781
NC
875S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
3dab1dad
YO
877{
878 U32 state;
ab3bbdeb 879 SV *sv=sv_newmortal();
55eed653 880 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
881 GET_RE_DEBUG_FLAGS_DECL;
882
7918f24d 883 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 884
3dab1dad
YO
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
888
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 890 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 891 if ( tmp ) {
ab3bbdeb
YO
892 PerlIO_printf( Perl_debug_log, "%*s",
893 colwidth,
ddc5bc0f 894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
898 )
899 );
3dab1dad
YO
900 }
901 }
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
904
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
907 PerlIO_printf( Perl_debug_log, "\n");
908
1e2e3d02 909 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 910 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
911
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
913
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
916 } else {
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
918 }
919
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
921
922 if ( base ) {
923 U32 ofs = 0;
924
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
928 ofs++;
929
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
931
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
936 {
ab3bbdeb
YO
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
938 colwidth,
3dab1dad
YO
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
940 } else {
ab3bbdeb 941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
942 }
943 }
944
945 PerlIO_printf( Perl_debug_log, "]");
946
947 }
948 PerlIO_printf( Perl_debug_log, "\n" );
949 }
950}
951/*
3dab1dad
YO
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
956*/
957STATIC void
55eed653 958S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
960 U32 depth)
3dab1dad
YO
961{
962 U32 state;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad 965 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
966
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
968
3dab1dad 969 /* print out the table precompression. */
ab3bbdeb
YO
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
3dab1dad
YO
973
974 for( state=1 ; state < next_alloc ; state ++ ) {
975 U16 charid;
976
ab3bbdeb 977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
981 } else {
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
984 );
985 }
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
988 if ( tmp ) {
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
990 colwidth,
ddc5bc0f 991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
995 ) ,
1e2e3d02
YO
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
998 );
999 if (!(charid % 10))
664e119d
RGS
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1e2e3d02 1002 }
ab3bbdeb
YO
1003 }
1004 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1005 }
1006}
1007
1008/*
3dab1dad
YO
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1013*/
1014STATIC void
55eed653 1015S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1017 U32 depth)
3dab1dad
YO
1018{
1019 U32 state;
1020 U16 charid;
ab3bbdeb 1021 SV *sv=sv_newmortal();
55eed653 1022 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1023 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1024
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1026
1027 /*
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1030 */
1031
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1033
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1036 if ( tmp ) {
ab3bbdeb
YO
1037 PerlIO_printf( Perl_debug_log, "%*s",
1038 colwidth,
ddc5bc0f 1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1043 )
1044 );
3dab1dad
YO
1045 }
1046 }
1047
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1049
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1052 }
1053
1054 PerlIO_printf( Perl_debug_log, "\n" );
1055
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1057
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1061
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1064 if (v)
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1066 else
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1068 }
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1071 } else {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1074 }
1075 }
07be1b83 1076}
3dab1dad
YO
1077
1078#endif
1079
786e8c11
YO
1080/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
3dab1dad 1090
786e8c11 1091Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1092
786e8c11
YO
1093A trie is an N'ary tree where the branches are determined by digital
1094decomposition of the key. IE, at the root node you look up the 1st character and
1095follow that branch repeat until you find the end of the branches. Nodes can be
1096marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1097
786e8c11 1098 /he|she|his|hers/
72f13be8 1099
786e8c11
YO
1100would convert into the following structure. Numbers represent states, letters
1101following numbers represent valid transitions on the letter from that state, if
1102the number is in square brackets it represents an accepting state, otherwise it
1103will be in parenthesis.
07be1b83 1104
786e8c11
YO
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1106 | |
1107 | (2)
1108 | |
1109 (1) +-i->(6)-+-s->[7]
1110 |
1111 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1112
786e8c11
YO
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1114
1115This shows that when matching against the string 'hers' we will begin at state 1
1116read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119single traverse. We store a mapping from accepting to state to which word was
1120matched, and then when we have multiple possibilities we try to complete the
1121rest of the regex in the order in which they occured in the alternation.
1122
1123The only prior NFA like behaviour that would be changed by the TRIE support is
1124the silent ignoring of duplicate alternations which are of the form:
1125
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1127
1128Thus EVAL blocks follwing a trie may be called a different number of times with
1129and without the optimisation. With the optimisations dupes will be silently
1130ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131the following demonstrates:
1132
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1134
1135which prints out 'word' three times, but
1136
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1138
1139which doesnt print it out at all. This is due to other optimisations kicking in.
1140
1141Example of what happens on a structural level:
1142
1143The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1144
1145 1: CURLYM[1] {1,32767}(18)
1146 5: BRANCH(8)
1147 6: EXACT <ac>(16)
1148 8: BRANCH(11)
1149 9: EXACT <ad>(16)
1150 11: BRANCH(14)
1151 12: EXACT <ab>(16)
1152 16: SUCCEED(0)
1153 17: NOTHING(18)
1154 18: END(0)
1155
1156This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157and should turn into:
1158
1159 1: CURLYM[1] {1,32767}(18)
1160 5: TRIE(16)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1162 <ac>
1163 <ad>
1164 <ab>
1165 16: SUCCEED(0)
1166 17: NOTHING(18)
1167 18: END(0)
1168
1169Cases where tail != last would be like /(?foo|bar)baz/:
1170
1171 1: BRANCH(4)
1172 2: EXACT <foo>(8)
1173 4: BRANCH(7)
1174 5: EXACT <bar>(8)
1175 7: TAIL(8)
1176 8: EXACT <baz>(10)
1177 10: END(0)
1178
1179which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180and would end up looking like:
1181
1182 1: TRIE(8)
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1184 <foo>
1185 <bar>
1186 7: TAIL(8)
1187 8: EXACT <baz>(10)
1188 10: END(0)
1189
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1191
1192is the recommended Unicode-aware way of saying
1193
1194 *(d++) = uv;
1195*/
1196
1e2e3d02 1197#define TRIE_STORE_REVCHAR \
786e8c11 1198 STMT_START { \
73031816
NC
1199 if (UTF) { \
1200 SV *zlopp = newSV(2); \
88c9ea1e
CB
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1204 SvPOK_on(zlopp); \
1205 SvUTF8_on(zlopp); \
1206 av_push(revcharmap, zlopp); \
1207 } else { \
6bdeddd2 1208 char ooooff = (char)uvc; \
73031816
NC
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1210 } \
1211 } STMT_END
786e8c11
YO
1212
1213#define TRIE_READ_CHAR STMT_START { \
1214 wordlen++; \
1215 if ( UTF ) { \
1216 if ( folder ) { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1219 foldlen -= len; \
1220 scan += len; \
1221 len = 0; \
1222 } else { \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1227 } \
1228 } else { \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1230 } \
1231 } else { \
1232 uvc = (U32)*uc; \
1233 len = 1; \
1234 } \
1235} STMT_END
1236
1237
1238
1239#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1243 } \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1247} STMT_END
07be1b83 1248
786e8c11
YO
1249#define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1254} STMT_END
07be1b83 1255
786e8c11
YO
1256#define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1259 \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1262 DEBUG_r({ \
1263 /* store the word for dumping */ \
1264 SV* tmp; \
1265 if (OP(noper) != NOTHING) \
740cce10 1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1267 else \
740cce10 1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1269 av_push( trie_words, tmp ); \
786e8c11
YO
1270 }); \
1271 \
1272 curword++; \
1273 \
1274 if ( noper_next < tail ) { \
1275 if (!trie->jump) \
c944940b 1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1277 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1278 if (!jumper) \
1279 jumper = noper_next; \
1280 if (!nextbranch) \
1281 nextbranch= regnext(cur); \
1282 } \
1283 \
1284 if ( dupe ) { \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
c944940b 1290 trie->nextword = (U16 *) \
446bd890 1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1295 } else { \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1298 } \
1299} STMT_END
07be1b83 1300
3dab1dad 1301
786e8c11
YO
1302#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1309 )
3dab1dad 1310
786e8c11
YO
1311#define MADE_TRIE 1
1312#define MADE_JUMP_TRIE 2
1313#define MADE_EXACT_TRIE 4
3dab1dad 1314
a3621e74 1315STATIC I32
786e8c11 1316S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1317{
27da23d5 1318 dVAR;
a3621e74
YO
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
55eed653 1321 HV *widecharmap = NULL;
2b8b4781 1322 AV *revcharmap = newAV();
a3621e74 1323 regnode *cur;
9f7f3913 1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1325 STRLEN len = 0;
1326 UV uvc = 0;
1327 U16 curword = 0;
1328 U32 next_alloc = 0;
786e8c11
YO
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
7f69552c 1331 regnode *convert = NULL;
a3621e74 1332 /* we just use folder as a flag in utf8 */
e1ec3a88 1333 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1334 ? PL_fold
1335 : ( flags == EXACTFL
1336 ? PL_fold_locale
1337 : NULL
1338 )
1339 );
1340
2b8b4781
NC
1341#ifdef DEBUGGING
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
8e11feef 1346 */
2b8b4781
NC
1347#else
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1349 STRLEN trie_charcount=0;
3dab1dad 1350#endif
2b8b4781 1351 SV *re_trie_maxbuff;
a3621e74 1352 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1353
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1355#ifndef DEBUGGING
1356 PERL_UNUSED_ARG(depth);
1357#endif
a3621e74 1358
c944940b 1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1360 trie->refcount = 1;
3dab1dad 1361 trie->startstate = 1;
786e8c11 1362 trie->wordcount = word_count;
f8fc2ecf 1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1365 if (!(UTF && folder))
c944940b 1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1367 DEBUG_r({
2b8b4781 1368 trie_words = newAV();
a3621e74 1369 });
a3621e74 1370
0111c4fd 1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1372 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1374 }
3dab1dad
YO
1375 DEBUG_OPTIMISE_r({
1376 PerlIO_printf( Perl_debug_log,
786e8c11 1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1381 (int)depth);
3dab1dad 1382 });
7f69552c
YO
1383
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1387 convert = first;
1388 } else {
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1391 }
1392
a3621e74
YO
1393 /* -- First loop and Setup --
1394
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1398 have unique chars.
1399
1400 We use an array of integers to represent the character codes 0..255
38a44b82 1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1402 native representation of the character value as the key and IV's for the
1403 coded index.
1404
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1413
1414 */
1415
a3621e74 1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1417 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1418 const U8 *uc = (U8*)STRING( noper );
a28509cc 1419 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1420 STRLEN foldlen = 0;
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1422 const U8 *scan = (U8*)NULL;
07be1b83 1423 U32 wordlen = 0; /* required init */
02daf0ab
YO
1424 STRLEN chars = 0;
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1426
3dab1dad
YO
1427 if (OP(noper) == NOTHING) {
1428 trie->minlen= 0;
1429 continue;
1430 }
02daf0ab
YO
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1434
a3621e74 1435 for ( ; uc < e ; uc += len ) {
3dab1dad 1436 TRIE_CHARCOUNT(trie)++;
a3621e74 1437 TRIE_READ_CHAR;
3dab1dad 1438 chars++;
a3621e74
YO
1439 if ( uvc < 256 ) {
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1442 if ( folder )
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1444 TRIE_STORE_REVCHAR;
a3621e74 1445 }
02daf0ab
YO
1446 if ( set_bit ) {
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1450
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1453
1454 if ( !UTF ) {
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1462 }
1463 }
02daf0ab
YO
1464 set_bit = 0; /* We've done our bit :-) */
1465 }
a3621e74
YO
1466 } else {
1467 SV** svpp;
55eed653
NC
1468 if ( !widecharmap )
1469 widecharmap = newHV();
a3621e74 1470
55eed653 1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1472
1473 if ( !svpp )
e4584336 1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1475
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1478 TRIE_STORE_REVCHAR;
a3621e74
YO
1479 }
1480 }
1481 }
3dab1dad
YO
1482 if( cur == first ) {
1483 trie->minlen=chars;
1484 trie->maxlen=chars;
1485 } else if (chars < trie->minlen) {
1486 trie->minlen=chars;
1487 } else if (chars > trie->maxlen) {
1488 trie->maxlen=chars;
1489 }
1490
a3621e74
YO
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
55eed653 1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1498 );
c944940b 1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1500
1501 /*
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1506 conservative but potentially much slower representation using an array
1507 of lists.
1508
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1515
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1519
1520 */
1521
1522
3dab1dad 1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1524 /*
1525 Second Pass -- Array Of Lists Representation
1526
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1530
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
a3621e74
YO
1534 */
1535
a3621e74
YO
1536 STRLEN transcount = 1;
1537
1e2e3d02
YO
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
446bd890 1541
c944940b
JH
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
a3621e74
YO
1545 TRIE_LIST_NEW(1);
1546 next_alloc = 2;
1547
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1549
c445ea15
AL
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
07be1b83 1557 U32 wordlen = 0; /* required init */
c445ea15
AL
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1559
3dab1dad 1560 if (OP(noper) != NOTHING) {
786e8c11 1561 for ( ; uc < e ; uc += len ) {
c445ea15 1562
786e8c11 1563 TRIE_READ_CHAR;
c445ea15 1564
786e8c11
YO
1565 if ( uvc < 256 ) {
1566 charid = trie->charmap[ uvc ];
c445ea15 1567 } else {
55eed653 1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1569 if ( !svpp ) {
1570 charid = 0;
1571 } else {
1572 charid=(U16)SvIV( *svpp );
1573 }
c445ea15 1574 }
786e8c11
YO
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1576 if ( charid ) {
a3621e74 1577
786e8c11
YO
1578 U16 check;
1579 U32 newstate = 0;
a3621e74 1580
786e8c11
YO
1581 charid--;
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
c445ea15 1584 }
786e8c11
YO
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1588 break;
1589 }
1590 }
1591 if ( ! newstate ) {
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1594 transcount++;
1595 }
1596 state = newstate;
1597 } else {
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1599 }
a28509cc 1600 }
c445ea15 1601 }
3dab1dad 1602 TRIE_HANDLE_WORD(state);
a3621e74
YO
1603
1604 } /* end second pass */
1605
1e2e3d02
YO
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
c944940b
JH
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1610 next_alloc
1611 * sizeof(reg_trie_state) );
a3621e74 1612
3dab1dad 1613 /* and now dump it out before we compress it */
2b8b4781
NC
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1616 depth+1)
1e2e3d02 1617 );
a3621e74 1618
c944940b
JH
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1621 {
1622 U32 state;
a3621e74
YO
1623 U32 tp = 0;
1624 U32 zp = 0;
1625
1626
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1628 U32 base=0;
1629
1630 /*
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1633 );
1634 */
1635
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1638 U16 maxid=minid;
a28509cc 1639 U16 idx;
a3621e74
YO
1640
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1644 minid=forid;
1645 } else if ( forid > maxid ) {
1646 maxid=forid;
1647 }
a3621e74
YO
1648 }
1649 if ( transcount < tp + maxid - minid + 1) {
1650 transcount *= 2;
c944940b
JH
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
446bd890
NC
1653 transcount
1654 * sizeof(reg_trie_trans) );
a3621e74
YO
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1656 }
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1659 U32 set = 0;
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1665 set = 1;
1666 break;
1667 }
1668 }
1669 if ( !set ) {
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1672 tp++;
1673 zp = tp;
1674 }
1675 } else {
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1680 }
1681 tp += ( maxid - minid + 1 );
1682 }
1683 Safefree(trie->states[ state ].trans.list);
1684 }
1685 /*
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1688 );
1689 */
1690 trie->states[ state ].trans.base=base;
1691 }
cc601c31 1692 trie->lasttrans = tp + 1;
a3621e74
YO
1693 }
1694 } else {
1695 /*
1696 Second Pass -- Flat Table Representation.
1697
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1702
1703 We then construct the trie using only the .next slots of the entry
1704 structs.
1705
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1709
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1711 transition.
1712
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1718
1719 A B A B
1720 1. 2 4 1. 3 7
1721 2. 0 3 3. 0 5
1722 3. 0 0 5. 0 0
1723 4. 0 0 7. 0 0
1724
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1728
1729 */
1e2e3d02
YO
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
3dab1dad 1733
c944940b
JH
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
a3621e74
YO
1741 next_alloc = trie->uniquecharcount + 1;
1742
3dab1dad 1743
a3621e74
YO
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1745
c445ea15 1746 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1749
1750 U32 state = 1; /* required init */
1751
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1755
1756 STRLEN foldlen = 0; /* required init */
07be1b83 1757 U32 wordlen = 0; /* required init */
a3621e74
YO
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1759
3dab1dad 1760 if ( OP(noper) != NOTHING ) {
786e8c11 1761 for ( ; uc < e ; uc += len ) {
a3621e74 1762
786e8c11 1763 TRIE_READ_CHAR;
a3621e74 1764
786e8c11
YO
1765 if ( uvc < 256 ) {
1766 charid = trie->charmap[ uvc ];
1767 } else {
55eed653 1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1770 }
786e8c11
YO
1771 if ( charid ) {
1772 charid--;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1777 }
1778 state = trie->trans[ state + charid ].next;
1779 } else {
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1781 }
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1783 }
a3621e74 1784 }
3dab1dad
YO
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1787
1788 } /* end second pass */
1789
3dab1dad 1790 /* and now dump it out before we compress it */
2b8b4781
NC
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1792 revcharmap,
1793 next_alloc, depth+1));
a3621e74 1794
a3621e74
YO
1795 {
1796 /*
1797 * Inplace compress the table.*
1798
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1802
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1805
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1808
1809 - If .base is 0 there are no valid transitions from that node.
1810
1811 - If .base is nonzero then charid is added to it to find an entry in
1812 the trans array.
1813
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1819 valid.
1820
786e8c11 1821 XXX - wrong maybe?
a3621e74
YO
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1827 0.
1828
1829 - We set 'pos' to indicate the first entry of the second node.
1830
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1837 compression.
1838
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1841
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
786e8c11 1847 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1848
1849 - Once compressed we can Renew/realloc the structures to release the
1850 excess space.
1851
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1854
1855 demq
1856 */
a3b680e6 1857 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1858 U32 state, charid;
a3621e74 1859 U32 pos = 0, zp=0;
1e2e3d02 1860 trie->statecount = laststate;
a3621e74
YO
1861
1862 for ( state = 1 ; state < laststate ; state++ ) {
1863 U8 flag = 0;
a28509cc
AL
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1867 trie->trans[ stateidx ].check = 0;
1868
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1872 if (o_used == 1) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1875 break;
1876 }
1877 }
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1882 break;
1883 }
1884 used--;
1885 }
1886 if ( !flag ) {
1887 flag = 1;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1889 }
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1892 pos++;
1893 }
1894 }
1895 }
cc601c31 1896 trie->lasttrans = pos + 1;
c944940b
JH
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
a3621e74 1900 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1901 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1905 (IV)next_alloc,
1906 (IV)pos,
a3621e74
YO
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1908 );
1909
1910 } /* end table compress */
1911 }
1e2e3d02
YO
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1917 );
cc601c31 1918 /* resize the trans array to remove unused space */
c944940b
JH
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
a3621e74 1922
3dab1dad 1923 /* and now dump out the compressed format */
2b8b4781 1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1925
3dab1dad 1926 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1927 U8 nodetype =(U8)(flags & 0xFF);
1928 char *str=NULL;
786e8c11 1929
07be1b83 1930#ifdef DEBUGGING
e62cc96a 1931 regnode *optimize = NULL;
7122b237
YO
1932#ifdef RE_TRACK_PATTERN_OFFSETS
1933
b57a0404
JH
1934 U32 mjd_offset = 0;
1935 U32 mjd_nodelen = 0;
7122b237
YO
1936#endif /* RE_TRACK_PATTERN_OFFSETS */
1937#endif /* DEBUGGING */
a3621e74 1938 /*
3dab1dad
YO
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
a3621e74 1945 */
3dab1dad 1946 /* Find the node we are going to overwrite */
7f69552c 1947 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1948 /* branch sub-chain */
3dab1dad 1949 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1950#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1951 DEBUG_r({
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1954 });
7122b237 1955#endif
7f69552c 1956 /* whole branch chain */
7122b237
YO
1957 }
1958#ifdef RE_TRACK_PATTERN_OFFSETS
1959 else {
7f69552c
YO
1960 DEBUG_r({
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1964 });
07be1b83
YO
1965 }
1966 DEBUG_OPTIMISE_r(
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
786e8c11 1969 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1970 );
7122b237 1971#endif
3dab1dad
YO
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
55eed653 1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1976 U32 state;
1e2e3d02 1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1978 U32 ofs = 0;
8e11feef
RGS
1979 I32 idx = -1;
1980 U32 count = 0;
1981 const U32 base = trie->states[ state ].trans.base;
a3621e74 1982
3dab1dad 1983 if ( trie->states[state].wordnum )
8e11feef 1984 count = 1;
a3621e74 1985
8e11feef 1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1990 {
3dab1dad 1991 if ( ++count > 1 ) {
2b8b4781 1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1994 if ( state == 1 ) break;
3dab1dad
YO
1995 if ( count == 2 ) {
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1997 DEBUG_OPTIMISE_r(
8e11feef
RGS
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
786e8c11 2001 (UV)state));
be8e71aa 2002 if (idx >= 0) {
2b8b4781 2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2005
3dab1dad 2006 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2007 if ( folder )
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2009 DEBUG_OPTIMISE_r(
f1f66076 2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2011 );
8e11feef
RGS
2012 }
2013 }
2014 TRIE_BITMAP_SET(trie,*ch);
2015 if ( folder )
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2018 }
2019 idx = ofs;
2020 }
3dab1dad
YO
2021 }
2022 if ( count == 1 ) {
2b8b4781 2023 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2024 STRLEN len;
2025 char *ch = SvPV( *tmp, len );
de734bd5
A
2026 DEBUG_OPTIMISE_r({
2027 SV *sv=sv_newmortal();
8e11feef
RGS
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
de734bd5
A
2031 (UV)state, (UV)idx,
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2036 )
2037 );
2038 });
3dab1dad
YO
2039 if ( state==1 ) {
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2042 STR_LEN(convert)=0;
2043 }
c490c714
YO
2044 STR_LEN(convert) += len;
2045 while (len--)
de734bd5 2046 *str++ = *ch++;
8e11feef 2047 } else {
f9049ba1 2048#ifdef DEBUGGING
8e11feef
RGS
2049 if (state>1)
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2051#endif
8e11feef
RGS
2052 break;
2053 }
2054 }
3dab1dad 2055 if (str) {
8e11feef 2056 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2058 trie->startstate = state;
07be1b83
YO
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
33809eae
JH
2061#ifdef DEBUGGING
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2064 * it right here. */
2065 if (
2066#ifdef PERL_EXT_RE_BUILD
2067 1
2068#else
2069 DEBUG_r_TEST
2070#endif
2071 ) {
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2074 mjd_nodelen++;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2078 }
2079 while (word--) {
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2081 if (tmp) {
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2084 else
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2086 }
2087 }
2088 }
2089#endif
8e11feef
RGS
2090 if (trie->maxlen) {
2091 convert = n;
2092 } else {
3dab1dad 2093 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2094 DEBUG_r(optimize= n);
3dab1dad
YO
2095 }
2096 }
2097 }
a5ca303d
YO
2098 if (!jumper)
2099 jumper = last;
3dab1dad 2100 if ( trie->maxlen ) {
8e11feef
RGS
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
786e8c11
YO
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2106 if (trie->jump)
7f69552c 2107 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2108
786e8c11
YO
2109 /* XXXX */
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2112 {
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2115 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2116 trie->bitmap= NULL;
2117 } else
2118 OP( convert ) = TRIE;
a3621e74 2119
3dab1dad
YO
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
a5ca303d
YO
2122 DEBUG_r({
2123 optimize = convert
2124 + NODE_STEP_REGNODE
2125 + regarglen[ OP( convert ) ];
2126 });
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
3dab1dad 2129 }
a3621e74 2130 /* needed for dumping*/
e62cc96a 2131 DEBUG_r(if (optimize) {
07be1b83 2132 regnode *opt = convert;
bcdf7404 2133
e62cc96a 2134 while ( ++opt < optimize) {
07be1b83
YO
2135 Set_Node_Offset_Length(opt,0,0);
2136 }
786e8c11
YO
2137 /*
2138 Try to clean up some of the debris left after the
2139 optimisation.
a3621e74 2140 */
786e8c11 2141 while( optimize < jumper ) {
07be1b83 2142 mjd_nodelen += Node_Length((optimize));
a3621e74 2143 OP( optimize ) = OPTIMIZED;
07be1b83 2144 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2145 optimize++;
2146 }
07be1b83 2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2148 });
2149 } /* end node insert */
55eed653 2150 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2151#ifdef DEBUGGING
2152 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2153 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2154#else
2155 SvREFCNT_dec(revcharmap);
07be1b83 2156#endif
786e8c11
YO
2157 return trie->jump
2158 ? MADE_JUMP_TRIE
2159 : trie->startstate>1
2160 ? MADE_EXACT_TRIE
2161 : MADE_TRIE;
2162}
2163
2164STATIC void
2165S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2166{
2167/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2168
2169 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2170 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2171 ISBN 0-201-10088-6
2172
2173 We find the fail state for each state in the trie, this state is the longest proper
2174 suffix of the current states 'word' that is also a proper prefix of another word in our
2175 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2176 the DFA not to have to restart after its tried and failed a word at a given point, it
2177 simply continues as though it had been matching the other word in the first place.
2178 Consider
2179 'abcdgu'=~/abcdefg|cdgu/
2180 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2181 fail, which would bring use to the state representing 'd' in the second word where we would
2182 try 'g' and succeed, prodceding to match 'cdgu'.
2183 */
2184 /* add a fail transition */
3251b653
NC
2185 const U32 trie_offset = ARG(source);
2186 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2187 U32 *q;
2188 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2189 const U32 numstates = trie->statecount;
786e8c11
YO
2190 const U32 ubound = trie->lasttrans + ucharcount;
2191 U32 q_read = 0;
2192 U32 q_write = 0;
2193 U32 charid;
2194 U32 base = trie->states[ 1 ].trans.base;
2195 U32 *fail;
2196 reg_ac_data *aho;
2197 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2198 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2199
2200 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2201#ifndef DEBUGGING
2202 PERL_UNUSED_ARG(depth);
2203#endif
2204
2205
2206 ARG_SET( stclass, data_slot );
c944940b 2207 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2208 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2209 aho->trie=trie_offset;
446bd890
NC
2210 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2211 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2212 Newxz( q, numstates, U32);
c944940b 2213 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2214 aho->refcount = 1;
2215 fail = aho->fail;
2216 /* initialize fail[0..1] to be 1 so that we always have
2217 a valid final fail state */
2218 fail[ 0 ] = fail[ 1 ] = 1;
2219
2220 for ( charid = 0; charid < ucharcount ; charid++ ) {
2221 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2222 if ( newstate ) {
2223 q[ q_write ] = newstate;
2224 /* set to point at the root */
2225 fail[ q[ q_write++ ] ]=1;
2226 }
2227 }
2228 while ( q_read < q_write) {
2229 const U32 cur = q[ q_read++ % numstates ];
2230 base = trie->states[ cur ].trans.base;
2231
2232 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2233 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2234 if (ch_state) {
2235 U32 fail_state = cur;
2236 U32 fail_base;
2237 do {
2238 fail_state = fail[ fail_state ];
2239 fail_base = aho->states[ fail_state ].trans.base;
2240 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2241
2242 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2243 fail[ ch_state ] = fail_state;
2244 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2245 {
2246 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2247 }
2248 q[ q_write++ % numstates] = ch_state;
2249 }
2250 }
2251 }
2252 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2253 when we fail in state 1, this allows us to use the
2254 charclass scan to find a valid start char. This is based on the principle
2255 that theres a good chance the string being searched contains lots of stuff
2256 that cant be a start char.
2257 */
2258 fail[ 0 ] = fail[ 1 ] = 0;
2259 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2260 PerlIO_printf(Perl_debug_log,
2261 "%*sStclass Failtable (%"UVuf" states): 0",
2262 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2263 );
786e8c11
YO
2264 for( q_read=1; q_read<numstates; q_read++ ) {
2265 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2266 }
2267 PerlIO_printf(Perl_debug_log, "\n");
2268 });
2269 Safefree(q);
2270 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2271}
2272
786e8c11 2273
a3621e74 2274/*
5d1c421c
JH
2275 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2276 * These need to be revisited when a newer toolchain becomes available.
2277 */
2278#if defined(__sparc64__) && defined(__GNUC__)
2279# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2280# undef SPARC64_GCC_WORKAROUND
2281# define SPARC64_GCC_WORKAROUND 1
2282# endif
2283#endif
2284
07be1b83 2285#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2286 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2287 SV * const mysv=sv_newmortal(); \
2288 regnode *Next = regnext(scan); \
2289 regprop(RExC_rx, mysv, scan); \
7f69552c 2290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2292 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2293 }});
07be1b83 2294
1de06328
YO
2295
2296
2297
2298
07be1b83
YO
2299#define JOIN_EXACT(scan,min,flags) \
2300 if (PL_regkind[OP(scan)] == EXACT) \
2301 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2302
be8e71aa 2303STATIC U32
07be1b83
YO
2304S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2305 /* Merge several consecutive EXACTish nodes into one. */
2306 regnode *n = regnext(scan);
2307 U32 stringok = 1;
2308 regnode *next = scan + NODE_SZ_STR(scan);
2309 U32 merged = 0;
2310 U32 stopnow = 0;
2311#ifdef DEBUGGING
2312 regnode *stop = scan;
72f13be8 2313 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2314#else
d47053eb
RGS
2315 PERL_UNUSED_ARG(depth);
2316#endif
7918f24d
NC
2317
2318 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2319#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2320 PERL_UNUSED_ARG(flags);
2321 PERL_UNUSED_ARG(val);
07be1b83 2322#endif
07be1b83
YO
2323 DEBUG_PEEP("join",scan,depth);
2324
2325 /* Skip NOTHING, merge EXACT*. */
2326 while (n &&
2327 ( PL_regkind[OP(n)] == NOTHING ||
2328 (stringok && (OP(n) == OP(scan))))
2329 && NEXT_OFF(n)
2330 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2331
2332 if (OP(n) == TAIL || n > next)
2333 stringok = 0;
2334 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2335 DEBUG_PEEP("skip:",n,depth);
2336 NEXT_OFF(scan) += NEXT_OFF(n);
2337 next = n + NODE_STEP_REGNODE;
2338#ifdef DEBUGGING
2339 if (stringok)
2340 stop = n;
2341#endif
2342 n = regnext(n);
2343 }
2344 else if (stringok) {
786e8c11 2345 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2346 regnode * const nnext = regnext(n);
2347
2348 DEBUG_PEEP("merg",n,depth);
2349
2350 merged++;
2351 if (oldl + STR_LEN(n) > U8_MAX)
2352 break;
2353 NEXT_OFF(scan) += NEXT_OFF(n);
2354 STR_LEN(scan) += STR_LEN(n);
2355 next = n + NODE_SZ_STR(n);
2356 /* Now we can overwrite *n : */
2357 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2358#ifdef DEBUGGING
2359 stop = next - 1;
2360#endif
2361 n = nnext;
2362 if (stopnow) break;
2363 }
2364
d47053eb
RGS
2365#ifdef EXPERIMENTAL_INPLACESCAN
2366 if (flags && !NEXT_OFF(n)) {
2367 DEBUG_PEEP("atch", val, depth);
2368 if (reg_off_by_arg[OP(n)]) {
2369 ARG_SET(n, val - n);
2370 }
2371 else {
2372 NEXT_OFF(n) = val - n;
2373 }
2374 stopnow = 1;
2375 }
07be1b83
YO
2376#endif
2377 }
2378
2379 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2380 /*
2381 Two problematic code points in Unicode casefolding of EXACT nodes:
2382
2383 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2384 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2385
2386 which casefold to
2387
2388 Unicode UTF-8
2389
2390 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2391 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2392
2393 This means that in case-insensitive matching (or "loose matching",
2394 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2395 length of the above casefolded versions) can match a target string
2396 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2397 This would rather mess up the minimum length computation.
2398
2399 What we'll do is to look for the tail four bytes, and then peek
2400 at the preceding two bytes to see whether we need to decrease
2401 the minimum length by four (six minus two).
2402
2403 Thanks to the design of UTF-8, there cannot be false matches:
2404 A sequence of valid UTF-8 bytes cannot be a subsequence of
2405 another valid sequence of UTF-8 bytes.
2406
2407 */
2408 char * const s0 = STRING(scan), *s, *t;
2409 char * const s1 = s0 + STR_LEN(scan) - 1;
2410 char * const s2 = s1 - 4;
e294cc5d
JH
2411#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2412 const char t0[] = "\xaf\x49\xaf\x42";
2413#else
07be1b83 2414 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2415#endif
07be1b83
YO
2416 const char * const t1 = t0 + 3;
2417
2418 for (s = s0 + 2;
2419 s < s2 && (t = ninstr(s, s1, t0, t1));
2420 s = t + 4) {
e294cc5d
JH
2421#ifdef EBCDIC
2422 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2423 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2424#else
07be1b83
YO
2425 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2426 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2427#endif
07be1b83
YO
2428 *min -= 4;
2429 }
2430 }
2431
2432#ifdef DEBUGGING
2433 /* Allow dumping */
2434 n = scan + NODE_SZ_STR(scan);
2435 while (n <= stop) {
2436 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2437 OP(n) = OPTIMIZED;
2438 NEXT_OFF(n) = 0;
2439 }
2440 n++;
2441 }
2442#endif
2443 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2444 return stopnow;
2445}
2446
653099ff
GS
2447/* REx optimizer. Converts nodes into quickier variants "in place".
2448 Finds fixed substrings. */
2449
a0288114 2450/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2451 to the position after last scanned or to NULL. */
2452
40d049e4
YO
2453#define INIT_AND_WITHP \
2454 assert(!and_withp); \
2455 Newx(and_withp,1,struct regnode_charclass_class); \
2456 SAVEFREEPV(and_withp)
07be1b83 2457
b515a41d
YO
2458/* this is a chain of data about sub patterns we are processing that
2459 need to be handled seperately/specially in study_chunk. Its so
2460 we can simulate recursion without losing state. */
2461struct scan_frame;
2462typedef struct scan_frame {
2463 regnode *last; /* last node to process in this frame */
2464 regnode *next; /* next node to process when last is reached */
2465 struct scan_frame *prev; /*previous frame*/
2466 I32 stop; /* what stopparen do we use */
2467} scan_frame;
2468
304ee84b
YO
2469
2470#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2471
e1d1eefb
YO
2472#define CASE_SYNST_FNC(nAmE) \
2473case nAmE: \
2474 if (flags & SCF_DO_STCLASS_AND) { \
2475 for (value = 0; value < 256; value++) \
2476 if (!is_ ## nAmE ## _cp(value)) \
2477 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2478 } \
2479 else { \
2480 for (value = 0; value < 256; value++) \
2481 if (is_ ## nAmE ## _cp(value)) \
2482 ANYOF_BITMAP_SET(data->start_class, value); \
2483 } \
2484 break; \
2485case N ## nAmE: \
2486 if (flags & SCF_DO_STCLASS_AND) { \
2487 for (value = 0; value < 256; value++) \
2488 if (is_ ## nAmE ## _cp(value)) \
2489 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2490 } \
2491 else { \
2492 for (value = 0; value < 256; value++) \
2493 if (!is_ ## nAmE ## _cp(value)) \
2494 ANYOF_BITMAP_SET(data->start_class, value); \
2495 } \
2496 break
2497
2498
2499
76e3520e 2500STATIC I32
40d049e4 2501S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2502 I32 *minlenp, I32 *deltap,
40d049e4
YO
2503 regnode *last,
2504 scan_data_t *data,
2505 I32 stopparen,
2506 U8* recursed,
2507 struct regnode_charclass_class *and_withp,
2508 U32 flags, U32 depth)
c277df42
IZ
2509 /* scanp: Start here (read-write). */
2510 /* deltap: Write maxlen-minlen here. */
2511 /* last: Stop before this one. */
40d049e4
YO
2512 /* data: string data about the pattern */
2513 /* stopparen: treat close N as END */
2514 /* recursed: which subroutines have we recursed into */
2515 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2516{
97aff369 2517 dVAR;
c277df42
IZ
2518 I32 min = 0, pars = 0, code;
2519 regnode *scan = *scanp, *next;
2520 I32 delta = 0;
2521 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2522 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2523 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2524 scan_data_t data_fake;
a3621e74 2525 SV *re_trie_maxbuff = NULL;
786e8c11 2526 regnode *first_non_open = scan;
e2e6a0f1 2527 I32 stopmin = I32_MAX;
8aa23a47 2528 scan_frame *frame = NULL;
a3621e74 2529 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2530
7918f24d
NC
2531 PERL_ARGS_ASSERT_STUDY_CHUNK;
2532
13a24bad 2533#ifdef DEBUGGING
40d049e4 2534 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2535#endif
40d049e4 2536
786e8c11 2537 if ( depth == 0 ) {
40d049e4 2538 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2539 first_non_open=regnext(first_non_open);
2540 }
2541
b81d288d 2542
8aa23a47
YO
2543 fake_study_recurse:
2544 while ( scan && OP(scan) != END && scan < last ){
2545 /* Peephole optimizer: */
304ee84b 2546 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2547 DEBUG_PEEP("Peep",scan,depth);
2548 JOIN_EXACT(scan,&min,0);
2549
2550 /* Follow the next-chain of the current node and optimize
2551 away all the NOTHINGs from it. */
2552 if (OP(scan) != CURLYX) {
2553 const int max = (reg_off_by_arg[OP(scan)]
2554 ? I32_MAX
2555 /* I32 may be smaller than U16 on CRAYs! */
2556 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2557 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2558 int noff;
2559 regnode *n = scan;
2560
2561 /* Skip NOTHING and LONGJMP. */
2562 while ((n = regnext(n))
2563 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2564 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2565 && off + noff < max)
2566 off += noff;
2567 if (reg_off_by_arg[OP(scan)])
2568 ARG(scan) = off;
2569 else
2570 NEXT_OFF(scan) = off;
2571 }
a3621e74 2572
c277df42 2573
8aa23a47
YO
2574
2575 /* The principal pseudo-switch. Cannot be a switch, since we
2576 look into several different things. */
2577 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2578 || OP(scan) == IFTHEN) {
2579 next = regnext(scan);
2580 code = OP(scan);
2581 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2582
2583 if (OP(next) == code || code == IFTHEN) {
2584 /* NOTE - There is similar code to this block below for handling
2585 TRIE nodes on a re-study. If you change stuff here check there
2586 too. */
2587 I32 max1 = 0, min1 = I32_MAX, num = 0;
2588 struct regnode_charclass_class accum;
2589 regnode * const startbranch=scan;
2590
2591 if (flags & SCF_DO_SUBSTR)
304ee84b 2592 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2593 if (flags & SCF_DO_STCLASS)
2594 cl_init_zero(pRExC_state, &accum);
2595
2596 while (OP(scan) == code) {
2597 I32 deltanext, minnext, f = 0, fake;
2598 struct regnode_charclass_class this_class;
2599
2600 num++;
2601 data_fake.flags = 0;
2602 if (data) {
2603 data_fake.whilem_c = data->whilem_c;
2604 data_fake.last_closep = data->last_closep;
2605 }
2606 else
2607 data_fake.last_closep = &fake;
58e23c8d
YO
2608
2609 data_fake.pos_delta = delta;
8aa23a47
YO
2610 next = regnext(scan);
2611 scan = NEXTOPER(scan);
2612 if (code != BRANCH)
c277df42 2613 scan = NEXTOPER(scan);
8aa23a47
YO
2614 if (flags & SCF_DO_STCLASS) {
2615 cl_init(pRExC_state, &this_class);
2616 data_fake.start_class = &this_class;
2617 f = SCF_DO_STCLASS_AND;
58e23c8d 2618 }
8aa23a47
YO
2619 if (flags & SCF_WHILEM_VISITED_POS)
2620 f |= SCF_WHILEM_VISITED_POS;
2621
2622 /* we suppose the run is continuous, last=next...*/
2623 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2624 next, &data_fake,
2625 stopparen, recursed, NULL, f,depth+1);
2626 if (min1 > minnext)
2627 min1 = minnext;
2628 if (max1 < minnext + deltanext)
2629 max1 = minnext + deltanext;
2630 if (deltanext == I32_MAX)
2631 is_inf = is_inf_internal = 1;
2632 scan = next;
2633 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2634 pars++;
2635 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2636 if ( stopmin > minnext)
2637 stopmin = min + min1;
2638 flags &= ~SCF_DO_SUBSTR;
2639 if (data)
2640 data->flags |= SCF_SEEN_ACCEPT;
2641 }
2642 if (data) {
2643 if (data_fake.flags & SF_HAS_EVAL)
2644 data->flags |= SF_HAS_EVAL;
2645 data->whilem_c = data_fake.whilem_c;
3dab1dad 2646 }
8aa23a47
YO
2647 if (flags & SCF_DO_STCLASS)
2648 cl_or(pRExC_state, &accum, &this_class);
2649 }
2650 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2651 min1 = 0;
2652 if (flags & SCF_DO_SUBSTR) {
2653 data->pos_min += min1;
2654 data->pos_delta += max1 - min1;
2655 if (max1 != min1 || is_inf)
2656 data->longest = &(data->longest_float);
2657 }
2658 min += min1;
2659 delta += max1 - min1;
2660 if (flags & SCF_DO_STCLASS_OR) {
2661 cl_or(pRExC_state, data->start_class, &accum);
2662 if (min1) {
2663 cl_and(data->start_class, and_withp);
2664 flags &= ~SCF_DO_STCLASS;
653099ff 2665 }
8aa23a47
YO
2666 }
2667 else if (flags & SCF_DO_STCLASS_AND) {
2668 if (min1) {
2669 cl_and(data->start_class, &accum);
2670 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2671 }
8aa23a47
YO
2672 else {
2673 /* Switch to OR mode: cache the old value of
2674 * data->start_class */
2675 INIT_AND_WITHP;
2676 StructCopy(data->start_class, and_withp,
2677 struct regnode_charclass_class);
2678 flags &= ~SCF_DO_STCLASS_AND;
2679 StructCopy(&accum, data->start_class,
2680 struct regnode_charclass_class);
2681 flags |= SCF_DO_STCLASS_OR;
2682 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2683 }
8aa23a47 2684 }
a3621e74 2685
8aa23a47
YO
2686 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2687 /* demq.
a3621e74 2688
8aa23a47
YO
2689 Assuming this was/is a branch we are dealing with: 'scan' now
2690 points at the item that follows the branch sequence, whatever
2691 it is. We now start at the beginning of the sequence and look
2692 for subsequences of
a3621e74 2693
8aa23a47
YO
2694 BRANCH->EXACT=>x1
2695 BRANCH->EXACT=>x2
2696 tail
a3621e74 2697
8aa23a47 2698 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2699
8aa23a47
YO
2700 If we can find such a subseqence we need to turn the first
2701 element into a trie and then add the subsequent branch exact
2702 strings to the trie.
a3621e74 2703
8aa23a47 2704 We have two cases
a3621e74 2705
8aa23a47 2706 1. patterns where the whole set of branch can be converted.
a3621e74 2707
8aa23a47 2708 2. patterns where only a subset can be converted.
a3621e74 2709
8aa23a47
YO
2710 In case 1 we can replace the whole set with a single regop
2711 for the trie. In case 2 we need to keep the start and end
2712 branchs so
a3621e74 2713
8aa23a47
YO
2714 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2715 becomes BRANCH TRIE; BRANCH X;
786e8c11 2716
8aa23a47
YO
2717 There is an additional case, that being where there is a
2718 common prefix, which gets split out into an EXACT like node
2719 preceding the TRIE node.
a3621e74 2720
8aa23a47
YO
2721 If x(1..n)==tail then we can do a simple trie, if not we make
2722 a "jump" trie, such that when we match the appropriate word
2723 we "jump" to the appopriate tail node. Essentailly we turn
2724 a nested if into a case structure of sorts.
b515a41d 2725
8aa23a47
YO
2726 */
2727
2728 int made=0;
2729 if (!re_trie_maxbuff) {
2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2731 if (!SvIOK(re_trie_maxbuff))
2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2733 }
2734 if ( SvIV(re_trie_maxbuff)>=0 ) {
2735 regnode *cur;
2736 regnode *first = (regnode *)NULL;
2737 regnode *last = (regnode *)NULL;
2738 regnode *tail = scan;
2739 U8 optype = 0;
2740 U32 count=0;
a3621e74
YO
2741
2742#ifdef DEBUGGING
8aa23a47 2743 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2744#endif
8aa23a47
YO
2745 /* var tail is used because there may be a TAIL
2746 regop in the way. Ie, the exacts will point to the
2747 thing following the TAIL, but the last branch will
2748 point at the TAIL. So we advance tail. If we
2749 have nested (?:) we may have to move through several
2750 tails.
2751 */
2752
2753 while ( OP( tail ) == TAIL ) {
2754 /* this is the TAIL generated by (?:) */
2755 tail = regnext( tail );
2756 }
a3621e74 2757
8aa23a47
YO
2758
2759 DEBUG_OPTIMISE_r({
2760 regprop(RExC_rx, mysv, tail );
2761 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2762 (int)depth * 2 + 2, "",
2763 "Looking for TRIE'able sequences. Tail node is: ",
2764 SvPV_nolen_const( mysv )
2765 );
2766 });
2767
2768 /*
2769
2770 step through the branches, cur represents each
2771 branch, noper is the first thing to be matched
2772 as part of that branch and noper_next is the
2773 regnext() of that node. if noper is an EXACT
2774 and noper_next is the same as scan (our current
2775 position in the regex) then the EXACT branch is
2776 a possible optimization target. Once we have
2777 two or more consequetive such branches we can
2778 create a trie of the EXACT's contents and stich
2779 it in place. If the sequence represents all of
2780 the branches we eliminate the whole thing and
2781 replace it with a single TRIE. If it is a
2782 subsequence then we need to stitch it in. This
2783 means the first branch has to remain, and needs
2784 to be repointed at the item on the branch chain
2785 following the last branch optimized. This could
2786 be either a BRANCH, in which case the
2787 subsequence is internal, or it could be the
2788 item following the branch sequence in which
2789 case the subsequence is at the end.
2790
2791 */
2792
2793 /* dont use tail as the end marker for this traverse */
2794 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2795 regnode * const noper = NEXTOPER( cur );
b515a41d 2796#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2797 regnode * const noper_next = regnext( noper );
b515a41d
YO
2798#endif
2799
8aa23a47
YO
2800 DEBUG_OPTIMISE_r({
2801 regprop(RExC_rx, mysv, cur);
2802 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2803 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2804
2805 regprop(RExC_rx, mysv, noper);
2806 PerlIO_printf( Perl_debug_log, " -> %s",
2807 SvPV_nolen_const(mysv));
2808
2809 if ( noper_next ) {
2810 regprop(RExC_rx, mysv, noper_next );
2811 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2812 SvPV_nolen_const(mysv));
2813 }
2814 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2815 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2816 });
2817 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2818 : PL_regkind[ OP( noper ) ] == EXACT )
2819 || OP(noper) == NOTHING )
786e8c11 2820#ifdef NOJUMPTRIE
8aa23a47 2821 && noper_next == tail
786e8c11 2822#endif
8aa23a47
YO
2823 && count < U16_MAX)
2824 {
2825 count++;
2826 if ( !first || optype == NOTHING ) {
2827 if (!first) first = cur;
2828 optype = OP( noper );
2829 } else {
2830 last = cur;
2831 }
2832 } else {
a0a388a1 2833/*
0abd0d78
YO
2834 Currently we do not believe that the trie logic can
2835 handle case insensitive matching properly when the
2836 pattern is not unicode (thus forcing unicode semantics).
2837
2838 If/when this is fixed the following define can be swapped
2839 in below to fully enable trie logic.
2840
a0a388a1 2841#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2842
2843*/
2844#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2845
a0a388a1 2846 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2847 make_trie( pRExC_state,
2848 startbranch, first, cur, tail, count,
2849 optype, depth+1 );
2850 }
2851 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2852#ifdef NOJUMPTRIE
8aa23a47 2853 && noper_next == tail
786e8c11 2854#endif
8aa23a47
YO
2855 ){
2856 count = 1;
2857 first = cur;
2858 optype = OP( noper );
2859 } else {
2860 count = 0;
2861 first = NULL;
2862 optype = 0;
2863 }
2864 last = NULL;
2865 }
2866 }
2867 DEBUG_OPTIMISE_r({
2868 regprop(RExC_rx, mysv, cur);
2869 PerlIO_printf( Perl_debug_log,
2870 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2871 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2872
2873 });
a0a388a1
YO
2874
2875 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2876 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2877#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2878 if ( ((made == MADE_EXACT_TRIE &&
2879 startbranch == first)
2880 || ( first_non_open == first )) &&
2881 depth==0 ) {
2882 flags |= SCF_TRIE_RESTUDY;
2883 if ( startbranch == first
2884 && scan == tail )
2885 {
2886 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2887 }
2888 }
3dab1dad 2889#endif
8aa23a47
YO
2890 }
2891 }
2892
2893 } /* do trie */
2894
653099ff 2895 }
8aa23a47
YO
2896 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2897 scan = NEXTOPER(NEXTOPER(scan));
2898 } else /* single branch is optimized. */
2899 scan = NEXTOPER(scan);
2900 continue;
2901 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2902 scan_frame *newframe = NULL;
2903 I32 paren;
2904 regnode *start;
2905 regnode *end;
2906
2907 if (OP(scan) != SUSPEND) {
2908 /* set the pointer */
2909 if (OP(scan) == GOSUB) {
2910 paren = ARG(scan);
2911 RExC_recurse[ARG2L(scan)] = scan;
2912 start = RExC_open_parens[paren-1];
2913 end = RExC_close_parens[paren-1];
2914 } else {
2915 paren = 0;
f8fc2ecf 2916 start = RExC_rxi->program + 1;
8aa23a47
YO
2917 end = RExC_opend;
2918 }
2919 if (!recursed) {
2920 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2921 SAVEFREEPV(recursed);
2922 }
2923 if (!PAREN_TEST(recursed,paren+1)) {
2924 PAREN_SET(recursed,paren+1);
2925 Newx(newframe,1,scan_frame);
2926 } else {
2927 if (flags & SCF_DO_SUBSTR) {
304ee84b 2928 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2929 data->longest = &(data->longest_float);
2930 }
2931 is_inf = is_inf_internal = 1;
2932 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2933 cl_anything(pRExC_state, data->start_class);
2934 flags &= ~SCF_DO_STCLASS;
2935 }
2936 } else {
2937 Newx(newframe,1,scan_frame);
2938 paren = stopparen;
2939 start = scan+2;
2940 end = regnext(scan);
2941 }
2942 if (newframe) {
2943 assert(start);
2944 assert(end);
2945 SAVEFREEPV(newframe);
2946 newframe->next = regnext(scan);
2947 newframe->last = last;
2948 newframe->stop = stopparen;
2949 newframe->prev = frame;
2950
2951 frame = newframe;
2952 scan = start;
2953 stopparen = paren;
2954 last = end;
2955
2956 continue;
2957 }
2958 }
2959 else if (OP(scan) == EXACT) {
2960 I32 l = STR_LEN(scan);
2961 UV uc;
2962 if (UTF) {
2963 const U8 * const s = (U8*)STRING(scan);
2964 l = utf8_length(s, s + l);
2965 uc = utf8_to_uvchr(s, NULL);
2966 } else {
2967 uc = *((U8*)STRING(scan));
2968 }
2969 min += l;
2970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2971 /* The code below prefers earlier match for fixed
2972 offset, later match for variable offset. */
2973 if (data->last_end == -1) { /* Update the start info. */
2974 data->last_start_min = data->pos_min;
2975 data->last_start_max = is_inf
2976 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2977 }
8aa23a47
YO
2978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2979 if (UTF)
2980 SvUTF8_on(data->last_found);
2981 {
2982 SV * const sv = data->last_found;
2983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2985 if (mg && mg->mg_len >= 0)
2986 mg->mg_len += utf8_length((U8*)STRING(scan),
2987 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2988 }
8aa23a47
YO
2989 data->last_end = data->pos_min + l;
2990 data->pos_min += l; /* As in the first entry. */
2991 data->flags &= ~SF_BEFORE_EOL;
2992 }
2993 if (flags & SCF_DO_STCLASS_AND) {
2994 /* Check whether it is compatible with what we know already! */
2995 int compat = 1;
2996
2997 if (uc >= 0x100 ||
2998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2999 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3000 && (!(data->start_class->flags & ANYOF_FOLD)
3001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3002 )
3003 compat = 0;
3004 ANYOF_CLASS_ZERO(data->start_class);
3005 ANYOF_BITMAP_ZERO(data->start_class);
3006 if (compat)
3007 ANYOF_BITMAP_SET(data->start_class, uc);
3008 data->start_class->flags &= ~ANYOF_EOS;
3009 if (uc < 0x100)
3010 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3011 }
3012 else if (flags & SCF_DO_STCLASS_OR) {
3013 /* false positive possible if the class is case-folded */
3014 if (uc < 0x100)
3015 ANYOF_BITMAP_SET(data->start_class, uc);
3016 else
3017 data->start_class->flags |= ANYOF_UNICODE_ALL;
3018 data->start_class->flags &= ~ANYOF_EOS;
3019 cl_and(data->start_class, and_withp);
3020 }
3021 flags &= ~SCF_DO_STCLASS;
3022 }
3023 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3024 I32 l = STR_LEN(scan);
3025 UV uc = *((U8*)STRING(scan));
3026
3027 /* Search for fixed substrings supports EXACT only. */
3028 if (flags & SCF_DO_SUBSTR) {
3029 assert(data);
304ee84b 3030 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3031 }
3032 if (UTF) {
3033 const U8 * const s = (U8 *)STRING(scan);
3034 l = utf8_length(s, s + l);
3035 uc = utf8_to_uvchr(s, NULL);
3036 }
3037 min += l;
3038 if (flags & SCF_DO_SUBSTR)
3039 data->pos_min += l;
3040 if (flags & SCF_DO_STCLASS_AND) {
3041 /* Check whether it is compatible with what we know already! */
3042 int compat = 1;
3043
3044 if (uc >= 0x100 ||
3045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3048 compat = 0;
3049 ANYOF_CLASS_ZERO(data->start_class);
3050 ANYOF_BITMAP_ZERO(data->start_class);
3051 if (compat) {
3052 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3053 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3054 data->start_class->flags |= ANYOF_FOLD;
3055 if (OP(scan) == EXACTFL)
3056 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3057 }
8aa23a47
YO
3058 }
3059 else if (flags & SCF_DO_STCLASS_OR) {
3060 if (data->start_class->flags & ANYOF_FOLD) {
3061 /* false positive possible if the class is case-folded.
3062 Assume that the locale settings are the same... */
1aa99e6b
IH
3063 if (uc < 0x100)
3064 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3065 data->start_class->flags &= ~ANYOF_EOS;
3066 }
8aa23a47 3067 cl_and(data->start_class, and_withp);
653099ff 3068 }
8aa23a47
YO
3069 flags &= ~SCF_DO_STCLASS;
3070 }
3071 else if (strchr((const char*)PL_varies,OP(scan))) {
3072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3073 I32 f = flags, pos_before = 0;
3074 regnode * const oscan = scan;
3075 struct regnode_charclass_class this_class;
3076 struct regnode_charclass_class *oclass = NULL;
3077 I32 next_is_eval = 0;
3078
3079 switch (PL_regkind[OP(scan)]) {
3080 case WHILEM: /* End of (?:...)* . */
3081 scan = NEXTOPER(scan);
3082 goto finish;
3083 case PLUS:
3084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3085 next = NEXTOPER(scan);
3086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3087 mincount = 1;
3088 maxcount = REG_INFTY;
3089 next = regnext(scan);
3090 scan = NEXTOPER(scan);
3091 goto do_curly;
3092 }
3093 }
3094 if (flags & SCF_DO_SUBSTR)
3095 data->pos_min++;
3096 min++;
3097 /* Fall through. */
3098 case STAR:
3099 if (flags & SCF_DO_STCLASS) {
3100 mincount = 0;
3101 maxcount = REG_INFTY;
3102 next = regnext(scan);
3103 scan = NEXTOPER(scan);
3104 goto do_curly;
3105 }
3106 is_inf = is_inf_internal = 1;
3107 scan = regnext(scan);
c277df42 3108 if (flags & SCF_DO_SUBSTR) {
304ee84b 3109 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3110 data->longest = &(data->longest_float);
c277df42 3111 }
8aa23a47
YO
3112 goto optimize_curly_tail;
3113 case CURLY:
3114 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3115 && (scan->flags == stopparen))
3116 {
3117 mincount = 1;
3118 maxcount = 1;
3119 } else {
3120 mincount = ARG1(scan);
3121 maxcount = ARG2(scan);
653099ff 3122 }
8aa23a47
YO
3123 next = regnext(scan);
3124 if (OP(scan) == CURLYX) {
3125 I32 lp = (data ? *(data->last_closep) : 0);
3126 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3127 }
8aa23a47
YO
3128 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3129 next_is_eval = (OP(scan) == EVAL);
3130 do_curly:
3131 if (flags & SCF_DO_SUBSTR) {
304ee84b 3132 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3133 pos_before = data->pos_min;
b45f050a 3134 }
8aa23a47
YO
3135 if (data) {
3136 fl = data->flags;
3137 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3138 if (is_inf)
3139 data->flags |= SF_IS_INF;
3140 }
3141 if (flags & SCF_DO_STCLASS) {
3142 cl_init(pRExC_state, &this_class);
3143 oclass = data->start_class;
3144 data->start_class = &this_class;
3145 f |= SCF_DO_STCLASS_AND;
3146 f &= ~SCF_DO_STCLASS_OR;
3147 }
3148 /* These are the cases when once a subexpression
3149 fails at a particular position, it cannot succeed
3150 even after backtracking at the enclosing scope.
3151
3152 XXXX what if minimal match and we are at the
3153 initial run of {n,m}? */
3154 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3155 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3156
8aa23a47
YO
3157 /* This will finish on WHILEM, setting scan, or on NULL: */
3158 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3159 last, data, stopparen, recursed, NULL,
3160 (mincount == 0
3161 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3162
8aa23a47
YO
3163 if (flags & SCF_DO_STCLASS)
3164 data->start_class = oclass;
3165 if (mincount == 0 || minnext == 0) {
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3168 }
3169 else if (flags & SCF_DO_STCLASS_AND) {
3170 /* Switch to OR mode: cache the old value of
3171 * data->start_class */
3172 INIT_AND_WITHP;
3173 StructCopy(data->start_class, and_withp,
3174 struct regnode_charclass_class);
3175 flags &= ~SCF_DO_STCLASS_AND;
3176 StructCopy(&this_class, data->start_class,
3177 struct regnode_charclass_class);
3178 flags |= SCF_DO_STCLASS_OR;
3179 data->start_class->flags |= ANYOF_EOS;
3180 }
3181 } else { /* Non-zero len */
3182 if (flags & SCF_DO_STCLASS_OR) {
3183 cl_or(pRExC_state, data->start_class, &this_class);
3184 cl_and(data->start_class, and_withp);
3185 }
3186 else if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class, &this_class);
3188 flags &= ~SCF_DO_STCLASS;
3189 }
3190 if (!scan) /* It was not CURLYX, but CURLY. */
3191 scan = next;
3192 if ( /* ? quantifier ok, except for (?{ ... }) */
3193 (next_is_eval || !(mincount == 0 && maxcount == 1))
3194 && (minnext == 0) && (deltanext == 0)
3195 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3196 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3197 {
668c081a
NC
3198 ckWARNreg(RExC_parse,
3199 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3200 }
3201
3202 min += minnext * mincount;
3203 is_inf_internal |= ((maxcount == REG_INFTY
3204 && (minnext + deltanext) > 0)
3205 || deltanext == I32_MAX);
3206 is_inf |= is_inf_internal;
3207 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3208
3209 /* Try powerful optimization CURLYX => CURLYN. */
3210 if ( OP(oscan) == CURLYX && data
3211 && data->flags & SF_IN_PAR
3212 && !(data->flags & SF_HAS_EVAL)
3213 && !deltanext && minnext == 1 ) {
3214 /* Try to optimize to CURLYN. */
3215 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3216 regnode * const nxt1 = nxt;
497b47a8 3217#ifdef DEBUGGING
8aa23a47 3218 regnode *nxt2;
497b47a8 3219#endif
c277df42 3220
8aa23a47
YO
3221 /* Skip open. */
3222 nxt = regnext(nxt);
3223 if (!strchr((const char*)PL_simple,OP(nxt))
3224 && !(PL_regkind[OP(nxt)] == EXACT
3225 && STR_LEN(nxt) == 1))
3226 goto nogo;
497b47a8 3227#ifdef DEBUGGING
8aa23a47 3228 nxt2 = nxt;
497b47a8 3229#endif
8aa23a47
YO
3230 nxt = regnext(nxt);
3231 if (OP(nxt) != CLOSE)
3232 goto nogo;
3233 if (RExC_open_parens) {
3234 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3235 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3236 }
3237 /* Now we know that nxt2 is the only contents: */
3238 oscan->flags = (U8)ARG(nxt);
3239 OP(oscan) = CURLYN;
3240 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3241
c277df42 3242#ifdef DEBUGGING
8aa23a47
YO
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3245 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3246 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3247 OP(nxt + 1) = OPTIMIZED; /* was count. */
3248 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3249#endif
8aa23a47
YO
3250 }
3251 nogo:
3252
3253 /* Try optimization CURLYX => CURLYM. */
3254 if ( OP(oscan) == CURLYX && data
3255 && !(data->flags & SF_HAS_PAR)
3256 && !(data->flags & SF_HAS_EVAL)
3257 && !deltanext /* atom is fixed width */
3258 && minnext != 0 /* CURLYM can't handle zero width */
3259 ) {
3260 /* XXXX How to optimize if data == 0? */
3261 /* Optimize to a simpler form. */
3262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3263 regnode *nxt2;
3264
3265 OP(oscan) = CURLYM;
3266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3267 && (OP(nxt2) != WHILEM))
3268 nxt = nxt2;
3269 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3270 /* Need to optimize away parenths. */
3271 if (data->flags & SF_IN_PAR) {
3272 /* Set the parenth number. */
3273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3274
3275 if (OP(nxt) != CLOSE)
3276 FAIL("Panic opt close");
3277 oscan->flags = (U8)ARG(nxt);
3278 if (RExC_open_parens) {
3279 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3280 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3281 }
8aa23a47
YO
3282 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3283 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3284
c277df42 3285#ifdef DEBUGGING
8aa23a47
YO
3286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3287 OP(nxt + 1) = OPTIMIZED; /* was count. */
3288 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3289 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3290#endif
c277df42 3291#if 0
8aa23a47
YO
3292 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3293 regnode *nnxt = regnext(nxt1);
3294
3295 if (nnxt == nxt) {
3296 if (reg_off_by_arg[OP(nxt1)])
3297 ARG_SET(nxt1, nxt2 - nxt1);
3298 else if (nxt2 - nxt1 < U16_MAX)
3299 NEXT_OFF(nxt1) = nxt2 - nxt1;
3300 else
3301 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3302 }
8aa23a47 3303 nxt1 = nnxt;
c277df42 3304 }
5d1c421c 3305#endif
8aa23a47
YO
3306 /* Optimize again: */
3307 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3308 NULL, stopparen, recursed, NULL, 0,depth+1);
3309 }
3310 else
3311 oscan->flags = 0;
3312 }
3313 else if ((OP(oscan) == CURLYX)
3314 && (flags & SCF_WHILEM_VISITED_POS)
3315 /* See the comment on a similar expression above.
3316 However, this time it not a subexpression
3317 we care about, but the expression itself. */
3318 && (maxcount == REG_INFTY)
3319 && data && ++data->whilem_c < 16) {
3320 /* This stays as CURLYX, we can put the count/of pair. */
3321 /* Find WHILEM (as in regexec.c) */
3322 regnode *nxt = oscan + NEXT_OFF(oscan);
3323
3324 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3325 nxt += ARG(nxt);
3326 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3327 | (RExC_whilem_seen << 4)); /* On WHILEM */
3328 }
3329 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3330 pars++;
3331 if (flags & SCF_DO_SUBSTR) {
3332 SV *last_str = NULL;
3333 int counted = mincount != 0;
a0ed51b3 3334
8aa23a47
YO
3335 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3336#if defined(SPARC64_GCC_WORKAROUND)
3337 I32 b = 0;
3338 STRLEN l = 0;
3339 const char *s = NULL;
3340 I32 old = 0;
b515a41d 3341
8aa23a47
YO
3342 if (pos_before >= data->last_start_min)
3343 b = pos_before;
3344 else
3345 b = data->last_start_min;
b515a41d 3346
8aa23a47
YO
3347 l = 0;
3348 s = SvPV_const(data->last_found, l);
3349 old = b - data->last_start_min;
3350
3351#else
3352 I32 b = pos_before >= data->last_start_min
3353 ? pos_before : data->last_start_min;
3354 STRLEN l;
3355 const char * const s = SvPV_const(data->last_found, l);
3356 I32 old = b - data->last_start_min;
3357#endif
3358
3359 if (UTF)
3360 old = utf8_hop((U8*)s, old) - (U8*)s;
3361
3362 l -= old;
3363 /* Get the added string: */
740cce10 3364 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3365 if (deltanext == 0 && pos_before == b) {
3366 /* What was added is a constant string */
3367 if (mincount > 1) {
3368 SvGROW(last_str, (mincount * l) + 1);
3369 repeatcpy(SvPVX(last_str) + l,
3370 SvPVX_const(last_str), l, mincount - 1);
3371 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3372 /* Add additional parts. */
3373 SvCUR_set(data->last_found,
3374 SvCUR(data->last_found) - l);
3375 sv_catsv(data->last_found, last_str);
3376 {
3377 SV * sv = data->last_found;
3378 MAGIC *mg =
3379 SvUTF8(sv) && SvMAGICAL(sv) ?
3380 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3381 if (mg && mg->mg_len >= 0)
bd94e887 3382 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3383 }
8aa23a47 3384 data->last_end += l * (mincount - 1);
b515a41d 3385 }
8aa23a47
YO
3386 } else {
3387 /* start offset must point into the last copy */
3388 data->last_start_min += minnext * (mincount - 1);
3389 data->last_start_max += is_inf ? I32_MAX
3390 : (maxcount - 1) * (minnext + data->pos_delta);
3391 }
c277df42 3392 }
8aa23a47
YO
3393 /* It is counted once already... */
3394 data->pos_min += minnext * (mincount - counted);
3395 data->pos_delta += - counted * deltanext +
3396 (minnext + deltanext) * maxcount - minnext * mincount;
3397 if (mincount != maxcount) {
3398 /* Cannot extend fixed substrings found inside
3399 the group. */
304ee84b 3400 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3401 if (mincount && last_str) {
3402 SV * const sv = data->last_found;
3403 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3404 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3405
3406 if (mg)
3407 mg->mg_len = -1;
3408 sv_setsv(sv, last_str);
3409 data->last_end = data->pos_min;
3410 data->last_start_min =
3411 data->pos_min - CHR_SVLEN(last_str);
3412 data->last_start_max = is_inf
3413 ? I32_MAX
3414 : data->pos_min + data->pos_delta
3415 - CHR_SVLEN(last_str);
3416 }
3417 data->longest = &(data->longest_float);
3418 }
3419 SvREFCNT_dec(last_str);
c277df42 3420 }
8aa23a47
YO
3421 if (data && (fl & SF_HAS_EVAL))
3422 data->flags |= SF_HAS_EVAL;
3423 optimize_curly_tail:
3424 if (OP(oscan) != CURLYX) {
3425 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3426 && NEXT_OFF(next))
3427 NEXT_OFF(oscan) += NEXT_OFF(next);
3428 }
3429 continue;
3430 default: /* REF and CLUMP only? */
3431 if (flags & SCF_DO_SUBSTR) {
304ee84b 3432 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3433 data->longest = &(data->longest_float);
3434 }
3435 is_inf = is_inf_internal = 1;
3436 if (flags & SCF_DO_STCLASS_OR)
3437 cl_anything(pRExC_state, data->start_class);
3438 flags &= ~SCF_DO_STCLASS;
3439 break;
c277df42 3440 }
8aa23a47 3441 }
e1d1eefb
YO
3442 else if (OP(scan) == LNBREAK) {
3443 if (flags & SCF_DO_STCLASS) {
3444 int value = 0;
3445 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 for (value = 0; value < 256; value++)
e64b1bd1 3448 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3449 ANYOF_BITMAP_CLEAR(data->start_class, value);
3450 }
3451 else {
3452 for (value = 0; value < 256; value++)
e64b1bd1 3453 if (is_VERTWS_cp(value))
e1d1eefb
YO
3454 ANYOF_BITMAP_SET(data->start_class, value);
3455 }
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3459 }
3460 min += 1;
f9a79580 3461 delta += 1;
e1d1eefb
YO
3462 if (flags & SCF_DO_SUBSTR) {
3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3464 data->pos_min += 1;
f9a79580 3465 data->pos_delta += 1;
e1d1eefb
YO
3466 data->longest = &(data->longest_float);
3467 }
3468
3469 }
f9a79580
RGS
3470 else if (OP(scan) == FOLDCHAR) {
3471 int d = ARG(scan)==0xDF ? 1 : 2;
3472 flags &= ~SCF_DO_STCLASS;
3473 min += 1;
3474 delta += d;
3475 if (flags & SCF_DO_SUBSTR) {
3476 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3477 data->pos_min += 1;
3478 data->pos_delta += d;
3479 data->longest = &(data->longest_float);
3480 }
3481 }
8aa23a47
YO
3482 else if (strchr((const char*)PL_simple,OP(scan))) {
3483 int value = 0;
653099ff 3484
8aa23a47 3485 if (flags & SCF_DO_SUBSTR) {
304ee84b 3486 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3487 data->pos_min++;
3488 }
3489 min++;
3490 if (flags & SCF_DO_STCLASS) {
3491 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3492
8aa23a47
YO
3493 /* Some of the logic below assumes that switching
3494 locale on will only add false positives. */
3495 switch (PL_regkind[OP(scan)]) {
3496 case SANY:
3497 default:
3498 do_default:
3499 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3500 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3501 cl_anything(pRExC_state, data->start_class);
3502 break;
3503 case REG_ANY:
3504 if (OP(scan) == SANY)
3505 goto do_default;
3506 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3507 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3508 || (data->start_class->flags & ANYOF_CLASS));
3509 cl_anything(pRExC_state, data->start_class);
653099ff 3510 }
8aa23a47
YO
3511 if (flags & SCF_DO_STCLASS_AND || !value)
3512 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3513 break;
3514 case ANYOF:
3515 if (flags & SCF_DO_STCLASS_AND)
3516 cl_and(data->start_class,
3517 (struct regnode_charclass_class*)scan);
653099ff 3518 else
8aa23a47
YO
3519 cl_or(pRExC_state, data->start_class,
3520 (struct regnode_charclass_class*)scan);
3521 break;
3522 case ALNUM:
3523 if (flags & SCF_DO_STCLASS_AND) {
3524 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3525 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3526 for (value = 0; value < 256; value++)
3527 if (!isALNUM(value))
3528 ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 }
653099ff 3530 }
8aa23a47
YO
3531 else {
3532 if (data->start_class->flags & ANYOF_LOCALE)
3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3534 else {
3535 for (value = 0; value < 256; value++)
3536 if (isALNUM(value))
3537 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3538 }
8aa23a47
YO
3539 }
3540 break;
3541 case ALNUML:
3542 if (flags & SCF_DO_STCLASS_AND) {
3543 if (data->start_class->flags & ANYOF_LOCALE)
3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3545 }
3546 else {
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3548 data->start_class->flags |= ANYOF_LOCALE;
3549 }
3550 break;
3551 case NALNUM:
3552 if (flags & SCF_DO_STCLASS_AND) {
3553 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3554 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3555 for (value = 0; value < 256; value++)
3556 if (isALNUM(value))
3557 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3558 }
3559 }
8aa23a47
YO
3560 else {
3561 if (data->start_class->flags & ANYOF_LOCALE)
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563 else {
3564 for (value = 0; value < 256; value++)
3565 if (!isALNUM(value))
3566 ANYOF_BITMAP_SET(data->start_class, value);
3567 }
653099ff 3568 }
8aa23a47
YO
3569 break;
3570 case NALNUML:
3571 if (flags & SCF_DO_STCLASS_AND) {
3572 if (data->start_class->flags & ANYOF_LOCALE)
3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3574 }
8aa23a47
YO
3575 else {
3576 data->start_class->flags |= ANYOF_LOCALE;
3577 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3578 }
3579 break;
3580 case SPACE:
3581 if (flags & SCF_DO_STCLASS_AND) {
3582 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3584 for (value = 0; value < 256; value++)
3585 if (!isSPACE(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3587 }
3588 }
8aa23a47
YO
3589 else {
3590 if (data->start_class->flags & ANYOF_LOCALE)
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592 else {
3593 for (value = 0; value < 256; value++)
3594 if (isSPACE(value))
3595 ANYOF_BITMAP_SET(data->start_class, value);
3596 }
653099ff 3597 }
8aa23a47
YO
3598 break;
3599 case SPACEL:
3600 if (flags & SCF_DO_STCLASS_AND) {
3601 if (data->start_class->flags & ANYOF_LOCALE)
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3603 }
3604 else {
3605 data->start_class->flags |= ANYOF_LOCALE;
3606 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3607 }
3608 break;
3609 case NSPACE:
3610 if (flags & SCF_DO_STCLASS_AND) {
3611 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3612 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3613 for (value = 0; value < 256; value++)
3614 if (isSPACE(value))
3615 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3616 }
8aa23a47
YO
3617 }
3618 else {
3619 if (data->start_class->flags & ANYOF_LOCALE)
3620 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3621 else {
3622 for (value = 0; value < 256; value++)
3623 if (!isSPACE(value))
3624 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3625 }
3626 }
8aa23a47
YO
3627 break;
3628 case NSPACEL:
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 if (data->start_class->flags & ANYOF_LOCALE) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3632 for (value = 0; value < 256; value++)
3633 if (!isSPACE(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 }
653099ff 3636 }
8aa23a47
YO
3637 else {
3638 data->start_class->flags |= ANYOF_LOCALE;
3639 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3640 }
3641 break;
3642 case DIGIT:
3643 if (flags & SCF_DO_STCLASS_AND) {
3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3645 for (value = 0; value < 256; value++)
3646 if (!isDIGIT(value))
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 }
3649 else {
3650 if (data->start_class->flags & ANYOF_LOCALE)
3651 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3652 else {
3653 for (value = 0; value < 256; value++)
3654 if (isDIGIT(value))
3655 ANYOF_BITMAP_SET(data->start_class, value);
3656 }
3657 }
3658 break;
3659 case NDIGIT:
3660 if (flags & SCF_DO_STCLASS_AND) {
3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3662 for (value = 0; value < 256; value++)
3663 if (isDIGIT(value))
3664 ANYOF_BITMAP_CLEAR(data->start_class, value);
3665 }
3666 else {
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3669 else {
3670 for (value = 0; value < 256; value++)
3671 if (!isDIGIT(value))
3672 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3673 }
3674 }
8aa23a47 3675 break;
e1d1eefb
YO
3676 CASE_SYNST_FNC(VERTWS);
3677 CASE_SYNST_FNC(HORIZWS);
3678
8aa23a47
YO
3679 }
3680 if (flags & SCF_DO_STCLASS_OR)
3681 cl_and(data->start_class, and_withp);
3682 flags &= ~SCF_DO_STCLASS;
3683 }
3684 }
3685 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3686 data->flags |= (OP(scan) == MEOL
3687 ? SF_BEFORE_MEOL
3688 : SF_BEFORE_SEOL);
3689 }
3690 else if ( PL_regkind[OP(scan)] == BRANCHJ
3691 /* Lookbehind, or need to calculate parens/evals/stclass: */
3692 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3693 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3694 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3695 || OP(scan) == UNLESSM )
3696 {
3697 /* Negative Lookahead/lookbehind
3698 In this case we can't do fixed string optimisation.
3699 */
1de06328 3700
8aa23a47
YO
3701 I32 deltanext, minnext, fake = 0;
3702 regnode *nscan;
3703 struct regnode_charclass_class intrnl;
3704 int f = 0;
1de06328 3705
8aa23a47
YO
3706 data_fake.flags = 0;
3707 if (data) {
3708 data_fake.whilem_c = data->whilem_c;
3709 data_fake.last_closep = data->last_closep;
c277df42 3710 }
8aa23a47
YO
3711 else
3712 data_fake.last_closep = &fake;
58e23c8d 3713 data_fake.pos_delta = delta;
8aa23a47
YO
3714 if ( flags & SCF_DO_STCLASS && !scan->flags
3715 && OP(scan) == IFMATCH ) { /* Lookahead */
3716 cl_init(pRExC_state, &intrnl);
3717 data_fake.start_class = &intrnl;
3718 f |= SCF_DO_STCLASS_AND;
3719 }
3720 if (flags & SCF_WHILEM_VISITED_POS)
3721 f |= SCF_WHILEM_VISITED_POS;
3722 next = regnext(scan);
3723 nscan = NEXTOPER(NEXTOPER(scan));
3724 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3725 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3726 if (scan->flags) {
3727 if (deltanext) {
58e23c8d 3728 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3729 }
3730 else if (minnext > (I32)U8_MAX) {
58e23c8d 3731 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3732 }
3733 scan->flags = (U8)minnext;
3734 }
3735 if (data) {
3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737 pars++;
3738 if (data_fake.flags & SF_HAS_EVAL)
3739 data->flags |= SF_HAS_EVAL;
3740 data->whilem_c = data_fake.whilem_c;
3741 }
3742 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3743 if (flags & SCF_DO_STCLASS_OR) {
3744 /* OR before, AND after: ideally we would recurse with
3745 * data_fake to get the AND applied by study of the
3746 * remainder of the pattern, and then derecurse;
3747 * *** HACK *** for now just treat as "no information".
3748 * See [perl #56690].
3749 */
3750 cl_init(pRExC_state, data->start_class);
3751 } else {
3752 /* AND before and after: combine and continue */
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3754
3755 cl_and(data->start_class, &intrnl);
3756 if (was)
3757 data->start_class->flags |= ANYOF_EOS;
3758 }
8aa23a47 3759 }
cb434fcc 3760 }
8aa23a47
YO
3761#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3762 else {
3763 /* Positive Lookahead/lookbehind
3764 In this case we can do fixed string optimisation,
3765 but we must be careful about it. Note in the case of
3766 lookbehind the positions will be offset by the minimum
3767 length of the pattern, something we won't know about
3768 until after the recurse.
3769 */
3770 I32 deltanext, fake = 0;
3771 regnode *nscan;
3772 struct regnode_charclass_class intrnl;
3773 int f = 0;
3774 /* We use SAVEFREEPV so that when the full compile
3775 is finished perl will clean up the allocated
3776 minlens when its all done. This was we don't
3777 have to worry about freeing them when we know
3778 they wont be used, which would be a pain.
3779 */
3780 I32 *minnextp;
3781 Newx( minnextp, 1, I32 );
3782 SAVEFREEPV(minnextp);
3783
3784 if (data) {
3785 StructCopy(data, &data_fake, scan_data_t);
3786 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3787 f |= SCF_DO_SUBSTR;
3788 if (scan->flags)