This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid a declaration after statement, accidentally added by 6079961fee8cf49c.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* Can match anything (initialization) */
716STATIC int
5f66b61c 717S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
718{
719 int value;
720
7918f24d
NC
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
722
aaa51d5e 723 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
725 return 1;
1aa99e6b
IH
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
727 return 0;
10edeb5d 728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 729 return 0;
653099ff
GS
730 return 1;
731}
732
733/* Can match anything (initialization) */
734STATIC void
097eb12c 735S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 736{
7918f24d
NC
737 PERL_ARGS_ASSERT_CL_INIT;
738
8ecf7187 739 Zero(cl, 1, struct regnode_charclass_class);
653099ff 740 cl->type = ANYOF;
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742}
743
744STATIC void
097eb12c 745S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752 if (LOC)
753 cl->flags |= ANYOF_LOCALE;
754}
755
756/* 'And' a given class with another one. Can create false positives */
757/* We assume that cl is not inverted */
758STATIC void
5f66b61c 759S_cl_and(struct regnode_charclass_class *cl,
a28509cc 760 const struct regnode_charclass_class *and_with)
653099ff 761{
7918f24d 762 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
763
764 assert(and_with->type == ANYOF);
653099ff
GS
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
770 int i;
771
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
775 else
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
1aa99e6b 781
14ebb1a2
JH
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
787 }
14ebb1a2
JH
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 790 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 793 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
794}
795
796/* 'OR' a given class with another one. Can create false positives */
797/* We assume that cl is not inverted */
798STATIC void
097eb12c 799S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 800{
7918f24d
NC
801 PERL_ARGS_ASSERT_CL_OR;
802
653099ff
GS
803 if (or_with->flags & ANYOF_INVERT) {
804 /* We do not use
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
812 */
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
816 int i;
817
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
821 else {
830247a4 822 cl_anything(pRExC_state, cl);
653099ff
GS
823 }
824 } else {
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 827 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
828 || (cl->flags & ANYOF_FOLD)) ) {
829 int i;
830
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
838 }
839 }
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 841 cl_anything(pRExC_state, cl);
653099ff
GS
842 }
843 }
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
846
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
851 }
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
855 }
653099ff
GS
856}
857
a3621e74
YO
858#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
862
3dab1dad
YO
863
864#ifdef DEBUGGING
07be1b83 865/*
2b8b4781
NC
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
869
870 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
874
3dab1dad
YO
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
07be1b83
YO
877
878*/
3dab1dad
YO
879
880/*
3dab1dad
YO
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
883*/
b9a59e08 884
3dab1dad 885STATIC void
2b8b4781
NC
886S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
3dab1dad
YO
888{
889 U32 state;
ab3bbdeb 890 SV *sv=sv_newmortal();
55eed653 891 int colwidth= widecharmap ? 6 : 4;
2e64971a 892 U16 word;
3dab1dad
YO
893 GET_RE_DEBUG_FLAGS_DECL;
894
7918f24d 895 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 896
3dab1dad
YO
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
900
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 902 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 903 if ( tmp ) {
ab3bbdeb
YO
904 PerlIO_printf( Perl_debug_log, "%*s",
905 colwidth,
ddc5bc0f 906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
910 )
911 );
3dab1dad
YO
912 }
913 }
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
916
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
919 PerlIO_printf( Perl_debug_log, "\n");
920
1e2e3d02 921 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 922 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
923
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
925
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
928 } else {
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
930 }
931
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
933
934 if ( base ) {
935 U32 ofs = 0;
936
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
940 ofs++;
941
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
943
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
948 {
ab3bbdeb
YO
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
950 colwidth,
3dab1dad
YO
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
952 } else {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
954 }
955 }
956
957 PerlIO_printf( Perl_debug_log, "]");
958
959 }
960 PerlIO_printf( Perl_debug_log, "\n" );
961 }
2e64971a
DM
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
967 }
968 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
969}
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
ab3bbdeb 982 SV *sv=sv_newmortal();
55eed653 983 int colwidth= widecharmap ? 6 : 4;
3dab1dad 984 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
985
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
987
3dab1dad 988 /* print out the table precompression. */
ab3bbdeb
YO
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
3dab1dad
YO
992
993 for( state=1 ; state < next_alloc ; state ++ ) {
994 U16 charid;
995
ab3bbdeb 996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1000 } else {
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1003 );
1004 }
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1007 if ( tmp ) {
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1009 colwidth,
ddc5bc0f 1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1014 ) ,
1e2e3d02
YO
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1017 );
1018 if (!(charid % 10))
664e119d
RGS
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1e2e3d02 1021 }
ab3bbdeb
YO
1022 }
1023 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1024 }
1025}
1026
1027/*
3dab1dad
YO
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1032*/
1033STATIC void
55eed653 1034S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1036 U32 depth)
3dab1dad
YO
1037{
1038 U32 state;
1039 U16 charid;
ab3bbdeb 1040 SV *sv=sv_newmortal();
55eed653 1041 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1042 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1043
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1045
1046 /*
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1049 */
1050
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1052
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1055 if ( tmp ) {
ab3bbdeb
YO
1056 PerlIO_printf( Perl_debug_log, "%*s",
1057 colwidth,
ddc5bc0f 1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1062 )
1063 );
3dab1dad
YO
1064 }
1065 }
1066
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1068
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1071 }
1072
1073 PerlIO_printf( Perl_debug_log, "\n" );
1074
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1076
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1080
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1083 if (v)
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1085 else
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1087 }
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1090 } else {
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1093 }
1094 }
07be1b83 1095}
3dab1dad
YO
1096
1097#endif
1098
2e64971a 1099
786e8c11
YO
1100/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
3dab1dad 1110
786e8c11 1111Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1112
786e8c11
YO
1113A trie is an N'ary tree where the branches are determined by digital
1114decomposition of the key. IE, at the root node you look up the 1st character and
1115follow that branch repeat until you find the end of the branches. Nodes can be
1116marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1117
786e8c11 1118 /he|she|his|hers/
72f13be8 1119
786e8c11
YO
1120would convert into the following structure. Numbers represent states, letters
1121following numbers represent valid transitions on the letter from that state, if
1122the number is in square brackets it represents an accepting state, otherwise it
1123will be in parenthesis.
07be1b83 1124
786e8c11
YO
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1126 | |
1127 | (2)
1128 | |
1129 (1) +-i->(6)-+-s->[7]
1130 |
1131 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1132
786e8c11
YO
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1134
1135This shows that when matching against the string 'hers' we will begin at state 1
1136read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139single traverse. We store a mapping from accepting to state to which word was
1140matched, and then when we have multiple possibilities we try to complete the
1141rest of the regex in the order in which they occured in the alternation.
1142
1143The only prior NFA like behaviour that would be changed by the TRIE support is
1144the silent ignoring of duplicate alternations which are of the form:
1145
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1147
1148Thus EVAL blocks follwing a trie may be called a different number of times with
1149and without the optimisation. With the optimisations dupes will be silently
1150ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151the following demonstrates:
1152
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1154
1155which prints out 'word' three times, but
1156
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1158
1159which doesnt print it out at all. This is due to other optimisations kicking in.
1160
1161Example of what happens on a structural level:
1162
1163The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1164
1165 1: CURLYM[1] {1,32767}(18)
1166 5: BRANCH(8)
1167 6: EXACT <ac>(16)
1168 8: BRANCH(11)
1169 9: EXACT <ad>(16)
1170 11: BRANCH(14)
1171 12: EXACT <ab>(16)
1172 16: SUCCEED(0)
1173 17: NOTHING(18)
1174 18: END(0)
1175
1176This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177and should turn into:
1178
1179 1: CURLYM[1] {1,32767}(18)
1180 5: TRIE(16)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1182 <ac>
1183 <ad>
1184 <ab>
1185 16: SUCCEED(0)
1186 17: NOTHING(18)
1187 18: END(0)
1188
1189Cases where tail != last would be like /(?foo|bar)baz/:
1190
1191 1: BRANCH(4)
1192 2: EXACT <foo>(8)
1193 4: BRANCH(7)
1194 5: EXACT <bar>(8)
1195 7: TAIL(8)
1196 8: EXACT <baz>(10)
1197 10: END(0)
1198
1199which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200and would end up looking like:
1201
1202 1: TRIE(8)
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1204 <foo>
1205 <bar>
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1211
1212is the recommended Unicode-aware way of saying
1213
1214 *(d++) = uv;
1215*/
1216
1e2e3d02 1217#define TRIE_STORE_REVCHAR \
786e8c11 1218 STMT_START { \
73031816
NC
1219 if (UTF) { \
1220 SV *zlopp = newSV(2); \
88c9ea1e
CB
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1224 SvPOK_on(zlopp); \
1225 SvUTF8_on(zlopp); \
1226 av_push(revcharmap, zlopp); \
1227 } else { \
6bdeddd2 1228 char ooooff = (char)uvc; \
73031816
NC
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1230 } \
1231 } STMT_END
786e8c11
YO
1232
1233#define TRIE_READ_CHAR STMT_START { \
1234 wordlen++; \
1235 if ( UTF ) { \
1236 if ( folder ) { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1239 foldlen -= len; \
1240 scan += len; \
1241 len = 0; \
1242 } else { \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1247 } \
1248 } else { \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1250 } \
1251 } else { \
1252 uvc = (U32)*uc; \
1253 len = 1; \
1254 } \
1255} STMT_END
1256
1257
1258
1259#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1263 } \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1267} STMT_END
07be1b83 1268
786e8c11
YO
1269#define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1274} STMT_END
07be1b83 1275
786e8c11
YO
1276#define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1279 \
786e8c11
YO
1280 DEBUG_r({ \
1281 /* store the word for dumping */ \
1282 SV* tmp; \
1283 if (OP(noper) != NOTHING) \
740cce10 1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1285 else \
740cce10 1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1287 av_push( trie_words, tmp ); \
786e8c11
YO
1288 }); \
1289 \
1290 curword++; \
2e64971a
DM
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1294 \
1295 if ( noper_next < tail ) { \
1296 if (!trie->jump) \
c944940b 1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1298 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1299 if (!jumper) \
1300 jumper = noper_next; \
1301 if (!nextbranch) \
1302 nextbranch= regnext(cur); \
1303 } \
1304 \
1305 if ( dupe ) { \
2e64971a
DM
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1311 } else { \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1314 } \
1315} STMT_END
07be1b83 1316
3dab1dad 1317
786e8c11
YO
1318#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1325 )
3dab1dad 1326
786e8c11
YO
1327#define MADE_TRIE 1
1328#define MADE_JUMP_TRIE 2
1329#define MADE_EXACT_TRIE 4
3dab1dad 1330
a3621e74 1331STATIC I32
786e8c11 1332S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1333{
27da23d5 1334 dVAR;
a3621e74
YO
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
55eed653 1337 HV *widecharmap = NULL;
2b8b4781 1338 AV *revcharmap = newAV();
a3621e74 1339 regnode *cur;
9f7f3913 1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1341 STRLEN len = 0;
1342 UV uvc = 0;
1343 U16 curword = 0;
1344 U32 next_alloc = 0;
786e8c11
YO
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
7f69552c 1347 regnode *convert = NULL;
2e64971a 1348 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1349 /* we just use folder as a flag in utf8 */
e1ec3a88 1350 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1351 ? PL_fold
1352 : ( flags == EXACTFL
1353 ? PL_fold_locale
1354 : NULL
1355 )
1356 );
1357
2b8b4781
NC
1358#ifdef DEBUGGING
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
8e11feef 1363 */
2b8b4781
NC
1364#else
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1366 STRLEN trie_charcount=0;
3dab1dad 1367#endif
2b8b4781 1368 SV *re_trie_maxbuff;
a3621e74 1369 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1370
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1372#ifndef DEBUGGING
1373 PERL_UNUSED_ARG(depth);
1374#endif
a3621e74 1375
c944940b 1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1377 trie->refcount = 1;
3dab1dad 1378 trie->startstate = 1;
786e8c11 1379 trie->wordcount = word_count;
f8fc2ecf 1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1382 if (!(UTF && folder))
c944940b 1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1386
a3621e74 1387 DEBUG_r({
2b8b4781 1388 trie_words = newAV();
a3621e74 1389 });
a3621e74 1390
0111c4fd 1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1392 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1394 }
3dab1dad
YO
1395 DEBUG_OPTIMISE_r({
1396 PerlIO_printf( Perl_debug_log,
786e8c11 1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1401 (int)depth);
3dab1dad 1402 });
7f69552c
YO
1403
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1407 convert = first;
1408 } else {
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1411 }
1412
a3621e74
YO
1413 /* -- First loop and Setup --
1414
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1418 have unique chars.
1419
1420 We use an array of integers to represent the character codes 0..255
38a44b82 1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1422 native representation of the character value as the key and IV's for the
1423 coded index.
1424
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1433
1434 */
1435
a3621e74 1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1437 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1438 const U8 *uc = (U8*)STRING( noper );
a28509cc 1439 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1440 STRLEN foldlen = 0;
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1442 const U8 *scan = (U8*)NULL;
07be1b83 1443 U32 wordlen = 0; /* required init */
02daf0ab
YO
1444 STRLEN chars = 0;
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1446
3dab1dad
YO
1447 if (OP(noper) == NOTHING) {
1448 trie->minlen= 0;
1449 continue;
1450 }
02daf0ab
YO
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1454
a3621e74 1455 for ( ; uc < e ; uc += len ) {
3dab1dad 1456 TRIE_CHARCOUNT(trie)++;
a3621e74 1457 TRIE_READ_CHAR;
3dab1dad 1458 chars++;
a3621e74
YO
1459 if ( uvc < 256 ) {
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1462 if ( folder )
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1464 TRIE_STORE_REVCHAR;
a3621e74 1465 }
02daf0ab
YO
1466 if ( set_bit ) {
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1470
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1473
1474 if ( !UTF ) {
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1482 }
1483 }
02daf0ab
YO
1484 set_bit = 0; /* We've done our bit :-) */
1485 }
a3621e74
YO
1486 } else {
1487 SV** svpp;
55eed653
NC
1488 if ( !widecharmap )
1489 widecharmap = newHV();
a3621e74 1490
55eed653 1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1492
1493 if ( !svpp )
e4584336 1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1495
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1498 TRIE_STORE_REVCHAR;
a3621e74
YO
1499 }
1500 }
1501 }
3dab1dad
YO
1502 if( cur == first ) {
1503 trie->minlen=chars;
1504 trie->maxlen=chars;
1505 } else if (chars < trie->minlen) {
1506 trie->minlen=chars;
1507 } else if (chars > trie->maxlen) {
1508 trie->maxlen=chars;
1509 }
1510
a3621e74
YO
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
55eed653 1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1518 );
a3621e74
YO
1519
1520 /*
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1525 conservative but potentially much slower representation using an array
1526 of lists.
1527
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1534
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1538
1539 */
1540
1541
2e64971a
DM
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1543 prev_states[1] = 0;
1544
3dab1dad 1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1546 /*
1547 Second Pass -- Array Of Lists Representation
1548
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1552
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
a3621e74
YO
1556 */
1557
a3621e74
YO
1558 STRLEN transcount = 1;
1559
1e2e3d02
YO
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
446bd890 1563
c944940b
JH
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
a3621e74
YO
1567 TRIE_LIST_NEW(1);
1568 next_alloc = 2;
1569
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1571
c445ea15
AL
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
07be1b83 1579 U32 wordlen = 0; /* required init */
c445ea15
AL
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1581
3dab1dad 1582 if (OP(noper) != NOTHING) {
786e8c11 1583 for ( ; uc < e ; uc += len ) {
c445ea15 1584
786e8c11 1585 TRIE_READ_CHAR;
c445ea15 1586
786e8c11
YO
1587 if ( uvc < 256 ) {
1588 charid = trie->charmap[ uvc ];
c445ea15 1589 } else {
55eed653 1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1591 if ( !svpp ) {
1592 charid = 0;
1593 } else {
1594 charid=(U16)SvIV( *svpp );
1595 }
c445ea15 1596 }
786e8c11
YO
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1598 if ( charid ) {
a3621e74 1599
786e8c11
YO
1600 U16 check;
1601 U32 newstate = 0;
a3621e74 1602
786e8c11
YO
1603 charid--;
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
c445ea15 1606 }
786e8c11
YO
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1610 break;
1611 }
1612 }
1613 if ( ! newstate ) {
1614 newstate = next_alloc++;
2e64971a 1615 prev_states[newstate] = state;
786e8c11
YO
1616 TRIE_LIST_PUSH( state, charid, newstate );
1617 transcount++;
1618 }
1619 state = newstate;
1620 } else {
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1622 }
a28509cc 1623 }
c445ea15 1624 }
3dab1dad 1625 TRIE_HANDLE_WORD(state);
a3621e74
YO
1626
1627 } /* end second pass */
1628
1e2e3d02
YO
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
c944940b
JH
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1633 next_alloc
1634 * sizeof(reg_trie_state) );
a3621e74 1635
3dab1dad 1636 /* and now dump it out before we compress it */
2b8b4781
NC
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1639 depth+1)
1e2e3d02 1640 );
a3621e74 1641
c944940b
JH
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1644 {
1645 U32 state;
a3621e74
YO
1646 U32 tp = 0;
1647 U32 zp = 0;
1648
1649
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1651 U32 base=0;
1652
1653 /*
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1656 );
1657 */
1658
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1661 U16 maxid=minid;
a28509cc 1662 U16 idx;
a3621e74
YO
1663
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1667 minid=forid;
1668 } else if ( forid > maxid ) {
1669 maxid=forid;
1670 }
a3621e74
YO
1671 }
1672 if ( transcount < tp + maxid - minid + 1) {
1673 transcount *= 2;
c944940b
JH
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
446bd890
NC
1676 transcount
1677 * sizeof(reg_trie_trans) );
a3621e74
YO
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1679 }
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1682 U32 set = 0;
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1688 set = 1;
1689 break;
1690 }
1691 }
1692 if ( !set ) {
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1695 tp++;
1696 zp = tp;
1697 }
1698 } else {
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1703 }
1704 tp += ( maxid - minid + 1 );
1705 }
1706 Safefree(trie->states[ state ].trans.list);
1707 }
1708 /*
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1711 );
1712 */
1713 trie->states[ state ].trans.base=base;
1714 }
cc601c31 1715 trie->lasttrans = tp + 1;
a3621e74
YO
1716 }
1717 } else {
1718 /*
1719 Second Pass -- Flat Table Representation.
1720
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1725
1726 We then construct the trie using only the .next slots of the entry
1727 structs.
1728
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1732
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1734 transition.
1735
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1741
1742 A B A B
1743 1. 2 4 1. 3 7
1744 2. 0 3 3. 0 5
1745 3. 0 0 5. 0 0
1746 4. 0 0 7. 0 0
1747
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1751
1752 */
1e2e3d02
YO
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
3dab1dad 1756
c944940b
JH
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
a3621e74
YO
1764 next_alloc = trie->uniquecharcount + 1;
1765
3dab1dad 1766
a3621e74
YO
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768
c445ea15 1769 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1772
1773 U32 state = 1; /* required init */
1774
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778
1779 STRLEN foldlen = 0; /* required init */
07be1b83 1780 U32 wordlen = 0; /* required init */
a3621e74
YO
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1782
3dab1dad 1783 if ( OP(noper) != NOTHING ) {
786e8c11 1784 for ( ; uc < e ; uc += len ) {
a3621e74 1785
786e8c11 1786 TRIE_READ_CHAR;
a3621e74 1787
786e8c11
YO
1788 if ( uvc < 256 ) {
1789 charid = trie->charmap[ uvc ];
1790 } else {
55eed653 1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1793 }
786e8c11
YO
1794 if ( charid ) {
1795 charid--;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
2e64971a
DM
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
786e8c11
YO
1801 next_alloc += trie->uniquecharcount;
1802 }
1803 state = trie->trans[ state + charid ].next;
1804 } else {
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1806 }
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1808 }
a3621e74 1809 }
3dab1dad
YO
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1812
1813 } /* end second pass */
1814
3dab1dad 1815 /* and now dump it out before we compress it */
2b8b4781
NC
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1817 revcharmap,
1818 next_alloc, depth+1));
a3621e74 1819
a3621e74
YO
1820 {
1821 /*
1822 * Inplace compress the table.*
1823
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1827
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1830
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1833
1834 - If .base is 0 there are no valid transitions from that node.
1835
1836 - If .base is nonzero then charid is added to it to find an entry in
1837 the trans array.
1838
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1844 valid.
1845
786e8c11 1846 XXX - wrong maybe?
a3621e74
YO
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1852 0.
1853
1854 - We set 'pos' to indicate the first entry of the second node.
1855
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1862 compression.
1863
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1866
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
786e8c11 1872 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1873
1874 - Once compressed we can Renew/realloc the structures to release the
1875 excess space.
1876
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1879
1880 demq
1881 */
a3b680e6 1882 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1883 U32 state, charid;
a3621e74 1884 U32 pos = 0, zp=0;
1e2e3d02 1885 trie->statecount = laststate;
a3621e74
YO
1886
1887 for ( state = 1 ; state < laststate ; state++ ) {
1888 U8 flag = 0;
a28509cc
AL
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1892 trie->trans[ stateidx ].check = 0;
1893
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1897 if (o_used == 1) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1900 break;
1901 }
1902 }
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1907 break;
1908 }
1909 used--;
1910 }
1911 if ( !flag ) {
1912 flag = 1;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1914 }
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1917 pos++;
1918 }
1919 }
1920 }
cc601c31 1921 trie->lasttrans = pos + 1;
c944940b
JH
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
a3621e74 1925 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1926 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1930 (IV)next_alloc,
1931 (IV)pos,
a3621e74
YO
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1933 );
1934
1935 } /* end table compress */
1936 }
1e2e3d02
YO
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1942 );
cc601c31 1943 /* resize the trans array to remove unused space */
c944940b
JH
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
a3621e74 1947
3dab1dad 1948 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1949 U8 nodetype =(U8)(flags & 0xFF);
1950 char *str=NULL;
786e8c11 1951
07be1b83 1952#ifdef DEBUGGING
e62cc96a 1953 regnode *optimize = NULL;
7122b237
YO
1954#ifdef RE_TRACK_PATTERN_OFFSETS
1955
b57a0404
JH
1956 U32 mjd_offset = 0;
1957 U32 mjd_nodelen = 0;
7122b237
YO
1958#endif /* RE_TRACK_PATTERN_OFFSETS */
1959#endif /* DEBUGGING */
a3621e74 1960 /*
3dab1dad
YO
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
a3621e74 1967 */
3dab1dad 1968 /* Find the node we are going to overwrite */
7f69552c 1969 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1970 /* branch sub-chain */
3dab1dad 1971 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1972#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1973 DEBUG_r({
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1976 });
7122b237 1977#endif
7f69552c 1978 /* whole branch chain */
7122b237
YO
1979 }
1980#ifdef RE_TRACK_PATTERN_OFFSETS
1981 else {
7f69552c
YO
1982 DEBUG_r({
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1986 });
07be1b83
YO
1987 }
1988 DEBUG_OPTIMISE_r(
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
786e8c11 1991 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1992 );
7122b237 1993#endif
3dab1dad
YO
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
55eed653 1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1998 U32 state;
1e2e3d02 1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2000 U32 ofs = 0;
8e11feef
RGS
2001 I32 idx = -1;
2002 U32 count = 0;
2003 const U32 base = trie->states[ state ].trans.base;
a3621e74 2004
3dab1dad 2005 if ( trie->states[state].wordnum )
8e11feef 2006 count = 1;
a3621e74 2007
8e11feef 2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2012 {
3dab1dad 2013 if ( ++count > 1 ) {
2b8b4781 2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2016 if ( state == 1 ) break;
3dab1dad
YO
2017 if ( count == 2 ) {
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2019 DEBUG_OPTIMISE_r(
8e11feef
RGS
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
786e8c11 2023 (UV)state));
be8e71aa 2024 if (idx >= 0) {
2b8b4781 2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2027
3dab1dad 2028 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2029 if ( folder )
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2031 DEBUG_OPTIMISE_r(
f1f66076 2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2033 );
8e11feef
RGS
2034 }
2035 }
2036 TRIE_BITMAP_SET(trie,*ch);
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2040 }
2041 idx = ofs;
2042 }
3dab1dad
YO
2043 }
2044 if ( count == 1 ) {
2b8b4781 2045 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2046 STRLEN len;
2047 char *ch = SvPV( *tmp, len );
de734bd5
A
2048 DEBUG_OPTIMISE_r({
2049 SV *sv=sv_newmortal();
8e11feef
RGS
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
de734bd5
A
2053 (UV)state, (UV)idx,
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2058 )
2059 );
2060 });
3dab1dad
YO
2061 if ( state==1 ) {
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2064 STR_LEN(convert)=0;
2065 }
c490c714
YO
2066 STR_LEN(convert) += len;
2067 while (len--)
de734bd5 2068 *str++ = *ch++;
8e11feef 2069 } else {
f9049ba1 2070#ifdef DEBUGGING
8e11feef
RGS
2071 if (state>1)
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2073#endif
8e11feef
RGS
2074 break;
2075 }
2076 }
2e64971a 2077 trie->prefixlen = (state-1);
3dab1dad 2078 if (str) {
8e11feef 2079 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2081 trie->startstate = state;
07be1b83
YO
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
33809eae
JH
2084#ifdef DEBUGGING
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2087 * it right here. */
2088 if (
2089#ifdef PERL_EXT_RE_BUILD
2090 1
2091#else
2092 DEBUG_r_TEST
2093#endif
2094 ) {
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2097 mjd_nodelen++;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2101 }
2102 while (word--) {
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2104 if (tmp) {
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2107 else
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2109 }
2110 }
2111 }
2112#endif
8e11feef
RGS
2113 if (trie->maxlen) {
2114 convert = n;
2115 } else {
3dab1dad 2116 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2117 DEBUG_r(optimize= n);
3dab1dad
YO
2118 }
2119 }
2120 }
a5ca303d
YO
2121 if (!jumper)
2122 jumper = last;
3dab1dad 2123 if ( trie->maxlen ) {
8e11feef
RGS
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
786e8c11
YO
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2129 if (trie->jump)
7f69552c 2130 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2131
6c48061a
YO
2132 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2133 * and there is a bitmap
2134 * and the first "jump target" node we found leaves enough room
2135 * then convert the TRIE node into a TRIEC node, with the bitmap
2136 * embedded inline in the opcode - this is hypothetically faster.
2137 */
2138 if ( !trie->states[trie->startstate].wordnum
2139 && trie->bitmap
2140 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2141 {
2142 OP( convert ) = TRIEC;
2143 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2144 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2145 trie->bitmap= NULL;
2146 } else
2147 OP( convert ) = TRIE;
a3621e74 2148
3dab1dad
YO
2149 /* store the type in the flags */
2150 convert->flags = nodetype;
a5ca303d
YO
2151 DEBUG_r({
2152 optimize = convert
2153 + NODE_STEP_REGNODE
2154 + regarglen[ OP( convert ) ];
2155 });
2156 /* XXX We really should free up the resource in trie now,
2157 as we won't use them - (which resources?) dmq */
3dab1dad 2158 }
a3621e74 2159 /* needed for dumping*/
e62cc96a 2160 DEBUG_r(if (optimize) {
07be1b83 2161 regnode *opt = convert;
bcdf7404 2162
e62cc96a 2163 while ( ++opt < optimize) {
07be1b83
YO
2164 Set_Node_Offset_Length(opt,0,0);
2165 }
786e8c11
YO
2166 /*
2167 Try to clean up some of the debris left after the
2168 optimisation.
a3621e74 2169 */
786e8c11 2170 while( optimize < jumper ) {
07be1b83 2171 mjd_nodelen += Node_Length((optimize));
a3621e74 2172 OP( optimize ) = OPTIMIZED;
07be1b83 2173 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2174 optimize++;
2175 }
07be1b83 2176 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2177 });
2178 } /* end node insert */
2e64971a
DM
2179
2180 /* Finish populating the prev field of the wordinfo array. Walk back
2181 * from each accept state until we find another accept state, and if
2182 * so, point the first word's .prev field at the second word. If the
2183 * second already has a .prev field set, stop now. This will be the
2184 * case either if we've already processed that word's accept state,
2185 * or that that state had multiple words, and the overspill words
2186 * were already linked up earlier.
2187 */
2188 {
2189 U16 word;
2190 U32 state;
2191 U16 prev;
2192
2193 for (word=1; word <= trie->wordcount; word++) {
2194 prev = 0;
2195 if (trie->wordinfo[word].prev)
2196 continue;
2197 state = trie->wordinfo[word].accept;
2198 while (state) {
2199 state = prev_states[state];
2200 if (!state)
2201 break;
2202 prev = trie->states[state].wordnum;
2203 if (prev)
2204 break;
2205 }
2206 trie->wordinfo[word].prev = prev;
2207 }
2208 Safefree(prev_states);
2209 }
2210
2211
2212 /* and now dump out the compressed format */
2213 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2214
55eed653 2215 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2216#ifdef DEBUGGING
2217 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2218 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2219#else
2220 SvREFCNT_dec(revcharmap);
07be1b83 2221#endif
786e8c11
YO
2222 return trie->jump
2223 ? MADE_JUMP_TRIE
2224 : trie->startstate>1
2225 ? MADE_EXACT_TRIE
2226 : MADE_TRIE;
2227}
2228
2229STATIC void
2230S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2231{
2232/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2233
2234 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2235 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2236 ISBN 0-201-10088-6
2237
2238 We find the fail state for each state in the trie, this state is the longest proper
2239 suffix of the current states 'word' that is also a proper prefix of another word in our
2240 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2241 the DFA not to have to restart after its tried and failed a word at a given point, it
2242 simply continues as though it had been matching the other word in the first place.
2243 Consider
2244 'abcdgu'=~/abcdefg|cdgu/
2245 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2246 fail, which would bring use to the state representing 'd' in the second word where we would
2247 try 'g' and succeed, prodceding to match 'cdgu'.
2248 */
2249 /* add a fail transition */
3251b653
NC
2250 const U32 trie_offset = ARG(source);
2251 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2252 U32 *q;
2253 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2254 const U32 numstates = trie->statecount;
786e8c11
YO
2255 const U32 ubound = trie->lasttrans + ucharcount;
2256 U32 q_read = 0;
2257 U32 q_write = 0;
2258 U32 charid;
2259 U32 base = trie->states[ 1 ].trans.base;
2260 U32 *fail;
2261 reg_ac_data *aho;
2262 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2263 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2264
2265 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2266#ifndef DEBUGGING
2267 PERL_UNUSED_ARG(depth);
2268#endif
2269
2270
2271 ARG_SET( stclass, data_slot );
c944940b 2272 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2273 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2274 aho->trie=trie_offset;
446bd890
NC
2275 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2276 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2277 Newxz( q, numstates, U32);
c944940b 2278 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2279 aho->refcount = 1;
2280 fail = aho->fail;
2281 /* initialize fail[0..1] to be 1 so that we always have
2282 a valid final fail state */
2283 fail[ 0 ] = fail[ 1 ] = 1;
2284
2285 for ( charid = 0; charid < ucharcount ; charid++ ) {
2286 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2287 if ( newstate ) {
2288 q[ q_write ] = newstate;
2289 /* set to point at the root */
2290 fail[ q[ q_write++ ] ]=1;
2291 }
2292 }
2293 while ( q_read < q_write) {
2294 const U32 cur = q[ q_read++ % numstates ];
2295 base = trie->states[ cur ].trans.base;
2296
2297 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2298 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2299 if (ch_state) {
2300 U32 fail_state = cur;
2301 U32 fail_base;
2302 do {
2303 fail_state = fail[ fail_state ];
2304 fail_base = aho->states[ fail_state ].trans.base;
2305 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2306
2307 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2308 fail[ ch_state ] = fail_state;
2309 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2310 {
2311 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2312 }
2313 q[ q_write++ % numstates] = ch_state;
2314 }
2315 }
2316 }
2317 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2318 when we fail in state 1, this allows us to use the
2319 charclass scan to find a valid start char. This is based on the principle
2320 that theres a good chance the string being searched contains lots of stuff
2321 that cant be a start char.
2322 */
2323 fail[ 0 ] = fail[ 1 ] = 0;
2324 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2325 PerlIO_printf(Perl_debug_log,
2326 "%*sStclass Failtable (%"UVuf" states): 0",
2327 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2328 );
786e8c11
YO
2329 for( q_read=1; q_read<numstates; q_read++ ) {
2330 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2331 }
2332 PerlIO_printf(Perl_debug_log, "\n");
2333 });
2334 Safefree(q);
2335 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2336}
2337
786e8c11 2338
a3621e74 2339/*
5d1c421c
JH
2340 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2341 * These need to be revisited when a newer toolchain becomes available.
2342 */
2343#if defined(__sparc64__) && defined(__GNUC__)
2344# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2345# undef SPARC64_GCC_WORKAROUND
2346# define SPARC64_GCC_WORKAROUND 1
2347# endif
2348#endif
2349
07be1b83 2350#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2351 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2352 SV * const mysv=sv_newmortal(); \
2353 regnode *Next = regnext(scan); \
2354 regprop(RExC_rx, mysv, scan); \
7f69552c 2355 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2356 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2357 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2358 }});
07be1b83 2359
1de06328
YO
2360
2361
2362
2363
07be1b83
YO
2364#define JOIN_EXACT(scan,min,flags) \
2365 if (PL_regkind[OP(scan)] == EXACT) \
2366 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2367
be8e71aa 2368STATIC U32
07be1b83
YO
2369S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2370 /* Merge several consecutive EXACTish nodes into one. */
2371 regnode *n = regnext(scan);
2372 U32 stringok = 1;
2373 regnode *next = scan + NODE_SZ_STR(scan);
2374 U32 merged = 0;
2375 U32 stopnow = 0;
2376#ifdef DEBUGGING
2377 regnode *stop = scan;
72f13be8 2378 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2379#else
d47053eb
RGS
2380 PERL_UNUSED_ARG(depth);
2381#endif
7918f24d
NC
2382
2383 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2384#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2385 PERL_UNUSED_ARG(flags);
2386 PERL_UNUSED_ARG(val);
07be1b83 2387#endif
07be1b83
YO
2388 DEBUG_PEEP("join",scan,depth);
2389
2390 /* Skip NOTHING, merge EXACT*. */
2391 while (n &&
2392 ( PL_regkind[OP(n)] == NOTHING ||
2393 (stringok && (OP(n) == OP(scan))))
2394 && NEXT_OFF(n)
2395 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2396
2397 if (OP(n) == TAIL || n > next)
2398 stringok = 0;
2399 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2400 DEBUG_PEEP("skip:",n,depth);
2401 NEXT_OFF(scan) += NEXT_OFF(n);
2402 next = n + NODE_STEP_REGNODE;
2403#ifdef DEBUGGING
2404 if (stringok)
2405 stop = n;
2406#endif
2407 n = regnext(n);
2408 }
2409 else if (stringok) {
786e8c11 2410 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2411 regnode * const nnext = regnext(n);
2412
2413 DEBUG_PEEP("merg",n,depth);
2414
2415 merged++;
2416 if (oldl + STR_LEN(n) > U8_MAX)
2417 break;
2418 NEXT_OFF(scan) += NEXT_OFF(n);
2419 STR_LEN(scan) += STR_LEN(n);
2420 next = n + NODE_SZ_STR(n);
2421 /* Now we can overwrite *n : */
2422 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2423#ifdef DEBUGGING
2424 stop = next - 1;
2425#endif
2426 n = nnext;
2427 if (stopnow) break;
2428 }
2429
d47053eb
RGS
2430#ifdef EXPERIMENTAL_INPLACESCAN
2431 if (flags && !NEXT_OFF(n)) {
2432 DEBUG_PEEP("atch", val, depth);
2433 if (reg_off_by_arg[OP(n)]) {
2434 ARG_SET(n, val - n);
2435 }
2436 else {
2437 NEXT_OFF(n) = val - n;
2438 }
2439 stopnow = 1;
2440 }
07be1b83
YO
2441#endif
2442 }
2443
2444 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2445 /*
2446 Two problematic code points in Unicode casefolding of EXACT nodes:
2447
2448 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2449 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2450
2451 which casefold to
2452
2453 Unicode UTF-8
2454
2455 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2456 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2457
2458 This means that in case-insensitive matching (or "loose matching",
2459 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2460 length of the above casefolded versions) can match a target string
2461 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2462 This would rather mess up the minimum length computation.
2463
2464 What we'll do is to look for the tail four bytes, and then peek
2465 at the preceding two bytes to see whether we need to decrease
2466 the minimum length by four (six minus two).
2467
2468 Thanks to the design of UTF-8, there cannot be false matches:
2469 A sequence of valid UTF-8 bytes cannot be a subsequence of
2470 another valid sequence of UTF-8 bytes.
2471
2472 */
2473 char * const s0 = STRING(scan), *s, *t;
2474 char * const s1 = s0 + STR_LEN(scan) - 1;
2475 char * const s2 = s1 - 4;
e294cc5d
JH
2476#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2477 const char t0[] = "\xaf\x49\xaf\x42";
2478#else
07be1b83 2479 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2480#endif
07be1b83
YO
2481 const char * const t1 = t0 + 3;
2482
2483 for (s = s0 + 2;
2484 s < s2 && (t = ninstr(s, s1, t0, t1));
2485 s = t + 4) {
e294cc5d
JH
2486#ifdef EBCDIC
2487 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2488 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2489#else
07be1b83
YO
2490 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2491 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2492#endif
07be1b83
YO
2493 *min -= 4;
2494 }
2495 }
2496
2497#ifdef DEBUGGING
2498 /* Allow dumping */
2499 n = scan + NODE_SZ_STR(scan);
2500 while (n <= stop) {
2501 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2502 OP(n) = OPTIMIZED;
2503 NEXT_OFF(n) = 0;
2504 }
2505 n++;
2506 }
2507#endif
2508 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2509 return stopnow;
2510}
2511
653099ff
GS
2512/* REx optimizer. Converts nodes into quickier variants "in place".
2513 Finds fixed substrings. */
2514
a0288114 2515/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2516 to the position after last scanned or to NULL. */
2517
40d049e4
YO
2518#define INIT_AND_WITHP \
2519 assert(!and_withp); \
2520 Newx(and_withp,1,struct regnode_charclass_class); \
2521 SAVEFREEPV(and_withp)
07be1b83 2522
b515a41d
YO
2523/* this is a chain of data about sub patterns we are processing that
2524 need to be handled seperately/specially in study_chunk. Its so
2525 we can simulate recursion without losing state. */
2526struct scan_frame;
2527typedef struct scan_frame {
2528 regnode *last; /* last node to process in this frame */
2529 regnode *next; /* next node to process when last is reached */
2530 struct scan_frame *prev; /*previous frame*/
2531 I32 stop; /* what stopparen do we use */
2532} scan_frame;
2533
304ee84b
YO
2534
2535#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2536
e1d1eefb
YO
2537#define CASE_SYNST_FNC(nAmE) \
2538case nAmE: \
2539 if (flags & SCF_DO_STCLASS_AND) { \
2540 for (value = 0; value < 256; value++) \
2541 if (!is_ ## nAmE ## _cp(value)) \
2542 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2543 } \
2544 else { \
2545 for (value = 0; value < 256; value++) \
2546 if (is_ ## nAmE ## _cp(value)) \
2547 ANYOF_BITMAP_SET(data->start_class, value); \
2548 } \
2549 break; \
2550case N ## nAmE: \
2551 if (flags & SCF_DO_STCLASS_AND) { \
2552 for (value = 0; value < 256; value++) \
2553 if (is_ ## nAmE ## _cp(value)) \
2554 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2555 } \
2556 else { \
2557 for (value = 0; value < 256; value++) \
2558 if (!is_ ## nAmE ## _cp(value)) \
2559 ANYOF_BITMAP_SET(data->start_class, value); \
2560 } \
2561 break
2562
2563
2564
76e3520e 2565STATIC I32
40d049e4 2566S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2567 I32 *minlenp, I32 *deltap,
40d049e4
YO
2568 regnode *last,
2569 scan_data_t *data,
2570 I32 stopparen,
2571 U8* recursed,
2572 struct regnode_charclass_class *and_withp,
2573 U32 flags, U32 depth)
c277df42
IZ
2574 /* scanp: Start here (read-write). */
2575 /* deltap: Write maxlen-minlen here. */
2576 /* last: Stop before this one. */
40d049e4
YO
2577 /* data: string data about the pattern */
2578 /* stopparen: treat close N as END */
2579 /* recursed: which subroutines have we recursed into */
2580 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2581{
97aff369 2582 dVAR;
c277df42
IZ
2583 I32 min = 0, pars = 0, code;
2584 regnode *scan = *scanp, *next;
2585 I32 delta = 0;
2586 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2587 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2588 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2589 scan_data_t data_fake;
a3621e74 2590 SV *re_trie_maxbuff = NULL;
786e8c11 2591 regnode *first_non_open = scan;
e2e6a0f1 2592 I32 stopmin = I32_MAX;
8aa23a47 2593 scan_frame *frame = NULL;
a3621e74 2594 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2595
7918f24d
NC
2596 PERL_ARGS_ASSERT_STUDY_CHUNK;
2597
13a24bad 2598#ifdef DEBUGGING
40d049e4 2599 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2600#endif
40d049e4 2601
786e8c11 2602 if ( depth == 0 ) {
40d049e4 2603 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2604 first_non_open=regnext(first_non_open);
2605 }
2606
b81d288d 2607
8aa23a47
YO
2608 fake_study_recurse:
2609 while ( scan && OP(scan) != END && scan < last ){
2610 /* Peephole optimizer: */
304ee84b 2611 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2612 DEBUG_PEEP("Peep",scan,depth);
2613 JOIN_EXACT(scan,&min,0);
2614
2615 /* Follow the next-chain of the current node and optimize
2616 away all the NOTHINGs from it. */
2617 if (OP(scan) != CURLYX) {
2618 const int max = (reg_off_by_arg[OP(scan)]
2619 ? I32_MAX
2620 /* I32 may be smaller than U16 on CRAYs! */
2621 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2622 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2623 int noff;
2624 regnode *n = scan;
2625
2626 /* Skip NOTHING and LONGJMP. */
2627 while ((n = regnext(n))
2628 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2629 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2630 && off + noff < max)
2631 off += noff;
2632 if (reg_off_by_arg[OP(scan)])
2633 ARG(scan) = off;
2634 else
2635 NEXT_OFF(scan) = off;
2636 }
a3621e74 2637
c277df42 2638
8aa23a47
YO
2639
2640 /* The principal pseudo-switch. Cannot be a switch, since we
2641 look into several different things. */
2642 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2643 || OP(scan) == IFTHEN) {
2644 next = regnext(scan);
2645 code = OP(scan);
2646 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2647
2648 if (OP(next) == code || code == IFTHEN) {
2649 /* NOTE - There is similar code to this block below for handling
2650 TRIE nodes on a re-study. If you change stuff here check there
2651 too. */
2652 I32 max1 = 0, min1 = I32_MAX, num = 0;
2653 struct regnode_charclass_class accum;
2654 regnode * const startbranch=scan;
2655
2656 if (flags & SCF_DO_SUBSTR)
304ee84b 2657 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2658 if (flags & SCF_DO_STCLASS)
2659 cl_init_zero(pRExC_state, &accum);
2660
2661 while (OP(scan) == code) {
2662 I32 deltanext, minnext, f = 0, fake;
2663 struct regnode_charclass_class this_class;
2664
2665 num++;
2666 data_fake.flags = 0;
2667 if (data) {
2668 data_fake.whilem_c = data->whilem_c;
2669 data_fake.last_closep = data->last_closep;
2670 }
2671 else
2672 data_fake.last_closep = &fake;
58e23c8d
YO
2673
2674 data_fake.pos_delta = delta;
8aa23a47
YO
2675 next = regnext(scan);
2676 scan = NEXTOPER(scan);
2677 if (code != BRANCH)
c277df42 2678 scan = NEXTOPER(scan);
8aa23a47
YO
2679 if (flags & SCF_DO_STCLASS) {
2680 cl_init(pRExC_state, &this_class);
2681 data_fake.start_class = &this_class;
2682 f = SCF_DO_STCLASS_AND;
58e23c8d 2683 }
8aa23a47
YO
2684 if (flags & SCF_WHILEM_VISITED_POS)
2685 f |= SCF_WHILEM_VISITED_POS;
2686
2687 /* we suppose the run is continuous, last=next...*/
2688 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2689 next, &data_fake,
2690 stopparen, recursed, NULL, f,depth+1);
2691 if (min1 > minnext)
2692 min1 = minnext;
2693 if (max1 < minnext + deltanext)
2694 max1 = minnext + deltanext;
2695 if (deltanext == I32_MAX)
2696 is_inf = is_inf_internal = 1;
2697 scan = next;
2698 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2699 pars++;
2700 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2701 if ( stopmin > minnext)
2702 stopmin = min + min1;
2703 flags &= ~SCF_DO_SUBSTR;
2704 if (data)
2705 data->flags |= SCF_SEEN_ACCEPT;
2706 }
2707 if (data) {
2708 if (data_fake.flags & SF_HAS_EVAL)
2709 data->flags |= SF_HAS_EVAL;
2710 data->whilem_c = data_fake.whilem_c;
3dab1dad 2711 }
8aa23a47
YO
2712 if (flags & SCF_DO_STCLASS)
2713 cl_or(pRExC_state, &accum, &this_class);
2714 }
2715 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2716 min1 = 0;
2717 if (flags & SCF_DO_SUBSTR) {
2718 data->pos_min += min1;
2719 data->pos_delta += max1 - min1;
2720 if (max1 != min1 || is_inf)
2721 data->longest = &(data->longest_float);
2722 }
2723 min += min1;
2724 delta += max1 - min1;
2725 if (flags & SCF_DO_STCLASS_OR) {
2726 cl_or(pRExC_state, data->start_class, &accum);
2727 if (min1) {
2728 cl_and(data->start_class, and_withp);
2729 flags &= ~SCF_DO_STCLASS;
653099ff 2730 }
8aa23a47
YO
2731 }
2732 else if (flags & SCF_DO_STCLASS_AND) {
2733 if (min1) {
2734 cl_and(data->start_class, &accum);
2735 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2736 }
8aa23a47
YO
2737 else {
2738 /* Switch to OR mode: cache the old value of
2739 * data->start_class */
2740 INIT_AND_WITHP;
2741 StructCopy(data->start_class, and_withp,
2742 struct regnode_charclass_class);
2743 flags &= ~SCF_DO_STCLASS_AND;
2744 StructCopy(&accum, data->start_class,
2745 struct regnode_charclass_class);
2746 flags |= SCF_DO_STCLASS_OR;
2747 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2748 }
8aa23a47 2749 }
a3621e74 2750
8aa23a47
YO
2751 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2752 /* demq.
a3621e74 2753
8aa23a47
YO
2754 Assuming this was/is a branch we are dealing with: 'scan' now
2755 points at the item that follows the branch sequence, whatever
2756 it is. We now start at the beginning of the sequence and look
2757 for subsequences of
a3621e74 2758
8aa23a47
YO
2759 BRANCH->EXACT=>x1
2760 BRANCH->EXACT=>x2
2761 tail
a3621e74 2762
8aa23a47 2763 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2764
8aa23a47
YO
2765 If we can find such a subseqence we need to turn the first
2766 element into a trie and then add the subsequent branch exact
2767 strings to the trie.
a3621e74 2768
8aa23a47 2769 We have two cases
a3621e74 2770
8aa23a47 2771 1. patterns where the whole set of branch can be converted.
a3621e74 2772
8aa23a47 2773 2. patterns where only a subset can be converted.
a3621e74 2774
8aa23a47
YO
2775 In case 1 we can replace the whole set with a single regop
2776 for the trie. In case 2 we need to keep the start and end
2777 branchs so
a3621e74 2778
8aa23a47
YO
2779 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2780 becomes BRANCH TRIE; BRANCH X;
786e8c11 2781
8aa23a47
YO
2782 There is an additional case, that being where there is a
2783 common prefix, which gets split out into an EXACT like node
2784 preceding the TRIE node.
a3621e74 2785
8aa23a47
YO
2786 If x(1..n)==tail then we can do a simple trie, if not we make
2787 a "jump" trie, such that when we match the appropriate word
2788 we "jump" to the appopriate tail node. Essentailly we turn
2789 a nested if into a case structure of sorts.
b515a41d 2790
8aa23a47
YO
2791 */
2792
2793 int made=0;
2794 if (!re_trie_maxbuff) {
2795 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2796 if (!SvIOK(re_trie_maxbuff))
2797 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2798 }
2799 if ( SvIV(re_trie_maxbuff)>=0 ) {
2800 regnode *cur;
2801 regnode *first = (regnode *)NULL;
2802 regnode *last = (regnode *)NULL;
2803 regnode *tail = scan;
2804 U8 optype = 0;
2805 U32 count=0;
a3621e74
YO
2806
2807#ifdef DEBUGGING
8aa23a47 2808 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2809#endif
8aa23a47
YO
2810 /* var tail is used because there may be a TAIL
2811 regop in the way. Ie, the exacts will point to the
2812 thing following the TAIL, but the last branch will
2813 point at the TAIL. So we advance tail. If we
2814 have nested (?:) we may have to move through several
2815 tails.
2816 */
2817
2818 while ( OP( tail ) == TAIL ) {
2819 /* this is the TAIL generated by (?:) */
2820 tail = regnext( tail );
2821 }
a3621e74 2822
8aa23a47
YO
2823
2824 DEBUG_OPTIMISE_r({
2825 regprop(RExC_rx, mysv, tail );
2826 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2827 (int)depth * 2 + 2, "",
2828 "Looking for TRIE'able sequences. Tail node is: ",
2829 SvPV_nolen_const( mysv )
2830 );
2831 });
2832
2833 /*
2834
2835 step through the branches, cur represents each
2836 branch, noper is the first thing to be matched
2837 as part of that branch and noper_next is the
2838 regnext() of that node. if noper is an EXACT
2839 and noper_next is the same as scan (our current
2840 position in the regex) then the EXACT branch is
2841 a possible optimization target. Once we have
2842 two or more consequetive such branches we can
2843 create a trie of the EXACT's contents and stich
2844 it in place. If the sequence represents all of
2845 the branches we eliminate the whole thing and
2846 replace it with a single TRIE. If it is a
2847 subsequence then we need to stitch it in. This
2848 means the first branch has to remain, and needs
2849 to be repointed at the item on the branch chain
2850 following the last branch optimized. This could
2851 be either a BRANCH, in which case the
2852 subsequence is internal, or it could be the
2853 item following the branch sequence in which
2854 case the subsequence is at the end.
2855
2856 */
2857
2858 /* dont use tail as the end marker for this traverse */
2859 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2860 regnode * const noper = NEXTOPER( cur );
b515a41d 2861#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2862 regnode * const noper_next = regnext( noper );
b515a41d
YO
2863#endif
2864
8aa23a47
YO
2865 DEBUG_OPTIMISE_r({
2866 regprop(RExC_rx, mysv, cur);
2867 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2868 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2869
2870 regprop(RExC_rx, mysv, noper);
2871 PerlIO_printf( Perl_debug_log, " -> %s",
2872 SvPV_nolen_const(mysv));
2873
2874 if ( noper_next ) {
2875 regprop(RExC_rx, mysv, noper_next );
2876 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2877 SvPV_nolen_const(mysv));
2878 }
2879 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2880 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2881 });
2882 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2883 : PL_regkind[ OP( noper ) ] == EXACT )
2884 || OP(noper) == NOTHING )
786e8c11 2885#ifdef NOJUMPTRIE
8aa23a47 2886 && noper_next == tail
786e8c11 2887#endif
8aa23a47
YO
2888 && count < U16_MAX)
2889 {
2890 count++;
2891 if ( !first || optype == NOTHING ) {
2892 if (!first) first = cur;
2893 optype = OP( noper );
2894 } else {
2895 last = cur;
2896 }
2897 } else {
a0a388a1 2898/*
0abd0d78
YO
2899 Currently we do not believe that the trie logic can
2900 handle case insensitive matching properly when the
2901 pattern is not unicode (thus forcing unicode semantics).
2902
2903 If/when this is fixed the following define can be swapped
2904 in below to fully enable trie logic.
2905
a0a388a1 2906#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2907
2908*/
2909#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2910
a0a388a1 2911 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2912 make_trie( pRExC_state,
2913 startbranch, first, cur, tail, count,
2914 optype, depth+1 );
2915 }
2916 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2917#ifdef NOJUMPTRIE
8aa23a47 2918 && noper_next == tail
786e8c11 2919#endif
8aa23a47
YO
2920 ){
2921 count = 1;
2922 first = cur;
2923 optype = OP( noper );
2924 } else {
2925 count = 0;
2926 first = NULL;
2927 optype = 0;
2928 }
2929 last = NULL;
2930 }
2931 }
2932 DEBUG_OPTIMISE_r({
2933 regprop(RExC_rx, mysv, cur);
2934 PerlIO_printf( Perl_debug_log,
2935 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2936 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2937
2938 });
a0a388a1
YO
2939
2940 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2941 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2942#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2943 if ( ((made == MADE_EXACT_TRIE &&
2944 startbranch == first)
2945 || ( first_non_open == first )) &&
2946 depth==0 ) {
2947 flags |= SCF_TRIE_RESTUDY;
2948 if ( startbranch == first
2949 && scan == tail )
2950 {
2951 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2952 }
2953 }
3dab1dad 2954#endif
8aa23a47
YO
2955 }
2956 }
2957
2958 } /* do trie */
2959
653099ff 2960 }
8aa23a47
YO
2961 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2962 scan = NEXTOPER(NEXTOPER(scan));
2963 } else /* single branch is optimized. */
2964 scan = NEXTOPER(scan);
2965 continue;
2966 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2967 scan_frame *newframe = NULL;
2968 I32 paren;
2969 regnode *start;
2970 regnode *end;
2971
2972 if (OP(scan) != SUSPEND) {
2973 /* set the pointer */
2974 if (OP(scan) == GOSUB) {
2975 paren = ARG(scan);
2976 RExC_recurse[ARG2L(scan)] = scan;
2977 start = RExC_open_parens[paren-1];
2978 end = RExC_close_parens[paren-1];
2979 } else {
2980 paren = 0;
f8fc2ecf 2981 start = RExC_rxi->program + 1;
8aa23a47
YO
2982 end = RExC_opend;
2983 }
2984 if (!recursed) {
2985 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2986 SAVEFREEPV(recursed);
2987 }
2988 if (!PAREN_TEST(recursed,paren+1)) {
2989 PAREN_SET(recursed,paren+1);
2990 Newx(newframe,1,scan_frame);
2991 } else {
2992 if (flags & SCF_DO_SUBSTR) {
304ee84b 2993 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2994 data->longest = &(data->longest_float);
2995 }
2996 is_inf = is_inf_internal = 1;
2997 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2998 cl_anything(pRExC_state, data->start_class);
2999 flags &= ~SCF_DO_STCLASS;
3000 }
3001 } else {
3002 Newx(newframe,1,scan_frame);
3003 paren = stopparen;
3004 start = scan+2;
3005 end = regnext(scan);
3006 }
3007 if (newframe) {
3008 assert(start);
3009 assert(end);
3010 SAVEFREEPV(newframe);
3011 newframe->next = regnext(scan);
3012 newframe->last = last;
3013 newframe->stop = stopparen;
3014 newframe->prev = frame;
3015
3016 frame = newframe;
3017 scan = start;
3018 stopparen = paren;
3019 last = end;
3020
3021 continue;
3022 }
3023 }
3024 else if (OP(scan) == EXACT) {
3025 I32 l = STR_LEN(scan);
3026 UV uc;
3027 if (UTF) {
3028 const U8 * const s = (U8*)STRING(scan);
3029 l = utf8_length(s, s + l);
3030 uc = utf8_to_uvchr(s, NULL);
3031 } else {
3032 uc = *((U8*)STRING(scan));
3033 }
3034 min += l;
3035 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3036 /* The code below prefers earlier match for fixed
3037 offset, later match for variable offset. */
3038 if (data->last_end == -1) { /* Update the start info. */
3039 data->last_start_min = data->pos_min;
3040 data->last_start_max = is_inf
3041 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3042 }
8aa23a47
YO
3043 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3044 if (UTF)
3045 SvUTF8_on(data->last_found);
3046 {
3047 SV * const sv = data->last_found;
3048 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3049 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3050 if (mg && mg->mg_len >= 0)
3051 mg->mg_len += utf8_length((U8*)STRING(scan),
3052 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3053 }
8aa23a47
YO
3054 data->last_end = data->pos_min + l;
3055 data->pos_min += l; /* As in the first entry. */
3056 data->flags &= ~SF_BEFORE_EOL;
3057 }
3058 if (flags & SCF_DO_STCLASS_AND) {
3059 /* Check whether it is compatible with what we know already! */
3060 int compat = 1;
3061
3062 if (uc >= 0x100 ||
3063 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3064 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3065 && (!(data->start_class->flags & ANYOF_FOLD)
3066 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3067 )
3068 compat = 0;
3069 ANYOF_CLASS_ZERO(data->start_class);
3070 ANYOF_BITMAP_ZERO(data->start_class);
3071 if (compat)
3072 ANYOF_BITMAP_SET(data->start_class, uc);
3073 data->start_class->flags &= ~ANYOF_EOS;
3074 if (uc < 0x100)
3075 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3076 }
3077 else if (flags & SCF_DO_STCLASS_OR) {
3078 /* false positive possible if the class is case-folded */
3079 if (uc < 0x100)
3080 ANYOF_BITMAP_SET(data->start_class, uc);
3081 else
3082 data->start_class->flags |= ANYOF_UNICODE_ALL;
3083 data->start_class->flags &= ~ANYOF_EOS;
3084 cl_and(data->start_class, and_withp);
3085 }
3086 flags &= ~SCF_DO_STCLASS;
3087 }
3088 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3089 I32 l = STR_LEN(scan);
3090 UV uc = *((U8*)STRING(scan));
3091
3092 /* Search for fixed substrings supports EXACT only. */
3093 if (flags & SCF_DO_SUBSTR) {
3094 assert(data);
304ee84b 3095 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3096 }
3097 if (UTF) {
3098 const U8 * const s = (U8 *)STRING(scan);
3099 l = utf8_length(s, s + l);
3100 uc = utf8_to_uvchr(s, NULL);
3101 }
3102 min += l;
3103 if (flags & SCF_DO_SUBSTR)
3104 data->pos_min += l;
3105 if (flags & SCF_DO_STCLASS_AND) {
3106 /* Check whether it is compatible with what we know already! */
3107 int compat = 1;
3108
3109 if (uc >= 0x100 ||
3110 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3111 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3112 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3113 compat = 0;
3114 ANYOF_CLASS_ZERO(data->start_class);
3115 ANYOF_BITMAP_ZERO(data->start_class);
3116 if (compat) {
3117 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3118 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3119 data->start_class->flags |= ANYOF_FOLD;
3120 if (OP(scan) == EXACTFL)
3121 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3122 }
8aa23a47
YO
3123 }
3124 else if (flags & SCF_DO_STCLASS_OR) {
3125 if (data->start_class->flags & ANYOF_FOLD) {
3126 /* false positive possible if the class is case-folded.
3127 Assume that the locale settings are the same... */
1aa99e6b
IH
3128 if (uc < 0x100)
3129 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3130 data->start_class->flags &= ~ANYOF_EOS;
3131 }
8aa23a47 3132 cl_and(data->start_class, and_withp);
653099ff 3133 }
8aa23a47
YO
3134 flags &= ~SCF_DO_STCLASS;
3135 }
e52fc539 3136 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3137 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3138 I32 f = flags, pos_before = 0;
3139 regnode * const oscan = scan;
3140 struct regnode_charclass_class this_class;
3141 struct regnode_charclass_class *oclass = NULL;
3142 I32 next_is_eval = 0;
3143
3144 switch (PL_regkind[OP(scan)]) {
3145 case WHILEM: /* End of (?:...)* . */
3146 scan = NEXTOPER(scan);
3147 goto finish;
3148 case PLUS:
3149 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3150 next = NEXTOPER(scan);
3151 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3152 mincount = 1;
3153 maxcount = REG_INFTY;
3154 next = regnext(scan);
3155 scan = NEXTOPER(scan);
3156 goto do_curly;
3157 }
3158 }
3159 if (flags & SCF_DO_SUBSTR)
3160 data->pos_min++;
3161 min++;
3162 /* Fall through. */
3163 case STAR:
3164 if (flags & SCF_DO_STCLASS) {
3165 mincount = 0;
3166 maxcount = REG_INFTY;
3167 next = regnext(scan);
3168 scan = NEXTOPER(scan);
3169 goto do_curly;
3170 }
3171 is_inf = is_inf_internal = 1;
3172 scan = regnext(scan);
c277df42 3173 if (flags & SCF_DO_SUBSTR) {
304ee84b 3174 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3175 data->longest = &(data->longest_float);
c277df42 3176 }
8aa23a47
YO
3177 goto optimize_curly_tail;
3178 case CURLY:
3179 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3180 && (scan->flags == stopparen))
3181 {
3182 mincount = 1;
3183 maxcount = 1;
3184 } else {
3185 mincount = ARG1(scan);
3186 maxcount = ARG2(scan);
653099ff 3187 }
8aa23a47
YO
3188 next = regnext(scan);
3189 if (OP(scan) == CURLYX) {
3190 I32 lp = (data ? *(data->last_closep) : 0);
3191 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3192 }
8aa23a47
YO
3193 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3194 next_is_eval = (OP(scan) == EVAL);
3195 do_curly:
3196 if (flags & SCF_DO_SUBSTR) {
304ee84b 3197 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3198 pos_before = data->pos_min;
b45f050a 3199 }
8aa23a47
YO
3200 if (data) {
3201 fl = data->flags;
3202 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3203 if (is_inf)
3204 data->flags |= SF_IS_INF;
3205 }
3206 if (flags & SCF_DO_STCLASS) {
3207 cl_init(pRExC_state, &this_class);
3208 oclass = data->start_class;
3209 data->start_class = &this_class;
3210 f |= SCF_DO_STCLASS_AND;
3211 f &= ~SCF_DO_STCLASS_OR;
3212 }
3213 /* These are the cases when once a subexpression
3214 fails at a particular position, it cannot succeed
3215 even after backtracking at the enclosing scope.
b9a59e08 3216
8aa23a47
YO
3217 XXXX what if minimal match and we are at the
3218 initial run of {n,m}? */
3219 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3220 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3221
8aa23a47
YO
3222 /* This will finish on WHILEM, setting scan, or on NULL: */
3223 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3224 last, data, stopparen, recursed, NULL,
3225 (mincount == 0
3226 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3227
8aa23a47
YO
3228 if (flags & SCF_DO_STCLASS)
3229 data->start_class = oclass;
3230 if (mincount == 0 || minnext == 0) {
3231 if (flags & SCF_DO_STCLASS_OR) {
3232 cl_or(pRExC_state, data->start_class, &this_class);
3233 }
3234 else if (flags & SCF_DO_STCLASS_AND) {
3235 /* Switch to OR mode: cache the old value of
3236 * data->start_class */
3237 INIT_AND_WITHP;
3238 StructCopy(data->start_class, and_withp,
3239 struct regnode_charclass_class);
3240 flags &= ~SCF_DO_STCLASS_AND;
3241 StructCopy(&this_class, data->start_class,
3242 struct regnode_charclass_class);
3243 flags |= SCF_DO_STCLASS_OR;
3244 data->start_class->flags |= ANYOF_EOS;
3245 }
3246 } else { /* Non-zero len */
3247 if (flags & SCF_DO_STCLASS_OR) {
3248 cl_or(pRExC_state, data->start_class, &this_class);
3249 cl_and(data->start_class, and_withp);
3250 }
3251 else if (flags & SCF_DO_STCLASS_AND)
3252 cl_and(data->start_class, &this_class);
3253 flags &= ~SCF_DO_STCLASS;
3254 }
3255 if (!scan) /* It was not CURLYX, but CURLY. */
3256 scan = next;
3257 if ( /* ? quantifier ok, except for (?{ ... }) */
3258 (next_is_eval || !(mincount == 0 && maxcount == 1))
3259 && (minnext == 0) && (deltanext == 0)
3260 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3261 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3262 {
668c081a
NC
3263 ckWARNreg(RExC_parse,
3264 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3265 }
3266
3267 min += minnext * mincount;
3268 is_inf_internal |= ((maxcount == REG_INFTY
3269 && (minnext + deltanext) > 0)
3270 || deltanext == I32_MAX);
3271 is_inf |= is_inf_internal;
3272 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3273
3274 /* Try powerful optimization CURLYX => CURLYN. */
3275 if ( OP(oscan) == CURLYX && data
3276 && data->flags & SF_IN_PAR
3277 && !(data->flags & SF_HAS_EVAL)
3278 && !deltanext && minnext == 1 ) {
3279 /* Try to optimize to CURLYN. */
3280 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3281 regnode * const nxt1 = nxt;
497b47a8 3282#ifdef DEBUGGING
8aa23a47 3283 regnode *nxt2;
497b47a8 3284#endif
c277df42 3285
8aa23a47
YO
3286 /* Skip open. */
3287 nxt = regnext(nxt);
e52fc539 3288 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3289 && !(PL_regkind[OP(nxt)] == EXACT
3290 && STR_LEN(nxt) == 1))
3291 goto nogo;
497b47a8 3292#ifdef DEBUGGING
8aa23a47 3293 nxt2 = nxt;
497b47a8 3294#endif
8aa23a47
YO
3295 nxt = regnext(nxt);
3296 if (OP(nxt) != CLOSE)
3297 goto nogo;
3298 if (RExC_open_parens) {
3299 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3300 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3301 }
3302 /* Now we know that nxt2 is the only contents: */
3303 oscan->flags = (U8)ARG(nxt);
3304 OP(oscan) = CURLYN;
3305 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3306
c277df42 3307#ifdef DEBUGGING
8aa23a47 3308 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3309 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3310 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3311 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3312 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3313 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3314#endif
8aa23a47
YO
3315 }
3316 nogo:
3317
3318 /* Try optimization CURLYX => CURLYM. */
3319 if ( OP(oscan) == CURLYX && data
3320 && !(data->flags & SF_HAS_PAR)
3321 && !(data->flags & SF_HAS_EVAL)
3322 && !deltanext /* atom is fixed width */
3323 && minnext != 0 /* CURLYM can't handle zero width */
3324 ) {
3325 /* XXXX How to optimize if data == 0? */
3326 /* Optimize to a simpler form. */
3327 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3328 regnode *nxt2;
3329
3330 OP(oscan) = CURLYM;
3331 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3332 && (OP(nxt2) != WHILEM))
3333 nxt = nxt2;
3334 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3335 /* Need to optimize away parenths. */
b3c0965f 3336 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3337 /* Set the parenth number. */
3338 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3339
8aa23a47
YO
3340 oscan->flags = (U8)ARG(nxt);
3341 if (RExC_open_parens) {
3342 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3343 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3344 }
8aa23a47
YO
3345 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3346 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3347
c277df42 3348#ifdef DEBUGGING
8aa23a47
YO
3349 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3350 OP(nxt + 1) = OPTIMIZED; /* was count. */
3351 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3352 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3353#endif
c277df42 3354#if 0
8aa23a47
YO
3355 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3356 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3357 if (nnxt == nxt) {
3358 if (reg_off_by_arg[OP(nxt1)])
3359 ARG_SET(nxt1, nxt2 - nxt1);
3360 else if (nxt2 - nxt1 < U16_MAX)
3361 NEXT_OFF(nxt1) = nxt2 - nxt1;
3362 else
3363 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3364 }
8aa23a47 3365 nxt1 = nnxt;
c277df42 3366 }
5d1c421c 3367#endif
8aa23a47
YO
3368 /* Optimize again: */
3369 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3370 NULL, stopparen, recursed, NULL, 0,depth+1);
3371 }
3372 else
3373 oscan->flags = 0;
3374 }
3375 else if ((OP(oscan) == CURLYX)
3376 && (flags & SCF_WHILEM_VISITED_POS)
3377 /* See the comment on a similar expression above.
3378 However, this time it not a subexpression
3379 we care about, but the expression itself. */
3380 && (maxcount == REG_INFTY)
3381 && data && ++data->whilem_c < 16) {
3382 /* This stays as CURLYX, we can put the count/of pair. */
3383 /* Find WHILEM (as in regexec.c) */
3384 regnode *nxt = oscan + NEXT_OFF(oscan);
3385
3386 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3387 nxt += ARG(nxt);
3388 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3389 | (RExC_whilem_seen << 4)); /* On WHILEM */
3390 }
3391 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3392 pars++;
3393 if (flags & SCF_DO_SUBSTR) {
3394 SV *last_str = NULL;
3395 int counted = mincount != 0;
a0ed51b3 3396
8aa23a47
YO
3397 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3398#if defined(SPARC64_GCC_WORKAROUND)
3399 I32 b = 0;
3400 STRLEN l = 0;
3401 const char *s = NULL;
3402 I32 old = 0;
b515a41d 3403
8aa23a47
YO
3404 if (pos_before >= data->last_start_min)
3405 b = pos_before;
3406 else
3407 b = data->last_start_min;
b515a41d 3408
8aa23a47
YO
3409 l = 0;
3410 s = SvPV_const(data->last_found, l);
3411 old = b - data->last_start_min;
3412
3413#else
3414 I32 b = pos_before >= data->last_start_min
3415 ? pos_before : data->last_start_min;
3416 STRLEN l;
3417 const char * const s = SvPV_const(data->last_found, l);
3418 I32 old = b - data->last_start_min;
3419#endif
3420
3421 if (UTF)
3422 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3423 l -= old;
3424 /* Get the added string: */
740cce10 3425 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3426 if (deltanext == 0 && pos_before == b) {
3427 /* What was added is a constant string */
3428 if (mincount > 1) {
3429 SvGROW(last_str, (mincount * l) + 1);
3430 repeatcpy(SvPVX(last_str) + l,
3431 SvPVX_const(last_str), l, mincount - 1);
3432 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3433 /* Add additional parts. */
3434 SvCUR_set(data->last_found,
3435 SvCUR(data->last_found) - l);
3436 sv_catsv(data->last_found, last_str);
3437 {
3438 SV * sv = data->last_found;
3439 MAGIC *mg =
3440 SvUTF8(sv) && SvMAGICAL(sv) ?
3441 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3442 if (mg && mg->mg_len >= 0)
bd94e887 3443 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3444 }
8aa23a47 3445 data->last_end += l * (mincount - 1);
b515a41d 3446 }
8aa23a47
YO
3447 } else {
3448 /* start offset must point into the last copy */
3449 data->last_start_min += minnext * (mincount - 1);
3450 data->last_start_max += is_inf ? I32_MAX
3451 : (maxcount - 1) * (minnext + data->pos_delta);
3452 }
c277df42 3453 }
8aa23a47
YO
3454 /* It is counted once already... */
3455 data->pos_min += minnext * (mincount - counted);
3456 data->pos_delta += - counted * deltanext +
3457 (minnext + deltanext) * maxcount - minnext * mincount;
3458 if (mincount != maxcount) {
3459 /* Cannot extend fixed substrings found inside
3460 the group. */
304ee84b 3461 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3462 if (mincount && last_str) {
3463 SV * const sv = data->last_found;
3464 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3465 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3466
3467 if (mg)
3468 mg->mg_len = -1;
3469 sv_setsv(sv, last_str);
3470 data->last_end = data->pos_min;
3471 data->last_start_min =
3472 data->pos_min - CHR_SVLEN(last_str);
3473 data->last_start_max = is_inf
3474 ? I32_MAX
3475 : data->pos_min + data->pos_delta
3476 - CHR_SVLEN(last_str);
3477 }
3478 data->longest = &(data->longest_float);
3479 }
3480 SvREFCNT_dec(last_str);
c277df42 3481 }
8aa23a47
YO
3482 if (data && (fl & SF_HAS_EVAL))
3483 data->flags |= SF_HAS_EVAL;
3484 optimize_curly_tail:
3485 if (OP(oscan) != CURLYX) {
3486 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3487 && NEXT_OFF(next))
3488 NEXT_OFF(oscan) += NEXT_OFF(next);
3489 }
3490 continue;
3491 default: /* REF and CLUMP only? */
3492 if (flags & SCF_DO_SUBSTR) {
304ee84b 3493 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3494 data->longest = &(data->longest_float);
3495 }
3496 is_inf = is_inf_internal = 1;
3497 if (flags & SCF_DO_STCLASS_OR)
3498 cl_anything(pRExC_state, data->start_class);
3499 flags &= ~SCF_DO_STCLASS;
3500 break;
c277df42 3501 }
8aa23a47 3502 }
e1d1eefb
YO
3503 else if (OP(scan) == LNBREAK) {
3504 if (flags & SCF_DO_STCLASS) {
3505 int value = 0;
3506 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3507 if (flags & SCF_DO_STCLASS_AND) {
3508 for (value = 0; value < 256; value++)
e64b1bd1 3509 if (!is_VERTWS_cp(value))
b9a59e08
KW
3510 ANYOF_BITMAP_CLEAR(data->start_class, value);
3511 }
3512 else {
e1d1eefb 3513 for (value = 0; value < 256; value++)
e64b1bd1 3514 if (is_VERTWS_cp(value))
b9a59e08
KW
3515 ANYOF_BITMAP_SET(data->start_class, value);
3516 }
e1d1eefb
YO
3517 if (flags & SCF_DO_STCLASS_OR)
3518 cl_and(data->start_class, and_withp);
3519 flags &= ~SCF_DO_STCLASS;
3520 }
3521 min += 1;
f9a79580 3522 delta += 1;
e1d1eefb
YO
3523 if (flags & SCF_DO_SUBSTR) {
3524 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3525 data->pos_min += 1;
f9a79580 3526 data->pos_delta += 1;
e1d1eefb
YO
3527 data->longest = &(data->longest_float);
3528 }
e1d1eefb 3529 }
f9a79580
RGS
3530 else if (OP(scan) == FOLDCHAR) {
3531 int d = ARG(scan)==0xDF ? 1 : 2;
3532 flags &= ~SCF_DO_STCLASS;
3533 min += 1;
3534 delta += d;
3535 if (flags & SCF_DO_SUBSTR) {
3536 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3537 data->pos_min += 1;
3538 data->pos_delta += d;
3539 data->longest = &(data->longest_float);
3540 }
3541 }
e52fc539 3542 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3543 int value = 0;
653099ff 3544
8aa23a47 3545 if (flags & SCF_DO_SUBSTR) {
304ee84b 3546 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3547 data->pos_min++;
3548 }
3549 min++;
3550 if (flags & SCF_DO_STCLASS) {
3551 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3552
8aa23a47
YO
3553 /* Some of the logic below assumes that switching
3554 locale on will only add false positives. */
3555 switch (PL_regkind[OP(scan)]) {
3556 case SANY:
3557 default:
3558 do_default:
3559 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3560 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3561 cl_anything(pRExC_state, data->start_class);
3562 break;
3563 case REG_ANY:
3564 if (OP(scan) == SANY)
3565 goto do_default;
3566 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3567 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3568 || (data->start_class->flags & ANYOF_CLASS));
3569 cl_anything(pRExC_state, data->start_class);
653099ff 3570 }
8aa23a47
YO
3571 if (flags & SCF_DO_STCLASS_AND || !value)
3572 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3573 break;
3574 case ANYOF:
3575 if (flags & SCF_DO_STCLASS_AND)
3576 cl_and(data->start_class,
3577 (struct regnode_charclass_class*)scan);
653099ff 3578 else
8aa23a47
YO
3579 cl_or(pRExC_state, data->start_class,
3580 (struct regnode_charclass_class*)scan);
3581 break;
3582 case ALNUM:
3583 if (flags & SCF_DO_STCLASS_AND) {
3584 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3585 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3586 if (FLAGS(scan) & USE_UNI) {
3587 for (value = 0; value < 256; value++) {
3588 if (!isWORDCHAR_L1(value)) {
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3590 }
3591 }
3592 } else {
3593 for (value = 0; value < 256; value++) {
3594 if (!isALNUM(value)) {
3595 ANYOF_BITMAP_CLEAR(data->start_class, value);
3596 }
3597 }
3598 }
8aa23a47 3599 }
653099ff 3600 }
8aa23a47
YO
3601 else {
3602 if (data->start_class->flags & ANYOF_LOCALE)
3603 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3604 else if (FLAGS(scan) & USE_UNI) {
3605 for (value = 0; value < 256; value++) {
3606 if (isWORDCHAR_L1(value)) {
3607 ANYOF_BITMAP_SET(data->start_class, value);
3608 }
3609 }
3610 } else {
3611 for (value = 0; value < 256; value++) {
3612 if (isALNUM(value)) {
3613 ANYOF_BITMAP_SET(data->start_class, value);
3614 }
3615 }
3616 }
8aa23a47
YO
3617 }
3618 break;
3619 case ALNUML:
3620 if (flags & SCF_DO_STCLASS_AND) {
3621 if (data->start_class->flags & ANYOF_LOCALE)
3622 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3623 }
3624 else {
3625 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3626 data->start_class->flags |= ANYOF_LOCALE;
3627 }
3628 break;
3629 case NALNUM:
3630 if (flags & SCF_DO_STCLASS_AND) {
3631 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3632 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3633 if (FLAGS(scan) & USE_UNI) {
3634 for (value = 0; value < 256; value++) {
3635 if (isWORDCHAR_L1(value)) {
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 }
3638 }
3639 } else {
3640 for (value = 0; value < 256; value++) {
3641 if (isALNUM(value)) {
3642 ANYOF_BITMAP_CLEAR(data->start_class, value);
3643 }
3644 }
3645 }
653099ff
GS
3646 }
3647 }
8aa23a47
YO
3648 else {
3649 if (data->start_class->flags & ANYOF_LOCALE)
3650 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3651 else {
3652 for (value = 0; value < 256; value++)
3653 if (!isALNUM(value))
b9a59e08 3654 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3655 }
653099ff 3656 }
8aa23a47
YO
3657 break;
3658 case NALNUML:
3659 if (flags & SCF_DO_STCLASS_AND) {
3660 if (data->start_class->flags & ANYOF_LOCALE)
3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3662 }
8aa23a47
YO
3663 else {
3664 data->start_class->flags |= ANYOF_LOCALE;
3665 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3666 }
3667 break;
3668 case SPACE:
3669 if (flags & SCF_DO_STCLASS_AND) {
3670 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3671 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3672 if (FLAGS(scan) & USE_UNI) {
3673 for (value = 0; value < 256; value++) {
3674 if (!isSPACE_L1(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3676 }
3677 }
3678 } else {
3679 for (value = 0; value < 256; value++) {
3680 if (!isSPACE(value)) {
3681 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 }
3683 }
3684 }
653099ff
GS
3685 }
3686 }
8aa23a47 3687 else {
a12cf05f 3688 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3689 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3690 }
3691 else if (FLAGS(scan) & USE_UNI) {
3692 for (value = 0; value < 256; value++) {
3693 if (isSPACE_L1(value)) {
3694 ANYOF_BITMAP_SET(data->start_class, value);
3695 }
3696 }
3697 } else {
3698 for (value = 0; value < 256; value++) {
3699 if (isSPACE(value)) {
3700 ANYOF_BITMAP_SET(data->start_class, value);
3701 }
3702 }
8aa23a47 3703 }
653099ff 3704 }
8aa23a47
YO
3705 break;
3706 case SPACEL:
3707 if (flags & SCF_DO_STCLASS_AND) {
3708 if (data->start_class->flags & ANYOF_LOCALE)
3709 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3710 }
3711 else {
3712 data->start_class->flags |= ANYOF_LOCALE;
3713 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3714 }
3715 break;
3716 case NSPACE:
3717 if (flags & SCF_DO_STCLASS_AND) {
3718 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3719 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3720 if (FLAGS(scan) & USE_UNI) {
3721 for (value = 0; value < 256; value++) {
3722 if (isSPACE_L1(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3724 }
3725 }
3726 } else {
3727 for (value = 0; value < 256; value++) {
3728 if (isSPACE(value)) {
3729 ANYOF_BITMAP_CLEAR(data->start_class, value);
3730 }
3731 }
3732 }
653099ff 3733 }
8aa23a47
YO
3734 }
3735 else {
3736 if (data->start_class->flags & ANYOF_LOCALE)
3737 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3738 else if (FLAGS(scan) & USE_UNI) {
3739 for (value = 0; value < 256; value++) {
3740 if (!isSPACE_L1(value)) {
3741 ANYOF_BITMAP_SET(data->start_class, value);
3742 }
3743 }
3744 }
3745 else {
3746 for (value = 0; value < 256; value++) {
3747 if (!isSPACE(value)) {
3748 ANYOF_BITMAP_SET(data->start_class, value);
3749 }
3750 }
3751 }
653099ff 3752 }
8aa23a47
YO
3753 break;
3754 case NSPACEL:
3755 if (flags & SCF_DO_STCLASS_AND) {
3756 if (data->start_class->flags & ANYOF_LOCALE) {
3757 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3758 for (value = 0; value < 256; value++)
3759 if (!isSPACE(value))
3760 ANYOF_BITMAP_CLEAR(data->start_class, value);
3761 }
653099ff 3762 }
8aa23a47
YO
3763 else {
3764 data->start_class->flags |= ANYOF_LOCALE;
3765