This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
fix incorrect comment
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* Can match anything (initialization) */
716STATIC int
5f66b61c 717S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
718{
719 int value;
720
7918f24d
NC
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
722
aaa51d5e 723 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
725 return 1;
1aa99e6b
IH
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
727 return 0;
10edeb5d 728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 729 return 0;
653099ff
GS
730 return 1;
731}
732
733/* Can match anything (initialization) */
734STATIC void
097eb12c 735S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 736{
7918f24d
NC
737 PERL_ARGS_ASSERT_CL_INIT;
738
8ecf7187 739 Zero(cl, 1, struct regnode_charclass_class);
653099ff 740 cl->type = ANYOF;
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742}
743
744STATIC void
097eb12c 745S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752 if (LOC)
753 cl->flags |= ANYOF_LOCALE;
754}
755
756/* 'And' a given class with another one. Can create false positives */
757/* We assume that cl is not inverted */
758STATIC void
5f66b61c 759S_cl_and(struct regnode_charclass_class *cl,
a28509cc 760 const struct regnode_charclass_class *and_with)
653099ff 761{
7918f24d 762 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
763
764 assert(and_with->type == ANYOF);
653099ff
GS
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
770 int i;
771
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
775 else
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
1aa99e6b 781
14ebb1a2
JH
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
787 }
14ebb1a2
JH
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 790 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 793 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
794}
795
796/* 'OR' a given class with another one. Can create false positives */
797/* We assume that cl is not inverted */
798STATIC void
097eb12c 799S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 800{
7918f24d
NC
801 PERL_ARGS_ASSERT_CL_OR;
802
653099ff
GS
803 if (or_with->flags & ANYOF_INVERT) {
804 /* We do not use
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
812 */
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
816 int i;
817
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
821 else {
830247a4 822 cl_anything(pRExC_state, cl);
653099ff
GS
823 }
824 } else {
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 827 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
828 || (cl->flags & ANYOF_FOLD)) ) {
829 int i;
830
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
838 }
839 }
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 841 cl_anything(pRExC_state, cl);
653099ff
GS
842 }
843 }
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
846
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
851 }
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
855 }
653099ff
GS
856}
857
a3621e74
YO
858#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
862
3dab1dad
YO
863
864#ifdef DEBUGGING
07be1b83 865/*
2b8b4781
NC
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
869
870 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
874
3dab1dad
YO
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
07be1b83
YO
877
878*/
3dab1dad
YO
879
880/*
3dab1dad
YO
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
883*/
b9a59e08 884
3dab1dad 885STATIC void
2b8b4781
NC
886S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
3dab1dad
YO
888{
889 U32 state;
ab3bbdeb 890 SV *sv=sv_newmortal();
55eed653 891 int colwidth= widecharmap ? 6 : 4;
2e64971a 892 U16 word;
3dab1dad
YO
893 GET_RE_DEBUG_FLAGS_DECL;
894
7918f24d 895 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 896
3dab1dad
YO
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
900
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 902 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 903 if ( tmp ) {
ab3bbdeb
YO
904 PerlIO_printf( Perl_debug_log, "%*s",
905 colwidth,
ddc5bc0f 906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
910 )
911 );
3dab1dad
YO
912 }
913 }
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
916
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
919 PerlIO_printf( Perl_debug_log, "\n");
920
1e2e3d02 921 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 922 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
923
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
925
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
928 } else {
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
930 }
931
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
933
934 if ( base ) {
935 U32 ofs = 0;
936
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
940 ofs++;
941
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
943
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
948 {
ab3bbdeb
YO
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
950 colwidth,
3dab1dad
YO
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
952 } else {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
954 }
955 }
956
957 PerlIO_printf( Perl_debug_log, "]");
958
959 }
960 PerlIO_printf( Perl_debug_log, "\n" );
961 }
2e64971a
DM
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
967 }
968 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
969}
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
ab3bbdeb 982 SV *sv=sv_newmortal();
55eed653 983 int colwidth= widecharmap ? 6 : 4;
3dab1dad 984 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
985
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
987
3dab1dad 988 /* print out the table precompression. */
ab3bbdeb
YO
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
3dab1dad
YO
992
993 for( state=1 ; state < next_alloc ; state ++ ) {
994 U16 charid;
995
ab3bbdeb 996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1000 } else {
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1003 );
1004 }
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1007 if ( tmp ) {
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1009 colwidth,
ddc5bc0f 1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1014 ) ,
1e2e3d02
YO
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1017 );
1018 if (!(charid % 10))
664e119d
RGS
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1e2e3d02 1021 }
ab3bbdeb
YO
1022 }
1023 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1024 }
1025}
1026
1027/*
3dab1dad
YO
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1032*/
1033STATIC void
55eed653 1034S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1036 U32 depth)
3dab1dad
YO
1037{
1038 U32 state;
1039 U16 charid;
ab3bbdeb 1040 SV *sv=sv_newmortal();
55eed653 1041 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1042 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1043
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1045
1046 /*
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1049 */
1050
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1052
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1055 if ( tmp ) {
ab3bbdeb
YO
1056 PerlIO_printf( Perl_debug_log, "%*s",
1057 colwidth,
ddc5bc0f 1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1062 )
1063 );
3dab1dad
YO
1064 }
1065 }
1066
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1068
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1071 }
1072
1073 PerlIO_printf( Perl_debug_log, "\n" );
1074
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1076
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1080
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1083 if (v)
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1085 else
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1087 }
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1090 } else {
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1093 }
1094 }
07be1b83 1095}
3dab1dad
YO
1096
1097#endif
1098
2e64971a 1099
786e8c11
YO
1100/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
3dab1dad 1110
786e8c11 1111Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1112
786e8c11
YO
1113A trie is an N'ary tree where the branches are determined by digital
1114decomposition of the key. IE, at the root node you look up the 1st character and
1115follow that branch repeat until you find the end of the branches. Nodes can be
1116marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1117
786e8c11 1118 /he|she|his|hers/
72f13be8 1119
786e8c11
YO
1120would convert into the following structure. Numbers represent states, letters
1121following numbers represent valid transitions on the letter from that state, if
1122the number is in square brackets it represents an accepting state, otherwise it
1123will be in parenthesis.
07be1b83 1124
786e8c11
YO
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1126 | |
1127 | (2)
1128 | |
1129 (1) +-i->(6)-+-s->[7]
1130 |
1131 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1132
786e8c11
YO
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1134
1135This shows that when matching against the string 'hers' we will begin at state 1
1136read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139single traverse. We store a mapping from accepting to state to which word was
1140matched, and then when we have multiple possibilities we try to complete the
1141rest of the regex in the order in which they occured in the alternation.
1142
1143The only prior NFA like behaviour that would be changed by the TRIE support is
1144the silent ignoring of duplicate alternations which are of the form:
1145
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1147
1148Thus EVAL blocks follwing a trie may be called a different number of times with
1149and without the optimisation. With the optimisations dupes will be silently
1150ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151the following demonstrates:
1152
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1154
1155which prints out 'word' three times, but
1156
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1158
1159which doesnt print it out at all. This is due to other optimisations kicking in.
1160
1161Example of what happens on a structural level:
1162
1163The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1164
1165 1: CURLYM[1] {1,32767}(18)
1166 5: BRANCH(8)
1167 6: EXACT <ac>(16)
1168 8: BRANCH(11)
1169 9: EXACT <ad>(16)
1170 11: BRANCH(14)
1171 12: EXACT <ab>(16)
1172 16: SUCCEED(0)
1173 17: NOTHING(18)
1174 18: END(0)
1175
1176This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177and should turn into:
1178
1179 1: CURLYM[1] {1,32767}(18)
1180 5: TRIE(16)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1182 <ac>
1183 <ad>
1184 <ab>
1185 16: SUCCEED(0)
1186 17: NOTHING(18)
1187 18: END(0)
1188
1189Cases where tail != last would be like /(?foo|bar)baz/:
1190
1191 1: BRANCH(4)
1192 2: EXACT <foo>(8)
1193 4: BRANCH(7)
1194 5: EXACT <bar>(8)
1195 7: TAIL(8)
1196 8: EXACT <baz>(10)
1197 10: END(0)
1198
1199which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200and would end up looking like:
1201
1202 1: TRIE(8)
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1204 <foo>
1205 <bar>
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1211
1212is the recommended Unicode-aware way of saying
1213
1214 *(d++) = uv;
1215*/
1216
1e2e3d02 1217#define TRIE_STORE_REVCHAR \
786e8c11 1218 STMT_START { \
73031816
NC
1219 if (UTF) { \
1220 SV *zlopp = newSV(2); \
88c9ea1e
CB
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1224 SvPOK_on(zlopp); \
1225 SvUTF8_on(zlopp); \
1226 av_push(revcharmap, zlopp); \
1227 } else { \
6bdeddd2 1228 char ooooff = (char)uvc; \
73031816
NC
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1230 } \
1231 } STMT_END
786e8c11
YO
1232
1233#define TRIE_READ_CHAR STMT_START { \
1234 wordlen++; \
1235 if ( UTF ) { \
1236 if ( folder ) { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1239 foldlen -= len; \
1240 scan += len; \
1241 len = 0; \
1242 } else { \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1247 } \
1248 } else { \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1250 } \
1251 } else { \
1252 uvc = (U32)*uc; \
1253 len = 1; \
1254 } \
1255} STMT_END
1256
1257
1258
1259#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1263 } \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1267} STMT_END
07be1b83 1268
786e8c11
YO
1269#define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1274} STMT_END
07be1b83 1275
786e8c11
YO
1276#define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1279 \
786e8c11
YO
1280 DEBUG_r({ \
1281 /* store the word for dumping */ \
1282 SV* tmp; \
1283 if (OP(noper) != NOTHING) \
740cce10 1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1285 else \
740cce10 1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1287 av_push( trie_words, tmp ); \
786e8c11
YO
1288 }); \
1289 \
1290 curword++; \
2e64971a
DM
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1294 \
1295 if ( noper_next < tail ) { \
1296 if (!trie->jump) \
c944940b 1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1298 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1299 if (!jumper) \
1300 jumper = noper_next; \
1301 if (!nextbranch) \
1302 nextbranch= regnext(cur); \
1303 } \
1304 \
1305 if ( dupe ) { \
2e64971a
DM
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1311 } else { \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1314 } \
1315} STMT_END
07be1b83 1316
3dab1dad 1317
786e8c11
YO
1318#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1325 )
3dab1dad 1326
786e8c11
YO
1327#define MADE_TRIE 1
1328#define MADE_JUMP_TRIE 2
1329#define MADE_EXACT_TRIE 4
3dab1dad 1330
a3621e74 1331STATIC I32
786e8c11 1332S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1333{
27da23d5 1334 dVAR;
a3621e74
YO
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
55eed653 1337 HV *widecharmap = NULL;
2b8b4781 1338 AV *revcharmap = newAV();
a3621e74 1339 regnode *cur;
9f7f3913 1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1341 STRLEN len = 0;
1342 UV uvc = 0;
1343 U16 curword = 0;
1344 U32 next_alloc = 0;
786e8c11
YO
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
7f69552c 1347 regnode *convert = NULL;
2e64971a 1348 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1349 /* we just use folder as a flag in utf8 */
e1ec3a88 1350 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1351 ? PL_fold
1352 : ( flags == EXACTFL
1353 ? PL_fold_locale
1354 : NULL
1355 )
1356 );
1357
2b8b4781
NC
1358#ifdef DEBUGGING
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
8e11feef 1363 */
2b8b4781
NC
1364#else
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1366 STRLEN trie_charcount=0;
3dab1dad 1367#endif
2b8b4781 1368 SV *re_trie_maxbuff;
a3621e74 1369 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1370
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1372#ifndef DEBUGGING
1373 PERL_UNUSED_ARG(depth);
1374#endif
a3621e74 1375
c944940b 1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1377 trie->refcount = 1;
3dab1dad 1378 trie->startstate = 1;
786e8c11 1379 trie->wordcount = word_count;
f8fc2ecf 1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1382 if (!(UTF && folder))
c944940b 1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1386
a3621e74 1387 DEBUG_r({
2b8b4781 1388 trie_words = newAV();
a3621e74 1389 });
a3621e74 1390
0111c4fd 1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1392 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1394 }
3dab1dad
YO
1395 DEBUG_OPTIMISE_r({
1396 PerlIO_printf( Perl_debug_log,
786e8c11 1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1401 (int)depth);
3dab1dad 1402 });
7f69552c
YO
1403
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1407 convert = first;
1408 } else {
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1411 }
1412
a3621e74
YO
1413 /* -- First loop and Setup --
1414
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1418 have unique chars.
1419
1420 We use an array of integers to represent the character codes 0..255
38a44b82 1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1422 native representation of the character value as the key and IV's for the
1423 coded index.
1424
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1433
1434 */
1435
a3621e74 1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1437 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1438 const U8 *uc = (U8*)STRING( noper );
a28509cc 1439 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1440 STRLEN foldlen = 0;
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1442 const U8 *scan = (U8*)NULL;
07be1b83 1443 U32 wordlen = 0; /* required init */
02daf0ab
YO
1444 STRLEN chars = 0;
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1446
3dab1dad
YO
1447 if (OP(noper) == NOTHING) {
1448 trie->minlen= 0;
1449 continue;
1450 }
02daf0ab
YO
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1454
a3621e74 1455 for ( ; uc < e ; uc += len ) {
3dab1dad 1456 TRIE_CHARCOUNT(trie)++;
a3621e74 1457 TRIE_READ_CHAR;
3dab1dad 1458 chars++;
a3621e74
YO
1459 if ( uvc < 256 ) {
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1462 if ( folder )
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1464 TRIE_STORE_REVCHAR;
a3621e74 1465 }
02daf0ab
YO
1466 if ( set_bit ) {
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1470
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1473
1474 if ( !UTF ) {
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1482 }
1483 }
02daf0ab
YO
1484 set_bit = 0; /* We've done our bit :-) */
1485 }
a3621e74
YO
1486 } else {
1487 SV** svpp;
55eed653
NC
1488 if ( !widecharmap )
1489 widecharmap = newHV();
a3621e74 1490
55eed653 1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1492
1493 if ( !svpp )
e4584336 1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1495
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1498 TRIE_STORE_REVCHAR;
a3621e74
YO
1499 }
1500 }
1501 }
3dab1dad
YO
1502 if( cur == first ) {
1503 trie->minlen=chars;
1504 trie->maxlen=chars;
1505 } else if (chars < trie->minlen) {
1506 trie->minlen=chars;
1507 } else if (chars > trie->maxlen) {
1508 trie->maxlen=chars;
1509 }
1510
a3621e74
YO
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
55eed653 1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1518 );
a3621e74
YO
1519
1520 /*
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1525 conservative but potentially much slower representation using an array
1526 of lists.
1527
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1534
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1538
1539 */
1540
1541
2e64971a
DM
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1543 prev_states[1] = 0;
1544
3dab1dad 1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1546 /*
1547 Second Pass -- Array Of Lists Representation
1548
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1552
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
a3621e74
YO
1556 */
1557
a3621e74
YO
1558 STRLEN transcount = 1;
1559
1e2e3d02
YO
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
446bd890 1563
c944940b
JH
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
a3621e74
YO
1567 TRIE_LIST_NEW(1);
1568 next_alloc = 2;
1569
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1571
c445ea15
AL
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
07be1b83 1579 U32 wordlen = 0; /* required init */
c445ea15
AL
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1581
3dab1dad 1582 if (OP(noper) != NOTHING) {
786e8c11 1583 for ( ; uc < e ; uc += len ) {
c445ea15 1584
786e8c11 1585 TRIE_READ_CHAR;
c445ea15 1586
786e8c11
YO
1587 if ( uvc < 256 ) {
1588 charid = trie->charmap[ uvc ];
c445ea15 1589 } else {
55eed653 1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1591 if ( !svpp ) {
1592 charid = 0;
1593 } else {
1594 charid=(U16)SvIV( *svpp );
1595 }
c445ea15 1596 }
786e8c11
YO
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1598 if ( charid ) {
a3621e74 1599
786e8c11
YO
1600 U16 check;
1601 U32 newstate = 0;
a3621e74 1602
786e8c11
YO
1603 charid--;
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
c445ea15 1606 }
786e8c11
YO
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1610 break;
1611 }
1612 }
1613 if ( ! newstate ) {
1614 newstate = next_alloc++;
2e64971a 1615 prev_states[newstate] = state;
786e8c11
YO
1616 TRIE_LIST_PUSH( state, charid, newstate );
1617 transcount++;
1618 }
1619 state = newstate;
1620 } else {
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1622 }
a28509cc 1623 }
c445ea15 1624 }
3dab1dad 1625 TRIE_HANDLE_WORD(state);
a3621e74
YO
1626
1627 } /* end second pass */
1628
1e2e3d02
YO
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
c944940b
JH
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1633 next_alloc
1634 * sizeof(reg_trie_state) );
a3621e74 1635
3dab1dad 1636 /* and now dump it out before we compress it */
2b8b4781
NC
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1639 depth+1)
1e2e3d02 1640 );
a3621e74 1641
c944940b
JH
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1644 {
1645 U32 state;
a3621e74
YO
1646 U32 tp = 0;
1647 U32 zp = 0;
1648
1649
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1651 U32 base=0;
1652
1653 /*
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1656 );
1657 */
1658
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1661 U16 maxid=minid;
a28509cc 1662 U16 idx;
a3621e74
YO
1663
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1667 minid=forid;
1668 } else if ( forid > maxid ) {
1669 maxid=forid;
1670 }
a3621e74
YO
1671 }
1672 if ( transcount < tp + maxid - minid + 1) {
1673 transcount *= 2;
c944940b
JH
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
446bd890
NC
1676 transcount
1677 * sizeof(reg_trie_trans) );
a3621e74
YO
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1679 }
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1682 U32 set = 0;
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1688 set = 1;
1689 break;
1690 }
1691 }
1692 if ( !set ) {
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1695 tp++;
1696 zp = tp;
1697 }
1698 } else {
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1703 }
1704 tp += ( maxid - minid + 1 );
1705 }
1706 Safefree(trie->states[ state ].trans.list);
1707 }
1708 /*
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1711 );
1712 */
1713 trie->states[ state ].trans.base=base;
1714 }
cc601c31 1715 trie->lasttrans = tp + 1;
a3621e74
YO
1716 }
1717 } else {
1718 /*
1719 Second Pass -- Flat Table Representation.
1720
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1725
1726 We then construct the trie using only the .next slots of the entry
1727 structs.
1728
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1732
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1734 transition.
1735
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1741
1742 A B A B
1743 1. 2 4 1. 3 7
1744 2. 0 3 3. 0 5
1745 3. 0 0 5. 0 0
1746 4. 0 0 7. 0 0
1747
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1751
1752 */
1e2e3d02
YO
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
3dab1dad 1756
c944940b
JH
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
a3621e74
YO
1764 next_alloc = trie->uniquecharcount + 1;
1765
3dab1dad 1766
a3621e74
YO
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768
c445ea15 1769 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1772
1773 U32 state = 1; /* required init */
1774
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778
1779 STRLEN foldlen = 0; /* required init */
07be1b83 1780 U32 wordlen = 0; /* required init */
a3621e74
YO
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1782
3dab1dad 1783 if ( OP(noper) != NOTHING ) {
786e8c11 1784 for ( ; uc < e ; uc += len ) {
a3621e74 1785
786e8c11 1786 TRIE_READ_CHAR;
a3621e74 1787
786e8c11
YO
1788 if ( uvc < 256 ) {
1789 charid = trie->charmap[ uvc ];
1790 } else {
55eed653 1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1793 }
786e8c11
YO
1794 if ( charid ) {
1795 charid--;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
2e64971a
DM
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
786e8c11
YO
1801 next_alloc += trie->uniquecharcount;
1802 }
1803 state = trie->trans[ state + charid ].next;
1804 } else {
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1806 }
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1808 }
a3621e74 1809 }
3dab1dad
YO
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1812
1813 } /* end second pass */
1814
3dab1dad 1815 /* and now dump it out before we compress it */
2b8b4781
NC
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1817 revcharmap,
1818 next_alloc, depth+1));
a3621e74 1819
a3621e74
YO
1820 {
1821 /*
1822 * Inplace compress the table.*
1823
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1827
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1830
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1833
1834 - If .base is 0 there are no valid transitions from that node.
1835
1836 - If .base is nonzero then charid is added to it to find an entry in
1837 the trans array.
1838
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1844 valid.
1845
786e8c11 1846 XXX - wrong maybe?
a3621e74
YO
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1852 0.
1853
1854 - We set 'pos' to indicate the first entry of the second node.
1855
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1862 compression.
1863
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1866
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
786e8c11 1872 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1873
1874 - Once compressed we can Renew/realloc the structures to release the
1875 excess space.
1876
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1879
1880 demq
1881 */
a3b680e6 1882 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1883 U32 state, charid;
a3621e74 1884 U32 pos = 0, zp=0;
1e2e3d02 1885 trie->statecount = laststate;
a3621e74
YO
1886
1887 for ( state = 1 ; state < laststate ; state++ ) {
1888 U8 flag = 0;
a28509cc
AL
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1892 trie->trans[ stateidx ].check = 0;
1893
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1897 if (o_used == 1) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1900 break;
1901 }
1902 }
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1907 break;
1908 }
1909 used--;
1910 }
1911 if ( !flag ) {
1912 flag = 1;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1914 }
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1917 pos++;
1918 }
1919 }
1920 }
cc601c31 1921 trie->lasttrans = pos + 1;
c944940b
JH
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
a3621e74 1925 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1926 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1930 (IV)next_alloc,
1931 (IV)pos,
a3621e74
YO
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1933 );
1934
1935 } /* end table compress */
1936 }
1e2e3d02
YO
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1942 );
cc601c31 1943 /* resize the trans array to remove unused space */
c944940b
JH
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
a3621e74 1947
3dab1dad 1948 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1949 U8 nodetype =(U8)(flags & 0xFF);
1950 char *str=NULL;
786e8c11 1951
07be1b83 1952#ifdef DEBUGGING
e62cc96a 1953 regnode *optimize = NULL;
7122b237
YO
1954#ifdef RE_TRACK_PATTERN_OFFSETS
1955
b57a0404
JH
1956 U32 mjd_offset = 0;
1957 U32 mjd_nodelen = 0;
7122b237
YO
1958#endif /* RE_TRACK_PATTERN_OFFSETS */
1959#endif /* DEBUGGING */
a3621e74 1960 /*
3dab1dad
YO
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
a3621e74 1967 */
3dab1dad 1968 /* Find the node we are going to overwrite */
7f69552c 1969 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1970 /* branch sub-chain */
3dab1dad 1971 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1972#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1973 DEBUG_r({
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1976 });
7122b237 1977#endif
7f69552c 1978 /* whole branch chain */
7122b237
YO
1979 }
1980#ifdef RE_TRACK_PATTERN_OFFSETS
1981 else {
7f69552c
YO
1982 DEBUG_r({
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1986 });
07be1b83
YO
1987 }
1988 DEBUG_OPTIMISE_r(
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
786e8c11 1991 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1992 );
7122b237 1993#endif
3dab1dad
YO
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
55eed653 1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1998 U32 state;
1e2e3d02 1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2000 U32 ofs = 0;
8e11feef
RGS
2001 I32 idx = -1;
2002 U32 count = 0;
2003 const U32 base = trie->states[ state ].trans.base;
a3621e74 2004
3dab1dad 2005 if ( trie->states[state].wordnum )
8e11feef 2006 count = 1;
a3621e74 2007
8e11feef 2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2012 {
3dab1dad 2013 if ( ++count > 1 ) {
2b8b4781 2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2016 if ( state == 1 ) break;
3dab1dad
YO
2017 if ( count == 2 ) {
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2019 DEBUG_OPTIMISE_r(
8e11feef
RGS
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
786e8c11 2023 (UV)state));
be8e71aa 2024 if (idx >= 0) {
2b8b4781 2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2027
3dab1dad 2028 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2029 if ( folder )
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2031 DEBUG_OPTIMISE_r(
f1f66076 2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2033 );
8e11feef
RGS
2034 }
2035 }
2036 TRIE_BITMAP_SET(trie,*ch);
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2040 }
2041 idx = ofs;
2042 }
3dab1dad
YO
2043 }
2044 if ( count == 1 ) {
2b8b4781 2045 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2046 STRLEN len;
2047 char *ch = SvPV( *tmp, len );
de734bd5
A
2048 DEBUG_OPTIMISE_r({
2049 SV *sv=sv_newmortal();
8e11feef
RGS
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
de734bd5
A
2053 (UV)state, (UV)idx,
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2058 )
2059 );
2060 });
3dab1dad
YO
2061 if ( state==1 ) {
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2064 STR_LEN(convert)=0;
2065 }
c490c714
YO
2066 STR_LEN(convert) += len;
2067 while (len--)
de734bd5 2068 *str++ = *ch++;
8e11feef 2069 } else {
f9049ba1 2070#ifdef DEBUGGING
8e11feef
RGS
2071 if (state>1)
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2073#endif
8e11feef
RGS
2074 break;
2075 }
2076 }
2e64971a 2077 trie->prefixlen = (state-1);
3dab1dad 2078 if (str) {
8e11feef 2079 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2081 trie->startstate = state;
07be1b83
YO
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
33809eae
JH
2084#ifdef DEBUGGING
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2087 * it right here. */
2088 if (
2089#ifdef PERL_EXT_RE_BUILD
2090 1
2091#else
2092 DEBUG_r_TEST
2093#endif
2094 ) {
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2097 mjd_nodelen++;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2101 }
2102 while (word--) {
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2104 if (tmp) {
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2107 else
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2109 }
2110 }
2111 }
2112#endif
8e11feef
RGS
2113 if (trie->maxlen) {
2114 convert = n;
2115 } else {
3dab1dad 2116 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2117 DEBUG_r(optimize= n);
3dab1dad
YO
2118 }
2119 }
2120 }
a5ca303d
YO
2121 if (!jumper)
2122 jumper = last;
3dab1dad 2123 if ( trie->maxlen ) {
8e11feef
RGS
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
786e8c11
YO
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2129 if (trie->jump)
7f69552c 2130 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2131
786e8c11
YO
2132 /* XXXX */
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2135 {
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2138 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2139 trie->bitmap= NULL;
2140 } else
2141 OP( convert ) = TRIE;
a3621e74 2142
3dab1dad
YO
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
a5ca303d
YO
2145 DEBUG_r({
2146 optimize = convert
2147 + NODE_STEP_REGNODE
2148 + regarglen[ OP( convert ) ];
2149 });
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
3dab1dad 2152 }
a3621e74 2153 /* needed for dumping*/
e62cc96a 2154 DEBUG_r(if (optimize) {
07be1b83 2155 regnode *opt = convert;
bcdf7404 2156
e62cc96a 2157 while ( ++opt < optimize) {
07be1b83
YO
2158 Set_Node_Offset_Length(opt,0,0);
2159 }
786e8c11
YO
2160 /*
2161 Try to clean up some of the debris left after the
2162 optimisation.
a3621e74 2163 */
786e8c11 2164 while( optimize < jumper ) {
07be1b83 2165 mjd_nodelen += Node_Length((optimize));
a3621e74 2166 OP( optimize ) = OPTIMIZED;
07be1b83 2167 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2168 optimize++;
2169 }
07be1b83 2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2171 });
2172 } /* end node insert */
2e64971a
DM
2173
2174 /* Finish populating the prev field of the wordinfo array. Walk back
2175 * from each accept state until we find another accept state, and if
2176 * so, point the first word's .prev field at the second word. If the
2177 * second already has a .prev field set, stop now. This will be the
2178 * case either if we've already processed that word's accept state,
2179 * or that that state had multiple words, and the overspill words
2180 * were already linked up earlier.
2181 */
2182 {
2183 U16 word;
2184 U32 state;
2185 U16 prev;
2186
2187 for (word=1; word <= trie->wordcount; word++) {
2188 prev = 0;
2189 if (trie->wordinfo[word].prev)
2190 continue;
2191 state = trie->wordinfo[word].accept;
2192 while (state) {
2193 state = prev_states[state];
2194 if (!state)
2195 break;
2196 prev = trie->states[state].wordnum;
2197 if (prev)
2198 break;
2199 }
2200 trie->wordinfo[word].prev = prev;
2201 }
2202 Safefree(prev_states);
2203 }
2204
2205
2206 /* and now dump out the compressed format */
2207 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2208
55eed653 2209 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2210#ifdef DEBUGGING
2211 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2212 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2213#else
2214 SvREFCNT_dec(revcharmap);
07be1b83 2215#endif
786e8c11
YO
2216 return trie->jump
2217 ? MADE_JUMP_TRIE
2218 : trie->startstate>1
2219 ? MADE_EXACT_TRIE
2220 : MADE_TRIE;
2221}
2222
2223STATIC void
2224S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2225{
2226/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2227
2228 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2229 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2230 ISBN 0-201-10088-6
2231
2232 We find the fail state for each state in the trie, this state is the longest proper
2233 suffix of the current states 'word' that is also a proper prefix of another word in our
2234 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2235 the DFA not to have to restart after its tried and failed a word at a given point, it
2236 simply continues as though it had been matching the other word in the first place.
2237 Consider
2238 'abcdgu'=~/abcdefg|cdgu/
2239 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2240 fail, which would bring use to the state representing 'd' in the second word where we would
2241 try 'g' and succeed, prodceding to match 'cdgu'.
2242 */
2243 /* add a fail transition */
3251b653
NC
2244 const U32 trie_offset = ARG(source);
2245 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2246 U32 *q;
2247 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2248 const U32 numstates = trie->statecount;
786e8c11
YO
2249 const U32 ubound = trie->lasttrans + ucharcount;
2250 U32 q_read = 0;
2251 U32 q_write = 0;
2252 U32 charid;
2253 U32 base = trie->states[ 1 ].trans.base;
2254 U32 *fail;
2255 reg_ac_data *aho;
2256 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2257 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2258
2259 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2260#ifndef DEBUGGING
2261 PERL_UNUSED_ARG(depth);
2262#endif
2263
2264
2265 ARG_SET( stclass, data_slot );
c944940b 2266 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2267 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2268 aho->trie=trie_offset;
446bd890
NC
2269 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2270 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2271 Newxz( q, numstates, U32);
c944940b 2272 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2273 aho->refcount = 1;
2274 fail = aho->fail;
2275 /* initialize fail[0..1] to be 1 so that we always have
2276 a valid final fail state */
2277 fail[ 0 ] = fail[ 1 ] = 1;
2278
2279 for ( charid = 0; charid < ucharcount ; charid++ ) {
2280 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2281 if ( newstate ) {
2282 q[ q_write ] = newstate;
2283 /* set to point at the root */
2284 fail[ q[ q_write++ ] ]=1;
2285 }
2286 }
2287 while ( q_read < q_write) {
2288 const U32 cur = q[ q_read++ % numstates ];
2289 base = trie->states[ cur ].trans.base;
2290
2291 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2292 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2293 if (ch_state) {
2294 U32 fail_state = cur;
2295 U32 fail_base;
2296 do {
2297 fail_state = fail[ fail_state ];
2298 fail_base = aho->states[ fail_state ].trans.base;
2299 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2300
2301 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2302 fail[ ch_state ] = fail_state;
2303 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2304 {
2305 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2306 }
2307 q[ q_write++ % numstates] = ch_state;
2308 }
2309 }
2310 }
2311 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2312 when we fail in state 1, this allows us to use the
2313 charclass scan to find a valid start char. This is based on the principle
2314 that theres a good chance the string being searched contains lots of stuff
2315 that cant be a start char.
2316 */
2317 fail[ 0 ] = fail[ 1 ] = 0;
2318 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2319 PerlIO_printf(Perl_debug_log,
2320 "%*sStclass Failtable (%"UVuf" states): 0",
2321 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2322 );
786e8c11
YO
2323 for( q_read=1; q_read<numstates; q_read++ ) {
2324 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2325 }
2326 PerlIO_printf(Perl_debug_log, "\n");
2327 });
2328 Safefree(q);
2329 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2330}
2331
786e8c11 2332
a3621e74 2333/*
5d1c421c
JH
2334 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2335 * These need to be revisited when a newer toolchain becomes available.
2336 */
2337#if defined(__sparc64__) && defined(__GNUC__)
2338# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2339# undef SPARC64_GCC_WORKAROUND
2340# define SPARC64_GCC_WORKAROUND 1
2341# endif
2342#endif
2343
07be1b83 2344#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2345 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2346 SV * const mysv=sv_newmortal(); \
2347 regnode *Next = regnext(scan); \
2348 regprop(RExC_rx, mysv, scan); \
7f69552c 2349 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2350 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2351 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2352 }});
07be1b83 2353
1de06328
YO
2354
2355
2356
2357
07be1b83
YO
2358#define JOIN_EXACT(scan,min,flags) \
2359 if (PL_regkind[OP(scan)] == EXACT) \
2360 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2361
be8e71aa 2362STATIC U32
07be1b83
YO
2363S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2364 /* Merge several consecutive EXACTish nodes into one. */
2365 regnode *n = regnext(scan);
2366 U32 stringok = 1;
2367 regnode *next = scan + NODE_SZ_STR(scan);
2368 U32 merged = 0;
2369 U32 stopnow = 0;
2370#ifdef DEBUGGING
2371 regnode *stop = scan;
72f13be8 2372 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2373#else
d47053eb
RGS
2374 PERL_UNUSED_ARG(depth);
2375#endif
7918f24d
NC
2376
2377 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2378#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2379 PERL_UNUSED_ARG(flags);
2380 PERL_UNUSED_ARG(val);
07be1b83 2381#endif
07be1b83
YO
2382 DEBUG_PEEP("join",scan,depth);
2383
2384 /* Skip NOTHING, merge EXACT*. */
2385 while (n &&
2386 ( PL_regkind[OP(n)] == NOTHING ||
2387 (stringok && (OP(n) == OP(scan))))
2388 && NEXT_OFF(n)
2389 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2390
2391 if (OP(n) == TAIL || n > next)
2392 stringok = 0;
2393 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2394 DEBUG_PEEP("skip:",n,depth);
2395 NEXT_OFF(scan) += NEXT_OFF(n);
2396 next = n + NODE_STEP_REGNODE;
2397#ifdef DEBUGGING
2398 if (stringok)
2399 stop = n;
2400#endif
2401 n = regnext(n);
2402 }
2403 else if (stringok) {
786e8c11 2404 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2405 regnode * const nnext = regnext(n);
2406
2407 DEBUG_PEEP("merg",n,depth);
2408
2409 merged++;
2410 if (oldl + STR_LEN(n) > U8_MAX)
2411 break;
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 STR_LEN(scan) += STR_LEN(n);
2414 next = n + NODE_SZ_STR(n);
2415 /* Now we can overwrite *n : */
2416 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2417#ifdef DEBUGGING
2418 stop = next - 1;
2419#endif
2420 n = nnext;
2421 if (stopnow) break;
2422 }
2423
d47053eb
RGS
2424#ifdef EXPERIMENTAL_INPLACESCAN
2425 if (flags && !NEXT_OFF(n)) {
2426 DEBUG_PEEP("atch", val, depth);
2427 if (reg_off_by_arg[OP(n)]) {
2428 ARG_SET(n, val - n);
2429 }
2430 else {
2431 NEXT_OFF(n) = val - n;
2432 }
2433 stopnow = 1;
2434 }
07be1b83
YO
2435#endif
2436 }
2437
2438 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2439 /*
2440 Two problematic code points in Unicode casefolding of EXACT nodes:
2441
2442 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2443 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2444
2445 which casefold to
2446
2447 Unicode UTF-8
2448
2449 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2450 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2451
2452 This means that in case-insensitive matching (or "loose matching",
2453 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2454 length of the above casefolded versions) can match a target string
2455 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2456 This would rather mess up the minimum length computation.
2457
2458 What we'll do is to look for the tail four bytes, and then peek
2459 at the preceding two bytes to see whether we need to decrease
2460 the minimum length by four (six minus two).
2461
2462 Thanks to the design of UTF-8, there cannot be false matches:
2463 A sequence of valid UTF-8 bytes cannot be a subsequence of
2464 another valid sequence of UTF-8 bytes.
2465
2466 */
2467 char * const s0 = STRING(scan), *s, *t;
2468 char * const s1 = s0 + STR_LEN(scan) - 1;
2469 char * const s2 = s1 - 4;
e294cc5d
JH
2470#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2471 const char t0[] = "\xaf\x49\xaf\x42";
2472#else
07be1b83 2473 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2474#endif
07be1b83
YO
2475 const char * const t1 = t0 + 3;
2476
2477 for (s = s0 + 2;
2478 s < s2 && (t = ninstr(s, s1, t0, t1));
2479 s = t + 4) {
e294cc5d
JH
2480#ifdef EBCDIC
2481 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2482 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2483#else
07be1b83
YO
2484 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2485 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2486#endif
07be1b83
YO
2487 *min -= 4;
2488 }
2489 }
2490
2491#ifdef DEBUGGING
2492 /* Allow dumping */
2493 n = scan + NODE_SZ_STR(scan);
2494 while (n <= stop) {
2495 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2496 OP(n) = OPTIMIZED;
2497 NEXT_OFF(n) = 0;
2498 }
2499 n++;
2500 }
2501#endif
2502 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2503 return stopnow;
2504}
2505
653099ff
GS
2506/* REx optimizer. Converts nodes into quickier variants "in place".
2507 Finds fixed substrings. */
2508
a0288114 2509/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2510 to the position after last scanned or to NULL. */
2511
40d049e4
YO
2512#define INIT_AND_WITHP \
2513 assert(!and_withp); \
2514 Newx(and_withp,1,struct regnode_charclass_class); \
2515 SAVEFREEPV(and_withp)
07be1b83 2516
b515a41d
YO
2517/* this is a chain of data about sub patterns we are processing that
2518 need to be handled seperately/specially in study_chunk. Its so
2519 we can simulate recursion without losing state. */
2520struct scan_frame;
2521typedef struct scan_frame {
2522 regnode *last; /* last node to process in this frame */
2523 regnode *next; /* next node to process when last is reached */
2524 struct scan_frame *prev; /*previous frame*/
2525 I32 stop; /* what stopparen do we use */
2526} scan_frame;
2527
304ee84b
YO
2528
2529#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2530
e1d1eefb
YO
2531#define CASE_SYNST_FNC(nAmE) \
2532case nAmE: \
2533 if (flags & SCF_DO_STCLASS_AND) { \
2534 for (value = 0; value < 256; value++) \
2535 if (!is_ ## nAmE ## _cp(value)) \
2536 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2537 } \
2538 else { \
2539 for (value = 0; value < 256; value++) \
2540 if (is_ ## nAmE ## _cp(value)) \
2541 ANYOF_BITMAP_SET(data->start_class, value); \
2542 } \
2543 break; \
2544case N ## nAmE: \
2545 if (flags & SCF_DO_STCLASS_AND) { \
2546 for (value = 0; value < 256; value++) \
2547 if (is_ ## nAmE ## _cp(value)) \
2548 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2549 } \
2550 else { \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_SET(data->start_class, value); \
2554 } \
2555 break
2556
2557
2558
76e3520e 2559STATIC I32
40d049e4 2560S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2561 I32 *minlenp, I32 *deltap,
40d049e4
YO
2562 regnode *last,
2563 scan_data_t *data,
2564 I32 stopparen,
2565 U8* recursed,
2566 struct regnode_charclass_class *and_withp,
2567 U32 flags, U32 depth)
c277df42
IZ
2568 /* scanp: Start here (read-write). */
2569 /* deltap: Write maxlen-minlen here. */
2570 /* last: Stop before this one. */
40d049e4
YO
2571 /* data: string data about the pattern */
2572 /* stopparen: treat close N as END */
2573 /* recursed: which subroutines have we recursed into */
2574 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2575{
97aff369 2576 dVAR;
c277df42
IZ
2577 I32 min = 0, pars = 0, code;
2578 regnode *scan = *scanp, *next;
2579 I32 delta = 0;
2580 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2581 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2582 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2583 scan_data_t data_fake;
a3621e74 2584 SV *re_trie_maxbuff = NULL;
786e8c11 2585 regnode *first_non_open = scan;
e2e6a0f1 2586 I32 stopmin = I32_MAX;
8aa23a47 2587 scan_frame *frame = NULL;
a3621e74 2588 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2589
7918f24d
NC
2590 PERL_ARGS_ASSERT_STUDY_CHUNK;
2591
13a24bad 2592#ifdef DEBUGGING
40d049e4 2593 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2594#endif
40d049e4 2595
786e8c11 2596 if ( depth == 0 ) {
40d049e4 2597 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2598 first_non_open=regnext(first_non_open);
2599 }
2600
b81d288d 2601
8aa23a47
YO
2602 fake_study_recurse:
2603 while ( scan && OP(scan) != END && scan < last ){
2604 /* Peephole optimizer: */
304ee84b 2605 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2606 DEBUG_PEEP("Peep",scan,depth);
2607 JOIN_EXACT(scan,&min,0);
2608
2609 /* Follow the next-chain of the current node and optimize
2610 away all the NOTHINGs from it. */
2611 if (OP(scan) != CURLYX) {
2612 const int max = (reg_off_by_arg[OP(scan)]
2613 ? I32_MAX
2614 /* I32 may be smaller than U16 on CRAYs! */
2615 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2616 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2617 int noff;
2618 regnode *n = scan;
2619
2620 /* Skip NOTHING and LONGJMP. */
2621 while ((n = regnext(n))
2622 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2623 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2624 && off + noff < max)
2625 off += noff;
2626 if (reg_off_by_arg[OP(scan)])
2627 ARG(scan) = off;
2628 else
2629 NEXT_OFF(scan) = off;
2630 }
a3621e74 2631
c277df42 2632
8aa23a47
YO
2633
2634 /* The principal pseudo-switch. Cannot be a switch, since we
2635 look into several different things. */
2636 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2637 || OP(scan) == IFTHEN) {
2638 next = regnext(scan);
2639 code = OP(scan);
2640 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2641
2642 if (OP(next) == code || code == IFTHEN) {
2643 /* NOTE - There is similar code to this block below for handling
2644 TRIE nodes on a re-study. If you change stuff here check there
2645 too. */
2646 I32 max1 = 0, min1 = I32_MAX, num = 0;
2647 struct regnode_charclass_class accum;
2648 regnode * const startbranch=scan;
2649
2650 if (flags & SCF_DO_SUBSTR)
304ee84b 2651 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2652 if (flags & SCF_DO_STCLASS)
2653 cl_init_zero(pRExC_state, &accum);
2654
2655 while (OP(scan) == code) {
2656 I32 deltanext, minnext, f = 0, fake;
2657 struct regnode_charclass_class this_class;
2658
2659 num++;
2660 data_fake.flags = 0;
2661 if (data) {
2662 data_fake.whilem_c = data->whilem_c;
2663 data_fake.last_closep = data->last_closep;
2664 }
2665 else
2666 data_fake.last_closep = &fake;
58e23c8d
YO
2667
2668 data_fake.pos_delta = delta;
8aa23a47
YO
2669 next = regnext(scan);
2670 scan = NEXTOPER(scan);
2671 if (code != BRANCH)
c277df42 2672 scan = NEXTOPER(scan);
8aa23a47
YO
2673 if (flags & SCF_DO_STCLASS) {
2674 cl_init(pRExC_state, &this_class);
2675 data_fake.start_class = &this_class;
2676 f = SCF_DO_STCLASS_AND;
58e23c8d 2677 }
8aa23a47
YO
2678 if (flags & SCF_WHILEM_VISITED_POS)
2679 f |= SCF_WHILEM_VISITED_POS;
2680
2681 /* we suppose the run is continuous, last=next...*/
2682 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2683 next, &data_fake,
2684 stopparen, recursed, NULL, f,depth+1);
2685 if (min1 > minnext)
2686 min1 = minnext;
2687 if (max1 < minnext + deltanext)
2688 max1 = minnext + deltanext;
2689 if (deltanext == I32_MAX)
2690 is_inf = is_inf_internal = 1;
2691 scan = next;
2692 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2693 pars++;
2694 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2695 if ( stopmin > minnext)
2696 stopmin = min + min1;
2697 flags &= ~SCF_DO_SUBSTR;
2698 if (data)
2699 data->flags |= SCF_SEEN_ACCEPT;
2700 }
2701 if (data) {
2702 if (data_fake.flags & SF_HAS_EVAL)
2703 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
3dab1dad 2705 }
8aa23a47
YO
2706 if (flags & SCF_DO_STCLASS)
2707 cl_or(pRExC_state, &accum, &this_class);
2708 }
2709 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2710 min1 = 0;
2711 if (flags & SCF_DO_SUBSTR) {
2712 data->pos_min += min1;
2713 data->pos_delta += max1 - min1;
2714 if (max1 != min1 || is_inf)
2715 data->longest = &(data->longest_float);
2716 }
2717 min += min1;
2718 delta += max1 - min1;
2719 if (flags & SCF_DO_STCLASS_OR) {
2720 cl_or(pRExC_state, data->start_class, &accum);
2721 if (min1) {
2722 cl_and(data->start_class, and_withp);
2723 flags &= ~SCF_DO_STCLASS;
653099ff 2724 }
8aa23a47
YO
2725 }
2726 else if (flags & SCF_DO_STCLASS_AND) {
2727 if (min1) {
2728 cl_and(data->start_class, &accum);
2729 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2730 }
8aa23a47
YO
2731 else {
2732 /* Switch to OR mode: cache the old value of
2733 * data->start_class */
2734 INIT_AND_WITHP;
2735 StructCopy(data->start_class, and_withp,
2736 struct regnode_charclass_class);
2737 flags &= ~SCF_DO_STCLASS_AND;
2738 StructCopy(&accum, data->start_class,
2739 struct regnode_charclass_class);
2740 flags |= SCF_DO_STCLASS_OR;
2741 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2742 }
8aa23a47 2743 }
a3621e74 2744
8aa23a47
YO
2745 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2746 /* demq.
a3621e74 2747
8aa23a47
YO
2748 Assuming this was/is a branch we are dealing with: 'scan' now
2749 points at the item that follows the branch sequence, whatever
2750 it is. We now start at the beginning of the sequence and look
2751 for subsequences of
a3621e74 2752
8aa23a47
YO
2753 BRANCH->EXACT=>x1
2754 BRANCH->EXACT=>x2
2755 tail
a3621e74 2756
8aa23a47 2757 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2758
8aa23a47
YO
2759 If we can find such a subseqence we need to turn the first
2760 element into a trie and then add the subsequent branch exact
2761 strings to the trie.
a3621e74 2762
8aa23a47 2763 We have two cases
a3621e74 2764
8aa23a47 2765 1. patterns where the whole set of branch can be converted.
a3621e74 2766
8aa23a47 2767 2. patterns where only a subset can be converted.
a3621e74 2768
8aa23a47
YO
2769 In case 1 we can replace the whole set with a single regop
2770 for the trie. In case 2 we need to keep the start and end
2771 branchs so
a3621e74 2772
8aa23a47
YO
2773 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2774 becomes BRANCH TRIE; BRANCH X;
786e8c11 2775
8aa23a47
YO
2776 There is an additional case, that being where there is a
2777 common prefix, which gets split out into an EXACT like node
2778 preceding the TRIE node.
a3621e74 2779
8aa23a47
YO
2780 If x(1..n)==tail then we can do a simple trie, if not we make
2781 a "jump" trie, such that when we match the appropriate word
2782 we "jump" to the appopriate tail node. Essentailly we turn
2783 a nested if into a case structure of sorts.
b515a41d 2784
8aa23a47
YO
2785 */
2786
2787 int made=0;
2788 if (!re_trie_maxbuff) {
2789 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2790 if (!SvIOK(re_trie_maxbuff))
2791 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2792 }
2793 if ( SvIV(re_trie_maxbuff)>=0 ) {
2794 regnode *cur;
2795 regnode *first = (regnode *)NULL;
2796 regnode *last = (regnode *)NULL;
2797 regnode *tail = scan;
2798 U8 optype = 0;
2799 U32 count=0;
a3621e74
YO
2800
2801#ifdef DEBUGGING
8aa23a47 2802 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2803#endif
8aa23a47
YO
2804 /* var tail is used because there may be a TAIL
2805 regop in the way. Ie, the exacts will point to the
2806 thing following the TAIL, but the last branch will
2807 point at the TAIL. So we advance tail. If we
2808 have nested (?:) we may have to move through several
2809 tails.
2810 */
2811
2812 while ( OP( tail ) == TAIL ) {
2813 /* this is the TAIL generated by (?:) */
2814 tail = regnext( tail );
2815 }
a3621e74 2816
8aa23a47
YO
2817
2818 DEBUG_OPTIMISE_r({
2819 regprop(RExC_rx, mysv, tail );
2820 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2821 (int)depth * 2 + 2, "",
2822 "Looking for TRIE'able sequences. Tail node is: ",
2823 SvPV_nolen_const( mysv )
2824 );
2825 });
2826
2827 /*
2828
2829 step through the branches, cur represents each
2830 branch, noper is the first thing to be matched
2831 as part of that branch and noper_next is the
2832 regnext() of that node. if noper is an EXACT
2833 and noper_next is the same as scan (our current
2834 position in the regex) then the EXACT branch is
2835 a possible optimization target. Once we have
2836 two or more consequetive such branches we can
2837 create a trie of the EXACT's contents and stich
2838 it in place. If the sequence represents all of
2839 the branches we eliminate the whole thing and
2840 replace it with a single TRIE. If it is a
2841 subsequence then we need to stitch it in. This
2842 means the first branch has to remain, and needs
2843 to be repointed at the item on the branch chain
2844 following the last branch optimized. This could
2845 be either a BRANCH, in which case the
2846 subsequence is internal, or it could be the
2847 item following the branch sequence in which
2848 case the subsequence is at the end.
2849
2850 */
2851
2852 /* dont use tail as the end marker for this traverse */
2853 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2854 regnode * const noper = NEXTOPER( cur );
b515a41d 2855#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2856 regnode * const noper_next = regnext( noper );
b515a41d
YO
2857#endif
2858
8aa23a47
YO
2859 DEBUG_OPTIMISE_r({
2860 regprop(RExC_rx, mysv, cur);
2861 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2862 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2863
2864 regprop(RExC_rx, mysv, noper);
2865 PerlIO_printf( Perl_debug_log, " -> %s",
2866 SvPV_nolen_const(mysv));
2867
2868 if ( noper_next ) {
2869 regprop(RExC_rx, mysv, noper_next );
2870 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2871 SvPV_nolen_const(mysv));
2872 }
2873 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2874 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2875 });
2876 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2877 : PL_regkind[ OP( noper ) ] == EXACT )
2878 || OP(noper) == NOTHING )
786e8c11 2879#ifdef NOJUMPTRIE
8aa23a47 2880 && noper_next == tail
786e8c11 2881#endif
8aa23a47
YO
2882 && count < U16_MAX)
2883 {
2884 count++;
2885 if ( !first || optype == NOTHING ) {
2886 if (!first) first = cur;
2887 optype = OP( noper );
2888 } else {
2889 last = cur;
2890 }
2891 } else {
a0a388a1 2892/*
0abd0d78
YO
2893 Currently we do not believe that the trie logic can
2894 handle case insensitive matching properly when the
2895 pattern is not unicode (thus forcing unicode semantics).
2896
2897 If/when this is fixed the following define can be swapped
2898 in below to fully enable trie logic.
2899
a0a388a1 2900#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2901
2902*/
2903#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2904
a0a388a1 2905 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2906 make_trie( pRExC_state,
2907 startbranch, first, cur, tail, count,
2908 optype, depth+1 );
2909 }
2910 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2911#ifdef NOJUMPTRIE
8aa23a47 2912 && noper_next == tail
786e8c11 2913#endif
8aa23a47
YO
2914 ){
2915 count = 1;
2916 first = cur;
2917 optype = OP( noper );
2918 } else {
2919 count = 0;
2920 first = NULL;
2921 optype = 0;
2922 }
2923 last = NULL;
2924 }
2925 }
2926 DEBUG_OPTIMISE_r({
2927 regprop(RExC_rx, mysv, cur);
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2930 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2931
2932 });
a0a388a1
YO
2933
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2935 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2936#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2937 if ( ((made == MADE_EXACT_TRIE &&
2938 startbranch == first)
2939 || ( first_non_open == first )) &&
2940 depth==0 ) {
2941 flags |= SCF_TRIE_RESTUDY;
2942 if ( startbranch == first
2943 && scan == tail )
2944 {
2945 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2946 }
2947 }
3dab1dad 2948#endif
8aa23a47
YO
2949 }
2950 }
2951
2952 } /* do trie */
2953
653099ff 2954 }
8aa23a47
YO
2955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2956 scan = NEXTOPER(NEXTOPER(scan));
2957 } else /* single branch is optimized. */
2958 scan = NEXTOPER(scan);
2959 continue;
2960 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2961 scan_frame *newframe = NULL;
2962 I32 paren;
2963 regnode *start;
2964 regnode *end;
2965
2966 if (OP(scan) != SUSPEND) {
2967 /* set the pointer */
2968 if (OP(scan) == GOSUB) {
2969 paren = ARG(scan);
2970 RExC_recurse[ARG2L(scan)] = scan;
2971 start = RExC_open_parens[paren-1];
2972 end = RExC_close_parens[paren-1];
2973 } else {
2974 paren = 0;
f8fc2ecf 2975 start = RExC_rxi->program + 1;
8aa23a47
YO
2976 end = RExC_opend;
2977 }
2978 if (!recursed) {
2979 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2980 SAVEFREEPV(recursed);
2981 }
2982 if (!PAREN_TEST(recursed,paren+1)) {
2983 PAREN_SET(recursed,paren+1);
2984 Newx(newframe,1,scan_frame);
2985 } else {
2986 if (flags & SCF_DO_SUBSTR) {
304ee84b 2987 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2988 data->longest = &(data->longest_float);
2989 }
2990 is_inf = is_inf_internal = 1;
2991 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2992 cl_anything(pRExC_state, data->start_class);
2993 flags &= ~SCF_DO_STCLASS;
2994 }
2995 } else {
2996 Newx(newframe,1,scan_frame);
2997 paren = stopparen;
2998 start = scan+2;
2999 end = regnext(scan);
3000 }
3001 if (newframe) {
3002 assert(start);
3003 assert(end);
3004 SAVEFREEPV(newframe);
3005 newframe->next = regnext(scan);
3006 newframe->last = last;
3007 newframe->stop = stopparen;
3008 newframe->prev = frame;
3009
3010 frame = newframe;
3011 scan = start;
3012 stopparen = paren;
3013 last = end;
3014
3015 continue;
3016 }
3017 }
3018 else if (OP(scan) == EXACT) {
3019 I32 l = STR_LEN(scan);
3020 UV uc;
3021 if (UTF) {
3022 const U8 * const s = (U8*)STRING(scan);
3023 l = utf8_length(s, s + l);
3024 uc = utf8_to_uvchr(s, NULL);
3025 } else {
3026 uc = *((U8*)STRING(scan));
3027 }
3028 min += l;
3029 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3030 /* The code below prefers earlier match for fixed
3031 offset, later match for variable offset. */
3032 if (data->last_end == -1) { /* Update the start info. */
3033 data->last_start_min = data->pos_min;
3034 data->last_start_max = is_inf
3035 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3036 }
8aa23a47
YO
3037 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3038 if (UTF)
3039 SvUTF8_on(data->last_found);
3040 {
3041 SV * const sv = data->last_found;
3042 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3044 if (mg && mg->mg_len >= 0)
3045 mg->mg_len += utf8_length((U8*)STRING(scan),
3046 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3047 }
8aa23a47
YO
3048 data->last_end = data->pos_min + l;
3049 data->pos_min += l; /* As in the first entry. */
3050 data->flags &= ~SF_BEFORE_EOL;
3051 }
3052 if (flags & SCF_DO_STCLASS_AND) {
3053 /* Check whether it is compatible with what we know already! */
3054 int compat = 1;
3055
3056 if (uc >= 0x100 ||
3057 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3058 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3059 && (!(data->start_class->flags & ANYOF_FOLD)
3060 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3061 )
3062 compat = 0;
3063 ANYOF_CLASS_ZERO(data->start_class);
3064 ANYOF_BITMAP_ZERO(data->start_class);
3065 if (compat)
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3068 if (uc < 0x100)
3069 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3070 }
3071 else if (flags & SCF_DO_STCLASS_OR) {
3072 /* false positive possible if the class is case-folded */
3073 if (uc < 0x100)
3074 ANYOF_BITMAP_SET(data->start_class, uc);
3075 else
3076 data->start_class->flags |= ANYOF_UNICODE_ALL;
3077 data->start_class->flags &= ~ANYOF_EOS;
3078 cl_and(data->start_class, and_withp);
3079 }
3080 flags &= ~SCF_DO_STCLASS;
3081 }
3082 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3083 I32 l = STR_LEN(scan);
3084 UV uc = *((U8*)STRING(scan));
3085
3086 /* Search for fixed substrings supports EXACT only. */
3087 if (flags & SCF_DO_SUBSTR) {
3088 assert(data);
304ee84b 3089 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3090 }
3091 if (UTF) {
3092 const U8 * const s = (U8 *)STRING(scan);
3093 l = utf8_length(s, s + l);
3094 uc = utf8_to_uvchr(s, NULL);
3095 }
3096 min += l;
3097 if (flags & SCF_DO_SUBSTR)
3098 data->pos_min += l;
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 /* Check whether it is compatible with what we know already! */
3101 int compat = 1;
3102
3103 if (uc >= 0x100 ||
3104 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3105 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3106 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3107 compat = 0;
3108 ANYOF_CLASS_ZERO(data->start_class);
3109 ANYOF_BITMAP_ZERO(data->start_class);
3110 if (compat) {
3111 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3112 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3113 data->start_class->flags |= ANYOF_FOLD;
3114 if (OP(scan) == EXACTFL)
3115 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3116 }
8aa23a47
YO
3117 }
3118 else if (flags & SCF_DO_STCLASS_OR) {
3119 if (data->start_class->flags & ANYOF_FOLD) {
3120 /* false positive possible if the class is case-folded.
3121 Assume that the locale settings are the same... */
1aa99e6b
IH
3122 if (uc < 0x100)
3123 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3124 data->start_class->flags &= ~ANYOF_EOS;
3125 }
8aa23a47 3126 cl_and(data->start_class, and_withp);
653099ff 3127 }
8aa23a47
YO
3128 flags &= ~SCF_DO_STCLASS;
3129 }
e52fc539 3130 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3131 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3132 I32 f = flags, pos_before = 0;
3133 regnode * const oscan = scan;
3134 struct regnode_charclass_class this_class;
3135 struct regnode_charclass_class *oclass = NULL;
3136 I32 next_is_eval = 0;
3137
3138 switch (PL_regkind[OP(scan)]) {
3139 case WHILEM: /* End of (?:...)* . */
3140 scan = NEXTOPER(scan);
3141 goto finish;
3142 case PLUS:
3143 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3144 next = NEXTOPER(scan);
3145 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3146 mincount = 1;
3147 maxcount = REG_INFTY;
3148 next = regnext(scan);
3149 scan = NEXTOPER(scan);
3150 goto do_curly;
3151 }
3152 }
3153 if (flags & SCF_DO_SUBSTR)
3154 data->pos_min++;
3155 min++;
3156 /* Fall through. */
3157 case STAR:
3158 if (flags & SCF_DO_STCLASS) {
3159 mincount = 0;
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3163 goto do_curly;
3164 }
3165 is_inf = is_inf_internal = 1;
3166 scan = regnext(scan);
c277df42 3167 if (flags & SCF_DO_SUBSTR) {
304ee84b 3168 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3169 data->longest = &(data->longest_float);
c277df42 3170 }
8aa23a47
YO
3171 goto optimize_curly_tail;
3172 case CURLY:
3173 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3174 && (scan->flags == stopparen))
3175 {
3176 mincount = 1;
3177 maxcount = 1;
3178 } else {
3179 mincount = ARG1(scan);
3180 maxcount = ARG2(scan);
653099ff 3181 }
8aa23a47
YO
3182 next = regnext(scan);
3183 if (OP(scan) == CURLYX) {
3184 I32 lp = (data ? *(data->last_closep) : 0);
3185 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3186 }
8aa23a47
YO
3187 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3188 next_is_eval = (OP(scan) == EVAL);
3189 do_curly:
3190 if (flags & SCF_DO_SUBSTR) {
304ee84b 3191 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3192 pos_before = data->pos_min;
b45f050a 3193 }
8aa23a47
YO
3194 if (data) {
3195 fl = data->flags;
3196 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3197 if (is_inf)
3198 data->flags |= SF_IS_INF;
3199 }
3200 if (flags & SCF_DO_STCLASS) {
3201 cl_init(pRExC_state, &this_class);
3202 oclass = data->start_class;
3203 data->start_class = &this_class;
3204 f |= SCF_DO_STCLASS_AND;
3205 f &= ~SCF_DO_STCLASS_OR;
3206 }
3207 /* These are the cases when once a subexpression
3208 fails at a particular position, it cannot succeed
3209 even after backtracking at the enclosing scope.
b9a59e08 3210
8aa23a47
YO
3211 XXXX what if minimal match and we are at the
3212 initial run of {n,m}? */
3213 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3214 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3215
8aa23a47
YO
3216 /* This will finish on WHILEM, setting scan, or on NULL: */
3217 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3218 last, data, stopparen, recursed, NULL,
3219 (mincount == 0
3220 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3221
8aa23a47
YO
3222 if (flags & SCF_DO_STCLASS)
3223 data->start_class = oclass;
3224 if (mincount == 0 || minnext == 0) {
3225 if (flags & SCF_DO_STCLASS_OR) {
3226 cl_or(pRExC_state, data->start_class, &this_class);
3227 }
3228 else if (flags & SCF_DO_STCLASS_AND) {
3229 /* Switch to OR mode: cache the old value of
3230 * data->start_class */
3231 INIT_AND_WITHP;
3232 StructCopy(data->start_class, and_withp,
3233 struct regnode_charclass_class);
3234 flags &= ~SCF_DO_STCLASS_AND;
3235 StructCopy(&this_class, data->start_class,
3236 struct regnode_charclass_class);
3237 flags |= SCF_DO_STCLASS_OR;
3238 data->start_class->flags |= ANYOF_EOS;
3239 }
3240 } else { /* Non-zero len */
3241 if (flags & SCF_DO_STCLASS_OR) {
3242 cl_or(pRExC_state, data->start_class, &this_class);
3243 cl_and(data->start_class, and_withp);
3244 }
3245 else if (flags & SCF_DO_STCLASS_AND)
3246 cl_and(data->start_class, &this_class);
3247 flags &= ~SCF_DO_STCLASS;
3248 }
3249 if (!scan) /* It was not CURLYX, but CURLY. */
3250 scan = next;
3251 if ( /* ? quantifier ok, except for (?{ ... }) */
3252 (next_is_eval || !(mincount == 0 && maxcount == 1))
3253 && (minnext == 0) && (deltanext == 0)
3254 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3255 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3256 {
668c081a
NC
3257 ckWARNreg(RExC_parse,
3258 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3259 }
3260
3261 min += minnext * mincount;
3262 is_inf_internal |= ((maxcount == REG_INFTY
3263 && (minnext + deltanext) > 0)
3264 || deltanext == I32_MAX);
3265 is_inf |= is_inf_internal;
3266 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3267
3268 /* Try powerful optimization CURLYX => CURLYN. */
3269 if ( OP(oscan) == CURLYX && data
3270 && data->flags & SF_IN_PAR
3271 && !(data->flags & SF_HAS_EVAL)
3272 && !deltanext && minnext == 1 ) {
3273 /* Try to optimize to CURLYN. */
3274 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3275 regnode * const nxt1 = nxt;
497b47a8 3276#ifdef DEBUGGING
8aa23a47 3277 regnode *nxt2;
497b47a8 3278#endif
c277df42 3279
8aa23a47
YO
3280 /* Skip open. */
3281 nxt = regnext(nxt);
e52fc539 3282 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3283 && !(PL_regkind[OP(nxt)] == EXACT
3284 && STR_LEN(nxt) == 1))
3285 goto nogo;
497b47a8 3286#ifdef DEBUGGING
8aa23a47 3287 nxt2 = nxt;
497b47a8 3288#endif
8aa23a47
YO
3289 nxt = regnext(nxt);
3290 if (OP(nxt) != CLOSE)
3291 goto nogo;
3292 if (RExC_open_parens) {
3293 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3294 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3295 }
3296 /* Now we know that nxt2 is the only contents: */
3297 oscan->flags = (U8)ARG(nxt);
3298 OP(oscan) = CURLYN;
3299 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3300
c277df42 3301#ifdef DEBUGGING
8aa23a47 3302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3303 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3304 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3305 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3306 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3307 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3308#endif
8aa23a47
YO
3309 }
3310 nogo:
3311
3312 /* Try optimization CURLYX => CURLYM. */
3313 if ( OP(oscan) == CURLYX && data
3314 && !(data->flags & SF_HAS_PAR)
3315 && !(data->flags & SF_HAS_EVAL)
3316 && !deltanext /* atom is fixed width */
3317 && minnext != 0 /* CURLYM can't handle zero width */
3318 ) {
3319 /* XXXX How to optimize if data == 0? */
3320 /* Optimize to a simpler form. */
3321 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3322 regnode *nxt2;
3323
3324 OP(oscan) = CURLYM;
3325 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3326 && (OP(nxt2) != WHILEM))
3327 nxt = nxt2;
3328 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3329 /* Need to optimize away parenths. */
b3c0965f 3330 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3331 /* Set the parenth number. */
3332 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3333
8aa23a47
YO
3334 oscan->flags = (U8)ARG(nxt);
3335 if (RExC_open_parens) {
3336 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3337 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3338 }
8aa23a47
YO
3339 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3340 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3341
c277df42 3342#ifdef DEBUGGING
8aa23a47
YO
3343 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3344 OP(nxt + 1) = OPTIMIZED; /* was count. */
3345 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3346 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3347#endif
c277df42 3348#if 0
8aa23a47
YO
3349 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3350 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3351 if (nnxt == nxt) {
3352 if (reg_off_by_arg[OP(nxt1)])
3353 ARG_SET(nxt1, nxt2 - nxt1);
3354 else if (nxt2 - nxt1 < U16_MAX)
3355 NEXT_OFF(nxt1) = nxt2 - nxt1;
3356 else
3357 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3358 }
8aa23a47 3359 nxt1 = nnxt;
c277df42 3360 }
5d1c421c 3361#endif
8aa23a47
YO
3362 /* Optimize again: */
3363 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3364 NULL, stopparen, recursed, NULL, 0,depth+1);
3365 }
3366 else
3367 oscan->flags = 0;
3368 }
3369 else if ((OP(oscan) == CURLYX)
3370 && (flags & SCF_WHILEM_VISITED_POS)
3371 /* See the comment on a similar expression above.
3372 However, this time it not a subexpression
3373 we care about, but the expression itself. */
3374 && (maxcount == REG_INFTY)
3375 && data && ++data->whilem_c < 16) {
3376 /* This stays as CURLYX, we can put the count/of pair. */
3377 /* Find WHILEM (as in regexec.c) */
3378 regnode *nxt = oscan + NEXT_OFF(oscan);
3379
3380 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3381 nxt += ARG(nxt);
3382 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3383 | (RExC_whilem_seen << 4)); /* On WHILEM */
3384 }
3385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3386 pars++;
3387 if (flags & SCF_DO_SUBSTR) {
3388 SV *last_str = NULL;
3389 int counted = mincount != 0;
a0ed51b3 3390
8aa23a47
YO
3391 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3392#if defined(SPARC64_GCC_WORKAROUND)
3393 I32 b = 0;
3394 STRLEN l = 0;
3395 const char *s = NULL;
3396 I32 old = 0;
b515a41d 3397
8aa23a47
YO
3398 if (pos_before >= data->last_start_min)
3399 b = pos_before;
3400 else
3401 b = data->last_start_min;
b515a41d 3402
8aa23a47
YO
3403 l = 0;
3404 s = SvPV_const(data->last_found, l);
3405 old = b - data->last_start_min;
3406
3407#else
3408 I32 b = pos_before >= data->last_start_min
3409 ? pos_before : data->last_start_min;
3410 STRLEN l;
3411 const char * const s = SvPV_const(data->last_found, l);
3412 I32 old = b - data->last_start_min;
3413#endif
3414
3415 if (UTF)
3416 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3417 l -= old;
3418 /* Get the added string: */
740cce10 3419 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3420 if (deltanext == 0 && pos_before == b) {
3421 /* What was added is a constant string */
3422 if (mincount > 1) {
3423 SvGROW(last_str, (mincount * l) + 1);
3424 repeatcpy(SvPVX(last_str) + l,
3425 SvPVX_const(last_str), l, mincount - 1);
3426 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427 /* Add additional parts. */
3428 SvCUR_set(data->last_found,
3429 SvCUR(data->last_found) - l);
3430 sv_catsv(data->last_found, last_str);
3431 {
3432 SV * sv = data->last_found;
3433 MAGIC *mg =
3434 SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
bd94e887 3437 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3438 }
8aa23a47 3439 data->last_end += l * (mincount - 1);
b515a41d 3440 }
8aa23a47
YO
3441 } else {
3442 /* start offset must point into the last copy */
3443 data->last_start_min += minnext * (mincount - 1);
3444 data->last_start_max += is_inf ? I32_MAX
3445 : (maxcount - 1) * (minnext + data->pos_delta);
3446 }
c277df42 3447 }
8aa23a47
YO
3448 /* It is counted once already... */
3449 data->pos_min += minnext * (mincount - counted);
3450 data->pos_delta += - counted * deltanext +
3451 (minnext + deltanext) * maxcount - minnext * mincount;
3452 if (mincount != maxcount) {
3453 /* Cannot extend fixed substrings found inside
3454 the group. */
304ee84b 3455 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3456 if (mincount && last_str) {
3457 SV * const sv = data->last_found;
3458 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3460
3461 if (mg)
3462 mg->mg_len = -1;
3463 sv_setsv(sv, last_str);
3464 data->last_end = data->pos_min;
3465 data->last_start_min =
3466 data->pos_min - CHR_SVLEN(last_str);
3467 data->last_start_max = is_inf
3468 ? I32_MAX
3469 : data->pos_min + data->pos_delta
3470 - CHR_SVLEN(last_str);
3471 }
3472 data->longest = &(data->longest_float);
3473 }
3474 SvREFCNT_dec(last_str);
c277df42 3475 }
8aa23a47
YO
3476 if (data && (fl & SF_HAS_EVAL))
3477 data->flags |= SF_HAS_EVAL;
3478 optimize_curly_tail:
3479 if (OP(oscan) != CURLYX) {
3480 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3481 && NEXT_OFF(next))
3482 NEXT_OFF(oscan) += NEXT_OFF(next);
3483 }
3484 continue;
3485 default: /* REF and CLUMP only? */
3486 if (flags & SCF_DO_SUBSTR) {
304ee84b 3487 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3488 data->longest = &(data->longest_float);
3489 }
3490 is_inf = is_inf_internal = 1;
3491 if (flags & SCF_DO_STCLASS_OR)
3492 cl_anything(pRExC_state, data->start_class);
3493 flags &= ~SCF_DO_STCLASS;
3494 break;
c277df42 3495 }
8aa23a47 3496 }
e1d1eefb
YO
3497 else if (OP(scan) == LNBREAK) {
3498 if (flags & SCF_DO_STCLASS) {
3499 int value = 0;
3500 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 for (value = 0; value < 256; value++)
e64b1bd1 3503 if (!is_VERTWS_cp(value))
b9a59e08
KW
3504 ANYOF_BITMAP_CLEAR(data->start_class, value);
3505 }
3506 else {
e1d1eefb 3507 for (value = 0; value < 256; value++)
e64b1bd1 3508 if (is_VERTWS_cp(value))
b9a59e08
KW
3509 ANYOF_BITMAP_SET(data->start_class, value);
3510 }
e1d1eefb
YO
3511 if (flags & SCF_DO_STCLASS_OR)
3512 cl_and(data->start_class, and_withp);
3513 flags &= ~SCF_DO_STCLASS;
3514 }
3515 min += 1;
f9a79580 3516 delta += 1;
e1d1eefb
YO
3517 if (flags & SCF_DO_SUBSTR) {
3518 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3519 data->pos_min += 1;
f9a79580 3520 data->pos_delta += 1;
e1d1eefb
YO
3521 data->longest = &(data->longest_float);
3522 }
e1d1eefb 3523 }
f9a79580
RGS
3524 else if (OP(scan) == FOLDCHAR) {
3525 int d = ARG(scan)==0xDF ? 1 : 2;
3526 flags &= ~SCF_DO_STCLASS;
3527 min += 1;
3528 delta += d;
3529 if (flags & SCF_DO_SUBSTR) {
3530 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3531 data->pos_min += 1;
3532 data->pos_delta += d;
3533 data->longest = &(data->longest_float);
3534 }
3535 }
e52fc539 3536 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3537 int value = 0;
653099ff 3538
8aa23a47 3539 if (flags & SCF_DO_SUBSTR) {
304ee84b 3540 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3541 data->pos_min++;
3542 }
3543 min++;
3544 if (flags & SCF_DO_STCLASS) {
3545 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3546
8aa23a47
YO
3547 /* Some of the logic below assumes that switching
3548 locale on will only add false positives. */
3549 switch (PL_regkind[OP(scan)]) {
3550 case SANY:
3551 default:
3552 do_default:
3553 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3554 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3555 cl_anything(pRExC_state, data->start_class);
3556 break;
3557 case REG_ANY:
3558 if (OP(scan) == SANY)
3559 goto do_default;
3560 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3561 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3562 || (data->start_class->flags & ANYOF_CLASS));
3563 cl_anything(pRExC_state, data->start_class);
653099ff 3564 }
8aa23a47
YO
3565 if (flags & SCF_DO_STCLASS_AND || !value)
3566 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3567 break;
3568 case ANYOF:
3569 if (flags & SCF_DO_STCLASS_AND)
3570 cl_and(data->start_class,
3571 (struct regnode_charclass_class*)scan);
653099ff 3572 else
8aa23a47
YO
3573 cl_or(pRExC_state, data->start_class,
3574 (struct regnode_charclass_class*)scan);
3575 break;
3576 case ALNUM:
3577 if (flags & SCF_DO_STCLASS_AND) {
3578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3580 if (FLAGS(scan) & USE_UNI) {
3581 for (value = 0; value < 256; value++) {
3582 if (!isWORDCHAR_L1(value)) {
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3584 }
3585 }
3586 } else {
3587 for (value = 0; value < 256; value++) {
3588 if (!isALNUM(value)) {
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3590 }
3591 }
3592 }
8aa23a47 3593 }
653099ff 3594 }
8aa23a47
YO
3595 else {
3596 if (data->start_class->flags & ANYOF_LOCALE)
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3598 else if (FLAGS(scan) & USE_UNI) {
3599 for (value = 0; value < 256; value++) {
3600 if (isWORDCHAR_L1(value)) {
3601 ANYOF_BITMAP_SET(data->start_class, value);
3602 }
3603 }
3604 } else {
3605 for (value = 0; value < 256; value++) {
3606 if (isALNUM(value)) {
3607 ANYOF_BITMAP_SET(data->start_class, value);
3608 }
3609 }
3610 }
8aa23a47
YO
3611 }
3612 break;
3613 case ALNUML:
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE)
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3617 }
3618 else {
3619 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3620 data->start_class->flags |= ANYOF_LOCALE;
3621 }
3622 break;
3623 case NALNUM:
3624 if (flags & SCF_DO_STCLASS_AND) {
3625 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3626 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3627 if (FLAGS(scan) & USE_UNI) {
3628 for (value = 0; value < 256; value++) {
3629 if (isWORDCHAR_L1(value)) {
3630 ANYOF_BITMAP_CLEAR(data->start_class, value);
3631 }
3632 }
3633 } else {
3634 for (value = 0; value < 256; value++) {
3635 if (isALNUM(value)) {
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 }
3638 }
3639 }
653099ff
GS
3640 }
3641 }
8aa23a47
YO
3642 else {
3643 if (data->start_class->flags & ANYOF_LOCALE)
3644 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (!isALNUM(value))
b9a59e08 3648 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3649 }
653099ff 3650 }
8aa23a47
YO
3651 break;
3652 case NALNUML:
3653 if (flags & SCF_DO_STCLASS_AND) {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3656 }
8aa23a47
YO
3657 else {
3658 data->start_class->flags |= ANYOF_LOCALE;
3659 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3660 }
3661 break;
3662 case SPACE:
3663 if (flags & SCF_DO_STCLASS_AND) {
3664 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3665 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3666 if (FLAGS(scan) & USE_UNI) {
3667 for (value = 0; value < 256; value++) {
3668 if (!isSPACE_L1(value)) {
3669 ANYOF_BITMAP_CLEAR(data->start_class, value);
3670 }
3671 }
3672 } else {
3673 for (value = 0; value < 256; value++) {
3674 if (!isSPACE(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3676 }
3677 }
3678 }
653099ff
GS
3679 }
3680 }
8aa23a47 3681 else {
a12cf05f 3682 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3683 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3684 }
3685 else if (FLAGS(scan) & USE_UNI) {
3686 for (value = 0; value < 256; value++) {
3687 if (isSPACE_L1(value)) {
3688 ANYOF_BITMAP_SET(data->start_class, value);
3689 }
3690 }
3691 } else {
3692 for (value = 0; value < 256; value++) {
3693 if (isSPACE(value)) {
3694 ANYOF_BITMAP_SET(data->start_class, value);
3695 }
3696 }
8aa23a47 3697 }
653099ff 3698 }
8aa23a47
YO
3699 break;
3700 case SPACEL:
3701 if (flags & SCF_DO_STCLASS_AND) {
3702 if (data->start_class->flags & ANYOF_LOCALE)
3703 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3704 }
3705 else {
3706 data->start_class->flags |= ANYOF_LOCALE;
3707 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3708 }
3709 break;
3710 case NSPACE:
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3714 if (FLAGS(scan) & USE_UNI) {
3715 for (value = 0; value < 256; value++) {
3716 if (isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3718 }
3719 }
3720 } else {
3721 for (value = 0; value < 256; value++) {
3722 if (isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3724 }
3725 }
3726 }
653099ff 3727 }
8aa23a47
YO
3728 }
3729 else {
3730 if (data->start_class->flags & ANYOF_LOCALE)
3731 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3732 else if (FLAGS(scan) & USE_UNI) {
3733 for (value = 0; value < 256; value++) {
3734 if (!isSPACE_L1(value)) {
3735 ANYOF_BITMAP_SET(data->start_class, value);
3736 }
3737 }
3738 }
3739 else {
3740 for (value = 0; value < 256; value++) {
3741 if (!isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3743 }
3744 }
3745 }
653099ff 3746 }
8aa23a47
YO
3747 break;
3748 case NSPACEL:
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (data->start_class->flags & ANYOF_LOCALE) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3752 for (value = 0; value < 256; value++)
3753 if (!isSPACE(value))
3754 ANYOF_BITMAP_CLEAR(data->start_class, value);
3755 }
653099ff 3756 }
8aa23a47
YO
3757 else {
3758 data->start_class->flags |= ANYOF_LOCALE;
3759 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3760 }
3761 break;
3762 case DIGIT: