This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #40302] sigtrap(3pm) man page is vague
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
786e8c11 174#define FULL_TRIE_STUDY
07be1b83
YO
175#define TRIE_STCLASS
176#endif
2c2d71f5
JH
177/* Length of a variant. */
178
179typedef struct scan_data_t {
180 I32 len_min;
181 I32 len_delta;
182 I32 pos_min;
183 I32 pos_delta;
184 SV *last_found;
185 I32 last_end; /* min value, <0 unless valid. */
186 I32 last_start_min;
187 I32 last_start_max;
188 SV **longest; /* Either &l_fixed, or &l_float. */
189 SV *longest_fixed;
190 I32 offset_fixed;
191 SV *longest_float;
192 I32 offset_float_min;
193 I32 offset_float_max;
194 I32 flags;
195 I32 whilem_c;
cb434fcc 196 I32 *last_closep;
653099ff 197 struct regnode_charclass_class *start_class;
2c2d71f5
JH
198} scan_data_t;
199
a687059c 200/*
e50aee73 201 * Forward declarations for pregcomp()'s friends.
a687059c 202 */
a0d0e21e 203
27da23d5
JH
204static const scan_data_t zero_scan_data =
205 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
206
207#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
208#define SF_BEFORE_SEOL 0x0001
209#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
210#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
212
09b7f37c
CB
213#ifdef NO_UNARY_PLUS
214# define SF_FIX_SHIFT_EOL (0+2)
215# define SF_FL_SHIFT_EOL (0+4)
216#else
217# define SF_FIX_SHIFT_EOL (+2)
218# define SF_FL_SHIFT_EOL (+4)
219#endif
c277df42
IZ
220
221#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223
224#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
226#define SF_IS_INF 0x0040
227#define SF_HAS_PAR 0x0080
228#define SF_IN_PAR 0x0100
229#define SF_HAS_EVAL 0x0200
230#define SCF_DO_SUBSTR 0x0400
653099ff
GS
231#define SCF_DO_STCLASS_AND 0x0800
232#define SCF_DO_STCLASS_OR 0x1000
233#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 234#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 235
786e8c11
YO
236#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
237
07be1b83 238
eb160463 239#define UTF (RExC_utf8 != 0)
e2509266
JH
240#define LOC ((RExC_flags & PMf_LOCALE) != 0)
241#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 242
ffc61ed2 243#define OOB_UNICODE 12345678
93733859 244#define OOB_NAMEDCLASS -1
b8c5462f 245
a0ed51b3
LW
246#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
247#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248
8615cb43 249
b45f050a
JF
250/* length of regex to show in messages that don't mark a position within */
251#define RegexLengthToShowInErrorMessages 127
252
253/*
254 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
255 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
256 * op/pragma/warn/regcomp.
257 */
7253e4e3
RK
258#define MARKER1 "<-- HERE" /* marker as it appears in the description */
259#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 260
7253e4e3 261#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
262
263/*
264 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
265 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 * "...".
267 */
ccb2c380 268#define FAIL(msg) STMT_START { \
bfed75c6 269 const char *ellipses = ""; \
ccb2c380
MP
270 IV len = RExC_end - RExC_precomp; \
271 \
272 if (!SIZE_ONLY) \
273 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
274 if (len > RegexLengthToShowInErrorMessages) { \
275 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
276 len = RegexLengthToShowInErrorMessages - 10; \
277 ellipses = "..."; \
278 } \
279 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
280 msg, (int)len, RExC_precomp, ellipses); \
281} STMT_END
8615cb43 282
b45f050a 283/*
b45f050a
JF
284 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
285 */
ccb2c380 286#define Simple_vFAIL(m) STMT_START { \
a28509cc 287 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
288 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
289 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
290} STMT_END
b45f050a
JF
291
292/*
293 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
294 */
ccb2c380
MP
295#define vFAIL(m) STMT_START { \
296 if (!SIZE_ONLY) \
297 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
298 Simple_vFAIL(m); \
299} STMT_END
b45f050a
JF
300
301/*
302 * Like Simple_vFAIL(), but accepts two arguments.
303 */
ccb2c380 304#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 305 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
306 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
307 (int)offset, RExC_precomp, RExC_precomp + offset); \
308} STMT_END
b45f050a
JF
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
312 */
ccb2c380
MP
313#define vFAIL2(m,a1) STMT_START { \
314 if (!SIZE_ONLY) \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL2(m, a1); \
317} STMT_END
b45f050a
JF
318
319
320/*
321 * Like Simple_vFAIL(), but accepts three arguments.
322 */
ccb2c380 323#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 324 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
325 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
326 (int)offset, RExC_precomp, RExC_precomp + offset); \
327} STMT_END
b45f050a
JF
328
329/*
330 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
331 */
ccb2c380
MP
332#define vFAIL3(m,a1,a2) STMT_START { \
333 if (!SIZE_ONLY) \
334 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
335 Simple_vFAIL3(m, a1, a2); \
336} STMT_END
b45f050a
JF
337
338/*
339 * Like Simple_vFAIL(), but accepts four arguments.
340 */
ccb2c380 341#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 342 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
345} STMT_END
b45f050a 346
ccb2c380 347#define vWARN(loc,m) STMT_START { \
a28509cc 348 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
349 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
350 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351} STMT_END
352
353#define vWARNdep(loc,m) STMT_START { \
a28509cc 354 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
355 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
356 "%s" REPORT_LOCATION, \
357 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
359
360
361#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 362 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
363 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
364 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365} STMT_END
366
367#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 368 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
369 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
370 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371} STMT_END
372
373#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 374 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
375 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
376 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 380 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
381 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
382 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
383} STMT_END
9d1d55b5 384
8615cb43 385
cd439c50 386/* Allow for side effects in s */
ccb2c380
MP
387#define REGC(c,s) STMT_START { \
388 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389} STMT_END
cd439c50 390
fac92740
MJD
391/* Macros for recording node offsets. 20001227 mjd@plover.com
392 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
393 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
394 * Element 0 holds the number n.
07be1b83 395 * Position is 1 indexed.
fac92740
MJD
396 */
397
ccb2c380
MP
398#define Set_Node_Offset_To_R(node,byte) STMT_START { \
399 if (! SIZE_ONLY) { \
400 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 401 __LINE__, (node), (int)(byte))); \
ccb2c380 402 if((node) < 0) { \
551405c4 403 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
404 } else { \
405 RExC_offsets[2*(node)-1] = (byte); \
406 } \
407 } \
408} STMT_END
409
410#define Set_Node_Offset(node,byte) \
411 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
412#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
413
414#define Set_Node_Length_To_R(node,len) STMT_START { \
415 if (! SIZE_ONLY) { \
416 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 417 __LINE__, (int)(node), (int)(len))); \
ccb2c380 418 if((node) < 0) { \
551405c4 419 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
420 } else { \
421 RExC_offsets[2*(node)] = (len); \
422 } \
423 } \
424} STMT_END
425
426#define Set_Node_Length(node,len) \
427 Set_Node_Length_To_R((node)-RExC_emit_start, len)
428#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
429#define Set_Node_Cur_Length(node) \
430 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
431
432/* Get offsets and lengths */
433#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
434#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
435
07be1b83
YO
436#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
438 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
439} STMT_END
440
441
442#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
443#define EXPERIMENTAL_INPLACESCAN
444#endif
445
acfe0abc 446static void clear_re(pTHX_ void *r);
4327152a 447
653099ff 448/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 449 Update the longest found anchored substring and the longest found
653099ff
GS
450 floating substrings if needed. */
451
4327152a 452STATIC void
097eb12c 453S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 454{
e1ec3a88
AL
455 const STRLEN l = CHR_SVLEN(data->last_found);
456 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 457
c277df42 458 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 459 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
460 if (*data->longest == data->longest_fixed) {
461 data->offset_fixed = l ? data->last_start_min : data->pos_min;
462 if (data->flags & SF_BEFORE_EOL)
b81d288d 463 data->flags
c277df42
IZ
464 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
465 else
466 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
467 }
468 else {
c277df42 469 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
470 data->offset_float_max = (l
471 ? data->last_start_max
c277df42 472 : data->pos_min + data->pos_delta);
9051bda5
HS
473 if ((U32)data->offset_float_max > (U32)I32_MAX)
474 data->offset_float_max = I32_MAX;
c277df42 475 if (data->flags & SF_BEFORE_EOL)
b81d288d 476 data->flags
c277df42
IZ
477 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
478 else
479 data->flags &= ~SF_FL_BEFORE_EOL;
480 }
481 }
482 SvCUR_set(data->last_found, 0);
0eda9292 483 {
a28509cc 484 SV * const sv = data->last_found;
097eb12c
AL
485 if (SvUTF8(sv) && SvMAGICAL(sv)) {
486 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
487 if (mg)
488 mg->mg_len = 0;
489 }
0eda9292 490 }
c277df42
IZ
491 data->last_end = -1;
492 data->flags &= ~SF_BEFORE_EOL;
493}
494
653099ff
GS
495/* Can match anything (initialization) */
496STATIC void
097eb12c 497S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 498{
653099ff 499 ANYOF_CLASS_ZERO(cl);
f8bef550 500 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 501 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
502 if (LOC)
503 cl->flags |= ANYOF_LOCALE;
504}
505
506/* Can match anything (initialization) */
507STATIC int
5f66b61c 508S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
509{
510 int value;
511
aaa51d5e 512 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
513 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
514 return 1;
1aa99e6b
IH
515 if (!(cl->flags & ANYOF_UNICODE_ALL))
516 return 0;
10edeb5d 517 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 518 return 0;
653099ff
GS
519 return 1;
520}
521
522/* Can match anything (initialization) */
523STATIC void
097eb12c 524S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 525{
8ecf7187 526 Zero(cl, 1, struct regnode_charclass_class);
653099ff 527 cl->type = ANYOF;
830247a4 528 cl_anything(pRExC_state, cl);
653099ff
GS
529}
530
531STATIC void
097eb12c 532S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 533{
8ecf7187 534 Zero(cl, 1, struct regnode_charclass_class);
653099ff 535 cl->type = ANYOF;
830247a4 536 cl_anything(pRExC_state, cl);
653099ff
GS
537 if (LOC)
538 cl->flags |= ANYOF_LOCALE;
539}
540
541/* 'And' a given class with another one. Can create false positives */
542/* We assume that cl is not inverted */
543STATIC void
5f66b61c 544S_cl_and(struct regnode_charclass_class *cl,
a28509cc 545 const struct regnode_charclass_class *and_with)
653099ff 546{
653099ff
GS
547 if (!(and_with->flags & ANYOF_CLASS)
548 && !(cl->flags & ANYOF_CLASS)
549 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
550 && !(and_with->flags & ANYOF_FOLD)
551 && !(cl->flags & ANYOF_FOLD)) {
552 int i;
553
554 if (and_with->flags & ANYOF_INVERT)
555 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
556 cl->bitmap[i] &= ~and_with->bitmap[i];
557 else
558 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
559 cl->bitmap[i] &= and_with->bitmap[i];
560 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
561 if (!(and_with->flags & ANYOF_EOS))
562 cl->flags &= ~ANYOF_EOS;
1aa99e6b 563
14ebb1a2
JH
564 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
565 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 cl->flags |= ANYOF_UNICODE;
568 ARG_SET(cl, ARG(and_with));
569 }
14ebb1a2
JH
570 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
571 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 572 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
573 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
574 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 575 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
576}
577
578/* 'OR' a given class with another one. Can create false positives */
579/* We assume that cl is not inverted */
580STATIC void
097eb12c 581S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 582{
653099ff
GS
583 if (or_with->flags & ANYOF_INVERT) {
584 /* We do not use
585 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
586 * <= (B1 | !B2) | (CL1 | !CL2)
587 * which is wasteful if CL2 is small, but we ignore CL2:
588 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
589 * XXXX Can we handle case-fold? Unclear:
590 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
591 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
592 */
593 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
594 && !(or_with->flags & ANYOF_FOLD)
595 && !(cl->flags & ANYOF_FOLD) ) {
596 int i;
597
598 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
599 cl->bitmap[i] |= ~or_with->bitmap[i];
600 } /* XXXX: logic is complicated otherwise */
601 else {
830247a4 602 cl_anything(pRExC_state, cl);
653099ff
GS
603 }
604 } else {
605 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
606 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 607 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
608 || (cl->flags & ANYOF_FOLD)) ) {
609 int i;
610
611 /* OR char bitmap and class bitmap separately */
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= or_with->bitmap[i];
614 if (or_with->flags & ANYOF_CLASS) {
615 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
616 cl->classflags[i] |= or_with->classflags[i];
617 cl->flags |= ANYOF_CLASS;
618 }
619 }
620 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 621 cl_anything(pRExC_state, cl);
653099ff
GS
622 }
623 }
624 if (or_with->flags & ANYOF_EOS)
625 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
626
627 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
628 ARG(cl) != ARG(or_with)) {
629 cl->flags |= ANYOF_UNICODE_ALL;
630 cl->flags &= ~ANYOF_UNICODE;
631 }
632 if (or_with->flags & ANYOF_UNICODE_ALL) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
635 }
653099ff
GS
636}
637
a3621e74
YO
638#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
639#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
640#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
641#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
642
3dab1dad
YO
643
644#ifdef DEBUGGING
07be1b83 645/*
3dab1dad
YO
646 dump_trie(trie)
647 dump_trie_interim_list(trie,next_alloc)
648 dump_trie_interim_table(trie,next_alloc)
649
650 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
651 The _interim_ variants are used for debugging the interim
652 tables that are used to generate the final compressed
653 representation which is what dump_trie expects.
654
3dab1dad
YO
655 Part of the reason for their existance is to provide a form
656 of documentation as to how the different representations function.
07be1b83
YO
657
658*/
3dab1dad
YO
659
660/*
661 dump_trie(trie)
662 Dumps the final compressed table form of the trie to Perl_debug_log.
663 Used for debugging make_trie().
664*/
665
666STATIC void
667S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
668{
669 U32 state;
ab3bbdeb
YO
670 SV *sv=sv_newmortal();
671 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
672 GET_RE_DEBUG_FLAGS_DECL;
673
ab3bbdeb 674
3dab1dad
YO
675 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
676 (int)depth * 2 + 2,"",
677 "Match","Base","Ofs" );
678
679 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 680 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 681 if ( tmp ) {
ab3bbdeb
YO
682 PerlIO_printf( Perl_debug_log, "%*s",
683 colwidth,
ddc5bc0f 684 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
685 PL_colors[0], PL_colors[1],
686 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
687 PERL_PV_ESCAPE_FIRSTCHAR
688 )
689 );
3dab1dad
YO
690 }
691 }
692 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
693 (int)depth * 2 + 2,"");
694
695 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 696 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
697 PerlIO_printf( Perl_debug_log, "\n");
698
786e8c11 699 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 700 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
701
702 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
703
704 if ( trie->states[ state ].wordnum ) {
705 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
706 } else {
707 PerlIO_printf( Perl_debug_log, "%6s", "" );
708 }
709
710 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
711
712 if ( base ) {
713 U32 ofs = 0;
714
715 while( ( base + ofs < trie->uniquecharcount ) ||
716 ( base + ofs - trie->uniquecharcount < trie->lasttrans
717 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
718 ofs++;
719
720 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
721
722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
723 if ( ( base + ofs >= trie->uniquecharcount ) &&
724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
726 {
ab3bbdeb
YO
727 PerlIO_printf( Perl_debug_log, "%*"UVXf,
728 colwidth,
3dab1dad
YO
729 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
730 } else {
ab3bbdeb 731 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
732 }
733 }
734
735 PerlIO_printf( Perl_debug_log, "]");
736
737 }
738 PerlIO_printf( Perl_debug_log, "\n" );
739 }
740}
741/*
742 dump_trie_interim_list(trie,next_alloc)
743 Dumps a fully constructed but uncompressed trie in list form.
744 List tries normally only are used for construction when the number of
745 possible chars (trie->uniquecharcount) is very high.
746 Used for debugging make_trie().
747*/
748STATIC void
749S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
750{
751 U32 state;
ab3bbdeb
YO
752 SV *sv=sv_newmortal();
753 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
754 GET_RE_DEBUG_FLAGS_DECL;
755 /* print out the table precompression. */
ab3bbdeb
YO
756 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
757 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
758 "------:-----+-----------------\n" );
3dab1dad
YO
759
760 for( state=1 ; state < next_alloc ; state ++ ) {
761 U16 charid;
762
ab3bbdeb 763 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
764 (int)depth * 2 + 2,"", (UV)state );
765 if ( ! trie->states[ state ].wordnum ) {
766 PerlIO_printf( Perl_debug_log, "%5s| ","");
767 } else {
768 PerlIO_printf( Perl_debug_log, "W%4x| ",
769 trie->states[ state ].wordnum
770 );
771 }
772 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 773 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
774 if ( tmp ) {
775 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
776 colwidth,
ddc5bc0f 777 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
778 PL_colors[0], PL_colors[1],
779 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
780 PERL_PV_ESCAPE_FIRSTCHAR
781 ) ,
3dab1dad
YO
782 TRIE_LIST_ITEM(state,charid).forid,
783 (UV)TRIE_LIST_ITEM(state,charid).newstate
784 );
785 }
ab3bbdeb
YO
786 }
787 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
788 }
789}
790
791/*
792 dump_trie_interim_table(trie,next_alloc)
793 Dumps a fully constructed but uncompressed trie in table form.
794 This is the normal DFA style state transition table, with a few
795 twists to facilitate compression later.
796 Used for debugging make_trie().
797*/
798STATIC void
799S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
800{
801 U32 state;
802 U16 charid;
ab3bbdeb
YO
803 SV *sv=sv_newmortal();
804 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
805 GET_RE_DEBUG_FLAGS_DECL;
806
807 /*
808 print out the table precompression so that we can do a visual check
809 that they are identical.
810 */
811
812 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
813
814 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 815 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 816 if ( tmp ) {
ab3bbdeb
YO
817 PerlIO_printf( Perl_debug_log, "%*s",
818 colwidth,
ddc5bc0f 819 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
820 PL_colors[0], PL_colors[1],
821 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
822 PERL_PV_ESCAPE_FIRSTCHAR
823 )
824 );
3dab1dad
YO
825 }
826 }
827
828 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
829
830 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 831 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
832 }
833
834 PerlIO_printf( Perl_debug_log, "\n" );
835
836 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
837
838 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
839 (int)depth * 2 + 2,"",
840 (UV)TRIE_NODENUM( state ) );
841
842 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
843 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
844 if (v)
845 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
846 else
847 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
848 }
849 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
850 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
851 } else {
852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
853 trie->states[ TRIE_NODENUM( state ) ].wordnum );
854 }
855 }
07be1b83 856}
3dab1dad
YO
857
858#endif
859
786e8c11
YO
860/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
861 startbranch: the first branch in the whole branch sequence
862 first : start branch of sequence of branch-exact nodes.
863 May be the same as startbranch
864 last : Thing following the last branch.
865 May be the same as tail.
866 tail : item following the branch sequence
867 count : words in the sequence
868 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
869 depth : indent depth
3dab1dad 870
786e8c11 871Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 872
786e8c11
YO
873A trie is an N'ary tree where the branches are determined by digital
874decomposition of the key. IE, at the root node you look up the 1st character and
875follow that branch repeat until you find the end of the branches. Nodes can be
876marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 877
786e8c11 878 /he|she|his|hers/
72f13be8 879
786e8c11
YO
880would convert into the following structure. Numbers represent states, letters
881following numbers represent valid transitions on the letter from that state, if
882the number is in square brackets it represents an accepting state, otherwise it
883will be in parenthesis.
07be1b83 884
786e8c11
YO
885 +-h->+-e->[3]-+-r->(8)-+-s->[9]
886 | |
887 | (2)
888 | |
889 (1) +-i->(6)-+-s->[7]
890 |
891 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 892
786e8c11
YO
893 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
894
895This shows that when matching against the string 'hers' we will begin at state 1
896read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
897then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
898is also accepting. Thus we know that we can match both 'he' and 'hers' with a
899single traverse. We store a mapping from accepting to state to which word was
900matched, and then when we have multiple possibilities we try to complete the
901rest of the regex in the order in which they occured in the alternation.
902
903The only prior NFA like behaviour that would be changed by the TRIE support is
904the silent ignoring of duplicate alternations which are of the form:
905
906 / (DUPE|DUPE) X? (?{ ... }) Y /x
907
908Thus EVAL blocks follwing a trie may be called a different number of times with
909and without the optimisation. With the optimisations dupes will be silently
910ignored. This inconsistant behaviour of EVAL type nodes is well established as
911the following demonstrates:
912
913 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
914
915which prints out 'word' three times, but
916
917 'words'=~/(word|word|word)(?{ print $1 })S/
918
919which doesnt print it out at all. This is due to other optimisations kicking in.
920
921Example of what happens on a structural level:
922
923The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
924
925 1: CURLYM[1] {1,32767}(18)
926 5: BRANCH(8)
927 6: EXACT <ac>(16)
928 8: BRANCH(11)
929 9: EXACT <ad>(16)
930 11: BRANCH(14)
931 12: EXACT <ab>(16)
932 16: SUCCEED(0)
933 17: NOTHING(18)
934 18: END(0)
935
936This would be optimizable with startbranch=5, first=5, last=16, tail=16
937and should turn into:
938
939 1: CURLYM[1] {1,32767}(18)
940 5: TRIE(16)
941 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
942 <ac>
943 <ad>
944 <ab>
945 16: SUCCEED(0)
946 17: NOTHING(18)
947 18: END(0)
948
949Cases where tail != last would be like /(?foo|bar)baz/:
950
951 1: BRANCH(4)
952 2: EXACT <foo>(8)
953 4: BRANCH(7)
954 5: EXACT <bar>(8)
955 7: TAIL(8)
956 8: EXACT <baz>(10)
957 10: END(0)
958
959which would be optimizable with startbranch=1, first=1, last=7, tail=8
960and would end up looking like:
961
962 1: TRIE(8)
963 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
964 <foo>
965 <bar>
966 7: TAIL(8)
967 8: EXACT <baz>(10)
968 10: END(0)
969
970 d = uvuni_to_utf8_flags(d, uv, 0);
971
972is the recommended Unicode-aware way of saying
973
974 *(d++) = uv;
975*/
976
977#define TRIE_STORE_REVCHAR \
978 STMT_START { \
979 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
980 if (UTF) SvUTF8_on(tmp); \
981 av_push( TRIE_REVCHARMAP(trie), tmp ); \
982 } STMT_END
983
984#define TRIE_READ_CHAR STMT_START { \
985 wordlen++; \
986 if ( UTF ) { \
987 if ( folder ) { \
988 if ( foldlen > 0 ) { \
989 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
990 foldlen -= len; \
991 scan += len; \
992 len = 0; \
993 } else { \
994 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
995 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
996 foldlen -= UNISKIP( uvc ); \
997 scan = foldbuf + UNISKIP( uvc ); \
998 } \
999 } else { \
1000 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1001 } \
1002 } else { \
1003 uvc = (U32)*uc; \
1004 len = 1; \
1005 } \
1006} STMT_END
1007
1008
1009
1010#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1011 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1012 TRIE_LIST_LEN( state ) *= 2; \
1013 Renew( trie->states[ state ].trans.list, \
1014 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1015 } \
1016 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1017 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1018 TRIE_LIST_CUR( state )++; \
1019} STMT_END
07be1b83 1020
786e8c11
YO
1021#define TRIE_LIST_NEW(state) STMT_START { \
1022 Newxz( trie->states[ state ].trans.list, \
1023 4, reg_trie_trans_le ); \
1024 TRIE_LIST_CUR( state ) = 1; \
1025 TRIE_LIST_LEN( state ) = 4; \
1026} STMT_END
07be1b83 1027
786e8c11
YO
1028#define TRIE_HANDLE_WORD(state) STMT_START { \
1029 U16 dupe= trie->states[ state ].wordnum; \
1030 regnode * const noper_next = regnext( noper ); \
1031 \
1032 if (trie->wordlen) \
1033 trie->wordlen[ curword ] = wordlen; \
1034 DEBUG_r({ \
1035 /* store the word for dumping */ \
1036 SV* tmp; \
1037 if (OP(noper) != NOTHING) \
1038 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1039 else \
1040 tmp = newSVpvn( "", 0 ); \
1041 if ( UTF ) SvUTF8_on( tmp ); \
1042 av_push( trie->words, tmp ); \
1043 }); \
1044 \
1045 curword++; \
1046 \
1047 if ( noper_next < tail ) { \
1048 if (!trie->jump) \
1049 Newxz( trie->jump, word_count + 1, U16); \
1050 trie->jump[curword] = (U16)(tail - noper_next); \
1051 if (!jumper) \
1052 jumper = noper_next; \
1053 if (!nextbranch) \
1054 nextbranch= regnext(cur); \
1055 } \
1056 \
1057 if ( dupe ) { \
1058 /* So it's a dupe. This means we need to maintain a */\
1059 /* linked-list from the first to the next. */\
1060 /* we only allocate the nextword buffer when there */\
1061 /* a dupe, so first time we have to do the allocation */\
1062 if (!trie->nextword) \
1063 Newxz( trie->nextword, word_count + 1, U16); \
1064 while ( trie->nextword[dupe] ) \
1065 dupe= trie->nextword[dupe]; \
1066 trie->nextword[dupe]= curword; \
1067 } else { \
1068 /* we haven't inserted this word yet. */ \
1069 trie->states[ state ].wordnum = curword; \
1070 } \
1071} STMT_END
07be1b83 1072
3dab1dad 1073
786e8c11
YO
1074#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1075 ( ( base + charid >= ucharcount \
1076 && base + charid < ubound \
1077 && state == trie->trans[ base - ucharcount + charid ].check \
1078 && trie->trans[ base - ucharcount + charid ].next ) \
1079 ? trie->trans[ base - ucharcount + charid ].next \
1080 : ( state==1 ? special : 0 ) \
1081 )
3dab1dad 1082
786e8c11
YO
1083#define MADE_TRIE 1
1084#define MADE_JUMP_TRIE 2
1085#define MADE_EXACT_TRIE 4
3dab1dad 1086
a3621e74 1087STATIC I32
786e8c11 1088S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1089{
27da23d5 1090 dVAR;
a3621e74
YO
1091 /* first pass, loop through and scan words */
1092 reg_trie_data *trie;
1093 regnode *cur;
9f7f3913 1094 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1095 STRLEN len = 0;
1096 UV uvc = 0;
1097 U16 curword = 0;
1098 U32 next_alloc = 0;
786e8c11
YO
1099 regnode *jumper = NULL;
1100 regnode *nextbranch = NULL;
a3621e74 1101 /* we just use folder as a flag in utf8 */
e1ec3a88 1102 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1103 ? PL_fold
1104 : ( flags == EXACTFL
1105 ? PL_fold_locale
1106 : NULL
1107 )
1108 );
1109
e1ec3a88 1110 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1111 SV *re_trie_maxbuff;
3dab1dad
YO
1112#ifndef DEBUGGING
1113 /* these are only used during construction but are useful during
8e11feef 1114 * debugging so we store them in the struct when debugging.
8e11feef 1115 */
3dab1dad 1116 STRLEN trie_charcount=0;
3dab1dad
YO
1117 AV *trie_revcharmap;
1118#endif
a3621e74 1119 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1120#ifndef DEBUGGING
1121 PERL_UNUSED_ARG(depth);
1122#endif
a3621e74 1123
a02a5408 1124 Newxz( trie, 1, reg_trie_data );
a3621e74 1125 trie->refcount = 1;
3dab1dad 1126 trie->startstate = 1;
786e8c11 1127 trie->wordcount = word_count;
a3621e74 1128 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1129 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1130 if (!(UTF && folder))
1131 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1132 DEBUG_r({
1133 trie->words = newAV();
a3621e74 1134 });
3dab1dad 1135 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1136
0111c4fd 1137 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1138 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1139 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1140 }
3dab1dad
YO
1141 DEBUG_OPTIMISE_r({
1142 PerlIO_printf( Perl_debug_log,
786e8c11 1143 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1144 (int)depth * 2 + 2, "",
1145 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1146 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1147 (int)depth);
3dab1dad 1148 });
a3621e74
YO
1149 /* -- First loop and Setup --
1150
1151 We first traverse the branches and scan each word to determine if it
1152 contains widechars, and how many unique chars there are, this is
1153 important as we have to build a table with at least as many columns as we
1154 have unique chars.
1155
1156 We use an array of integers to represent the character codes 0..255
1157 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1158 native representation of the character value as the key and IV's for the
1159 coded index.
1160
1161 *TODO* If we keep track of how many times each character is used we can
1162 remap the columns so that the table compression later on is more
1163 efficient in terms of memory by ensuring most common value is in the
1164 middle and the least common are on the outside. IMO this would be better
1165 than a most to least common mapping as theres a decent chance the most
1166 common letter will share a node with the least common, meaning the node
1167 will not be compressable. With a middle is most common approach the worst
1168 case is when we have the least common nodes twice.
1169
1170 */
1171
a3621e74 1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1173 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1174 const U8 *uc = (U8*)STRING( noper );
a28509cc 1175 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1176 STRLEN foldlen = 0;
1177 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1178 const U8 *scan = (U8*)NULL;
07be1b83 1179 U32 wordlen = 0; /* required init */
3dab1dad 1180 STRLEN chars=0;
a3621e74 1181
3dab1dad
YO
1182 if (OP(noper) == NOTHING) {
1183 trie->minlen= 0;
1184 continue;
1185 }
1186 if (trie->bitmap) {
1187 TRIE_BITMAP_SET(trie,*uc);
1188 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1189 }
a3621e74 1190 for ( ; uc < e ; uc += len ) {
3dab1dad 1191 TRIE_CHARCOUNT(trie)++;
a3621e74 1192 TRIE_READ_CHAR;
3dab1dad 1193 chars++;
a3621e74
YO
1194 if ( uvc < 256 ) {
1195 if ( !trie->charmap[ uvc ] ) {
1196 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1197 if ( folder )
1198 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1199 TRIE_STORE_REVCHAR;
a3621e74
YO
1200 }
1201 } else {
1202 SV** svpp;
1203 if ( !trie->widecharmap )
1204 trie->widecharmap = newHV();
1205
1206 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1207
1208 if ( !svpp )
e4584336 1209 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1210
1211 if ( !SvTRUE( *svpp ) ) {
1212 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1213 TRIE_STORE_REVCHAR;
a3621e74
YO
1214 }
1215 }
1216 }
3dab1dad
YO
1217 if( cur == first ) {
1218 trie->minlen=chars;
1219 trie->maxlen=chars;
1220 } else if (chars < trie->minlen) {
1221 trie->minlen=chars;
1222 } else if (chars > trie->maxlen) {
1223 trie->maxlen=chars;
1224 }
1225
a3621e74
YO
1226 } /* end first pass */
1227 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1228 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1229 (int)depth * 2 + 2,"",
85c3142d 1230 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1231 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1232 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1233 );
786e8c11 1234 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1235
1236 /*
1237 We now know what we are dealing with in terms of unique chars and
1238 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1239 representation using a flat table will take. If it's over a reasonable
1240 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1241 conservative but potentially much slower representation using an array
1242 of lists.
1243
1244 At the end we convert both representations into the same compressed
1245 form that will be used in regexec.c for matching with. The latter
1246 is a form that cannot be used to construct with but has memory
1247 properties similar to the list form and access properties similar
1248 to the table form making it both suitable for fast searches and
1249 small enough that its feasable to store for the duration of a program.
1250
1251 See the comment in the code where the compressed table is produced
1252 inplace from the flat tabe representation for an explanation of how
1253 the compression works.
1254
1255 */
1256
1257
3dab1dad 1258 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1259 /*
1260 Second Pass -- Array Of Lists Representation
1261
1262 Each state will be represented by a list of charid:state records
1263 (reg_trie_trans_le) the first such element holds the CUR and LEN
1264 points of the allocated array. (See defines above).
1265
1266 We build the initial structure using the lists, and then convert
1267 it into the compressed table form which allows faster lookups
1268 (but cant be modified once converted).
a3621e74
YO
1269 */
1270
a3621e74
YO
1271 STRLEN transcount = 1;
1272
3dab1dad 1273 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1274 TRIE_LIST_NEW(1);
1275 next_alloc = 2;
1276
1277 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1278
c445ea15
AL
1279 regnode * const noper = NEXTOPER( cur );
1280 U8 *uc = (U8*)STRING( noper );
1281 const U8 * const e = uc + STR_LEN( noper );
1282 U32 state = 1; /* required init */
1283 U16 charid = 0; /* sanity init */
1284 U8 *scan = (U8*)NULL; /* sanity init */
1285 STRLEN foldlen = 0; /* required init */
07be1b83 1286 U32 wordlen = 0; /* required init */
c445ea15
AL
1287 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1288
3dab1dad 1289 if (OP(noper) != NOTHING) {
786e8c11 1290 for ( ; uc < e ; uc += len ) {
c445ea15 1291
786e8c11 1292 TRIE_READ_CHAR;
c445ea15 1293
786e8c11
YO
1294 if ( uvc < 256 ) {
1295 charid = trie->charmap[ uvc ];
c445ea15 1296 } else {
786e8c11
YO
1297 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1298 if ( !svpp ) {
1299 charid = 0;
1300 } else {
1301 charid=(U16)SvIV( *svpp );
1302 }
c445ea15 1303 }
786e8c11
YO
1304 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1305 if ( charid ) {
a3621e74 1306
786e8c11
YO
1307 U16 check;
1308 U32 newstate = 0;
a3621e74 1309
786e8c11
YO
1310 charid--;
1311 if ( !trie->states[ state ].trans.list ) {
1312 TRIE_LIST_NEW( state );
c445ea15 1313 }
786e8c11
YO
1314 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1315 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1316 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1317 break;
1318 }
1319 }
1320 if ( ! newstate ) {
1321 newstate = next_alloc++;
1322 TRIE_LIST_PUSH( state, charid, newstate );
1323 transcount++;
1324 }
1325 state = newstate;
1326 } else {
1327 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1328 }
a28509cc 1329 }
c445ea15 1330 }
3dab1dad 1331 TRIE_HANDLE_WORD(state);
a3621e74
YO
1332
1333 } /* end second pass */
1334
786e8c11 1335 trie->laststate = next_alloc;
a3621e74
YO
1336 Renew( trie->states, next_alloc, reg_trie_state );
1337
3dab1dad
YO
1338 /* and now dump it out before we compress it */
1339 DEBUG_TRIE_COMPILE_MORE_r(
1340 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1341 );
a3621e74 1342
a02a5408 1343 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1344 {
1345 U32 state;
a3621e74
YO
1346 U32 tp = 0;
1347 U32 zp = 0;
1348
1349
1350 for( state=1 ; state < next_alloc ; state ++ ) {
1351 U32 base=0;
1352
1353 /*
1354 DEBUG_TRIE_COMPILE_MORE_r(
1355 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1356 );
1357 */
1358
1359 if (trie->states[state].trans.list) {
1360 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1361 U16 maxid=minid;
a28509cc 1362 U16 idx;
a3621e74
YO
1363
1364 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1365 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1366 if ( forid < minid ) {
1367 minid=forid;
1368 } else if ( forid > maxid ) {
1369 maxid=forid;
1370 }
a3621e74
YO
1371 }
1372 if ( transcount < tp + maxid - minid + 1) {
1373 transcount *= 2;
1374 Renew( trie->trans, transcount, reg_trie_trans );
1375 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1376 }
1377 base = trie->uniquecharcount + tp - minid;
1378 if ( maxid == minid ) {
1379 U32 set = 0;
1380 for ( ; zp < tp ; zp++ ) {
1381 if ( ! trie->trans[ zp ].next ) {
1382 base = trie->uniquecharcount + zp - minid;
1383 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1384 trie->trans[ zp ].check = state;
1385 set = 1;
1386 break;
1387 }
1388 }
1389 if ( !set ) {
1390 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1391 trie->trans[ tp ].check = state;
1392 tp++;
1393 zp = tp;
1394 }
1395 } else {
1396 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1397 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1398 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1399 trie->trans[ tid ].check = state;
1400 }
1401 tp += ( maxid - minid + 1 );
1402 }
1403 Safefree(trie->states[ state ].trans.list);
1404 }
1405 /*
1406 DEBUG_TRIE_COMPILE_MORE_r(
1407 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1408 );
1409 */
1410 trie->states[ state ].trans.base=base;
1411 }
cc601c31 1412 trie->lasttrans = tp + 1;
a3621e74
YO
1413 }
1414 } else {
1415 /*
1416 Second Pass -- Flat Table Representation.
1417
1418 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1419 We know that we will need Charcount+1 trans at most to store the data
1420 (one row per char at worst case) So we preallocate both structures
1421 assuming worst case.
1422
1423 We then construct the trie using only the .next slots of the entry
1424 structs.
1425
1426 We use the .check field of the first entry of the node temporarily to
1427 make compression both faster and easier by keeping track of how many non
1428 zero fields are in the node.
1429
1430 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1431 transition.
1432
1433 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1434 number representing the first entry of the node, and state as a
1435 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1436 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1437 are 2 entrys per node. eg:
1438
1439 A B A B
1440 1. 2 4 1. 3 7
1441 2. 0 3 3. 0 5
1442 3. 0 0 5. 0 0
1443 4. 0 0 7. 0 0
1444
1445 The table is internally in the right hand, idx form. However as we also
1446 have to deal with the states array which is indexed by nodenum we have to
1447 use TRIE_NODENUM() to convert.
1448
1449 */
1450
3dab1dad
YO
1451
1452 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1453 reg_trie_trans );
3dab1dad 1454 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1455 next_alloc = trie->uniquecharcount + 1;
1456
3dab1dad 1457
a3621e74
YO
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1459
c445ea15 1460 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1461 const U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1463
1464 U32 state = 1; /* required init */
1465
1466 U16 charid = 0; /* sanity init */
1467 U32 accept_state = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1469
1470 STRLEN foldlen = 0; /* required init */
07be1b83 1471 U32 wordlen = 0; /* required init */
a3621e74
YO
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1473
3dab1dad 1474 if ( OP(noper) != NOTHING ) {
786e8c11 1475 for ( ; uc < e ; uc += len ) {
a3621e74 1476
786e8c11 1477 TRIE_READ_CHAR;
a3621e74 1478
786e8c11
YO
1479 if ( uvc < 256 ) {
1480 charid = trie->charmap[ uvc ];
1481 } else {
1482 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1483 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1484 }
786e8c11
YO
1485 if ( charid ) {
1486 charid--;
1487 if ( !trie->trans[ state + charid ].next ) {
1488 trie->trans[ state + charid ].next = next_alloc;
1489 trie->trans[ state ].check++;
1490 next_alloc += trie->uniquecharcount;
1491 }
1492 state = trie->trans[ state + charid ].next;
1493 } else {
1494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1495 }
1496 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1497 }
a3621e74 1498 }
3dab1dad
YO
1499 accept_state = TRIE_NODENUM( state );
1500 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1501
1502 } /* end second pass */
1503
3dab1dad
YO
1504 /* and now dump it out before we compress it */
1505 DEBUG_TRIE_COMPILE_MORE_r(
1506 dump_trie_interim_table(trie,next_alloc,depth+1)
1507 );
a3621e74 1508
a3621e74
YO
1509 {
1510 /*
1511 * Inplace compress the table.*
1512
1513 For sparse data sets the table constructed by the trie algorithm will
1514 be mostly 0/FAIL transitions or to put it another way mostly empty.
1515 (Note that leaf nodes will not contain any transitions.)
1516
1517 This algorithm compresses the tables by eliminating most such
1518 transitions, at the cost of a modest bit of extra work during lookup:
1519
1520 - Each states[] entry contains a .base field which indicates the
1521 index in the state[] array wheres its transition data is stored.
1522
1523 - If .base is 0 there are no valid transitions from that node.
1524
1525 - If .base is nonzero then charid is added to it to find an entry in
1526 the trans array.
1527
1528 -If trans[states[state].base+charid].check!=state then the
1529 transition is taken to be a 0/Fail transition. Thus if there are fail
1530 transitions at the front of the node then the .base offset will point
1531 somewhere inside the previous nodes data (or maybe even into a node
1532 even earlier), but the .check field determines if the transition is
1533 valid.
1534
786e8c11 1535 XXX - wrong maybe?
a3621e74
YO
1536 The following process inplace converts the table to the compressed
1537 table: We first do not compress the root node 1,and mark its all its
1538 .check pointers as 1 and set its .base pointer as 1 as well. This
1539 allows to do a DFA construction from the compressed table later, and
1540 ensures that any .base pointers we calculate later are greater than
1541 0.
1542
1543 - We set 'pos' to indicate the first entry of the second node.
1544
1545 - We then iterate over the columns of the node, finding the first and
1546 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1547 and set the .check pointers accordingly, and advance pos
1548 appropriately and repreat for the next node. Note that when we copy
1549 the next pointers we have to convert them from the original
1550 NODEIDX form to NODENUM form as the former is not valid post
1551 compression.
1552
1553 - If a node has no transitions used we mark its base as 0 and do not
1554 advance the pos pointer.
1555
1556 - If a node only has one transition we use a second pointer into the
1557 structure to fill in allocated fail transitions from other states.
1558 This pointer is independent of the main pointer and scans forward
1559 looking for null transitions that are allocated to a state. When it
1560 finds one it writes the single transition into the "hole". If the
786e8c11 1561 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1562
1563 - Once compressed we can Renew/realloc the structures to release the
1564 excess space.
1565
1566 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1567 specifically Fig 3.47 and the associated pseudocode.
1568
1569 demq
1570 */
a3b680e6 1571 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1572 U32 state, charid;
a3621e74 1573 U32 pos = 0, zp=0;
786e8c11 1574 trie->laststate = laststate;
a3621e74
YO
1575
1576 for ( state = 1 ; state < laststate ; state++ ) {
1577 U8 flag = 0;
a28509cc
AL
1578 const U32 stateidx = TRIE_NODEIDX( state );
1579 const U32 o_used = trie->trans[ stateidx ].check;
1580 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1581 trie->trans[ stateidx ].check = 0;
1582
1583 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1584 if ( flag || trie->trans[ stateidx + charid ].next ) {
1585 if ( trie->trans[ stateidx + charid ].next ) {
1586 if (o_used == 1) {
1587 for ( ; zp < pos ; zp++ ) {
1588 if ( ! trie->trans[ zp ].next ) {
1589 break;
1590 }
1591 }
1592 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1593 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1594 trie->trans[ zp ].check = state;
1595 if ( ++zp > pos ) pos = zp;
1596 break;
1597 }
1598 used--;
1599 }
1600 if ( !flag ) {
1601 flag = 1;
1602 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1603 }
1604 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1605 trie->trans[ pos ].check = state;
1606 pos++;
1607 }
1608 }
1609 }
cc601c31 1610 trie->lasttrans = pos + 1;
a3621e74
YO
1611 Renew( trie->states, laststate + 1, reg_trie_state);
1612 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1613 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1614 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1615 (int)depth * 2 + 2,"",
1616 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1617 (IV)next_alloc,
1618 (IV)pos,
a3621e74
YO
1619 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1620 );
1621
1622 } /* end table compress */
1623 }
cc601c31
YO
1624 /* resize the trans array to remove unused space */
1625 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1626
3dab1dad
YO
1627 /* and now dump out the compressed format */
1628 DEBUG_TRIE_COMPILE_r(
1629 dump_trie(trie,depth+1)
1630 );
07be1b83 1631
3dab1dad
YO
1632 { /* Modify the program and insert the new TRIE node*/
1633 regnode *convert;
1634 U8 nodetype =(U8)(flags & 0xFF);
1635 char *str=NULL;
786e8c11 1636
07be1b83 1637#ifdef DEBUGGING
786e8c11 1638
07be1b83
YO
1639 U32 mjd_offset;
1640 U32 mjd_nodelen;
1641#endif
a3621e74 1642 /*
3dab1dad
YO
1643 This means we convert either the first branch or the first Exact,
1644 depending on whether the thing following (in 'last') is a branch
1645 or not and whther first is the startbranch (ie is it a sub part of
1646 the alternation or is it the whole thing.)
1647 Assuming its a sub part we conver the EXACT otherwise we convert
1648 the whole branch sequence, including the first.
a3621e74 1649 */
3dab1dad
YO
1650 /* Find the node we are going to overwrite */
1651 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1652 /* whole branch chain */
3dab1dad 1653 convert = first;
07be1b83
YO
1654 DEBUG_r({
1655 const regnode *nop = NEXTOPER( convert );
1656 mjd_offset= Node_Offset((nop));
1657 mjd_nodelen= Node_Length((nop));
1658 });
1659 } else {
1660 /* branch sub-chain */
3dab1dad
YO
1661 convert = NEXTOPER( first );
1662 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1663 DEBUG_r({
1664 mjd_offset= Node_Offset((convert));
1665 mjd_nodelen= Node_Length((convert));
1666 });
1667 }
1668 DEBUG_OPTIMISE_r(
1669 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1670 (int)depth * 2 + 2, "",
786e8c11 1671 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1672 );
a3621e74 1673
3dab1dad
YO
1674 /* But first we check to see if there is a common prefix we can
1675 split out as an EXACT and put in front of the TRIE node. */
1676 trie->startstate= 1;
786e8c11 1677 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1678 U32 state;
1679 DEBUG_OPTIMISE_r(
8e11feef
RGS
1680 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1681 (int)depth * 2 + 2, "",
786e8c11 1682 (UV)trie->laststate)
8e11feef 1683 );
786e8c11 1684 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1685 U32 ofs = 0;
8e11feef
RGS
1686 I32 idx = -1;
1687 U32 count = 0;
1688 const U32 base = trie->states[ state ].trans.base;
a3621e74 1689
3dab1dad 1690 if ( trie->states[state].wordnum )
8e11feef 1691 count = 1;
a3621e74 1692
8e11feef 1693 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1694 if ( ( base + ofs >= trie->uniquecharcount ) &&
1695 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1696 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1697 {
3dab1dad 1698 if ( ++count > 1 ) {
8e11feef 1699 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1700 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1701 if ( state == 1 ) break;
3dab1dad
YO
1702 if ( count == 2 ) {
1703 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1704 DEBUG_OPTIMISE_r(
8e11feef
RGS
1705 PerlIO_printf(Perl_debug_log,
1706 "%*sNew Start State=%"UVuf" Class: [",
1707 (int)depth * 2 + 2, "",
786e8c11 1708 (UV)state));
be8e71aa
YO
1709 if (idx >= 0) {
1710 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1711 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1712
3dab1dad 1713 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1714 if ( folder )
1715 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1716 DEBUG_OPTIMISE_r(
07be1b83 1717 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1718 );
8e11feef
RGS
1719 }
1720 }
1721 TRIE_BITMAP_SET(trie,*ch);
1722 if ( folder )
1723 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1724 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1725 }
1726 idx = ofs;
1727 }
3dab1dad
YO
1728 }
1729 if ( count == 1 ) {
1730 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1731 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1732 DEBUG_OPTIMISE_r(
8e11feef
RGS
1733 PerlIO_printf( Perl_debug_log,
1734 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1735 (int)depth * 2 + 2, "",
786e8c11 1736 (UV)state, (UV)idx, ch)
3dab1dad
YO
1737 );
1738 if ( state==1 ) {
1739 OP( convert ) = nodetype;
1740 str=STRING(convert);
1741 STR_LEN(convert)=0;
1742 }
1743 *str++=*ch;
1744 STR_LEN(convert)++;
a3621e74 1745
8e11feef 1746 } else {
f9049ba1 1747#ifdef DEBUGGING
8e11feef
RGS
1748 if (state>1)
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1750#endif
8e11feef
RGS
1751 break;
1752 }
1753 }
3dab1dad 1754 if (str) {
8e11feef 1755 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1756 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1757 trie->startstate = state;
07be1b83
YO
1758 trie->minlen -= (state - 1);
1759 trie->maxlen -= (state - 1);
1760 DEBUG_r({
1761 regnode *fix = convert;
1762 mjd_nodelen++;
1763 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1764 while( ++fix < n ) {
1765 Set_Node_Offset_Length(fix, 0, 0);
1766 }
1767 });
8e11feef
RGS
1768 if (trie->maxlen) {
1769 convert = n;
1770 } else {
3dab1dad
YO
1771 NEXT_OFF(convert) = (U16)(tail - convert);
1772 }
1773 }
1774 }
1775 if ( trie->maxlen ) {
8e11feef
RGS
1776 NEXT_OFF( convert ) = (U16)(tail - convert);
1777 ARG_SET( convert, data_slot );
786e8c11
YO
1778 /* Store the offset to the first unabsorbed branch in
1779 jump[0], which is otherwise unused by the jump logic.
1780 We use this when dumping a trie and during optimisation. */
1781 if (trie->jump)
1782 trie->jump[0] = (U16)(tail - nextbranch);
1783 if (!jumper)
85c3142d 1784 jumper = last;
786e8c11
YO
1785 /* XXXX */
1786 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
85c3142d 1787 ((char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1788 {
1789 OP( convert ) = TRIEC;
1790 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1791 Safefree(trie->bitmap);
1792 trie->bitmap= NULL;
1793 } else
1794 OP( convert ) = TRIE;
a3621e74 1795
3dab1dad
YO
1796 /* store the type in the flags */
1797 convert->flags = nodetype;
1798 /* XXX We really should free up the resource in trie now, as we wont use them */
1799 }
a3621e74
YO
1800 /* needed for dumping*/
1801 DEBUG_r({
786e8c11
YO
1802 regnode *optimize = convert
1803 + NODE_STEP_REGNODE
1804 + regarglen[ OP( convert ) ];
07be1b83
YO
1805 regnode *opt = convert;
1806 while (++opt<optimize) {
1807 Set_Node_Offset_Length(opt,0,0);
1808 }
786e8c11
YO
1809 /*
1810 Try to clean up some of the debris left after the
1811 optimisation.
a3621e74 1812 */
786e8c11 1813 while( optimize < jumper ) {
07be1b83 1814 mjd_nodelen += Node_Length((optimize));
a3621e74 1815 OP( optimize ) = OPTIMIZED;
07be1b83 1816 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1817 optimize++;
1818 }
07be1b83 1819 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1820 });
1821 } /* end node insert */
07be1b83 1822#ifndef DEBUGGING
6e8b4190 1823 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1824#endif
786e8c11
YO
1825 return trie->jump
1826 ? MADE_JUMP_TRIE
1827 : trie->startstate>1
1828 ? MADE_EXACT_TRIE
1829 : MADE_TRIE;
1830}
1831
1832STATIC void
1833S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1834{
1835/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1836
1837 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1838 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1839 ISBN 0-201-10088-6
1840
1841 We find the fail state for each state in the trie, this state is the longest proper
1842 suffix of the current states 'word' that is also a proper prefix of another word in our
1843 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1844 the DFA not to have to restart after its tried and failed a word at a given point, it
1845 simply continues as though it had been matching the other word in the first place.
1846 Consider
1847 'abcdgu'=~/abcdefg|cdgu/
1848 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1849 fail, which would bring use to the state representing 'd' in the second word where we would
1850 try 'g' and succeed, prodceding to match 'cdgu'.
1851 */
1852 /* add a fail transition */
1853 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1854 U32 *q;
1855 const U32 ucharcount = trie->uniquecharcount;
1856 const U32 numstates = trie->laststate;
1857 const U32 ubound = trie->lasttrans + ucharcount;
1858 U32 q_read = 0;
1859 U32 q_write = 0;
1860 U32 charid;
1861 U32 base = trie->states[ 1 ].trans.base;
1862 U32 *fail;
1863 reg_ac_data *aho;
1864 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1865 GET_RE_DEBUG_FLAGS_DECL;
1866#ifndef DEBUGGING
1867 PERL_UNUSED_ARG(depth);
1868#endif
1869
1870
1871 ARG_SET( stclass, data_slot );
1872 Newxz( aho, 1, reg_ac_data );
1873 RExC_rx->data->data[ data_slot ] = (void*)aho;
1874 aho->trie=trie;
1875 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1876 (trie->laststate+1)*sizeof(reg_trie_state));
1877 Newxz( q, numstates, U32);
1878 Newxz( aho->fail, numstates, U32 );
1879 aho->refcount = 1;
1880 fail = aho->fail;
1881 /* initialize fail[0..1] to be 1 so that we always have
1882 a valid final fail state */
1883 fail[ 0 ] = fail[ 1 ] = 1;
1884
1885 for ( charid = 0; charid < ucharcount ; charid++ ) {
1886 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1887 if ( newstate ) {
1888 q[ q_write ] = newstate;
1889 /* set to point at the root */
1890 fail[ q[ q_write++ ] ]=1;
1891 }
1892 }
1893 while ( q_read < q_write) {
1894 const U32 cur = q[ q_read++ % numstates ];
1895 base = trie->states[ cur ].trans.base;
1896
1897 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1898 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1899 if (ch_state) {
1900 U32 fail_state = cur;
1901 U32 fail_base;
1902 do {
1903 fail_state = fail[ fail_state ];
1904 fail_base = aho->states[ fail_state ].trans.base;
1905 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1906
1907 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1908 fail[ ch_state ] = fail_state;
1909 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1910 {
1911 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1912 }
1913 q[ q_write++ % numstates] = ch_state;
1914 }
1915 }
1916 }
1917 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1918 when we fail in state 1, this allows us to use the
1919 charclass scan to find a valid start char. This is based on the principle
1920 that theres a good chance the string being searched contains lots of stuff
1921 that cant be a start char.
1922 */
1923 fail[ 0 ] = fail[ 1 ] = 0;
1924 DEBUG_TRIE_COMPILE_r({
1925 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
1926 for( q_read=1; q_read<numstates; q_read++ ) {
1927 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
1928 }
1929 PerlIO_printf(Perl_debug_log, "\n");
1930 });
1931 Safefree(q);
1932 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
1933}
1934
786e8c11 1935
a3621e74 1936/*
5d1c421c
JH
1937 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1938 * These need to be revisited when a newer toolchain becomes available.
1939 */
1940#if defined(__sparc64__) && defined(__GNUC__)
1941# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1942# undef SPARC64_GCC_WORKAROUND
1943# define SPARC64_GCC_WORKAROUND 1
1944# endif
1945#endif
1946
07be1b83
YO
1947#define DEBUG_PEEP(str,scan,depth) \
1948 DEBUG_OPTIMISE_r({ \
1949 SV * const mysv=sv_newmortal(); \
1950 regnode *Next = regnext(scan); \
1951 regprop(RExC_rx, mysv, scan); \
1952 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1953 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1954 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1955 });
1956
1957#define JOIN_EXACT(scan,min,flags) \
1958 if (PL_regkind[OP(scan)] == EXACT) \
1959 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1960
be8e71aa 1961STATIC U32
07be1b83
YO
1962S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1963 /* Merge several consecutive EXACTish nodes into one. */
1964 regnode *n = regnext(scan);
1965 U32 stringok = 1;
1966 regnode *next = scan + NODE_SZ_STR(scan);
1967 U32 merged = 0;
1968 U32 stopnow = 0;
1969#ifdef DEBUGGING
1970 regnode *stop = scan;
72f13be8 1971 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 1972#else
d47053eb
RGS
1973 PERL_UNUSED_ARG(depth);
1974#endif
1975#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
1976 PERL_UNUSED_ARG(flags);
1977 PERL_UNUSED_ARG(val);
07be1b83 1978#endif
07be1b83
YO
1979 DEBUG_PEEP("join",scan,depth);
1980
1981 /* Skip NOTHING, merge EXACT*. */
1982 while (n &&
1983 ( PL_regkind[OP(n)] == NOTHING ||
1984 (stringok && (OP(n) == OP(scan))))
1985 && NEXT_OFF(n)
1986 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1987
1988 if (OP(n) == TAIL || n > next)
1989 stringok = 0;
1990 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
1991 DEBUG_PEEP("skip:",n,depth);
1992 NEXT_OFF(scan) += NEXT_OFF(n);
1993 next = n + NODE_STEP_REGNODE;
1994#ifdef DEBUGGING
1995 if (stringok)
1996 stop = n;
1997#endif
1998 n = regnext(n);
1999 }
2000 else if (stringok) {
786e8c11 2001 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2002 regnode * const nnext = regnext(n);
2003
2004 DEBUG_PEEP("merg",n,depth);
2005
2006 merged++;
2007 if (oldl + STR_LEN(n) > U8_MAX)
2008 break;
2009 NEXT_OFF(scan) += NEXT_OFF(n);
2010 STR_LEN(scan) += STR_LEN(n);
2011 next = n + NODE_SZ_STR(n);
2012 /* Now we can overwrite *n : */
2013 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2014#ifdef DEBUGGING
2015 stop = next - 1;
2016#endif
2017 n = nnext;
2018 if (stopnow) break;
2019 }
2020
d47053eb
RGS
2021#ifdef EXPERIMENTAL_INPLACESCAN
2022 if (flags && !NEXT_OFF(n)) {
2023 DEBUG_PEEP("atch", val, depth);
2024 if (reg_off_by_arg[OP(n)]) {
2025 ARG_SET(n, val - n);
2026 }
2027 else {
2028 NEXT_OFF(n) = val - n;
2029 }
2030 stopnow = 1;
2031 }
07be1b83
YO
2032#endif
2033 }
2034
2035 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2036 /*
2037 Two problematic code points in Unicode casefolding of EXACT nodes:
2038
2039 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2040 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2041
2042 which casefold to
2043
2044 Unicode UTF-8
2045
2046 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2047 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2048
2049 This means that in case-insensitive matching (or "loose matching",
2050 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2051 length of the above casefolded versions) can match a target string
2052 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2053 This would rather mess up the minimum length computation.
2054
2055 What we'll do is to look for the tail four bytes, and then peek
2056 at the preceding two bytes to see whether we need to decrease
2057 the minimum length by four (six minus two).
2058
2059 Thanks to the design of UTF-8, there cannot be false matches:
2060 A sequence of valid UTF-8 bytes cannot be a subsequence of
2061 another valid sequence of UTF-8 bytes.
2062
2063 */
2064 char * const s0 = STRING(scan), *s, *t;
2065 char * const s1 = s0 + STR_LEN(scan) - 1;
2066 char * const s2 = s1 - 4;
e294cc5d
JH
2067#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2068 const char t0[] = "\xaf\x49\xaf\x42";
2069#else
07be1b83 2070 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2071#endif
07be1b83
YO
2072 const char * const t1 = t0 + 3;
2073
2074 for (s = s0 + 2;
2075 s < s2 && (t = ninstr(s, s1, t0, t1));
2076 s = t + 4) {
e294cc5d
JH
2077#ifdef EBCDIC
2078 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2079 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2080#else
07be1b83
YO
2081 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2082 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2083#endif
07be1b83
YO
2084 *min -= 4;
2085 }
2086 }
2087
2088#ifdef DEBUGGING
2089 /* Allow dumping */
2090 n = scan + NODE_SZ_STR(scan);
2091 while (n <= stop) {
2092 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2093 OP(n) = OPTIMIZED;
2094 NEXT_OFF(n) = 0;
2095 }
2096 n++;
2097 }
2098#endif
2099 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2100 return stopnow;
2101}
2102
653099ff
GS
2103/* REx optimizer. Converts nodes into quickier variants "in place".
2104 Finds fixed substrings. */
2105
a0288114 2106/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2107 to the position after last scanned or to NULL. */
2108
07be1b83
YO
2109
2110
76e3520e 2111STATIC I32
9a957fbc
AL
2112S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2113 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2114 /* scanp: Start here (read-write). */
2115 /* deltap: Write maxlen-minlen here. */
2116 /* last: Stop before this one. */
2117{
97aff369 2118 dVAR;
c277df42
IZ
2119 I32 min = 0, pars = 0, code;
2120 regnode *scan = *scanp, *next;
2121 I32 delta = 0;
2122 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2123 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2124 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2125 scan_data_t data_fake;
653099ff 2126 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2127 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2128 regnode *first_non_open = scan;
2129
a3621e74
YO
2130
2131 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2132#ifdef DEBUGGING
2133 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2134#endif
786e8c11
YO
2135 if ( depth == 0 ) {
2136 while (first_non_open && OP(first_non_open) == OPEN)
2137 first_non_open=regnext(first_non_open);
2138 }
2139
b81d288d 2140
c277df42
IZ
2141 while (scan && OP(scan) != END && scan < last) {
2142 /* Peephole optimizer: */
07be1b83 2143 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2144
07be1b83 2145 JOIN_EXACT(scan,&min,0);
a3621e74 2146
653099ff
GS
2147 /* Follow the next-chain of the current node and optimize
2148 away all the NOTHINGs from it. */
c277df42 2149 if (OP(scan) != CURLYX) {
a3b680e6 2150 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2151 ? I32_MAX
2152 /* I32 may be smaller than U16 on CRAYs! */
2153 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2154 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2155 int noff;
2156 regnode *n = scan;
b81d288d 2157
c277df42
IZ
2158 /* Skip NOTHING and LONGJMP. */
2159 while ((n = regnext(n))
3dab1dad 2160 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2161 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2162 && off + noff < max)
2163 off += noff;
2164 if (reg_off_by_arg[OP(scan)])
2165 ARG(scan) = off;
b81d288d 2166 else
c277df42
IZ
2167 NEXT_OFF(scan) = off;
2168 }
a3621e74 2169
07be1b83 2170
3dab1dad 2171
653099ff
GS
2172 /* The principal pseudo-switch. Cannot be a switch, since we
2173 look into several different things. */
b81d288d 2174 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2175 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2176 next = regnext(scan);
2177 code = OP(scan);
a3621e74 2178 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2179
2180 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2181 /* NOTE - There is similar code to this block below for handling
2182 TRIE nodes on a re-study. If you change stuff here check there
2183 too. */
c277df42 2184 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2185 struct regnode_charclass_class accum;
d4c19fe8 2186 regnode * const startbranch=scan;
c277df42 2187
653099ff 2188 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2189 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2190 if (flags & SCF_DO_STCLASS)
830247a4 2191 cl_init_zero(pRExC_state, &accum);
a3621e74 2192
c277df42 2193 while (OP(scan) == code) {
830247a4 2194 I32 deltanext, minnext, f = 0, fake;
653099ff 2195 struct regnode_charclass_class this_class;
c277df42
IZ
2196
2197 num++;
2198 data_fake.flags = 0;
b81d288d 2199 if (data) {
2c2d71f5 2200 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2201 data_fake.last_closep = data->last_closep;
2202 }
2203 else
2204 data_fake.last_closep = &fake;
c277df42
IZ
2205 next = regnext(scan);
2206 scan = NEXTOPER(scan);
2207 if (code != BRANCH)
2208 scan = NEXTOPER(scan);
653099ff 2209 if (flags & SCF_DO_STCLASS) {
830247a4 2210 cl_init(pRExC_state, &this_class);
653099ff
GS
2211 data_fake.start_class = &this_class;
2212 f = SCF_DO_STCLASS_AND;
b81d288d 2213 }
e1901655
IZ
2214 if (flags & SCF_WHILEM_VISITED_POS)
2215 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2216
653099ff 2217 /* we suppose the run is continuous, last=next...*/
830247a4 2218 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2219 next, &data_fake, f,depth+1);
b81d288d 2220 if (min1 > minnext)
c277df42
IZ
2221 min1 = minnext;
2222 if (max1 < minnext + deltanext)
2223 max1 = minnext + deltanext;
2224 if (deltanext == I32_MAX)
aca2d497 2225 is_inf = is_inf_internal = 1;
c277df42
IZ
2226 scan = next;
2227 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2228 pars++;
3dab1dad
YO
2229 if (data) {
2230 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2231 data->flags |= SF_HAS_EVAL;
2c2d71f5 2232 data->whilem_c = data_fake.whilem_c;
3dab1dad 2233 }
653099ff 2234 if (flags & SCF_DO_STCLASS)
830247a4 2235 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2236 if (code == SUSPEND)
c277df42
IZ
2237 break;
2238 }
2239 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2240 min1 = 0;
2241 if (flags & SCF_DO_SUBSTR) {
2242 data->pos_min += min1;
2243 data->pos_delta += max1 - min1;
2244 if (max1 != min1 || is_inf)
2245 data->longest = &(data->longest_float);
2246 }
2247 min += min1;
2248 delta += max1 - min1;
653099ff 2249 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2250 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2251 if (min1) {
2252 cl_and(data->start_class, &and_with);
2253 flags &= ~SCF_DO_STCLASS;
2254 }
2255 }
2256 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2257 if (min1) {
2258 cl_and(data->start_class, &accum);
653099ff 2259 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2260 }
2261 else {
b81d288d 2262 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2263 * data->start_class */
2264 StructCopy(data->start_class, &and_with,
2265 struct regnode_charclass_class);
2266 flags &= ~SCF_DO_STCLASS_AND;
2267 StructCopy(&accum, data->start_class,
2268 struct regnode_charclass_class);
2269 flags |= SCF_DO_STCLASS_OR;
2270 data->start_class->flags |= ANYOF_EOS;
2271 }
653099ff 2272 }
a3621e74 2273
786e8c11 2274 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2275 /* demq.
2276
2277 Assuming this was/is a branch we are dealing with: 'scan' now
2278 points at the item that follows the branch sequence, whatever
2279 it is. We now start at the beginning of the sequence and look
2280 for subsequences of
2281
786e8c11
YO
2282 BRANCH->EXACT=>x1
2283 BRANCH->EXACT=>x2
2284 tail
a3621e74
YO
2285
2286 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2287
2288 If we can find such a subseqence we need to turn the first
2289 element into a trie and then add the subsequent branch exact
2290 strings to the trie.
2291
2292 We have two cases
2293
786e8c11 2294 1. patterns where the whole set of branch can be converted.
a3621e74 2295
786e8c11 2296 2. patterns where only a subset can be converted.
a3621e74
YO
2297
2298 In case 1 we can replace the whole set with a single regop
2299 for the trie. In case 2 we need to keep the start and end
2300 branchs so
2301
2302 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2303 becomes BRANCH TRIE; BRANCH X;
2304
786e8c11
YO
2305 There is an additional case, that being where there is a
2306 common prefix, which gets split out into an EXACT like node
2307 preceding the TRIE node.
2308
2309 If x(1..n)==tail then we can do a simple trie, if not we make
2310 a "jump" trie, such that when we match the appropriate word
2311 we "jump" to the appopriate tail node. Essentailly we turn
2312 a nested if into a case structure of sorts.
a3621e74
YO
2313
2314 */
786e8c11 2315
3dab1dad 2316 int made=0;
0111c4fd
RGS
2317 if (!re_trie_maxbuff) {
2318 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2319 if (!SvIOK(re_trie_maxbuff))
2320 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2321 }
786e8c11 2322 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2323 regnode *cur;
2324 regnode *first = (regnode *)NULL;
2325 regnode *last = (regnode *)NULL;
2326 regnode *tail = scan;
2327 U8 optype = 0;
2328 U32 count=0;
2329
2330#ifdef DEBUGGING
c445ea15 2331 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2332#endif
2333 /* var tail is used because there may be a TAIL
2334 regop in the way. Ie, the exacts will point to the
2335 thing following the TAIL, but the last branch will
2336 point at the TAIL. So we advance tail. If we
2337 have nested (?:) we may have to move through several
2338 tails.
2339 */
2340
2341 while ( OP( tail ) == TAIL ) {
2342 /* this is the TAIL generated by (?:) */
2343 tail = regnext( tail );
2344 }
2345
3dab1dad 2346
a3621e74 2347 DEBUG_OPTIMISE_r({
32fc9b6a 2348 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2349 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2350 (int)depth * 2 + 2, "",
2351 "Looking for TRIE'able sequences. Tail node is: ",
2352 SvPV_nolen_const( mysv )
a3621e74
YO
2353 );
2354 });
3dab1dad 2355
a3621e74
YO
2356 /*
2357
2358 step through the branches, cur represents each
2359 branch, noper is the first thing to be matched
2360 as part of that branch and noper_next is the
2361 regnext() of that node. if noper is an EXACT
2362 and noper_next is the same as scan (our current
2363 position in the regex) then the EXACT branch is
2364 a possible optimization target. Once we have
2365 two or more consequetive such branches we can
2366 create a trie of the EXACT's contents and stich
2367 it in place. If the sequence represents all of
2368 the branches we eliminate the whole thing and
2369 replace it with a single TRIE. If it is a
2370 subsequence then we need to stitch it in. This
2371 means the first branch has to remain, and needs
2372 to be repointed at the item on the branch chain
2373 following the last branch optimized. This could
2374 be either a BRANCH, in which case the
2375 subsequence is internal, or it could be the
2376 item following the branch sequence in which
2377 case the subsequence is at the end.
2378
2379 */
2380
2381 /* dont use tail as the end marker for this traverse */
2382 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2383 regnode * const noper = NEXTOPER( cur );
2384 regnode * const noper_next = regnext( noper );
a3621e74 2385
a3621e74 2386 DEBUG_OPTIMISE_r({
32fc9b6a 2387 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2388 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2389 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2390
32fc9b6a 2391 regprop(RExC_rx, mysv, noper);
a3621e74 2392 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2393 SvPV_nolen_const(mysv));
a3621e74
YO
2394
2395 if ( noper_next ) {
32fc9b6a 2396 regprop(RExC_rx, mysv, noper_next );
a3621e74 2397 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2398 SvPV_nolen_const(mysv));
a3621e74 2399 }
3dab1dad
YO
2400 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2401 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2402 });
3dab1dad
YO
2403 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2404 : PL_regkind[ OP( noper ) ] == EXACT )
2405 || OP(noper) == NOTHING )
786e8c11
YO
2406#ifdef NOJUMPTRIE
2407 && noper_next == tail
2408#endif
2409 && count < U16_MAX)
a3621e74
YO
2410 {
2411 count++;
3dab1dad
YO
2412 if ( !first || optype == NOTHING ) {
2413 if (!first) first = cur;
a3621e74
YO
2414 optype = OP( noper );
2415 } else {
a3621e74 2416 last = cur;
a3621e74
YO
2417 }
2418 } else {
2419 if ( last ) {
786e8c11
YO
2420 make_trie( pRExC_state,
2421 startbranch, first, cur, tail, count,
2422 optype, depth+1 );
a3621e74 2423 }
3dab1dad 2424 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2425#ifdef NOJUMPTRIE
2426 && noper_next == tail
2427#endif
2428 ){
a3621e74
YO
2429 count = 1;
2430 first = cur;
2431 optype = OP( noper );
2432 } else {
2433 count = 0;
2434 first = NULL;
2435 optype = 0;
2436 }
2437 last = NULL;
2438 }
2439 }
2440 DEBUG_OPTIMISE_r({
32fc9b6a 2441 regprop(RExC_rx, mysv, cur);
a3621e74 2442 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2443 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2444 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2445
2446 });
2447 if ( last ) {
786e8c11 2448 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2449#ifdef TRIE_STUDY_OPT
786e8c11
YO
2450 if ( ((made == MADE_EXACT_TRIE &&
2451 startbranch == first)
2452 || ( first_non_open == first )) &&
2453 depth==0 )
2454 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2455#endif
07be1b83 2456 }
a3621e74 2457 }
3dab1dad
YO
2458
2459 } /* do trie */
786e8c11 2460
a0ed51b3 2461 }
a3621e74 2462 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2463 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2464 } else /* single branch is optimized. */
c277df42
IZ
2465 scan = NEXTOPER(scan);
2466 continue;
a0ed51b3
LW
2467 }
2468 else if (OP(scan) == EXACT) {
cd439c50 2469 I32 l = STR_LEN(scan);
c445ea15 2470 UV uc;
a0ed51b3 2471 if (UTF) {
a3b680e6 2472 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2473 l = utf8_length(s, s + l);
9041c2e3 2474 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2475 } else {
2476 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2477 }
2478 min += l;
c277df42 2479 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2480 /* The code below prefers earlier match for fixed
2481 offset, later match for variable offset. */
2482 if (data->last_end == -1) { /* Update the start info. */
2483 data->last_start_min = data->pos_min;
2484 data->last_start_max = is_inf
b81d288d 2485 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2486 }
cd439c50 2487 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2488 if (UTF)
2489 SvUTF8_on(data->last_found);
0eda9292 2490 {
9a957fbc 2491 SV * const sv = data->last_found;
a28509cc 2492 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2494 if (mg && mg->mg_len >= 0)
5e43f467
JH
2495 mg->mg_len += utf8_length((U8*)STRING(scan),
2496 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2497 }
c277df42
IZ
2498 data->last_end = data->pos_min + l;
2499 data->pos_min += l; /* As in the first entry. */
2500 data->flags &= ~SF_BEFORE_EOL;
2501 }
653099ff
GS
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 /* Check whether it is compatible with what we know already! */
2504 int compat = 1;
2505
1aa99e6b 2506 if (uc >= 0x100 ||
516a5887 2507 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2508 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2509 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2510 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2511 )
653099ff
GS
2512 compat = 0;
2513 ANYOF_CLASS_ZERO(data->start_class);
2514 ANYOF_BITMAP_ZERO(data->start_class);
2515 if (compat)
1aa99e6b 2516 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2517 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2518 if (uc < 0x100)
2519 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2520 }
2521 else if (flags & SCF_DO_STCLASS_OR) {
2522 /* false positive possible if the class is case-folded */
1aa99e6b 2523 if (uc < 0x100)
9b877dbb
IH
2524 ANYOF_BITMAP_SET(data->start_class, uc);
2525 else
2526 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2527 data->start_class->flags &= ~ANYOF_EOS;
2528 cl_and(data->start_class, &and_with);
2529 }
2530 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2531 }
3dab1dad 2532 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2533 I32 l = STR_LEN(scan);
1aa99e6b 2534 UV uc = *((U8*)STRING(scan));
653099ff
GS
2535
2536 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2537 if (flags & SCF_DO_SUBSTR) {
2538 assert(data);
830247a4 2539 scan_commit(pRExC_state, data);
ecaa9b9c 2540 }
a0ed51b3 2541 if (UTF) {
6136c704 2542 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2543 l = utf8_length(s, s + l);
9041c2e3 2544 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2545 }
2546 min += l;
ecaa9b9c 2547 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2548 data->pos_min += l;
653099ff
GS
2549 if (flags & SCF_DO_STCLASS_AND) {
2550 /* Check whether it is compatible with what we know already! */
2551 int compat = 1;
2552
1aa99e6b 2553 if (uc >= 0x100 ||
516a5887 2554 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2555 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2556 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2557 compat = 0;
2558 ANYOF_CLASS_ZERO(data->start_class);
2559 ANYOF_BITMAP_ZERO(data->start_class);
2560 if (compat) {
1aa99e6b 2561 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2562 data->start_class->flags &= ~ANYOF_EOS;
2563 data->start_class->flags |= ANYOF_FOLD;
2564 if (OP(scan) == EXACTFL)
2565 data->start_class->flags |= ANYOF_LOCALE;
2566 }
2567 }
2568 else if (flags & SCF_DO_STCLASS_OR) {
2569 if (data->start_class->flags & ANYOF_FOLD) {
2570 /* false positive possible if the class is case-folded.
2571 Assume that the locale settings are the same... */
1aa99e6b
IH
2572 if (uc < 0x100)
2573 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2574 data->start_class->flags &= ~ANYOF_EOS;
2575 }
2576 cl_and(data->start_class, &and_with);
2577 }
2578 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2579 }
bfed75c6 2580 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2581 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2582 I32 f = flags, pos_before = 0;
d4c19fe8 2583 regnode * const oscan = scan;
653099ff
GS
2584 struct regnode_charclass_class this_class;
2585 struct regnode_charclass_class *oclass = NULL;
727f22e3 2586 I32 next_is_eval = 0;
653099ff 2587
3dab1dad 2588 switch (PL_regkind[OP(scan)]) {
653099ff 2589 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2590 scan = NEXTOPER(scan);
2591 goto finish;
2592 case PLUS:
653099ff 2593 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2594 next = NEXTOPER(scan);
653099ff 2595 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2596 mincount = 1;
2597 maxcount = REG_INFTY;
c277df42
IZ
2598 next = regnext(scan);
2599 scan = NEXTOPER(scan);
2600 goto do_curly;
2601 }
2602 }
2603 if (flags & SCF_DO_SUBSTR)
2604 data->pos_min++;
2605 min++;
2606 /* Fall through. */
2607 case STAR:
653099ff
GS
2608 if (flags & SCF_DO_STCLASS) {
2609 mincount = 0;
b81d288d 2610 maxcount = REG_INFTY;
653099ff
GS
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2613 goto do_curly;
2614 }
b81d288d 2615 is_inf = is_inf_internal = 1;
c277df42
IZ
2616 scan = regnext(scan);
2617 if (flags & SCF_DO_SUBSTR) {
830247a4 2618 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2619 data->longest = &(data->longest_float);
2620 }
2621 goto optimize_curly_tail;
2622 case CURLY:
b81d288d 2623 mincount = ARG1(scan);
c277df42
IZ
2624 maxcount = ARG2(scan);
2625 next = regnext(scan);
cb434fcc
IZ
2626 if (OP(scan) == CURLYX) {
2627 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2628 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2629 }
c277df42 2630 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2631 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2632 do_curly:
2633 if (flags & SCF_DO_SUBSTR) {
830247a4 2634 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2635 pos_before = data->pos_min;
2636 }
2637 if (data) {
2638 fl = data->flags;
2639 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2640 if (is_inf)
2641 data->flags |= SF_IS_INF;
2642 }
653099ff 2643 if (flags & SCF_DO_STCLASS) {
830247a4 2644 cl_init(pRExC_state, &this_class);
653099ff
GS
2645 oclass = data->start_class;
2646 data->start_class = &this_class;
2647 f |= SCF_DO_STCLASS_AND;
2648 f &= ~SCF_DO_STCLASS_OR;
2649 }
e1901655
IZ
2650 /* These are the cases when once a subexpression
2651 fails at a particular position, it cannot succeed
2652 even after backtracking at the enclosing scope.
b81d288d 2653
e1901655
IZ
2654 XXXX what if minimal match and we are at the
2655 initial run of {n,m}? */
2656 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2657 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2658
c277df42 2659 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2660 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2661 (mincount == 0
2662 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2663
2664 if (flags & SCF_DO_STCLASS)
2665 data->start_class = oclass;
2666 if (mincount == 0 || minnext == 0) {
2667 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2668 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2669 }
2670 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2671 /* Switch to OR mode: cache the old value of
653099ff
GS
2672 * data->start_class */
2673 StructCopy(data->start_class, &and_with,
2674 struct regnode_charclass_class);
2675 flags &= ~SCF_DO_STCLASS_AND;
2676 StructCopy(&this_class, data->start_class,
2677 struct regnode_charclass_class);
2678 flags |= SCF_DO_STCLASS_OR;
2679 data->start_class->flags |= ANYOF_EOS;
2680 }
2681 } else { /* Non-zero len */
2682 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2683 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2684 cl_and(data->start_class, &and_with);
2685 }
2686 else if (flags & SCF_DO_STCLASS_AND)
2687 cl_and(data->start_class, &this_class);
2688 flags &= ~SCF_DO_STCLASS;
2689 }
c277df42
IZ
2690 if (!scan) /* It was not CURLYX, but CURLY. */
2691 scan = next;
041457d9
DM
2692 if ( /* ? quantifier ok, except for (?{ ... }) */
2693 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2694 && (minnext == 0) && (deltanext == 0)
99799961 2695 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2696 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2697 && ckWARN(WARN_REGEXP))
b45f050a 2698 {
830247a4 2699 vWARN(RExC_parse,
b45f050a
JF
2700 "Quantifier unexpected on zero-length expression");
2701 }
2702
c277df42 2703 min += minnext * mincount;
b81d288d 2704 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2705 && (minnext + deltanext) > 0)
2706 || deltanext == I32_MAX);
aca2d497 2707 is_inf |= is_inf_internal;
c277df42
IZ
2708 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2709
2710 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2711 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2712 && data->flags & SF_IN_PAR
2713 && !(data->flags & SF_HAS_EVAL)
2714 && !deltanext && minnext == 1 ) {
2715 /* Try to optimize to CURLYN. */
2716 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2717 regnode * const nxt1 = nxt;
497b47a8
JH
2718#ifdef DEBUGGING
2719 regnode *nxt2;
2720#endif
c277df42
IZ
2721
2722 /* Skip open. */
2723 nxt = regnext(nxt);
bfed75c6 2724 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2725 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2726 && STR_LEN(nxt) == 1))
c277df42 2727 goto nogo;
497b47a8 2728#ifdef DEBUGGING
c277df42 2729 nxt2 = nxt;
497b47a8 2730#endif
c277df42 2731 nxt = regnext(nxt);
b81d288d 2732 if (OP(nxt) != CLOSE)
c277df42
IZ
2733 goto nogo;
2734 /* Now we know that nxt2 is the only contents: */
eb160463 2735 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2736 OP(oscan) = CURLYN;
2737 OP(nxt1) = NOTHING; /* was OPEN. */
2738#ifdef DEBUGGING
2739 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2740 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2741 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2742 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2743 OP(nxt + 1) = OPTIMIZED; /* was count. */
2744 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2745#endif
c277df42 2746 }
c277df42
IZ
2747 nogo:
2748
2749 /* Try optimization CURLYX => CURLYM. */
b81d288d 2750 if ( OP(oscan) == CURLYX && data
c277df42 2751 && !(data->flags & SF_HAS_PAR)
c277df42 2752 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2753 && !deltanext /* atom is fixed width */
2754 && minnext != 0 /* CURLYM can't handle zero width */
2755 ) {
c277df42
IZ
2756 /* XXXX How to optimize if data == 0? */
2757 /* Optimize to a simpler form. */
2758 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2759 regnode *nxt2;
2760
2761 OP(oscan) = CURLYM;
2762 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2763 && (OP(nxt2) != WHILEM))
c277df42
IZ
2764 nxt = nxt2;
2765 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2766 /* Need to optimize away parenths. */
2767 if (data->flags & SF_IN_PAR) {
2768 /* Set the parenth number. */
2769 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2770
b81d288d 2771 if (OP(nxt) != CLOSE)
b45f050a 2772 FAIL("Panic opt close");
eb160463 2773 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2774 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2775 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2776#ifdef DEBUGGING
2777 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2778 OP(nxt + 1) = OPTIMIZED; /* was count. */
2779 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2780 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2781#endif
c277df42
IZ
2782#if 0
2783 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2784 regnode *nnxt = regnext(nxt1);
b81d288d 2785
c277df42
IZ
2786 if (nnxt == nxt) {
2787 if (reg_off_by_arg[OP(nxt1)])
2788 ARG_SET(nxt1, nxt2 - nxt1);
2789 else if (nxt2 - nxt1 < U16_MAX)
2790 NEXT_OFF(nxt1) = nxt2 - nxt1;
2791 else
2792 OP(nxt) = NOTHING; /* Cannot beautify */
2793 }
2794 nxt1 = nnxt;
2795 }
2796#endif
2797 /* Optimize again: */
b81d288d 2798 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2799 NULL, 0,depth+1);
a0ed51b3
LW
2800 }
2801 else
c277df42 2802 oscan->flags = 0;
c277df42 2803 }
e1901655
IZ
2804 else if ((OP(oscan) == CURLYX)
2805 && (flags & SCF_WHILEM_VISITED_POS)
2806 /* See the comment on a similar expression above.
2807 However, this time it not a subexpression
2808 we care about, but the expression itself. */
2809 && (maxcount == REG_INFTY)
2810 && data && ++data->whilem_c < 16) {
2811 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2812 /* Find WHILEM (as in regexec.c) */
2813 regnode *nxt = oscan + NEXT_OFF(oscan);
2814
2815 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2816 nxt += ARG(nxt);
eb160463
GS
2817 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2818 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2819 }
b81d288d 2820 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2821 pars++;
2822 if (flags & SCF_DO_SUBSTR) {
c445ea15 2823 SV *last_str = NULL;
c277df42
IZ
2824 int counted = mincount != 0;
2825
2826 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2827#if defined(SPARC64_GCC_WORKAROUND)
2828 I32 b = 0;
2829 STRLEN l = 0;
cfd0369c 2830 const char *s = NULL;
5d1c421c
JH
2831 I32 old = 0;
2832
2833 if (pos_before >= data->last_start_min)
2834 b = pos_before;
2835 else
2836 b = data->last_start_min;
2837
2838 l = 0;
cfd0369c 2839 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2840 old = b - data->last_start_min;
2841
2842#else
b81d288d 2843 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2844 ? pos_before : data->last_start_min;
2845 STRLEN l;
d4c19fe8 2846 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2847 I32 old = b - data->last_start_min;
5d1c421c 2848#endif
a0ed51b3
LW
2849
2850 if (UTF)
2851 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2852
a0ed51b3 2853 l -= old;
c277df42 2854 /* Get the added string: */
79cb57f6 2855 last_str = newSVpvn(s + old, l);
0e933229
IH
2856 if (UTF)
2857 SvUTF8_on(last_str);
c277df42
IZ
2858 if (deltanext == 0 && pos_before == b) {
2859 /* What was added is a constant string */
2860 if (mincount > 1) {
2861 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2862 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2863 SvPVX_const(last_str), l, mincount - 1);
b162af07 2864 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2865 /* Add additional parts. */
b81d288d 2866 SvCUR_set(data->last_found,
c277df42
IZ
2867 SvCUR(data->last_found) - l);
2868 sv_catsv(data->last_found, last_str);
0eda9292
JH
2869 {
2870 SV * sv = data->last_found;
2871 MAGIC *mg =
2872 SvUTF8(sv) && SvMAGICAL(sv) ?
2873 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2874 if (mg && mg->mg_len >= 0)
2875 mg->mg_len += CHR_SVLEN(last_str);
2876 }
c277df42
IZ
2877 data->last_end += l * (mincount - 1);
2878 }
2a8d9689
HS
2879 } else {
2880 /* start offset must point into the last copy */
2881 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2882 data->last_start_max += is_inf ? I32_MAX
2883 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2884 }
2885 }
2886 /* It is counted once already... */
2887 data->pos_min += minnext * (mincount - counted);
2888 data->pos_delta += - counted * deltanext +
2889 (minnext + deltanext) * maxcount - minnext * mincount;
2890 if (mincount != maxcount) {
653099ff
GS
2891 /* Cannot extend fixed substrings found inside
2892 the group. */
830247a4 2893 scan_commit(pRExC_state,data);
c277df42 2894 if (mincount && last_str) {
d4c19fe8
AL
2895 SV * const sv = data->last_found;
2896 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2897 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2898
2899 if (mg)
2900 mg->mg_len = -1;
2901 sv_setsv(sv, last_str);
c277df42 2902 data->last_end = data->pos_min;
b81d288d 2903 data->last_start_min =
a0ed51b3 2904 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2905 data->last_start_max = is_inf
2906 ? I32_MAX
c277df42 2907 : data->pos_min + data->pos_delta
a0ed51b3 2908 - CHR_SVLEN(last_str);
c277df42
IZ
2909 }
2910 data->longest = &(data->longest_float);
2911 }
aca2d497 2912 SvREFCNT_dec(last_str);
c277df42 2913 }
405ff068 2914 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2915 data->flags |= SF_HAS_EVAL;
2916 optimize_curly_tail:
c277df42 2917 if (OP(oscan) != CURLYX) {
3dab1dad 2918 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2919 && NEXT_OFF(next))
2920 NEXT_OFF(oscan) += NEXT_OFF(next);
2921 }
c277df42 2922 continue;
653099ff 2923 default: /* REF and CLUMP only? */
c277df42 2924 if (flags & SCF_DO_SUBSTR) {
830247a4 2925 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2926 data->longest = &(data->longest_float);
2927 }
aca2d497 2928 is_inf = is_inf_internal = 1;
653099ff 2929 if (flags & SCF_DO_STCLASS_OR)
830247a4 2930 cl_anything(pRExC_state, data->start_class);
653099ff 2931 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2932 break;
2933 }
a0ed51b3 2934 }
bfed75c6 2935 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2936 int value = 0;
653099ff 2937
c277df42 2938 if (flags & SCF_DO_SUBSTR) {
830247a4 2939 scan_commit(pRExC_state,data);
c277df42
IZ
2940 data->pos_min++;
2941 }
2942 min++;
653099ff
GS
2943 if (flags & SCF_DO_STCLASS) {
2944 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2945
2946 /* Some of the logic below assumes that switching
2947 locale on will only add false positives. */
3dab1dad 2948 switch (PL_regkind[OP(scan)]) {
653099ff 2949 case SANY:
653099ff
GS
2950 default:
2951 do_default:
2952 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2953 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2954 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2955 break;
2956 case REG_ANY:
2957 if (OP(scan) == SANY)
2958 goto do_default;
2959 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2960 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2961 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2962 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2963 }
2964 if (flags & SCF_DO_STCLASS_AND || !value)
2965 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2966 break;
2967 case ANYOF:
2968 if (flags & SCF_DO_STCLASS_AND)
2969 cl_and(data->start_class,
2970 (struct regnode_charclass_class*)scan);
2971 else
830247a4 2972 cl_or(pRExC_state, data->start_class,
653099ff
GS
2973 (struct regnode_charclass_class*)scan);
2974 break;
2975 case ALNUM:
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2978 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2979 for (value = 0; value < 256; value++)
2980 if (!isALNUM(value))
2981 ANYOF_BITMAP_CLEAR(data->start_class, value);
2982 }
2983 }
2984 else {
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2987 else {
2988 for (value = 0; value < 256; value++)
2989 if (isALNUM(value))
b81d288d 2990 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2991 }
2992 }
2993 break;
2994 case ALNUML:
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 if (data->start_class->flags & ANYOF_LOCALE)
2997 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2998 }
2999 else {
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3001 data->start_class->flags |= ANYOF_LOCALE;
3002 }
3003 break;
3004 case NALNUM:
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3007 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3008 for (value = 0; value < 256; value++)
3009 if (isALNUM(value))
3010 ANYOF_BITMAP_CLEAR(data->start_class, value);
3011 }
3012 }
3013 else {
3014 if (data->start_class->flags & ANYOF_LOCALE)
3015 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3016 else {
3017 for (value = 0; value < 256; value++)
3018 if (!isALNUM(value))
b81d288d 3019 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3020 }
3021 }
3022 break;
3023 case NALNUML:
3024 if (flags & SCF_DO_STCLASS_AND) {
3025 if (data->start_class->flags & ANYOF_LOCALE)
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3027 }
3028 else {
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3031 }
3032 break;
3033 case SPACE:
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3036 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3037 for (value = 0; value < 256; value++)
3038 if (!isSPACE(value))
3039 ANYOF_BITMAP_CLEAR(data->start_class, value);
3040 }
3041 }
3042 else {
3043 if (data->start_class->flags & ANYOF_LOCALE)
3044 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3045 else {
3046 for (value = 0; value < 256; value++)
3047 if (isSPACE(value))
b81d288d 3048 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3049 }
3050 }
3051 break;
3052 case SPACEL:
3053 if (flags & SCF_DO_STCLASS_AND) {
3054 if (data->start_class->flags & ANYOF_LOCALE)
3055 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3056 }
3057 else {
3058 data->start_class->flags |= ANYOF_LOCALE;
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3060 }
3061 break;
3062 case NSPACE:
3063 if (flags & SCF_DO_STCLASS_AND) {
3064 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3065 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3066 for (value = 0; value < 256; value++)
3067 if (isSPACE(value))
3068 ANYOF_BITMAP_CLEAR(data->start_class, value);
3069 }
3070 }
3071 else {
3072 if (data->start_class->flags & ANYOF_LOCALE)
3073 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3074 else {
3075 for (value = 0; value < 256; value++)
3076 if (!isSPACE(value))
b81d288d 3077 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3078 }
3079 }
3080 break;
3081 case NSPACEL:
3082 if (flags & SCF_DO_STCLASS_AND) {
3083 if (data->start_class->flags & ANYOF_LOCALE) {
3084 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3085 for (value = 0; value < 256; value++)
3086 if (!isSPACE(value))
3087 ANYOF_BITMAP_CLEAR(data->start_class, value);
3088 }
3089 }
3090 else {
3091 data->start_class->flags |= ANYOF_LOCALE;
3092 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3093 }
3094 break;
3095 case DIGIT:
3096 if (flags & SCF_DO_STCLASS_AND) {
3097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3098 for (value = 0; value < 256; value++)
3099 if (!isDIGIT(value))
3100 ANYOF_BITMAP_CLEAR(data->start_class, value);
3101 }
3102 else {
3103 if (data->start_class->flags & ANYOF_LOCALE)
3104 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3105 else {
3106 for (value = 0; value < 256; value++)
3107 if (isDIGIT(value))
b81d288d 3108 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3109 }
3110 }
3111 break;
3112 case NDIGIT:
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3115 for (value = 0; value < 256; value++)
3116 if (isDIGIT(value))
3117 ANYOF_BITMAP_CLEAR(data->start_class, value);
3118 }
3119 else {
3120 if (data->start_class->flags & ANYOF_LOCALE)
3121 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3122 else {
3123 for (value = 0; value < 256; value++)
3124 if (!isDIGIT(value))
b81d288d 3125 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3126 }
3127 }
3128 break;
3129 }
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 flags &= ~SCF_DO_STCLASS;
3133 }
a0ed51b3 3134 }
3dab1dad 3135 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3136 data->flags |= (OP(scan) == MEOL
3137 ? SF_BEFORE_MEOL
3138 : SF_BEFORE_SEOL);
a0ed51b3 3139 }
3dab1dad 3140 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3141 /* Lookbehind, or need to calculate parens/evals/stclass: */
3142 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3143 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3144 /* Lookahead/lookbehind */
cb434fcc 3145 I32 deltanext, minnext, fake = 0;
c277df42 3146 regnode *nscan;
653099ff
GS
3147 struct regnode_charclass_class intrnl;
3148 int f = 0;
c277df42
IZ
3149
3150 data_fake.flags = 0;
b81d288d 3151 if (data) {
2c2d71f5 3152 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3153 data_fake.last_closep = data->last_closep;
3154 }
3155 else
3156 data_fake.last_closep = &fake;
653099ff
GS
3157 if ( flags & SCF_DO_STCLASS && !scan->flags
3158 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3159 cl_init(pRExC_state, &intrnl);
653099ff 3160 data_fake.start_class = &intrnl;
e1901655 3161 f |= SCF_DO_STCLASS_AND;
653099ff 3162 }
e1901655
IZ
3163 if (flags & SCF_WHILEM_VISITED_POS)
3164 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3165 next = regnext(scan);
3166 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3167 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3168 if (scan->flags) {
3169 if (deltanext) {
9baa0206 3170 vFAIL("Variable length lookbehind not implemented");
a0ed51b3 3171 }
786e8c11 3172 else if (minnext > (I32)U8_MAX) {
9baa0206 3173 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3174 }
eb160463 3175 scan->flags = (U8)minnext;
c277df42 3176 }
be8e71aa
YO
3177 if (data) {
3178 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3179 pars++;
3180 if (data_fake.flags & SF_HAS_EVAL)
3181 data->flags |= SF_HAS_EVAL;
2c2d71f5 3182 data->whilem_c = data_fake.whilem_c;
be8e71aa 3183 }
e1901655 3184 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3185 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3186
3187 cl_and(data->start_class, &intrnl);
3188 if (was)
3189 data->start_class->flags |= ANYOF_EOS;
3190 }
a0ed51b3
LW
3191 }
3192 else if (OP(scan) == OPEN) {
c277df42 3193 pars++;
a0ed51b3 3194 }
cb434fcc 3195 else if (OP(scan) == CLOSE) {
eb160463 3196 if ((I32)ARG(scan) == is_par) {
cb434fcc 3197 next = regnext(scan);
c277df42 3198
cb434fcc
IZ
3199 if ( next && (OP(next) != WHILEM) && next < last)
3200 is_par = 0; /* Disable optimization */
3201 }
3202 if (data)
3203 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3204 }
3205 else if (OP(scan) == EVAL) {
c277df42
IZ
3206 if (data)
3207 data->flags |= SF_HAS_EVAL;
3208 }
96776eda 3209 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3210 if (flags & SCF_DO_SUBSTR) {
830247a4 3211 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3212 data->longest = &(data->longest_float);
3213 }
3214 is_inf = is_inf_internal = 1;
653099ff 3215 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3216 cl_anything(pRExC_state, data->start_class);
96776eda 3217 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3218 }
786e8c11
YO
3219#ifdef TRIE_STUDY_OPT
3220#ifdef FULL_TRIE_STUDY
3221 else if (PL_regkind[OP(scan)] == TRIE) {
3222 /* NOTE - There is similar code to this block above for handling
3223 BRANCH nodes on the initial study. If you change stuff here
3224 check there too. */
3225 regnode *tail= regnext(scan);
3226 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3227 I32 max1 = 0, min1 = I32_MAX;
3228 struct regnode_charclass_class accum;
3229
3230 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3231 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
3232 if (flags & SCF_DO_STCLASS)
3233 cl_init_zero(pRExC_state, &accum);
3234
3235 if (!trie->jump) {
3236 min1= trie->minlen;
3237 max1= trie->maxlen;
3238 } else {
3239 const regnode *nextbranch= NULL;
3240 U32 word;
3241
3242 for ( word=1 ; word <= trie->wordcount ; word++)
3243 {
3244 I32 deltanext=0, minnext=0, f = 0, fake;
3245 struct regnode_charclass_class this_class;
3246
3247 data_fake.flags = 0;
3248 if (data) {
3249 data_fake.whilem_c = data->whilem_c;
3250 data_fake.last_closep = data->last_closep;
3251 }
3252 else
3253 data_fake.last_closep = &fake;
3254
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 data_fake.start_class = &this_class;
3258 f = SCF_DO_STCLASS_AND;
3259 }
3260 if (flags & SCF_WHILEM_VISITED_POS)
3261 f |= SCF_WHILEM_VISITED_POS;
3262
3263 if (trie->jump[word]) {
3264 if (!nextbranch)
3265 nextbranch = tail - trie->jump[0];
3266 scan= tail - trie->jump[word];
3267 /* We go from the jump point to the branch that follows
3268 it. Note this means we need the vestigal unused branches
3269 even though they arent otherwise used.
3270 */
3271 minnext = study_chunk(pRExC_state, &scan, &deltanext,
3272 (regnode *)nextbranch, &data_fake, f,depth+1);
3273 }
3274 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3275 nextbranch= regnext((regnode*)nextbranch);
3276
3277 if (min1 > (I32)(minnext + trie->minlen))
3278 min1 = minnext + trie->minlen;
3279 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3280 max1 = minnext + deltanext + trie->maxlen;
3281 if (deltanext == I32_MAX)
3282 is_inf = is_inf_internal = 1;
3283
3284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3285 pars++;
3286
3287 if (data) {
3288 if (data_fake.flags & SF_HAS_EVAL)
3289 data->flags |= SF_HAS_EVAL;
3290 data->whilem_c = data_fake.whilem_c;
3291 }
3292 if (flags & SCF_DO_STCLASS)
3293 cl_or(pRExC_state, &accum, &this_class);
3294 }
3295 }
3296 if (flags & SCF_DO_SUBSTR) {
3297 data->pos_min += min1;
3298 data->pos_delta += max1 - min1;
3299 if (max1 != min1 || is_inf)
3300 data->longest = &(data->longest_float);
3301 }
3302 min += min1;
3303 delta += max1 - min1;
3304 if (flags & SCF_DO_STCLASS_OR) {
3305 cl_or(pRExC_state, data->start_class, &accum);
3306 if (min1) {
3307 cl_and(data->start_class, &and_with);
3308 flags &= ~SCF_DO_STCLASS;
3309 }
3310 }
3311 else if (flags & SCF_DO_STCLASS_AND) {
3312 if (min1) {
3313 cl_and(data->start_class, &accum);
3314 flags &= ~SCF_DO_STCLASS;
3315 }
3316 else {
3317 /* Switch to OR mode: cache the old value of
3318 * data->start_class */
3319 StructCopy(data->start_class, &and_with,
3320 struct regnode_charclass_class);
3321 flags &= ~SCF_DO_STCLASS_AND;
3322 StructCopy(&accum, data->start_class,
3323 struct regnode_charclass_class);
3324 flags |= SCF_DO_STCLASS_OR;
3325 data->start_class->flags |= ANYOF_EOS;
3326 }
3327 }
3328 scan= tail;
3329 continue;
3330 }
3331#else
3332 else if (PL_regkind[OP(scan)] == TRIE) {
3333 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3334 U8*bang=NULL;
3335
3336 min += trie->minlen;
3337 delta += (trie->maxlen - trie->minlen);
3338 flags &= ~SCF_DO_STCLASS; /* xxx */
3339 if (flags & SCF_DO_SUBSTR) {
3340 scan_commit(pRExC_state,data); /* Cannot expect anything... */
3341 data->pos_min += trie->minlen;
3342 data->pos_delta += (trie->maxlen - trie->minlen);
3343 if (trie->maxlen != trie->minlen)
3344 data->longest = &(data->longest_float);
3345 }
3346 if (trie->jump) /* no more substrings -- for now /grr*/
3347 flags &= ~SCF_DO_SUBSTR;
3348 }
3349#endif /* old or new */
3350#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3351 /* Else: zero-length, ignore. */
3352 scan = regnext(scan);
3353 }
3354
3355 finish:
3356 *scanp = scan;
aca2d497 3357 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3358 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3359 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3360 if (is_par > (I32)U8_MAX)
c277df42
IZ
3361 is_par = 0;
3362 if (is_par && pars==1 && data) {
3363 data->flags |= SF_IN_PAR;
3364 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3365 }
3366 else if (pars && data) {
c277df42
IZ
3367 data->flags |= SF_HAS_PAR;
3368 data->flags &= ~SF_IN_PAR;
3369 }
653099ff
GS
3370 if (flags & SCF_DO_STCLASS_OR)
3371 cl_and(data->start_class, &and_with);
786e8c11
YO
3372 if (flags & SCF_TRIE_RESTUDY)
3373 data->flags |= SCF_TRIE_RESTUDY;
c277df42
IZ
3374 return min;
3375}
3376
76e3520e 3377STATIC I32
5f66b61c 3378S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3379{
830247a4 3380 if (RExC_rx->data) {
b81d288d
AB
3381 Renewc(RExC_rx->data,
3382 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3383 char, struct reg_data);
830247a4
IZ
3384 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3385 RExC_rx->data->count += n;
a0ed51b3
LW
3386 }
3387 else {
a02a5408 3388 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3389 char, struct reg_data);
a02a5408 3390 Newx(RExC_rx->data->what, n, U8);
830247a4 3391 RExC_rx->data->count = n;
c277df42 3392 }
830247a4
IZ
3393 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3394 return RExC_rx->data->count - n;
c277df42
IZ
3395}
3396
76234dfb 3397#ifndef PERL_IN_XSUB_RE
d88dccdf 3398void
864dbfa3 3399Perl_reginitcolors(pTHX)
d88dccdf 3400{
97aff369 3401 dVAR;
1df70142 3402 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3403 if (s) {
1df70142
AL
3404 char *t = savepv(s);
3405 int i = 0;
3406 PL_colors[0] = t;
d88dccdf 3407 while (++i < 6) {
1df70142
AL
3408 t = strchr(t, '\t');
3409 if (t) {
3410 *t = '\0';
3411 PL_colors[i] = ++t;
d88dccdf
IZ
3412 }
3413 else
1df70142 3414 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3415 }
3416 } else {
1df70142 3417 int i = 0;
b81d288d 3418 while (i < 6)
06b5626a 3419 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3420 }
3421 PL_colorset = 1;
3422}
76234dfb 3423#endif
8615cb43 3424
07be1b83 3425
786e8c11
YO
3426#ifdef TRIE_STUDY_OPT
3427#define CHECK_RESTUDY_GOTO \
3428 if ( \
3429 (data.flags & SCF_TRIE_RESTUDY) \
3430 && ! restudied++ \
3431 ) goto reStudy
3432#else
3433#define CHECK_RESTUDY_GOTO
3434#endif
a687059c 3435/*
e50aee73 3436 - pregcomp - compile a regular expression into internal code
a687059c
LW
3437 *
3438 * We can't allocate space until we know how big the compiled form will be,
3439 * but we can't compile it (and thus know how big it is) until we've got a
3440 * place to put the code. So we cheat: we compile it twice, once with code
3441 * generation turned off and size counting turned on, and once "for real".
3442 * This also means that we don't allocate space until we are sure that the
3443 * thing really will compile successfully, and we never have to move the
3444 * code and thus invalidate pointers into it. (Note that it has to be in
3445 * one piece because free() must be able to free it all.) [NB: not true in perl]
3446 *
3447 * Beware that the optimization-preparation code in here knows about some
3448 * of the structure of the compiled regexp. [I'll say.]
3449 */
3450regexp *
864dbfa3 3451Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3452{
97aff369 3453 dVAR;
a0d0e21e 3454 register regexp *r;
c277df42 3455 regnode *scan;
c277df42 3456 regnode *first;
a0d0e21e 3457 I32 flags;
a0d0e21e
LW
3458 I32 minlen = 0;
3459 I32 sawplus = 0;
3460 I32 sawopen = 0;
2c2d71f5 3461 scan_data_t data;
830247a4 3462 RExC_state_t RExC_state;
be8e71aa 3463 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3464#ifdef TRIE_STUDY_OPT
3465 int restudied= 0;
3466 RExC_state_t copyRExC_state;
3467#endif
a0d0e21e 3468
a3621e74
YO
3469 GET_RE_DEBUG_FLAGS_DECL;
3470
a0d0e21e 3471 if (exp == NULL)
c277df42 3472 FAIL("NULL regexp argument");
a0d0e21e 3473
a5961de5 3474 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3475
5cfc7842 3476 RExC_precomp = exp;
a3621e74
YO
3477 DEBUG_r(if (!PL_colorset) reginitcolors());
3478 DEBUG_COMPILE_r({
ab3bbdeb
YO
3479 SV *dsv= sv_newmortal();
3480 RE_PV_QUOTED_DECL(s, RExC_utf8,
3481 dsv, RExC_precomp, (xend - exp), 60);
3482 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3483 PL_colors[4],PL_colors[5],s);
a5961de5 3484 });
e2509266 3485 RExC_flags = pm->op_pmflags;
830247a4 3486 RExC_sawback = 0;
bbce6d69 3487
830247a4
IZ
3488 RExC_seen = 0;
3489 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3490 RExC_seen_evals = 0;
3491 RExC_extralen = 0;
c277df42 3492
bbce6d69 3493 /* First pass: determine size, legality. */
830247a4 3494 RExC_parse = exp;
fac92740 3495 RExC_start = exp;
830247a4
IZ
3496 RExC_end = xend;
3497 RExC_naughty = 0;
3498 RExC_npar = 1;
3499 RExC_size = 0L;
3500 RExC_emit = &PL_regdummy;
3501 RExC_whilem_seen = 0;
85ddcde9
JH
3502#if 0 /* REGC() is (currently) a NOP at the first pass.
3503 * Clever compilers notice this and complain. --jhi */
830247a4 3504 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3505#endif
3dab1dad
YO
3506 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3507 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3508 RExC_precomp = NULL;
a0d0e21e
LW
3509 return(NULL);
3510 }
3dab1dad
YO
3511 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3512 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3513 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3514 DEBUG_PARSE_r({
3515 RExC_lastnum=0;
3516 RExC_lastparse=NULL;
3517 });
c277df42 3518
07be1b83 3519
c277df42
IZ
3520 /* Small enough for pointer-storage convention?
3521 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3522 if (RExC_size >= 0x10000L && RExC_extralen)
3523 RExC_size += RExC_extralen;
c277df42 3524 else
830247a4
IZ
3525 RExC_extralen = 0;
3526 if (RExC_whilem_seen > 15)
3527 RExC_whilem_seen = 15;
a0d0e21e 3528
bbce6d69 3529 /* Allocate space and initialize. */
a02a5408 3530 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3531 char, regexp);
a0d0e21e 3532 if (r == NULL)
b45f050a
JF
3533 FAIL("Regexp out of space");
3534
0f79a09d
GS
3535#ifdef DEBUGGING
3536 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3537 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3538#endif
c277df42 3539 r->refcnt = 1;
bbce6d69 3540 r->prelen = xend - exp;
5cfc7842 3541 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3542 r->subbeg = NULL;
f8c7b90f 3543#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3544 r->saved_copy = NULL;
ed252734 3545#endif
cf93c79d 3546 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3547 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3548 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3549
3550 r->substrs = 0; /* Useful during FAIL. */
3551 r->startp = 0; /* Useful during FAIL. */
3552 r->endp = 0; /* Useful during FAIL. */
3553
a02a5408 3554 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3555 if (r->offsets) {
2af232bd 3556 r->offsets[0] = RExC_size;
fac92740 3557 }
a3621e74 3558 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3559 "%s %"UVuf" bytes for offset annotations.\n",
3560 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3561 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3562
830247a4 3563 RExC_rx = r;
bbce6d69 3564
3565 /* Second pass: emit code. */
e2509266 3566 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3567 RExC_parse = exp;
3568 RExC_end = xend;
3569 RExC_naughty = 0;
3570 RExC_npar = 1;
fac92740 3571 RExC_emit_start = r->program;
830247a4 3572 RExC_emit = r->program;
2cd61cdb 3573 /* Store the count of eval-groups for security checks: */
786e8c11 3574 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3575 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3576 r->data = 0;
3dab1dad 3577 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3578 return(NULL);
07be1b83
YO
3579 /* XXXX To minimize changes to RE engine we always allocate
3580 3-units-long substrs field. */
3581 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3582
07be1b83 3583reStudy:
786e8c11 3584 minlen=sawplus=sawopen=0;
07be1b83
YO
3585 Zero(r->substrs, 1, struct reg_substr_data);
3586 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3587
07be1b83
YO
3588#ifdef TRIE_STUDY_OPT
3589 if ( restudied ) {
3590 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3591 RExC_state=copyRExC_state;
3592 if (data.longest_fixed)
3593 SvREFCNT_dec(data.longest_fixed);
3594 if (data.longest_float)
3595 SvREFCNT_dec(data.longest_float);
3596 if (data.last_found)
3597 SvREFCNT_dec(data.last_found);
3598 } else {
3599 copyRExC_state=RExC_state;
3600 }
3601#endif
a0d0e21e 3602 /* Dig out information for optimizations. */
cf93c79d 3603 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3604 pm->op_pmflags = RExC_flags;
a0ed51b3 3605 if (UTF)
5ff6fc6d 3606 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3607 r->regstclass = NULL;
830247a4 3608 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3609 r->reganch |= ROPT_NAUGHTY;
c277df42 3610 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3611
653099ff 3612 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3613 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3614 I32 fake;
c5254dd6 3615 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3616 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3617 int stclass_flag;
07be1b83 3618 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3619
3620 first = scan;
c277df42 3621 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3622 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3623 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3624 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3625 /* for now we can't handle lookbehind IFMATCH*/
3626 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3627 (OP(first) == PLUS) ||
3628 (OP(first) == MINMOD) ||
653099ff 3629 /* An {n,m} with n>0 */
07be1b83
YO
3630 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3631 {
786e8c11 3632
a0d0e21e
LW
3633 if (OP(first) == PLUS)
3634 sawplus = 1;
3635 else
3dab1dad 3636 first += regarglen[OP(first)];
07be1b83
YO
3637 if (OP(first) == IFMATCH) {
3638 first = NEXTOPER(first);
3639 first += EXTRA_STEP_2ARGS;
7c167cea 3640 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3641 first = NEXTOPER(first);
a687059c
LW
3642 }
3643
a0d0e21e
LW
3644 /* Starting-point info. */
3645 again:
786e8c11 3646 DEBUG_PEEP("first:",first,0);
07be1b83 3647 /* Ignore EXACT as we deal with it later. */
3dab1dad 3648 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3649 if (OP(first) == EXACT)
6f207bd3 3650 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3651 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3652 r->regstclass = first;
3653 }
07be1b83 3654#ifdef TRIE_STCLASS
786e8c11 3655 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3656 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3657 {
786e8c11 3658 regnode *trie_op;
07be1b83 3659 /* this can happen only on restudy */
786e8c11
YO
3660 if ( OP(first) == TRIE ) {
3661 struct regnode_1 *trieop;
3662 Newxz(trieop,1,struct regnode_1);
3663 StructCopy(first,trieop,struct regnode_1);
3664