This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] More regex optimisations and debug enhancements (including Andys stuff...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad
YO
125#ifdef DEBUGGING
126 char *lastparse;
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
27da23d5
JH
198static const scan_data_t zero_scan_data =
199 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380 259#define FAIL(msg) STMT_START { \
bfed75c6 260 const char *ellipses = ""; \
ccb2c380
MP
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a 274/*
b45f050a
JF
275 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
276 */
ccb2c380 277#define Simple_vFAIL(m) STMT_START { \
a28509cc 278 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
279 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
280 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
281} STMT_END
b45f050a
JF
282
283/*
284 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
285 */
ccb2c380
MP
286#define vFAIL(m) STMT_START { \
287 if (!SIZE_ONLY) \
288 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 Simple_vFAIL(m); \
290} STMT_END
b45f050a
JF
291
292/*
293 * Like Simple_vFAIL(), but accepts two arguments.
294 */
ccb2c380 295#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 296 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
297 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
298 (int)offset, RExC_precomp, RExC_precomp + offset); \
299} STMT_END
b45f050a
JF
300
301/*
302 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
303 */
ccb2c380
MP
304#define vFAIL2(m,a1) STMT_START { \
305 if (!SIZE_ONLY) \
306 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
307 Simple_vFAIL2(m, a1); \
308} STMT_END
b45f050a
JF
309
310
311/*
312 * Like Simple_vFAIL(), but accepts three arguments.
313 */
ccb2c380 314#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 315 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
316 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
317 (int)offset, RExC_precomp, RExC_precomp + offset); \
318} STMT_END
b45f050a
JF
319
320/*
321 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
322 */
ccb2c380
MP
323#define vFAIL3(m,a1,a2) STMT_START { \
324 if (!SIZE_ONLY) \
325 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
326 Simple_vFAIL3(m, a1, a2); \
327} STMT_END
b45f050a
JF
328
329/*
330 * Like Simple_vFAIL(), but accepts four arguments.
331 */
ccb2c380 332#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 333 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
334 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
335 (int)offset, RExC_precomp, RExC_precomp + offset); \
336} STMT_END
b45f050a 337
ccb2c380 338#define vWARN(loc,m) STMT_START { \
a28509cc 339 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
340 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
341 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
342} STMT_END
343
344#define vWARNdep(loc,m) STMT_START { \
a28509cc 345 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
346 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
347 "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351
352#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 353 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
354 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
355 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 359 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
360 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
361 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
362} STMT_END
363
364#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 365 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
366 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
367 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
369
370#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 371 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
373 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
9d1d55b5 375
8615cb43 376
cd439c50 377/* Allow for side effects in s */
ccb2c380
MP
378#define REGC(c,s) STMT_START { \
379 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
380} STMT_END
cd439c50 381
fac92740
MJD
382/* Macros for recording node offsets. 20001227 mjd@plover.com
383 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
384 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
385 * Element 0 holds the number n.
386 */
387
388#define MJD_OFFSET_DEBUG(x)
a3621e74 389/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
390
391
392#define Set_Node_Offset_To_R(node,byte) STMT_START { \
393 if (! SIZE_ONLY) { \
394 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
395 __LINE__, (node), (byte))); \
396 if((node) < 0) { \
551405c4 397 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
398 } else { \
399 RExC_offsets[2*(node)-1] = (byte); \
400 } \
401 } \
402} STMT_END
403
404#define Set_Node_Offset(node,byte) \
405 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
406#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
407
408#define Set_Node_Length_To_R(node,len) STMT_START { \
409 if (! SIZE_ONLY) { \
410 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 411 __LINE__, (int)(node), (int)(len))); \
ccb2c380 412 if((node) < 0) { \
551405c4 413 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
414 } else { \
415 RExC_offsets[2*(node)] = (len); \
416 } \
417 } \
418} STMT_END
419
420#define Set_Node_Length(node,len) \
421 Set_Node_Length_To_R((node)-RExC_emit_start, len)
422#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
423#define Set_Node_Cur_Length(node) \
424 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
425
426/* Get offsets and lengths */
427#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
428#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
429
acfe0abc 430static void clear_re(pTHX_ void *r);
4327152a 431
653099ff
GS
432/* Mark that we cannot extend a found fixed substring at this point.
433 Updata the longest found anchored substring and the longest found
434 floating substrings if needed. */
435
4327152a 436STATIC void
097eb12c 437S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 438{
e1ec3a88
AL
439 const STRLEN l = CHR_SVLEN(data->last_found);
440 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 441
c277df42 442 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 443 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
444 if (*data->longest == data->longest_fixed) {
445 data->offset_fixed = l ? data->last_start_min : data->pos_min;
446 if (data->flags & SF_BEFORE_EOL)
b81d288d 447 data->flags
c277df42
IZ
448 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
449 else
450 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
451 }
452 else {
c277df42 453 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
454 data->offset_float_max = (l
455 ? data->last_start_max
c277df42 456 : data->pos_min + data->pos_delta);
9051bda5
HS
457 if ((U32)data->offset_float_max > (U32)I32_MAX)
458 data->offset_float_max = I32_MAX;
c277df42 459 if (data->flags & SF_BEFORE_EOL)
b81d288d 460 data->flags
c277df42
IZ
461 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
462 else
463 data->flags &= ~SF_FL_BEFORE_EOL;
464 }
465 }
466 SvCUR_set(data->last_found, 0);
0eda9292 467 {
a28509cc 468 SV * const sv = data->last_found;
097eb12c
AL
469 if (SvUTF8(sv) && SvMAGICAL(sv)) {
470 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
471 if (mg)
472 mg->mg_len = 0;
473 }
0eda9292 474 }
c277df42
IZ
475 data->last_end = -1;
476 data->flags &= ~SF_BEFORE_EOL;
477}
478
653099ff
GS
479/* Can match anything (initialization) */
480STATIC void
097eb12c 481S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 482{
653099ff 483 ANYOF_CLASS_ZERO(cl);
f8bef550 484 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 485 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
486 if (LOC)
487 cl->flags |= ANYOF_LOCALE;
488}
489
490/* Can match anything (initialization) */
491STATIC int
5f66b61c 492S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
493{
494 int value;
495
aaa51d5e 496 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
497 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
498 return 1;
1aa99e6b
IH
499 if (!(cl->flags & ANYOF_UNICODE_ALL))
500 return 0;
f8bef550
NC
501 if (!ANYOF_BITMAP_TESTALLSET(cl))
502 return 0;
653099ff
GS
503 return 1;
504}
505
506/* Can match anything (initialization) */
507STATIC void
097eb12c 508S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 509{
8ecf7187 510 Zero(cl, 1, struct regnode_charclass_class);
653099ff 511 cl->type = ANYOF;
830247a4 512 cl_anything(pRExC_state, cl);
653099ff
GS
513}
514
515STATIC void
097eb12c 516S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 517{
8ecf7187 518 Zero(cl, 1, struct regnode_charclass_class);
653099ff 519 cl->type = ANYOF;
830247a4 520 cl_anything(pRExC_state, cl);
653099ff
GS
521 if (LOC)
522 cl->flags |= ANYOF_LOCALE;
523}
524
525/* 'And' a given class with another one. Can create false positives */
526/* We assume that cl is not inverted */
527STATIC void
5f66b61c 528S_cl_and(struct regnode_charclass_class *cl,
a28509cc 529 const struct regnode_charclass_class *and_with)
653099ff 530{
653099ff
GS
531 if (!(and_with->flags & ANYOF_CLASS)
532 && !(cl->flags & ANYOF_CLASS)
533 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
534 && !(and_with->flags & ANYOF_FOLD)
535 && !(cl->flags & ANYOF_FOLD)) {
536 int i;
537
538 if (and_with->flags & ANYOF_INVERT)
539 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
540 cl->bitmap[i] &= ~and_with->bitmap[i];
541 else
542 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
543 cl->bitmap[i] &= and_with->bitmap[i];
544 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
545 if (!(and_with->flags & ANYOF_EOS))
546 cl->flags &= ~ANYOF_EOS;
1aa99e6b 547
14ebb1a2
JH
548 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
549 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
550 cl->flags &= ~ANYOF_UNICODE_ALL;
551 cl->flags |= ANYOF_UNICODE;
552 ARG_SET(cl, ARG(and_with));
553 }
14ebb1a2
JH
554 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
555 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 556 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
557 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
558 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 559 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
560}
561
562/* 'OR' a given class with another one. Can create false positives */
563/* We assume that cl is not inverted */
564STATIC void
097eb12c 565S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 566{
653099ff
GS
567 if (or_with->flags & ANYOF_INVERT) {
568 /* We do not use
569 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
570 * <= (B1 | !B2) | (CL1 | !CL2)
571 * which is wasteful if CL2 is small, but we ignore CL2:
572 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
573 * XXXX Can we handle case-fold? Unclear:
574 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
575 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
576 */
577 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
578 && !(or_with->flags & ANYOF_FOLD)
579 && !(cl->flags & ANYOF_FOLD) ) {
580 int i;
581
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] |= ~or_with->bitmap[i];
584 } /* XXXX: logic is complicated otherwise */
585 else {
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 } else {
589 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
590 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 591 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
592 || (cl->flags & ANYOF_FOLD)) ) {
593 int i;
594
595 /* OR char bitmap and class bitmap separately */
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= or_with->bitmap[i];
598 if (or_with->flags & ANYOF_CLASS) {
599 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
600 cl->classflags[i] |= or_with->classflags[i];
601 cl->flags |= ANYOF_CLASS;
602 }
603 }
604 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 605 cl_anything(pRExC_state, cl);
653099ff
GS
606 }
607 }
608 if (or_with->flags & ANYOF_EOS)
609 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
610
611 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
612 ARG(cl) != ARG(or_with)) {
613 cl->flags |= ANYOF_UNICODE_ALL;
614 cl->flags &= ~ANYOF_UNICODE;
615 }
616 if (or_with->flags & ANYOF_UNICODE_ALL) {
617 cl->flags |= ANYOF_UNICODE_ALL;
618 cl->flags &= ~ANYOF_UNICODE;
619 }
653099ff
GS
620}
621
5d1c421c 622/*
a3621e74 623
3dab1dad 624 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
625 startbranch: the first branch in the whole branch sequence
626 first : start branch of sequence of branch-exact nodes.
627 May be the same as startbranch
628 last : Thing following the last branch.
629 May be the same as tail.
630 tail : item following the branch sequence
631 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 632 depth : indent depth
a3621e74
YO
633
634Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
635
636A trie is an N'ary tree where the branches are determined by digital
637decomposition of the key. IE, at the root node you look up the 1st character and
638follow that branch repeat until you find the end of the branches. Nodes can be
639marked as "accepting" meaning they represent a complete word. Eg:
640
641 /he|she|his|hers/
642
643would convert into the following structure. Numbers represent states, letters
644following numbers represent valid transitions on the letter from that state, if
645the number is in square brackets it represents an accepting state, otherwise it
646will be in parenthesis.
647
648 +-h->+-e->[3]-+-r->(8)-+-s->[9]
649 | |
650 | (2)
651 | |
652 (1) +-i->(6)-+-s->[7]
653 |
654 +-s->(3)-+-h->(4)-+-e->[5]
655
656 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
657
658This shows that when matching against the string 'hers' we will begin at state 1
659read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
660then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
661is also accepting. Thus we know that we can match both 'he' and 'hers' with a
662single traverse. We store a mapping from accepting to state to which word was
663matched, and then when we have multiple possibilities we try to complete the
664rest of the regex in the order in which they occured in the alternation.
665
666The only prior NFA like behaviour that would be changed by the TRIE support is
667the silent ignoring of duplicate alternations which are of the form:
668
669 / (DUPE|DUPE) X? (?{ ... }) Y /x
670
671Thus EVAL blocks follwing a trie may be called a different number of times with
672and without the optimisation. With the optimisations dupes will be silently
673ignored. This inconsistant behaviour of EVAL type nodes is well established as
674the following demonstrates:
675
676 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
677
678which prints out 'word' three times, but
679
680 'words'=~/(word|word|word)(?{ print $1 })S/
681
682which doesnt print it out at all. This is due to other optimisations kicking in.
683
684Example of what happens on a structural level:
685
686The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
687
688 1: CURLYM[1] {1,32767}(18)
689 5: BRANCH(8)
690 6: EXACT <ac>(16)
691 8: BRANCH(11)
692 9: EXACT <ad>(16)
693 11: BRANCH(14)
694 12: EXACT <ab>(16)
695 16: SUCCEED(0)
696 17: NOTHING(18)
697 18: END(0)
698
699This would be optimizable with startbranch=5, first=5, last=16, tail=16
700and should turn into:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: TRIE(16)
704 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
705 <ac>
706 <ad>
707 <ab>
708 16: SUCCEED(0)
709 17: NOTHING(18)
710 18: END(0)
711
712Cases where tail != last would be like /(?foo|bar)baz/:
713
714 1: BRANCH(4)
715 2: EXACT <foo>(8)
716 4: BRANCH(7)
717 5: EXACT <bar>(8)
718 7: TAIL(8)
719 8: EXACT <baz>(10)
720 10: END(0)
721
722which would be optimizable with startbranch=1, first=1, last=7, tail=8
723and would end up looking like:
724
725 1: TRIE(8)
726 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
727 <foo>
728 <bar>
729 7: TAIL(8)
730 8: EXACT <baz>(10)
731 10: END(0)
732
3dab1dad
YO
733 d = uvuni_to_utf8_flags(d, uv, 0);
734
735is the recommended Unicode-aware way of saying
736
737 *(d++) = uv;
a3621e74
YO
738*/
739
3dab1dad
YO
740#define TRIE_STORE_REVCHAR \
741 STMT_START { \
742 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
743 av_push( TRIE_REVCHARMAP(trie), tmp ); \
744 } STMT_END
a3621e74
YO
745
746#define TRIE_READ_CHAR STMT_START { \
747 if ( UTF ) { \
748 if ( folder ) { \
749 if ( foldlen > 0 ) { \
750 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
751 foldlen -= len; \
752 scan += len; \
753 len = 0; \
754 } else { \
e1ec3a88 755 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
756 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
757 foldlen -= UNISKIP( uvc ); \
758 scan = foldbuf + UNISKIP( uvc ); \
759 } \
760 } else { \
e1ec3a88 761 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
762 } \
763 } else { \
764 uvc = (U32)*uc; \
765 len = 1; \
766 } \
767} STMT_END
768
769
770#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
771#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
772#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
773#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
774
775#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
776 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
777 TRIE_LIST_LEN( state ) *= 2; \
778 Renew( trie->states[ state ].trans.list, \
779 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
780 } \
781 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
782 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
783 TRIE_LIST_CUR( state )++; \
784} STMT_END
785
786#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 787 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
788 4, reg_trie_trans_le ); \
789 TRIE_LIST_CUR( state ) = 1; \
790 TRIE_LIST_LEN( state ) = 4; \
791} STMT_END
792
3dab1dad
YO
793#define TRIE_HANDLE_WORD(state) STMT_START { \
794 if ( !trie->states[ state ].wordnum ) { \
795 /* we havent inserted this word into the structure yet. */\
796 trie->states[ state ].wordnum = ++curword; \
797 DEBUG_r({ \
798 /* store the word for dumping */ \
799 SV* tmp; \
800 if (OP(noper) != NOTHING ) \
801 tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
802 else \
803 tmp=newSVpvn( "", 0 ); \
804 if ( UTF ) SvUTF8_on( tmp ); \
805 av_push( trie->words, tmp ); \
806 }); \
807 } else { \
808 NOOP; /* It's a dupe. So ignore it. */ \
809 } \
810} STMT_END
811
812#ifdef DEBUGGING
813/*
814 dump_trie(trie)
815 dump_trie_interim_list(trie,next_alloc)
816 dump_trie_interim_table(trie,next_alloc)
817
818 These routines dump out a trie in a somewhat readable format.
819 The _interim_ variants are used for debugging the interim
820 tables that are used to generate the final compressed
821 representation which is what dump_trie expects.
822
823 Part of the reason for their existance is to provide a form
824 of documentation as to how the different representations function.
825
826*/
827
828/*
829 dump_trie(trie)
830 Dumps the final compressed table form of the trie to Perl_debug_log.
831 Used for debugging make_trie().
832*/
833
834STATIC void
835S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
836{
837 U32 state;
838 GET_RE_DEBUG_FLAGS_DECL;
839
840 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
841 (int)depth * 2 + 2,"",
842 "Match","Base","Ofs" );
843
844 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
845 SV **tmp = av_fetch( trie->revcharmap, state, 0);
846 if ( tmp ) {
847 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
848 }
849 }
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
852
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
854 PerlIO_printf( Perl_debug_log, "-----");
855 PerlIO_printf( Perl_debug_log, "\n");
856
857 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
858 const U32 base = trie->states[ state ].trans.base;
859
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
861
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
864 } else {
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
866 }
867
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
869
870 if ( base ) {
871 U32 ofs = 0;
872
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
876 ofs++;
877
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
879
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
884 {
885 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
886 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
887 } else {
888 PerlIO_printf( Perl_debug_log, "%4s "," ." );
889 }
890 }
891
892 PerlIO_printf( Perl_debug_log, "]");
893
894 }
895 PerlIO_printf( Perl_debug_log, "\n" );
896 }
897}
898/*
899 dump_trie_interim_list(trie,next_alloc)
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
904*/
905STATIC void
906S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
907{
908 U32 state;
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
913 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
918 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
929 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
930 SvPV_nolen_const( *tmp ),
931 TRIE_LIST_ITEM(state,charid).forid,
932 (UV)TRIE_LIST_ITEM(state,charid).newstate
933 );
934 }
935
936 }
937}
938
939/*
940 dump_trie_interim_table(trie,next_alloc)
941 Dumps a fully constructed but uncompressed trie in table form.
942 This is the normal DFA style state transition table, with a few
943 twists to facilitate compression later.
944 Used for debugging make_trie().
945*/
946STATIC void
947S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
948{
949 U32 state;
950 U16 charid;
951 GET_RE_DEBUG_FLAGS_DECL;
952
953 /*
954 print out the table precompression so that we can do a visual check
955 that they are identical.
956 */
957
958 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
962 if ( tmp ) {
963 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
964 }
965 }
966
967 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
968
969 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
970 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n" );
974
975 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
976
977 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
978 (int)depth * 2 + 2,"",
979 (UV)TRIE_NODENUM( state ) );
980
981 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
982 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
983 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
984 }
985 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
986 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
987 } else {
988 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
989 trie->states[ TRIE_NODENUM( state ) ].wordnum );
990 }
991 }
992}
993
994#endif
995
996
997
998
999
a3621e74 1000STATIC I32
3dab1dad 1001S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1002{
27da23d5 1003 dVAR;
a3621e74
YO
1004 /* first pass, loop through and scan words */
1005 reg_trie_data *trie;
1006 regnode *cur;
9f7f3913 1007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1008 STRLEN len = 0;
1009 UV uvc = 0;
1010 U16 curword = 0;
1011 U32 next_alloc = 0;
1012 /* we just use folder as a flag in utf8 */
e1ec3a88 1013 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1014 ? PL_fold
1015 : ( flags == EXACTFL
1016 ? PL_fold_locale
1017 : NULL
1018 )
1019 );
1020
e1ec3a88 1021 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1022 SV *re_trie_maxbuff;
3dab1dad
YO
1023#ifndef DEBUGGING
1024 /* these are only used during construction but are useful during
1025 debugging so we store them in the struct when debugging.
1026 Wordcount is actually superfluous in debugging as we have
1027 (AV*)trie->words to use for it, but thats not available when
1028 not debugging... We could make the macro use the AV during
1029 debugging tho...
1030 */
1031 U16 trie_wordcount=0;
1032 STRLEN trie_charcount=0;
1033 U32 trie_laststate=0;
1034 AV *trie_revcharmap;
1035#endif
a3621e74
YO
1036 GET_RE_DEBUG_FLAGS_DECL;
1037
a02a5408 1038 Newxz( trie, 1, reg_trie_data );
a3621e74 1039 trie->refcount = 1;
3dab1dad 1040 trie->startstate = 1;
a3621e74 1041 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1042 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1043 if (!(UTF && folder))
1044 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1045 DEBUG_r({
1046 trie->words = newAV();
a3621e74 1047 });
3dab1dad 1048 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1049
0111c4fd 1050 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1051 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1052 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1053 }
3dab1dad
YO
1054 DEBUG_OPTIMISE_r({
1055 PerlIO_printf( Perl_debug_log,
1056 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1057 (int)depth * 2 + 2, "",
1058 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1059 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1060 });
a3621e74
YO
1061 /* -- First loop and Setup --
1062
1063 We first traverse the branches and scan each word to determine if it
1064 contains widechars, and how many unique chars there are, this is
1065 important as we have to build a table with at least as many columns as we
1066 have unique chars.
1067
1068 We use an array of integers to represent the character codes 0..255
1069 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1070 native representation of the character value as the key and IV's for the
1071 coded index.
1072
1073 *TODO* If we keep track of how many times each character is used we can
1074 remap the columns so that the table compression later on is more
1075 efficient in terms of memory by ensuring most common value is in the
1076 middle and the least common are on the outside. IMO this would be better
1077 than a most to least common mapping as theres a decent chance the most
1078 common letter will share a node with the least common, meaning the node
1079 will not be compressable. With a middle is most common approach the worst
1080 case is when we have the least common nodes twice.
1081
1082 */
1083
a3621e74 1084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1085 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1086 const U8 *uc = (U8*)STRING( noper );
a28509cc 1087 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1088 STRLEN foldlen = 0;
1089 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1090 const U8 *scan = (U8*)NULL;
3dab1dad 1091 STRLEN chars=0;
a3621e74 1092
3dab1dad
YO
1093 TRIE_WORDCOUNT(trie)++;
1094 if (OP(noper) == NOTHING) {
1095 trie->minlen= 0;
1096 continue;
1097 }
1098 if (trie->bitmap) {
1099 TRIE_BITMAP_SET(trie,*uc);
1100 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1101 }
a3621e74 1102 for ( ; uc < e ; uc += len ) {
3dab1dad 1103 TRIE_CHARCOUNT(trie)++;
a3621e74 1104 TRIE_READ_CHAR;
3dab1dad 1105 chars++;
a3621e74
YO
1106 if ( uvc < 256 ) {
1107 if ( !trie->charmap[ uvc ] ) {
1108 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1109 if ( folder )
1110 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1111 TRIE_STORE_REVCHAR;
a3621e74
YO
1112 }
1113 } else {
1114 SV** svpp;
1115 if ( !trie->widecharmap )
1116 trie->widecharmap = newHV();
1117
1118 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1119
1120 if ( !svpp )
e4584336 1121 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1122
1123 if ( !SvTRUE( *svpp ) ) {
1124 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1125 TRIE_STORE_REVCHAR;
a3621e74
YO
1126 }
1127 }
1128 }
3dab1dad
YO
1129 if( cur == first ) {
1130 trie->minlen=chars;
1131 trie->maxlen=chars;
1132 } else if (chars < trie->minlen) {
1133 trie->minlen=chars;
1134 } else if (chars > trie->maxlen) {
1135 trie->maxlen=chars;
1136 }
1137
a3621e74
YO
1138 } /* end first pass */
1139 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1140 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1141 (int)depth * 2 + 2,"",
1142 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1143 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
a3621e74
YO
1144 );
1145
1146
1147 /*
1148 We now know what we are dealing with in terms of unique chars and
1149 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1150 representation using a flat table will take. If it's over a reasonable
1151 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1152 conservative but potentially much slower representation using an array
1153 of lists.
1154
1155 At the end we convert both representations into the same compressed
1156 form that will be used in regexec.c for matching with. The latter
1157 is a form that cannot be used to construct with but has memory
1158 properties similar to the list form and access properties similar
1159 to the table form making it both suitable for fast searches and
1160 small enough that its feasable to store for the duration of a program.
1161
1162 See the comment in the code where the compressed table is produced
1163 inplace from the flat tabe representation for an explanation of how
1164 the compression works.
1165
1166 */
1167
1168
3dab1dad 1169 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1170 /*
1171 Second Pass -- Array Of Lists Representation
1172
1173 Each state will be represented by a list of charid:state records
1174 (reg_trie_trans_le) the first such element holds the CUR and LEN
1175 points of the allocated array. (See defines above).
1176
1177 We build the initial structure using the lists, and then convert
1178 it into the compressed table form which allows faster lookups
1179 (but cant be modified once converted).
a3621e74
YO
1180 */
1181
a3621e74
YO
1182 STRLEN transcount = 1;
1183
3dab1dad 1184 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1185 TRIE_LIST_NEW(1);
1186 next_alloc = 2;
1187
1188 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1189
c445ea15
AL
1190 regnode * const noper = NEXTOPER( cur );
1191 U8 *uc = (U8*)STRING( noper );
1192 const U8 * const e = uc + STR_LEN( noper );
1193 U32 state = 1; /* required init */
1194 U16 charid = 0; /* sanity init */
1195 U8 *scan = (U8*)NULL; /* sanity init */
1196 STRLEN foldlen = 0; /* required init */
1197 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1198
3dab1dad 1199 if (OP(noper) != NOTHING) {
c445ea15
AL
1200 for ( ; uc < e ; uc += len ) {
1201
1202 TRIE_READ_CHAR;
1203
1204 if ( uvc < 256 ) {
1205 charid = trie->charmap[ uvc ];
1206 } else {
1207 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1208 if ( !svpp ) {
1209 charid = 0;
1210 } else {
1211 charid=(U16)SvIV( *svpp );
1212 }
1213 }
1214 if ( charid ) {
a3621e74 1215
c445ea15
AL
1216 U16 check;
1217 U32 newstate = 0;
a3621e74 1218
c445ea15
AL
1219 charid--;
1220 if ( !trie->states[ state ].trans.list ) {
1221 TRIE_LIST_NEW( state );
1222 }
1223 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1224 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1225 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1226 break;
1227 }
1228 }
1229 if ( ! newstate ) {
1230 newstate = next_alloc++;
1231 TRIE_LIST_PUSH( state, charid, newstate );
1232 transcount++;
1233 }
1234 state = newstate;
1235 } else {
1236 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1237 }
c445ea15
AL
1238 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1239 }
c445ea15 1240 }
3dab1dad 1241 TRIE_HANDLE_WORD(state);
a3621e74
YO
1242
1243 } /* end second pass */
1244
3dab1dad 1245 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1246 Renew( trie->states, next_alloc, reg_trie_state );
1247
3dab1dad
YO
1248 /* and now dump it out before we compress it */
1249 DEBUG_TRIE_COMPILE_MORE_r(
1250 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1251 );
a3621e74 1252
a02a5408 1253 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1254 {
1255 U32 state;
a3621e74
YO
1256 U32 tp = 0;
1257 U32 zp = 0;
1258
1259
1260 for( state=1 ; state < next_alloc ; state ++ ) {
1261 U32 base=0;
1262
1263 /*
1264 DEBUG_TRIE_COMPILE_MORE_r(
1265 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1266 );
1267 */
1268
1269 if (trie->states[state].trans.list) {
1270 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1271 U16 maxid=minid;
a28509cc 1272 U16 idx;
a3621e74
YO
1273
1274 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1275 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1276 if ( forid < minid ) {
1277 minid=forid;
1278 } else if ( forid > maxid ) {
1279 maxid=forid;
1280 }
a3621e74
YO
1281 }
1282 if ( transcount < tp + maxid - minid + 1) {
1283 transcount *= 2;
1284 Renew( trie->trans, transcount, reg_trie_trans );
1285 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1286 }
1287 base = trie->uniquecharcount + tp - minid;
1288 if ( maxid == minid ) {
1289 U32 set = 0;
1290 for ( ; zp < tp ; zp++ ) {
1291 if ( ! trie->trans[ zp ].next ) {
1292 base = trie->uniquecharcount + zp - minid;
1293 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1294 trie->trans[ zp ].check = state;
1295 set = 1;
1296 break;
1297 }
1298 }
1299 if ( !set ) {
1300 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1301 trie->trans[ tp ].check = state;
1302 tp++;
1303 zp = tp;
1304 }
1305 } else {
1306 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1307 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1308 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1309 trie->trans[ tid ].check = state;
1310 }
1311 tp += ( maxid - minid + 1 );
1312 }
1313 Safefree(trie->states[ state ].trans.list);
1314 }
1315 /*
1316 DEBUG_TRIE_COMPILE_MORE_r(
1317 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1318 );
1319 */
1320 trie->states[ state ].trans.base=base;
1321 }
cc601c31 1322 trie->lasttrans = tp + 1;
a3621e74
YO
1323 }
1324 } else {
1325 /*
1326 Second Pass -- Flat Table Representation.
1327
1328 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1329 We know that we will need Charcount+1 trans at most to store the data
1330 (one row per char at worst case) So we preallocate both structures
1331 assuming worst case.
1332
1333 We then construct the trie using only the .next slots of the entry
1334 structs.
1335
1336 We use the .check field of the first entry of the node temporarily to
1337 make compression both faster and easier by keeping track of how many non
1338 zero fields are in the node.
1339
1340 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1341 transition.
1342
1343 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1344 number representing the first entry of the node, and state as a
1345 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1346 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1347 are 2 entrys per node. eg:
1348
1349 A B A B
1350 1. 2 4 1. 3 7
1351 2. 0 3 3. 0 5
1352 3. 0 0 5. 0 0
1353 4. 0 0 7. 0 0
1354
1355 The table is internally in the right hand, idx form. However as we also
1356 have to deal with the states array which is indexed by nodenum we have to
1357 use TRIE_NODENUM() to convert.
1358
1359 */
1360
3dab1dad
YO
1361
1362 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1363 reg_trie_trans );
3dab1dad 1364 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1365 next_alloc = trie->uniquecharcount + 1;
1366
3dab1dad 1367
a3621e74
YO
1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1369
c445ea15 1370 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1371 const U8 *uc = (U8*)STRING( noper );
1372 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1373
1374 U32 state = 1; /* required init */
1375
1376 U16 charid = 0; /* sanity init */
1377 U32 accept_state = 0; /* sanity init */
1378 U8 *scan = (U8*)NULL; /* sanity init */
1379
1380 STRLEN foldlen = 0; /* required init */
1381 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1382
3dab1dad 1383 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1384 for ( ; uc < e ; uc += len ) {
1385
1386 TRIE_READ_CHAR;
1387
1388 if ( uvc < 256 ) {
1389 charid = trie->charmap[ uvc ];
1390 } else {
c445ea15
AL
1391 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1392 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1393 }
1394 if ( charid ) {
1395 charid--;
1396 if ( !trie->trans[ state + charid ].next ) {
1397 trie->trans[ state + charid ].next = next_alloc;
1398 trie->trans[ state ].check++;
1399 next_alloc += trie->uniquecharcount;
1400 }
1401 state = trie->trans[ state + charid ].next;
1402 } else {
e4584336 1403 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1404 }
1405 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1406 }
a3621e74 1407 }
3dab1dad
YO
1408 accept_state = TRIE_NODENUM( state );
1409 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1410
1411 } /* end second pass */
1412
3dab1dad
YO
1413 /* and now dump it out before we compress it */
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 dump_trie_interim_table(trie,next_alloc,depth+1)
1416 );
a3621e74 1417
a3621e74
YO
1418 {
1419 /*
1420 * Inplace compress the table.*
1421
1422 For sparse data sets the table constructed by the trie algorithm will
1423 be mostly 0/FAIL transitions or to put it another way mostly empty.
1424 (Note that leaf nodes will not contain any transitions.)
1425
1426 This algorithm compresses the tables by eliminating most such
1427 transitions, at the cost of a modest bit of extra work during lookup:
1428
1429 - Each states[] entry contains a .base field which indicates the
1430 index in the state[] array wheres its transition data is stored.
1431
1432 - If .base is 0 there are no valid transitions from that node.
1433
1434 - If .base is nonzero then charid is added to it to find an entry in
1435 the trans array.
1436
1437 -If trans[states[state].base+charid].check!=state then the
1438 transition is taken to be a 0/Fail transition. Thus if there are fail
1439 transitions at the front of the node then the .base offset will point
1440 somewhere inside the previous nodes data (or maybe even into a node
1441 even earlier), but the .check field determines if the transition is
1442 valid.
1443
1444 The following process inplace converts the table to the compressed
1445 table: We first do not compress the root node 1,and mark its all its
1446 .check pointers as 1 and set its .base pointer as 1 as well. This
1447 allows to do a DFA construction from the compressed table later, and
1448 ensures that any .base pointers we calculate later are greater than
1449 0.
1450
1451 - We set 'pos' to indicate the first entry of the second node.
1452
1453 - We then iterate over the columns of the node, finding the first and
1454 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1455 and set the .check pointers accordingly, and advance pos
1456 appropriately and repreat for the next node. Note that when we copy
1457 the next pointers we have to convert them from the original
1458 NODEIDX form to NODENUM form as the former is not valid post
1459 compression.
1460
1461 - If a node has no transitions used we mark its base as 0 and do not
1462 advance the pos pointer.
1463
1464 - If a node only has one transition we use a second pointer into the
1465 structure to fill in allocated fail transitions from other states.
1466 This pointer is independent of the main pointer and scans forward
1467 looking for null transitions that are allocated to a state. When it
1468 finds one it writes the single transition into the "hole". If the
1469 pointer doesnt find one the single transition is appeneded as normal.
1470
1471 - Once compressed we can Renew/realloc the structures to release the
1472 excess space.
1473
1474 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1475 specifically Fig 3.47 and the associated pseudocode.
1476
1477 demq
1478 */
a3b680e6 1479 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1480 U32 state, charid;
a3621e74 1481 U32 pos = 0, zp=0;
3dab1dad 1482 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1483
1484 for ( state = 1 ; state < laststate ; state++ ) {
1485 U8 flag = 0;
a28509cc
AL
1486 const U32 stateidx = TRIE_NODEIDX( state );
1487 const U32 o_used = trie->trans[ stateidx ].check;
1488 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1489 trie->trans[ stateidx ].check = 0;
1490
1491 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1492 if ( flag || trie->trans[ stateidx + charid ].next ) {
1493 if ( trie->trans[ stateidx + charid ].next ) {
1494 if (o_used == 1) {
1495 for ( ; zp < pos ; zp++ ) {
1496 if ( ! trie->trans[ zp ].next ) {
1497 break;
1498 }
1499 }
1500 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1501 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1502 trie->trans[ zp ].check = state;
1503 if ( ++zp > pos ) pos = zp;
1504 break;
1505 }
1506 used--;
1507 }
1508 if ( !flag ) {
1509 flag = 1;
1510 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1511 }
1512 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1513 trie->trans[ pos ].check = state;
1514 pos++;
1515 }
1516 }
1517 }
cc601c31 1518 trie->lasttrans = pos + 1;
a3621e74
YO
1519 Renew( trie->states, laststate + 1, reg_trie_state);
1520 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1521 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1522 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1523 (int)depth * 2 + 2,"",
1524 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1525 (IV)next_alloc,
1526 (IV)pos,
a3621e74
YO
1527 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1528 );
1529
1530 } /* end table compress */
1531 }
cc601c31
YO
1532 /* resize the trans array to remove unused space */
1533 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1534
3dab1dad
YO
1535 /* and now dump out the compressed format */
1536 DEBUG_TRIE_COMPILE_r(
1537 dump_trie(trie,depth+1)
1538 );
1539
1540 { /* Modify the program and insert the new TRIE node*/
1541 regnode *convert;
1542 U8 nodetype =(U8)(flags & 0xFF);
1543 char *str=NULL;
a3621e74 1544 /*
3dab1dad
YO
1545 This means we convert either the first branch or the first Exact,
1546 depending on whether the thing following (in 'last') is a branch
1547 or not and whther first is the startbranch (ie is it a sub part of
1548 the alternation or is it the whole thing.)
1549 Assuming its a sub part we conver the EXACT otherwise we convert
1550 the whole branch sequence, including the first.
a3621e74 1551 */
3dab1dad
YO
1552 /* Find the node we are going to overwrite */
1553 if ( first == startbranch && OP( last ) != BRANCH ) {
1554 convert = first;
a3621e74 1555 } else {
3dab1dad
YO
1556 convert = NEXTOPER( first );
1557 NEXT_OFF( first ) = (U16)(last - first);
a3621e74
YO
1558 }
1559
3dab1dad
YO
1560 /* But first we check to see if there is a common prefix we can
1561 split out as an EXACT and put in front of the TRIE node. */
1562 trie->startstate= 1;
1563 if ( trie->bitmap && !trie->widecharmap ) {
1564 U32 state;
1565 DEBUG_OPTIMISE_r(
1566 PerlIO_printf( Perl_debug_log,"%*sLaststate:%d\n",
1567 (int)depth * 2 + 2,"",
1568 TRIE_LASTSTATE(trie)));
1569 for( state= 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1570 U32 ofs = 0;
3dab1dad
YO
1571 I32 idx= -1;
1572 U32 count= 0;
1573 const U32 base= trie->states[ state ].trans.base;
a3621e74 1574
3dab1dad
YO
1575 if ( trie->states[state].wordnum )
1576 count =1;
a3621e74 1577
3dab1dad
YO
1578 for ( ofs= 0 ; ofs < trie->uniquecharcount ; ofs++ )
1579 {
a3621e74 1580
cc601c31
YO
1581 if ( ( base + ofs >= trie->uniquecharcount ) &&
1582 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1583 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1584 {
3dab1dad
YO
1585 if ( ++count > 1 ) {
1586 SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1587 const char *ch= SvPV_nolen_const( *tmp );
1588 if (state==1) break;
1589 if ( count == 2 ) {
1590 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1591 DEBUG_OPTIMISE_r(
1592 PerlIO_printf( Perl_debug_log,"%*sNew Start State=%d Class: [",
1593 (int)depth * 2 + 2,"",
1594 state));
1595 if (idx>-1) {
1596 SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1597 const char *ch= SvPV_nolen_const( *tmp );
1598
1599 TRIE_BITMAP_SET(trie,*ch);
1600 if ( folder )
1601 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1602 DEBUG_OPTIMISE_r(
1603 PerlIO_printf( Perl_debug_log,"%s", ch)
1604 );
a3621e74
YO
1605 }
1606 }
3dab1dad
YO
1607 TRIE_BITMAP_SET(trie,*ch);
1608 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *ch ]);
1609 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
a3621e74 1610 }
3dab1dad 1611 idx= ofs;
a3621e74 1612 }
3dab1dad
YO
1613 }
1614 if ( count == 1 ) {
1615 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1616 const char *ch= SvPV_nolen_const( *tmp );
1617 DEBUG_OPTIMISE_r(
1618 PerlIO_printf( Perl_debug_log,"%*sPrefix State: %d Idx:%d Char='%s'\n",
1619 (int)depth * 2 + 2,"",
1620 state, idx, ch)
1621 );
1622 if ( state==1 ) {
1623 OP( convert ) = nodetype;
1624 str=STRING(convert);
1625 STR_LEN(convert)=0;
1626 }
1627 *str++=*ch;
1628 STR_LEN(convert)++;
a3621e74 1629
a3621e74 1630 } else {
3dab1dad
YO
1631 if (state>1)
1632 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1633 break;
a3621e74 1634 }
3dab1dad
YO
1635 }
1636 if (str) {
1637 regnode *n= convert+NODE_SZ_STR(convert);
1638 NEXT_OFF(convert)= NODE_SZ_STR(convert);
1639 trie->startstate= state;
1640 trie->minlen-= (state-1);
1641 trie->maxlen-= (state-1);
1642 if (trie->maxlen)
1643 convert= n;
1644 else {
1645 NEXT_OFF(convert) = (U16)(tail - convert);
1646 }
1647 }
1648 }
1649 if ( trie->maxlen ) {
1650 OP( convert ) = TRIE;
a3621e74
YO
1651 NEXT_OFF( convert ) = (U16)(tail - convert);
1652 ARG_SET( convert, data_slot );
1653
3dab1dad
YO
1654 /* store the type in the flags */
1655 convert->flags = nodetype;
1656 /* XXX We really should free up the resource in trie now, as we wont use them */
1657 }
a3621e74
YO
1658 /* needed for dumping*/
1659 DEBUG_r({
1660 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1661 /* We now need to mark all of the space originally used by the
1662 branches as optimized away. This keeps the dumpuntil from
1663 throwing a wobbly as it doesnt use regnext() to traverse the
1664 opcodes.
1665 */
1666 while( optimize < last ) {
1667 OP( optimize ) = OPTIMIZED;
1668 optimize++;
1669 }
1670 });
1671 } /* end node insert */
1672 return 1;
1673}
1674
a3621e74 1675/*
5d1c421c
JH
1676 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1677 * These need to be revisited when a newer toolchain becomes available.
1678 */
1679#if defined(__sparc64__) && defined(__GNUC__)
1680# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1681# undef SPARC64_GCC_WORKAROUND
1682# define SPARC64_GCC_WORKAROUND 1
1683# endif
1684#endif
1685
653099ff
GS
1686/* REx optimizer. Converts nodes into quickier variants "in place".
1687 Finds fixed substrings. */
1688
a0288114 1689/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1690 to the position after last scanned or to NULL. */
1691
76e3520e 1692STATIC I32
9a957fbc
AL
1693S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1694 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1695 /* scanp: Start here (read-write). */
1696 /* deltap: Write maxlen-minlen here. */
1697 /* last: Stop before this one. */
1698{
97aff369 1699 dVAR;
c277df42
IZ
1700 I32 min = 0, pars = 0, code;
1701 regnode *scan = *scanp, *next;
1702 I32 delta = 0;
1703 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1704 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1705 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1706 scan_data_t data_fake;
653099ff 1707 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1708 SV *re_trie_maxbuff = NULL;
1709
1710 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1711
3dab1dad 1712PEEP:
c277df42 1713 while (scan && OP(scan) != END && scan < last) {
3dab1dad
YO
1714 #ifdef DEBUGGING
1715 int merged=0;
1716 #endif
c277df42 1717 /* Peephole optimizer: */
a3621e74 1718 DEBUG_OPTIMISE_r({
c445ea15 1719 SV * const mysv=sv_newmortal();
32fc9b6a 1720 regprop(RExC_rx, mysv, scan);
3dab1dad
YO
1721 PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
1722 (int)depth*2, "",
1723 scan==*scanp ? "Peep" : "",
1724 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
a3621e74 1725 });
3dab1dad 1726 if (PL_regkind[OP(scan)] == EXACT) {
653099ff 1727 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1728 regnode *n = regnext(scan);
1729 U32 stringok = 1;
1730#ifdef DEBUGGING
1731 regnode *stop = scan;
b81d288d 1732#endif
cd439c50 1733 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1734 /* Skip NOTHING, merge EXACT*. */
1735 while (n &&
3dab1dad 1736 ( PL_regkind[OP(n)] == NOTHING ||
c277df42
IZ
1737 (stringok && (OP(n) == OP(scan))))
1738 && NEXT_OFF(n)
1739 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1740 if (OP(n) == TAIL || n > next)
1741 stringok = 0;
3dab1dad
YO
1742 if (PL_regkind[OP(n)] == NOTHING) {
1743 DEBUG_OPTIMISE_r({
1744 SV * const mysv=sv_newmortal();
1745 regprop(RExC_rx, mysv, n);
1746 PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
1747 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1748 });
c277df42
IZ
1749 NEXT_OFF(scan) += NEXT_OFF(n);
1750 next = n + NODE_STEP_REGNODE;
1751#ifdef DEBUGGING
1752 if (stringok)
1753 stop = n;
b81d288d 1754#endif
c277df42 1755 n = regnext(n);
a0ed51b3 1756 }
f49d4d0f 1757 else if (stringok) {
a3b680e6 1758 const int oldl = STR_LEN(scan);
c445ea15 1759 regnode * const nnext = regnext(n);
3dab1dad
YO
1760 DEBUG_OPTIMISE_r({
1761 SV * const mysv=sv_newmortal();
1762 regprop(RExC_rx, mysv, n);
1763 PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
1764 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1765 merged++;
1766 });
b81d288d 1767 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1768 break;
1769 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1770 STR_LEN(scan) += STR_LEN(n);
1771 next = n + NODE_SZ_STR(n);
c277df42 1772 /* Now we can overwrite *n : */
f49d4d0f 1773 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1774#ifdef DEBUGGING
f49d4d0f 1775 stop = next - 1;
b81d288d 1776#endif
c277df42
IZ
1777 n = nnext;
1778 }
1779 }
61a36c01 1780
a3621e74 1781 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1782/*
1783 Two problematic code points in Unicode casefolding of EXACT nodes:
1784
1785 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1786 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1787
1788 which casefold to
1789
1790 Unicode UTF-8
1791
1792 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1793 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1794
1795 This means that in case-insensitive matching (or "loose matching",
1796 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1797 length of the above casefolded versions) can match a target string
1798 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1799 This would rather mess up the minimum length computation.
1800
1801 What we'll do is to look for the tail four bytes, and then peek
1802 at the preceding two bytes to see whether we need to decrease
1803 the minimum length by four (six minus two).
1804
1805 Thanks to the design of UTF-8, there cannot be false matches:
1806 A sequence of valid UTF-8 bytes cannot be a subsequence of
1807 another valid sequence of UTF-8 bytes.
1808
1809*/
c445ea15
AL
1810 char * const s0 = STRING(scan), *s, *t;
1811 char * const s1 = s0 + STR_LEN(scan) - 1;
1812 char * const s2 = s1 - 4;
d4c19fe8 1813 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1814 const char * const t1 = t0 + 3;
2af232bd 1815
61a36c01
JH
1816 for (s = s0 + 2;
1817 s < s2 && (t = ninstr(s, s1, t0, t1));
1818 s = t + 4) {
1819 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1820 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1821 min -= 4;
1822 }
1823 }
1824
c277df42
IZ
1825#ifdef DEBUGGING
1826 /* Allow dumping */
cd439c50 1827 n = scan + NODE_SZ_STR(scan);
c277df42 1828 while (n <= stop) {
3dab1dad 1829 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1830 OP(n) = OPTIMIZED;
1831 NEXT_OFF(n) = 0;
1832 }
1833 n++;
1834 }
653099ff 1835#endif
c277df42 1836 }
a3621e74
YO
1837
1838
1839
653099ff
GS
1840 /* Follow the next-chain of the current node and optimize
1841 away all the NOTHINGs from it. */
c277df42 1842 if (OP(scan) != CURLYX) {
a3b680e6 1843 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1844 ? I32_MAX
1845 /* I32 may be smaller than U16 on CRAYs! */
1846 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1847 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1848 int noff;
1849 regnode *n = scan;
b81d288d 1850
c277df42
IZ
1851 /* Skip NOTHING and LONGJMP. */
1852 while ((n = regnext(n))
3dab1dad 1853 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1854 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1855 && off + noff < max)
1856 off += noff;
1857 if (reg_off_by_arg[OP(scan)])
1858 ARG(scan) = off;
b81d288d 1859 else
c277df42
IZ
1860 NEXT_OFF(scan) = off;
1861 }
a3621e74 1862
3dab1dad
YO
1863 DEBUG_OPTIMISE_r({if (merged){
1864 SV * const mysv=sv_newmortal();
1865 regprop(RExC_rx, mysv, scan);
1866 PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
1867 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1868 }});
1869
653099ff
GS
1870 /* The principal pseudo-switch. Cannot be a switch, since we
1871 look into several different things. */
b81d288d 1872 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1873 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1874 next = regnext(scan);
1875 code = OP(scan);
a3621e74 1876 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1877
1878 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1879 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1880 struct regnode_charclass_class accum;
d4c19fe8 1881 regnode * const startbranch=scan;
c277df42 1882
653099ff 1883 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1884 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1885 if (flags & SCF_DO_STCLASS)
830247a4 1886 cl_init_zero(pRExC_state, &accum);
a3621e74 1887
c277df42 1888 while (OP(scan) == code) {
830247a4 1889 I32 deltanext, minnext, f = 0, fake;
653099ff 1890 struct regnode_charclass_class this_class;
c277df42
IZ
1891
1892 num++;
1893 data_fake.flags = 0;
b81d288d 1894 if (data) {
2c2d71f5 1895 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1896 data_fake.last_closep = data->last_closep;
1897 }
1898 else
1899 data_fake.last_closep = &fake;
c277df42
IZ
1900 next = regnext(scan);
1901 scan = NEXTOPER(scan);
1902 if (code != BRANCH)
1903 scan = NEXTOPER(scan);
653099ff 1904 if (flags & SCF_DO_STCLASS) {
830247a4 1905 cl_init(pRExC_state, &this_class);
653099ff
GS
1906 data_fake.start_class = &this_class;
1907 f = SCF_DO_STCLASS_AND;
b81d288d 1908 }
e1901655
IZ
1909 if (flags & SCF_WHILEM_VISITED_POS)
1910 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1911
653099ff 1912 /* we suppose the run is continuous, last=next...*/
830247a4 1913 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1914 next, &data_fake, f,depth+1);
b81d288d 1915 if (min1 > minnext)
c277df42
IZ
1916 min1 = minnext;
1917 if (max1 < minnext + deltanext)
1918 max1 = minnext + deltanext;
1919 if (deltanext == I32_MAX)
aca2d497 1920 is_inf = is_inf_internal = 1;
c277df42
IZ
1921 scan = next;
1922 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1923 pars++;
3dab1dad
YO
1924 if (data) {
1925 if (data_fake.flags & SF_HAS_EVAL)
c277df42 1926 data->flags |= SF_HAS_EVAL;
2c2d71f5 1927 data->whilem_c = data_fake.whilem_c;
3dab1dad 1928 }
653099ff 1929 if (flags & SCF_DO_STCLASS)
830247a4 1930 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1931 if (code == SUSPEND)
c277df42
IZ
1932 break;
1933 }
1934 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1935 min1 = 0;
1936 if (flags & SCF_DO_SUBSTR) {
1937 data->pos_min += min1;
1938 data->pos_delta += max1 - min1;
1939 if (max1 != min1 || is_inf)
1940 data->longest = &(data->longest_float);
1941 }
1942 min += min1;
1943 delta += max1 - min1;
653099ff 1944 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1945 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1946 if (min1) {
1947 cl_and(data->start_class, &and_with);
1948 flags &= ~SCF_DO_STCLASS;
1949 }
1950 }
1951 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1952 if (min1) {
1953 cl_and(data->start_class, &accum);
653099ff 1954 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1955 }
1956 else {
b81d288d 1957 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1958 * data->start_class */
1959 StructCopy(data->start_class, &and_with,
1960 struct regnode_charclass_class);
1961 flags &= ~SCF_DO_STCLASS_AND;
1962 StructCopy(&accum, data->start_class,
1963 struct regnode_charclass_class);
1964 flags |= SCF_DO_STCLASS_OR;
1965 data->start_class->flags |= ANYOF_EOS;
1966 }
653099ff 1967 }
a3621e74
YO
1968
1969 /* demq.
1970
1971 Assuming this was/is a branch we are dealing with: 'scan' now
1972 points at the item that follows the branch sequence, whatever
1973 it is. We now start at the beginning of the sequence and look
1974 for subsequences of
1975
1976 BRANCH->EXACT=>X
1977 BRANCH->EXACT=>X
1978
1979 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1980
1981 If we can find such a subseqence we need to turn the first
1982 element into a trie and then add the subsequent branch exact
1983 strings to the trie.
1984
1985 We have two cases
1986
1987 1. patterns where the whole set of branch can be converted to a trie,
1988
1989 2. patterns where only a subset of the alternations can be
1990 converted to a trie.
1991
1992 In case 1 we can replace the whole set with a single regop
1993 for the trie. In case 2 we need to keep the start and end
1994 branchs so
1995
1996 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1997 becomes BRANCH TRIE; BRANCH X;
1998
1999 Hypthetically when we know the regex isnt anchored we can
2000 turn a case 1 into a DFA and let it rip... Every time it finds a match
2001 it would just call its tail, no WHILEM/CURLY needed.
2002
2003 */
0111c4fd 2004 if (DO_TRIE) {
3dab1dad 2005 int made=0;
0111c4fd
RGS
2006 if (!re_trie_maxbuff) {
2007 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2008 if (!SvIOK(re_trie_maxbuff))
2009 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2010 }
a3621e74
YO
2011 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2012 regnode *cur;
2013 regnode *first = (regnode *)NULL;
2014 regnode *last = (regnode *)NULL;
2015 regnode *tail = scan;
2016 U8 optype = 0;
2017 U32 count=0;
2018
2019#ifdef DEBUGGING
c445ea15 2020 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2021#endif
2022 /* var tail is used because there may be a TAIL
2023 regop in the way. Ie, the exacts will point to the
2024 thing following the TAIL, but the last branch will
2025 point at the TAIL. So we advance tail. If we
2026 have nested (?:) we may have to move through several
2027 tails.
2028 */
2029
2030 while ( OP( tail ) == TAIL ) {
2031 /* this is the TAIL generated by (?:) */
2032 tail = regnext( tail );
2033 }
2034
3dab1dad 2035
a3621e74 2036 DEBUG_OPTIMISE_r({
32fc9b6a 2037 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2038 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2039 (int)depth * 2 + 2, "",
2040 "Looking for TRIE'able sequences. Tail node is: ",
2041 SvPV_nolen_const( mysv )
a3621e74
YO
2042 );
2043 });
3dab1dad 2044
a3621e74
YO
2045 /*
2046
2047 step through the branches, cur represents each
2048 branch, noper is the first thing to be matched
2049 as part of that branch and noper_next is the
2050 regnext() of that node. if noper is an EXACT
2051 and noper_next is the same as scan (our current
2052 position in the regex) then the EXACT branch is
2053 a possible optimization target. Once we have
2054 two or more consequetive such branches we can
2055 create a trie of the EXACT's contents and stich
2056 it in place. If the sequence represents all of
2057 the branches we eliminate the whole thing and
2058 replace it with a single TRIE. If it is a
2059 subsequence then we need to stitch it in. This
2060 means the first branch has to remain, and needs
2061 to be repointed at the item on the branch chain
2062 following the last branch optimized. This could
2063 be either a BRANCH, in which case the
2064 subsequence is internal, or it could be the
2065 item following the branch sequence in which
2066 case the subsequence is at the end.
2067
2068 */
2069
2070 /* dont use tail as the end marker for this traverse */
2071 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2072 regnode * const noper = NEXTOPER( cur );
2073 regnode * const noper_next = regnext( noper );
a3621e74 2074
a3621e74 2075 DEBUG_OPTIMISE_r({
32fc9b6a 2076 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2077 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2078 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2079
32fc9b6a 2080 regprop(RExC_rx, mysv, noper);
a3621e74 2081 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2082 SvPV_nolen_const(mysv));
a3621e74
YO
2083
2084 if ( noper_next ) {
32fc9b6a 2085 regprop(RExC_rx, mysv, noper_next );
a3621e74 2086 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2087 SvPV_nolen_const(mysv));
a3621e74 2088 }
3dab1dad
YO
2089 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2090 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2091 });
3dab1dad
YO
2092 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2093 : PL_regkind[ OP( noper ) ] == EXACT )
2094 || OP(noper) == NOTHING )
a3621e74
YO
2095 && noper_next == tail && count<U16_MAX)
2096 {
2097 count++;
3dab1dad
YO
2098 if ( !first || optype == NOTHING ) {
2099 if (!first) first = cur;
a3621e74
YO
2100 optype = OP( noper );
2101 } else {
a3621e74 2102 last = cur;
a3621e74
YO
2103 }
2104 } else {
2105 if ( last ) {
3dab1dad 2106 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2107 }
3dab1dad 2108 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2109 && noper_next == tail )
2110 {
2111 count = 1;
2112 first = cur;
2113 optype = OP( noper );
2114 } else {
2115 count = 0;
2116 first = NULL;
2117 optype = 0;
2118 }
2119 last = NULL;
2120 }
2121 }
2122 DEBUG_OPTIMISE_r({
32fc9b6a 2123 regprop(RExC_rx, mysv, cur);
a3621e74 2124 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2125 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2126 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2127
2128 });
2129 if ( last ) {
3dab1dad
YO
2130 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2131#ifdef TRIE_STUDY_OPT
2132 if ( OP(first)!=TRIE && startbranch == first ) {
2133
a3621e74 2134 }
3dab1dad 2135#endif
a3621e74
YO
2136 }
2137 }
3dab1dad
YO
2138
2139 } /* do trie */
a0ed51b3 2140 }
a3621e74 2141 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2142 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2143 } else /* single branch is optimized. */
c277df42
IZ
2144 scan = NEXTOPER(scan);
2145 continue;
a0ed51b3
LW
2146 }
2147 else if (OP(scan) == EXACT) {
cd439c50 2148 I32 l = STR_LEN(scan);
c445ea15 2149 UV uc;
a0ed51b3 2150 if (UTF) {
a3b680e6 2151 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2152 l = utf8_length(s, s + l);
9041c2e3 2153 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2154 } else {
2155 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2156 }
2157 min += l;
c277df42 2158 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2159 /* The code below prefers earlier match for fixed
2160 offset, later match for variable offset. */
2161 if (data->last_end == -1) { /* Update the start info. */
2162 data->last_start_min = data->pos_min;
2163 data->last_start_max = is_inf
b81d288d 2164 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2165 }
cd439c50 2166 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2167 if (UTF)
2168 SvUTF8_on(data->last_found);
0eda9292 2169 {
9a957fbc 2170 SV * const sv = data->last_found;
a28509cc 2171 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2172 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2173 if (mg && mg->mg_len >= 0)
5e43f467
JH
2174 mg->mg_len += utf8_length((U8*)STRING(scan),
2175 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2176 }
c277df42
IZ
2177 data->last_end = data->pos_min + l;
2178 data->pos_min += l; /* As in the first entry. */
2179 data->flags &= ~SF_BEFORE_EOL;
2180 }
653099ff
GS
2181 if (flags & SCF_DO_STCLASS_AND) {
2182 /* Check whether it is compatible with what we know already! */
2183 int compat = 1;
2184
1aa99e6b 2185 if (uc >= 0x100 ||
516a5887 2186 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2187 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2188 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2189 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2190 )
653099ff
GS
2191 compat = 0;
2192 ANYOF_CLASS_ZERO(data->start_class);
2193 ANYOF_BITMAP_ZERO(data->start_class);
2194 if (compat)
1aa99e6b 2195 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2196 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2197 if (uc < 0x100)
2198 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2199 }
2200 else if (flags & SCF_DO_STCLASS_OR) {
2201 /* false positive possible if the class is case-folded */
1aa99e6b 2202 if (uc < 0x100)
9b877dbb
IH
2203 ANYOF_BITMAP_SET(data->start_class, uc);
2204 else
2205 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2206 data->start_class->flags &= ~ANYOF_EOS;
2207 cl_and(data->start_class, &and_with);
2208 }
2209 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2210 }
3dab1dad 2211 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2212 I32 l = STR_LEN(scan);
1aa99e6b 2213 UV uc = *((U8*)STRING(scan));
653099ff
GS
2214
2215 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2216 if (flags & SCF_DO_SUBSTR) {
2217 assert(data);
830247a4 2218 scan_commit(pRExC_state, data);
ecaa9b9c 2219 }
a0ed51b3 2220 if (UTF) {
6136c704 2221 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2222 l = utf8_length(s, s + l);
9041c2e3 2223 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2224 }
2225 min += l;
ecaa9b9c 2226 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2227 data->pos_min += l;
653099ff
GS
2228 if (flags & SCF_DO_STCLASS_AND) {
2229 /* Check whether it is compatible with what we know already! */
2230 int compat = 1;
2231
1aa99e6b 2232 if (uc >= 0x100 ||
516a5887 2233 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2234 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2235 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2236 compat = 0;
2237 ANYOF_CLASS_ZERO(data->start_class);
2238 ANYOF_BITMAP_ZERO(data->start_class);
2239 if (compat) {
1aa99e6b 2240 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2241 data->start_class->flags &= ~ANYOF_EOS;
2242 data->start_class->flags |= ANYOF_FOLD;
2243 if (OP(scan) == EXACTFL)
2244 data->start_class->flags |= ANYOF_LOCALE;
2245 }
2246 }
2247 else if (flags & SCF_DO_STCLASS_OR) {
2248 if (data->start_class->flags & ANYOF_FOLD) {
2249 /* false positive possible if the class is case-folded.
2250 Assume that the locale settings are the same... */
1aa99e6b
IH
2251 if (uc < 0x100)
2252 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2253 data->start_class->flags &= ~ANYOF_EOS;
2254 }
2255 cl_and(data->start_class, &and_with);
2256 }
2257 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2258 }
3dab1dad
YO
2259#ifdef TRIE_STUDY_OPT
2260 else if (OP(scan) == TRIE) {
2261 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2262 min += trie->minlen;
2263 flags &= ~SCF_DO_STCLASS; /* xxx */
2264 if (flags & SCF_DO_SUBSTR) {
2265 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2266 data->pos_min += trie->minlen;
2267 data->pos_delta+= (trie->maxlen-trie->minlen);
2268 }
2269 }
2270#endif
bfed75c6 2271 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2272 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2273 I32 f = flags, pos_before = 0;
d4c19fe8 2274 regnode * const oscan = scan;
653099ff
GS
2275 struct regnode_charclass_class this_class;
2276 struct regnode_charclass_class *oclass = NULL;
727f22e3 2277 I32 next_is_eval = 0;
653099ff 2278
3dab1dad 2279 switch (PL_regkind[OP(scan)]) {
653099ff 2280 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2281 scan = NEXTOPER(scan);
2282 goto finish;
2283 case PLUS:
653099ff 2284 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2285 next = NEXTOPER(scan);
653099ff 2286 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2287 mincount = 1;
2288 maxcount = REG_INFTY;
c277df42
IZ
2289 next = regnext(scan);
2290 scan = NEXTOPER(scan);
2291 goto do_curly;
2292 }
2293 }
2294 if (flags & SCF_DO_SUBSTR)
2295 data->pos_min++;
2296 min++;
2297 /* Fall through. */
2298 case STAR:
653099ff
GS
2299 if (flags & SCF_DO_STCLASS) {
2300 mincount = 0;
b81d288d 2301 maxcount = REG_INFTY;
653099ff
GS
2302 next = regnext(scan);
2303 scan = NEXTOPER(scan);
2304 goto do_curly;
2305 }
b81d288d 2306 is_inf = is_inf_internal = 1;
c277df42
IZ
2307 scan = regnext(scan);
2308 if (flags & SCF_DO_SUBSTR) {
830247a4 2309 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2310 data->longest = &(data->longest_float);
2311 }
2312 goto optimize_curly_tail;
2313 case CURLY:
b81d288d 2314 mincount = ARG1(scan);
c277df42
IZ
2315 maxcount = ARG2(scan);
2316 next = regnext(scan);
cb434fcc
IZ
2317 if (OP(scan) == CURLYX) {
2318 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2319 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2320 }
c277df42 2321 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2322 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2323 do_curly:
2324 if (flags & SCF_DO_SUBSTR) {
830247a4 2325 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2326 pos_before = data->pos_min;
2327 }
2328 if (data) {
2329 fl = data->flags;
2330 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2331 if (is_inf)
2332 data->flags |= SF_IS_INF;
2333 }
653099ff 2334 if (flags & SCF_DO_STCLASS) {
830247a4 2335 cl_init(pRExC_state, &this_class);
653099ff
GS
2336 oclass = data->start_class;
2337 data->start_class = &this_class;
2338 f |= SCF_DO_STCLASS_AND;
2339 f &= ~SCF_DO_STCLASS_OR;
2340 }
e1901655
IZ
2341 /* These are the cases when once a subexpression
2342 fails at a particular position, it cannot succeed
2343 even after backtracking at the enclosing scope.
b81d288d 2344
e1901655
IZ
2345 XXXX what if minimal match and we are at the
2346 initial run of {n,m}? */
2347 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2348 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2349
c277df42 2350 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2351 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2352 (mincount == 0
2353 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2354
2355 if (flags & SCF_DO_STCLASS)
2356 data->start_class = oclass;
2357 if (mincount == 0 || minnext == 0) {
2358 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2359 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2360 }
2361 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2362 /* Switch to OR mode: cache the old value of
653099ff
GS
2363 * data->start_class */
2364 StructCopy(data->start_class, &and_with,
2365 struct regnode_charclass_class);
2366 flags &= ~SCF_DO_STCLASS_AND;
2367 StructCopy(&this_class, data->start_class,
2368 struct regnode_charclass_class);
2369 flags |= SCF_DO_STCLASS_OR;
2370 data->start_class->flags |= ANYOF_EOS;
2371 }
2372 } else { /* Non-zero len */
2373 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2374 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2375 cl_and(data->start_class, &and_with);
2376 }
2377 else if (flags & SCF_DO_STCLASS_AND)
2378 cl_and(data->start_class, &this_class);
2379 flags &= ~SCF_DO_STCLASS;
2380 }
c277df42
IZ
2381 if (!scan) /* It was not CURLYX, but CURLY. */
2382 scan = next;
041457d9
DM
2383 if ( /* ? quantifier ok, except for (?{ ... }) */
2384 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2385 && (minnext == 0) && (deltanext == 0)
99799961 2386 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2387 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2388 && ckWARN(WARN_REGEXP))
b45f050a 2389 {
830247a4 2390 vWARN(RExC_parse,
b45f050a
JF
2391 "Quantifier unexpected on zero-length expression");
2392 }
2393
c277df42 2394 min += minnext * mincount;
b81d288d 2395 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2396 && (minnext + deltanext) > 0)
2397 || deltanext == I32_MAX);
aca2d497 2398 is_inf |= is_inf_internal;
c277df42
IZ
2399 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2400
2401 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2402 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2403 && data->flags & SF_IN_PAR
2404 && !(data->flags & SF_HAS_EVAL)
2405 && !deltanext && minnext == 1 ) {
2406 /* Try to optimize to CURLYN. */
2407 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2408 regnode * const nxt1 = nxt;
497b47a8
JH
2409#ifdef DEBUGGING
2410 regnode *nxt2;
2411#endif
c277df42
IZ
2412
2413 /* Skip open. */
2414 nxt = regnext(nxt);
bfed75c6 2415 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2416 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2417 && STR_LEN(nxt) == 1))
c277df42 2418 goto nogo;
497b47a8 2419#ifdef DEBUGGING
c277df42 2420 nxt2 = nxt;
497b47a8 2421#endif
c277df42 2422 nxt = regnext(nxt);
b81d288d 2423 if (OP(nxt) != CLOSE)
c277df42
IZ
2424 goto nogo;
2425 /* Now we know that nxt2 is the only contents: */
eb160463 2426 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2427 OP(oscan) = CURLYN;
2428 OP(nxt1) = NOTHING; /* was OPEN. */
2429#ifdef DEBUGGING
2430 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2431 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2432 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2433 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2434 OP(nxt + 1) = OPTIMIZED; /* was count. */
2435 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2436#endif
c277df42 2437 }
c277df42
IZ
2438 nogo:
2439
2440 /* Try optimization CURLYX => CURLYM. */
b81d288d 2441 if ( OP(oscan) == CURLYX && data
c277df42 2442 && !(data->flags & SF_HAS_PAR)
c277df42 2443 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2444 && !deltanext /* atom is fixed width */
2445 && minnext != 0 /* CURLYM can't handle zero width */
2446 ) {
c277df42
IZ
2447 /* XXXX How to optimize if data == 0? */
2448 /* Optimize to a simpler form. */
2449 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2450 regnode *nxt2;
2451
2452 OP(oscan) = CURLYM;
2453 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2454 && (OP(nxt2) != WHILEM))
c277df42
IZ
2455 nxt = nxt2;
2456 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2457 /* Need to optimize away parenths. */
2458 if (data->flags & SF_IN_PAR) {
2459 /* Set the parenth number. */
2460 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2461
b81d288d 2462 if (OP(nxt) != CLOSE)
b45f050a 2463 FAIL("Panic opt close");
eb160463 2464 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2465 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2466 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2467#ifdef DEBUGGING
2468 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2469 OP(nxt + 1) = OPTIMIZED; /* was count. */
2470 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2471 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2472#endif
c277df42
IZ
2473#if 0
2474 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2475 regnode *nnxt = regnext(nxt1);
b81d288d 2476
c277df42
IZ
2477 if (nnxt == nxt) {
2478 if (reg_off_by_arg[OP(nxt1)])
2479 ARG_SET(nxt1, nxt2 - nxt1);
2480 else if (nxt2 - nxt1 < U16_MAX)
2481 NEXT_OFF(nxt1) = nxt2 - nxt1;
2482 else
2483 OP(nxt) = NOTHING; /* Cannot beautify */
2484 }
2485 nxt1 = nnxt;
2486 }
2487#endif
2488 /* Optimize again: */
b81d288d 2489 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2490 NULL, 0,depth+1);
a0ed51b3
LW
2491 }
2492 else
c277df42 2493 oscan->flags = 0;
c277df42 2494 }
e1901655
IZ
2495 else if ((OP(oscan) == CURLYX)
2496 && (flags & SCF_WHILEM_VISITED_POS)
2497 /* See the comment on a similar expression above.
2498 However, this time it not a subexpression
2499 we care about, but the expression itself. */
2500 && (maxcount == REG_INFTY)
2501 && data && ++data->whilem_c < 16) {
2502 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2503 /* Find WHILEM (as in regexec.c) */
2504 regnode *nxt = oscan + NEXT_OFF(oscan);
2505
2506 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2507 nxt += ARG(nxt);
eb160463
GS
2508 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2509 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2510 }
b81d288d 2511 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2512 pars++;
2513 if (flags & SCF_DO_SUBSTR) {
c445ea15 2514 SV *last_str = NULL;
c277df42
IZ
2515 int counted = mincount != 0;
2516
2517 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2518#if defined(SPARC64_GCC_WORKAROUND)
2519 I32 b = 0;
2520 STRLEN l = 0;
cfd0369c 2521 const char *s = NULL;
5d1c421c
JH
2522 I32 old = 0;
2523
2524 if (pos_before >= data->last_start_min)
2525 b = pos_before;
2526 else
2527 b = data->last_start_min;
2528
2529 l = 0;
cfd0369c 2530 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2531 old = b - data->last_start_min;
2532
2533#else
b81d288d 2534 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2535 ? pos_before : data->last_start_min;
2536 STRLEN l;
d4c19fe8 2537 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2538 I32 old = b - data->last_start_min;
5d1c421c 2539#endif
a0ed51b3
LW
2540
2541 if (UTF)
2542 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2543
a0ed51b3 2544 l -= old;
c277df42 2545 /* Get the added string: */
79cb57f6 2546 last_str = newSVpvn(s + old, l);
0e933229
IH
2547 if (UTF)
2548 SvUTF8_on(last_str);
c277df42
IZ
2549 if (deltanext == 0 && pos_before == b) {
2550 /* What was added is a constant string */
2551 if (mincount > 1) {
2552 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2553 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2554 SvPVX_const(last_str), l, mincount - 1);
b162af07 2555 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2556 /* Add additional parts. */
b81d288d 2557 SvCUR_set(data->last_found,
c277df42
IZ
2558 SvCUR(data->last_found) - l);
2559 sv_catsv(data->last_found, last_str);
0eda9292
JH
2560 {
2561 SV * sv = data->last_found;
2562 MAGIC *mg =
2563 SvUTF8(sv) && SvMAGICAL(sv) ?
2564 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2565 if (mg && mg->mg_len >= 0)
2566 mg->mg_len += CHR_SVLEN(last_str);
2567 }
c277df42
IZ
2568 data->last_end += l * (mincount - 1);
2569 }
2a8d9689
HS
2570 } else {
2571 /* start offset must point into the last copy */
2572 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2573 data->last_start_max += is_inf ? I32_MAX
2574 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2575 }
2576 }
2577 /* It is counted once already... */
2578 data->pos_min += minnext * (mincount - counted);
2579 data->pos_delta += - counted * deltanext +
2580 (minnext + deltanext) * maxcount - minnext * mincount;
2581 if (mincount != maxcount) {
653099ff
GS
2582 /* Cannot extend fixed substrings found inside
2583 the group. */
830247a4 2584 scan_commit(pRExC_state,data);
c277df42 2585 if (mincount && last_str) {
d4c19fe8
AL
2586 SV * const sv = data->last_found;
2587 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2588 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2589
2590 if (mg)
2591 mg->mg_len = -1;
2592 sv_setsv(sv, last_str);
c277df42 2593 data->last_end = data->pos_min;
b81d288d 2594 data->last_start_min =
a0ed51b3 2595 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2596 data->last_start_max = is_inf
2597 ? I32_MAX
c277df42 2598 : data->pos_min + data->pos_delta
a0ed51b3 2599 - CHR_SVLEN(last_str);
c277df42
IZ
2600 }
2601 data->longest = &(data->longest_float);
2602 }
aca2d497 2603 SvREFCNT_dec(last_str);
c277df42 2604 }
405ff068 2605 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2606 data->flags |= SF_HAS_EVAL;
2607 optimize_curly_tail:
c277df42 2608 if (OP(oscan) != CURLYX) {
3dab1dad 2609 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2610 && NEXT_OFF(next))
2611 NEXT_OFF(oscan) += NEXT_OFF(next);
2612 }
c277df42 2613 continue;
653099ff 2614 default: /* REF and CLUMP only? */
c277df42 2615 if (flags & SCF_DO_SUBSTR) {
830247a4 2616 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2617 data->longest = &(data->longest_float);
2618 }
aca2d497 2619 is_inf = is_inf_internal = 1;
653099ff 2620 if (flags & SCF_DO_STCLASS_OR)
830247a4 2621 cl_anything(pRExC_state, data->start_class);
653099ff 2622 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2623 break;
2624 }
a0ed51b3 2625 }
bfed75c6 2626 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2627 int value = 0;
653099ff 2628
c277df42 2629 if (flags & SCF_DO_SUBSTR) {
830247a4 2630 scan_commit(pRExC_state,data);
c277df42
IZ
2631 data->pos_min++;
2632 }
2633 min++;
653099ff
GS
2634 if (flags & SCF_DO_STCLASS) {
2635 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2636
2637 /* Some of the logic below assumes that switching
2638 locale on will only add false positives. */
3dab1dad 2639 switch (PL_regkind[OP(scan)]) {
653099ff 2640 case SANY:
653099ff
GS
2641 default:
2642 do_default:
2643 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2645 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2646 break;
2647 case REG_ANY:
2648 if (OP(scan) == SANY)
2649 goto do_default;
2650 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2651 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2652 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2653 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2654 }
2655 if (flags & SCF_DO_STCLASS_AND || !value)
2656 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2657 break;
2658 case ANYOF:
2659 if (flags & SCF_DO_STCLASS_AND)
2660 cl_and(data->start_class,
2661 (struct regnode_charclass_class*)scan);
2662 else
830247a4 2663 cl_or(pRExC_state, data->start_class,
653099ff
GS
2664 (struct regnode_charclass_class*)scan);
2665 break;
2666 case ALNUM:
2667 if (flags & SCF_DO_STCLASS_AND) {
2668 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2669 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2670 for (value = 0; value < 256; value++)
2671 if (!isALNUM(value))
2672 ANYOF_BITMAP_CLEAR(data->start_class, value);
2673 }
2674 }
2675 else {
2676 if (data->start_class->flags & ANYOF_LOCALE)
2677 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2678 else {
2679 for (value = 0; value < 256; value++)
2680 if (isALNUM(value))
b81d288d 2681 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2682 }
2683 }
2684 break;
2685 case ALNUML:
2686 if (flags & SCF_DO_STCLASS_AND) {
2687 if (data->start_class->flags & ANYOF_LOCALE)
2688 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2689 }
2690 else {
2691 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2692 data->start_class->flags |= ANYOF_LOCALE;
2693 }
2694 break;
2695 case NALNUM:
2696 if (flags & SCF_DO_STCLASS_AND) {
2697 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2698 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2699 for (value = 0; value < 256; value++)
2700 if (isALNUM(value))
2701 ANYOF_BITMAP_CLEAR(data->start_class, value);
2702 }
2703 }
2704 else {
2705 if (data->start_class->flags & ANYOF_LOCALE)
2706 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2707 else {
2708 for (value = 0; value < 256; value++)
2709 if (!isALNUM(value))
b81d288d 2710 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2711 }
2712 }
2713 break;
2714 case NALNUML:
2715 if (flags & SCF_DO_STCLASS_AND) {
2716 if (data->start_class->flags & ANYOF_LOCALE)
2717 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2718 }
2719 else {
2720 data->start_class->flags |= ANYOF_LOCALE;
2721 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2722 }
2723 break;
2724 case SPACE:
2725 if (flags & SCF_DO_STCLASS_AND) {
2726 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2727 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2728 for (value = 0; value < 256; value++)
2729 if (!isSPACE(value))
2730 ANYOF_BITMAP_CLEAR(data->start_class, value);
2731 }
2732 }
2733 else {
2734 if (data->start_class->flags & ANYOF_LOCALE)
2735 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2736 else {
2737 for (value = 0; value < 256; value++)
2738 if (isSPACE(value))
b81d288d 2739 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2740 }
2741 }
2742 break;
2743 case SPACEL:
2744 if (flags & SCF_DO_STCLASS_AND) {
2745 if (data->start_class->flags & ANYOF_LOCALE)
2746 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2747 }
2748 else {
2749 data->start_class->flags |= ANYOF_LOCALE;
2750 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2751 }
2752 break;
2753 case NSPACE:
2754 if (flags & SCF_DO_STCLASS_AND) {
2755 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2756 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2757 for (value = 0; value < 256; value++)
2758 if (isSPACE(value))
2759 ANYOF_BITMAP_CLEAR(data->start_class, value);
2760 }
2761 }
2762 else {
2763 if (data->start_class->flags & ANYOF_LOCALE)
2764 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2765 else {
2766 for (value = 0; value < 256; value++)
2767 if (!isSPACE(value))
b81d288d 2768 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2769 }
2770 }
2771 break;
2772 case NSPACEL:
2773 if (flags & SCF_DO_STCLASS_AND) {
2774 if (data->start_class->flags & ANYOF_LOCALE) {
2775 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2776 for (value = 0; value < 256; value++)
2777 if (!isSPACE(value))
2778 ANYOF_BITMAP_CLEAR(data->start_class, value);
2779 }
2780 }
2781 else {
2782 data->start_class->flags |= ANYOF_LOCALE;
2783 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2784 }
2785 break;
2786 case DIGIT:
2787 if (flags & SCF_DO_STCLASS_AND) {
2788 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2789 for (value = 0; value < 256; value++)
2790 if (!isDIGIT(value))
2791 ANYOF_BITMAP_CLEAR(data->start_class, value);
2792 }
2793 else {
2794 if (data->start_class->flags & ANYOF_LOCALE)
2795 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2796 else {
2797 for (value = 0; value < 256; value++)
2798 if (isDIGIT(value))
b81d288d 2799 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2800 }
2801 }
2802 break;
2803 case NDIGIT:
2804 if (flags & SCF_DO_STCLASS_AND) {
2805 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2806 for (value = 0; value < 256; value++)
2807 if (isDIGIT(value))
2808 ANYOF_BITMAP_CLEAR(data->start_class, value);
2809 }
2810 else {
2811 if (data->start_class->flags & ANYOF_LOCALE)
2812 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2813 else {
2814 for (value = 0; value < 256; value++)
2815 if (!isDIGIT(value))
b81d288d 2816 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2817 }
2818 }
2819 break;
2820 }
2821 if (flags & SCF_DO_STCLASS_OR)
2822 cl_and(data->start_class, &and_with);
2823 flags &= ~SCF_DO_STCLASS;
2824 }
a0ed51b3 2825 }
3dab1dad 2826 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2827 data->flags |= (OP(scan) == MEOL
2828 ? SF_BEFORE_MEOL
2829 : SF_BEFORE_SEOL);
a0ed51b3 2830 }
3dab1dad 2831 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
2832 /* Lookbehind, or need to calculate parens/evals/stclass: */
2833 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2834 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2835 /* Lookahead/lookbehind */
cb434fcc 2836 I32 deltanext, minnext, fake = 0;
c277df42 2837 regnode *nscan;
653099ff
GS
2838 struct regnode_charclass_class intrnl;
2839 int f = 0;
c277df42
IZ
2840
2841 data_fake.flags = 0;
b81d288d 2842 if (data) {
2c2d71f5 2843 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2844 data_fake.last_closep = data->last_closep;
2845 }
2846 else
2847 data_fake.last_closep = &fake;
653099ff
GS
2848 if ( flags & SCF_DO_STCLASS && !scan->flags
2849 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2850 cl_init(pRExC_state, &intrnl);
653099ff 2851 data_fake.start_class = &intrnl;
e1901655 2852 f |= SCF_DO_STCLASS_AND;
653099ff 2853 }
e1901655
IZ
2854 if (flags & SCF_WHILEM_VISITED_POS)
2855 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2856 next = regnext(scan);
2857 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2858 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2859 if (scan->flags) {
2860 if (deltanext) {
9baa0206 2861 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2862 }
2863 else if (minnext > U8_MAX) {
9baa0206 2864 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2865 }
eb160463 2866 scan->flags = (U8)minnext;
c277df42
IZ
2867 }
2868 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2869 pars++;
405ff068 2870 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2871 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2872 if (data)
2873 data->whilem_c = data_fake.whilem_c;
e1901655 2874 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2875 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2876
2877 cl_and(data->start_class, &intrnl);
2878 if (was)
2879 data->start_class->flags |= ANYOF_EOS;
2880 }
a0ed51b3
LW
2881 }
2882 else if (OP(scan) == OPEN) {
c277df42 2883 pars++;
a0ed51b3 2884 }
cb434fcc 2885 else if (OP(scan) == CLOSE) {
eb160463 2886 if ((I32)ARG(scan) == is_par) {
cb434fcc 2887 next = regnext(scan);
c277df42 2888
cb434fcc
IZ
2889 if ( next && (OP(next) != WHILEM) && next < last)
2890 is_par = 0; /* Disable optimization */
2891 }
2892 if (data)
2893 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2894 }
2895 else if (OP(scan) == EVAL) {
c277df42
IZ
2896 if (data)
2897 data->flags |= SF_HAS_EVAL;
2898 }
96776eda 2899 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2900 if (flags & SCF_DO_SUBSTR) {
830247a4 2901 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2902 data->longest = &(data->longest_float);
2903 }
2904 is_inf = is_inf_internal = 1;
653099ff 2905 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2906 cl_anything(pRExC_state, data->start_class);
96776eda 2907 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2908 }
c277df42
IZ
2909 /* Else: zero-length, ignore. */
2910 scan = regnext(scan);
2911 }
2912
2913 finish:
2914 *scanp = scan;
aca2d497 2915 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2916 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2917 data->pos_delta = I32_MAX - data->pos_min;
2918 if (is_par > U8_MAX)
2919 is_par = 0;
2920 if (is_par && pars==1 && data) {
2921 data->flags |= SF_IN_PAR;
2922 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2923 }
2924 else if (pars && data) {
c277df42
IZ
2925 data->flags |= SF_HAS_PAR;
2926 data->flags &= ~SF_IN_PAR;
2927 }
653099ff
GS
2928 if (flags & SCF_DO_STCLASS_OR)
2929 cl_and(data->start_class, &and_with);
c277df42
IZ
2930 return min;
2931}
2932
76e3520e 2933STATIC I32
5f66b61c 2934S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2935{
830247a4 2936 if (RExC_rx->data) {
b81d288d
AB
2937 Renewc(RExC_rx->data,
2938 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2939 char, struct reg_data);
830247a4
IZ
2940 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2941 RExC_rx->data->count += n;
a0ed51b3
LW
2942 }
2943 else {
a02a5408 2944 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2945 char, struct reg_data);
a02a5408 2946 Newx(RExC_rx->data->what, n, U8);
830247a4 2947 RExC_rx->data->count = n;
c277df42 2948 }
830247a4
IZ
2949 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2950 return RExC_rx->data->count - n;
c277df42
IZ
2951}
2952
76234dfb 2953#ifndef PERL_IN_XSUB_RE
d88dccdf 2954void
864dbfa3 2955Perl_reginitcolors(pTHX)
d88dccdf 2956{
97aff369 2957 dVAR;
1df70142 2958 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2959 if (s) {
1df70142
AL
2960 char *t = savepv(s);
2961 int i = 0;
2962 PL_colors[0] = t;
d88dccdf 2963 while (++i < 6) {
1df70142
AL
2964 t = strchr(t, '\t');
2965 if (t) {
2966 *t = '\0';
2967 PL_colors[i] = ++t;
d88dccdf
IZ
2968 }
2969 else
1df70142 2970 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2971 }
2972 } else {
1df70142 2973 int i = 0;
b81d288d 2974 while (i < 6)
06b5626a 2975 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2976 }
2977 PL_colorset = 1;
2978}
76234dfb 2979#endif
8615cb43 2980
a687059c 2981/*
e50aee73 2982 - pregcomp - compile a regular expression into internal code
a687059c
LW
2983 *
2984 * We can't allocate space until we know how big the compiled form will be,
2985 * but we can't compile it (and thus know how big it is) until we've got a
2986 * place to put the code. So we cheat: we compile it twice, once with code
2987 * generation turned off and size counting turned on, and once "for real".
2988 * This also means that we don't allocate space until we are sure that the
2989 * thing really will compile successfully, and we never have to move the
2990 * code and thus invalidate pointers into it. (Note that it has to be in
2991 * one piece because free() must be able to free it all.) [NB: not true in perl]
2992 *
2993 * Beware that the optimization-preparation code in here knows about some
2994 * of the structure of the compiled regexp. [I'll say.]
2995 */
2996regexp *
864dbfa3 2997Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2998{
97aff369 2999 dVAR;
a0d0e21e 3000 register regexp *r;
c277df42 3001 regnode *scan;
c277df42 3002 regnode *first;
a0d0e21e 3003 I32 flags;
a0d0e21e
LW
3004 I32 minlen = 0;
3005 I32 sawplus = 0;
3006 I32 sawopen = 0;
2c2d71f5 3007 scan_data_t data;
830247a4
IZ
3008 RExC_state_t RExC_state;
3009 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 3010
a3621e74
YO
3011 GET_RE_DEBUG_FLAGS_DECL;
3012
a0d0e21e 3013 if (exp == NULL)
c277df42 3014 FAIL("NULL regexp argument");
a0d0e21e 3015
a5961de5 3016 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3017
5cfc7842 3018 RExC_precomp = exp;
a3621e74
YO
3019 DEBUG_r(if (!PL_colorset) reginitcolors());
3020 DEBUG_COMPILE_r({
3021 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3022 PL_colors[4],PL_colors[5],PL_colors[0],
3023 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3024 });
e2509266 3025 RExC_flags = pm->op_pmflags;
830247a4 3026 RExC_sawback = 0;
bbce6d69 3027
830247a4
IZ
3028 RExC_seen = 0;
3029 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3030 RExC_seen_evals = 0;
3031 RExC_extralen = 0;
c277df42 3032
bbce6d69 3033 /* First pass: determine size, legality. */
830247a4 3034 RExC_parse = exp;
fac92740 3035 RExC_start = exp;
830247a4
IZ
3036 RExC_end = xend;
3037 RExC_naughty = 0;
3038 RExC_npar = 1;
3039 RExC_size = 0L;
3040 RExC_emit = &PL_regdummy;
3041 RExC_whilem_seen = 0;
85ddcde9
JH
3042#if 0 /* REGC() is (currently) a NOP at the first pass.
3043 * Clever compilers notice this and complain. --jhi */
830247a4 3044 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3045#endif
3dab1dad
YO
3046 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3047 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3048 RExC_precomp = NULL;
a0d0e21e
LW
3049 return(NULL);
3050 }
3dab1dad
YO
3051 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3052 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3053 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
c277df42 3054
c277df42
IZ
3055 /* Small enough for pointer-storage convention?
3056 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3057 if (RExC_size >= 0x10000L && RExC_extralen)
3058 RExC_size += RExC_extralen;
c277df42 3059 else
830247a4
IZ
3060 RExC_extralen = 0;
3061 if (RExC_whilem_seen > 15)
3062 RExC_whilem_seen = 15;
a0d0e21e 3063
bbce6d69 3064 /* Allocate space and initialize. */
a02a5408 3065 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3066 char, regexp);
a0d0e21e 3067 if (r == NULL)
b45f050a
JF
3068 FAIL("Regexp out of space");
3069
0f79a09d
GS
3070#ifdef DEBUGGING
3071 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3072 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3073#endif
c277df42 3074 r->refcnt = 1;
bbce6d69 3075 r->prelen = xend - exp;
5cfc7842 3076 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3077 r->subbeg = NULL;
f8c7b90f 3078#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3079 r->saved_copy = NULL;
ed252734 3080#endif
cf93c79d 3081 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3082 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3083 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3084
3085 r->substrs = 0; /* Useful during FAIL. */
3086 r->startp = 0; /* Useful during FAIL. */
3087 r->endp = 0; /* Useful during FAIL. */
3088
a02a5408 3089 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3090 if (r->offsets) {
2af232bd 3091 r->offsets[0] = RExC_size;
fac92740 3092 }
a3621e74 3093 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3094 "%s %"UVuf" bytes for offset annotations.\n",
3095 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3096 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3097
830247a4 3098 RExC_rx = r;
bbce6d69 3099
3100 /* Second pass: emit code. */
e2509266 3101 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3102 RExC_parse = exp;
3103 RExC_end = xend;
3104 RExC_naughty = 0;
3105 RExC_npar = 1;
fac92740 3106 RExC_emit_start = r->program;
830247a4 3107 RExC_emit = r->program;
2cd61cdb 3108 /* Store the count of eval-groups for security checks: */
eb160463 3109 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3110 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3111 r->data = 0;
3dab1dad 3112 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e
LW
3113 return(NULL);
3114
a3621e74 3115
a0d0e21e 3116 /* Dig out information for optimizations. */
cf93c79d 3117 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3118 pm->op_pmflags = RExC_flags;
a0ed51b3 3119 if (UTF)
5ff6fc6d 3120 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3121 r->regstclass = NULL;
830247a4 3122 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3123 r->reganch |= ROPT_NAUGHTY;
c277df42 3124 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
3125
3126 /* XXXX To minimize changes to RE engine we always allocate
3127 3-units-long substrs field. */
a02a5408 3128 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 3129
2c2d71f5 3130 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 3131 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3132 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3133 I32 fake;
c5254dd6 3134 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
3135 struct regnode_charclass_class ch_class;
3136 int stclass_flag;
cb434fcc 3137 I32 last_close = 0;
a0d0e21e
LW
3138
3139 first = scan;
c277df42 3140 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3141 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3142 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
3143 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3144 (OP(first) == PLUS) ||
3145 (OP(first) == MINMOD) ||
653099ff 3146 /* An {n,m} with n>0 */
3dab1dad 3147 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
3148 if (OP(first) == PLUS)
3149 sawplus = 1;
3150 else
3dab1dad 3151 first += regarglen[OP(first)];
a0d0e21e 3152 first = NEXTOPER(first);
a687059c
LW
3153 }
3154
a0d0e21e
LW
3155 /* Starting-point info. */
3156 again:
3dab1dad 3157 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3158 if (OP(first) == EXACT)
6f207bd3 3159 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3160 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3161 r->regstclass = first;
3162 }
bfed75c6 3163 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3164 r->regstclass = first;
3dab1dad
YO
3165 else if (PL_regkind[OP(first)] == BOUND ||
3166 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3167 r->regstclass = first;
3dab1dad 3168 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3169 r->reganch |= (OP(first) == MBOL
3170 ? ROPT_ANCH_MBOL
3171 : (OP(first) == SBOL
3172 ? ROPT_ANCH_SBOL
3173 : ROPT_ANCH_BOL));
a0d0e21e 3174 first = NEXTOPER(first);
774d564b 3175 goto again;
3176 }
3177 else if (OP(first) == GPOS) {
3178 r->reganch |= ROPT_ANCH_GPOS;
3179 first = NEXTOPER(first);
3180 goto again;
a0d0e21e 3181 }
e09294f4 3182 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3183 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3184 !(r->reganch & ROPT_ANCH) )
3185 {
3186 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3187 const int type =
3188 (OP(NEXTOPER(first)) == REG_ANY)
3189 ? ROPT_ANCH_MBOL
3190 : ROPT_ANCH_SBOL;
cad2e5aa 3191 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3192 first = NEXTOPER(first);
774d564b 3193 goto again;
a0d0e21e 3194 }
b81d288d 3195 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3196 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3197 /* x+ must match at the 1st pos of run of x's */
3198 r->reganch |= ROPT_SKIP;
a0d0e21e 3199
c277df42 3200 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3201 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3202 (IV)(first - scan + 1)));
a0d0e21e
LW
3203 /*
3204 * If there's something expensive in the r.e., find the
3205 * longest literal string that must appear and make it the
3206 * regmust. Resolve ties in favor of later strings, since
3207 * the regstart check works with the beginning of the r.e.
3208 * and avoiding duplication strengthens checking. Not a
3209 * strong reason, but sufficient in the absence of others.
3210 * [Now we resolve ties in favor of the earlier string if
c277df42 3211 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3212 * earlier string may buy us something the later one won't.]
3213 */
a0d0e21e 3214 minlen = 0;
a687059c 3215
396482e1
GA
3216 data.longest_fixed = newSVpvs("");
3217 data.longest_float = newSVpvs("");
3218 data.last_found = newSVpvs("");
c277df42
IZ
3219 data.longest = &(data.longest_fixed);
3220 first = scan;
653099ff 3221 if (!r->regstclass) {
830247a4 3222 cl_init(pRExC_state, &ch_class);
653099ff
GS
3223 data.start_class = &ch_class;
3224 stclass_flag = SCF_DO_STCLASS_AND;
3225 } else /* XXXX Check for BOUND? */
3226 stclass_flag = 0;
cb434fcc 3227 data.last_closep = &last_close;
653099ff 3228
830247a4 3229 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3230 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3231 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3232 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3233 && !RExC_seen_zerolen
3234 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3235 r->reganch |= ROPT_CHECK_ALL;
830247a4 3236 scan_commit(pRExC_state, &data);
c277df42
IZ
3237 SvREFCNT_dec(data.last_found);
3238
a0ed51b3 3239 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3240 if (longest_float_length
c277df42
IZ
3241 || (data.flags & SF_FL_BEFORE_EOL
3242 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3243 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3244 int t;
3245
a0ed51b3 3246 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3247 && data.offset_fixed == data.offset_float_min
3248 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3249 goto remove_float; /* As in (a)+. */
3250
33b8afdf
JH
3251 if (SvUTF8(data.longest_float)) {
3252 r->float_utf8 = data.longest_float;
c445ea15 3253 r->float_substr = NULL;
33b8afdf
JH
3254 } else {
3255 r->float_substr = data.longest_float;
c445ea15 3256 r->float_utf8 = NULL;
33b8afdf 3257 }
c277df42
IZ
3258 r->float_min_offset = data.offset_float_min;
3259 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3260 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3261 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3262 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3263 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3264 }
3265 else {
aca2d497 3266 remove_float:
c445ea15 3267 r->float_substr = r->float_utf8 = NULL;
c277df42 3268 SvREFCNT_dec(data.longest_float);
c5254dd6 3269 longest_float_length = 0;
a0d0e21e 3270 }
c277df42 3271
a0ed51b3 3272 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3273 if (longest_fixed_length
c277df42
IZ
3274 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3275 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3276 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3277 int t;
3278
33b8afdf
JH
3279 if (SvUTF8(data.longest_fixed)) {
3280 r->anchored_utf8 = data.longest_fixed;
c445ea15 3281 r->anchored_substr = NULL;
33b8afdf
JH
3282 } else {
3283 r->anchored_substr = data.longest_fixed;
c445ea15 3284 r->anchored_utf8 = NULL;
33b8afdf 3285 }
c277df42 3286 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3287 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3288 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3289 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3290 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3291 }
3292 else {
c445ea15 3293 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3294 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3295 longest_fixed_length = 0;
a0d0e21e 3296 }
b81d288d 3297 if (r->regstclass
ffc61ed2 3298 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3299 r->regstclass = NULL;
33b8afdf
JH
3300 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3301 && stclass_flag
653099ff 3302 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3303 && !cl_is_anything(data.start_class))
3304 {
1df70142 3305 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3306
a02a5408 3307 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3308 struct regnode_charclass_class);
3309 StructCopy(data.start_class,
830247a4 3310 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3311 struct regnode_charclass_class);
830247a4 3312 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3313 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3314 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3315 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3316 PerlIO_printf(Perl_debug_log,
a0288114 3317 "synthetic stclass \"%s\".\n",
3f7c398e 3318 SvPVX_const(sv));});
653099ff 3319 }
c277df42
IZ
3320
3321 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3322 if (longest_fixed_length > longest_float_length) {
c277df42 3323 r->check_substr = r->anchored_substr;
33b8afdf 3324 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3325 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3326 if (r->reganch & ROPT_ANCH_SINGLE)
3327 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3328 }
3329 else {
c277df42 3330 r->check_substr = r->float_substr;
33b8afdf 3331 r->check_utf8 = r->float_utf8;
c277df42
IZ
3332 r->check_offset_min = data.offset_float_min;
3333 r->check_offset_max = data.offset_float_max;
a0d0e21e 3334 }
30382c73
IZ
3335 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3336 This should be changed ASAP! */
33b8afdf 3337 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3338 r->reganch |= RE_USE_INTUIT;
33b8afdf 3339 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3340 r->reganch |= RE_INTUIT_TAIL;
3341 }
a0ed51b3
LW
3342 }
3343 else {
c277df42
IZ
3344 /* Several toplevels. Best we can is to set minlen. */
3345 I32 fake;
653099ff 3346 struct regnode_charclass_class ch_class;
cb434fcc 3347 I32 last_close = 0;
c277df42 3348
a3621e74 3349 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3350 scan = r->program + 1;
830247a4 3351 cl_init(pRExC_state, &ch_class);
653099ff 3352 data.start_class = &ch_class;
cb434fcc 3353 data.last_closep = &last_close;
a3621e74 3354 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3355 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3356 = r->float_substr = r->float_utf8 = NULL;
653099ff 3357 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3358 && !cl_is_anything(data.start_class))
3359 {
1df70142 3360 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3361
a02a5408 3362 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3363 struct regnode_charclass_class);
3364 StructCopy(data.start_class,
830247a4 3365 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3366 struct regnode_charclass_class);
830247a4 3367 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3368 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3369 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3370 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3371 PerlIO_printf(Perl_debug_log,
a0288114 3372 "synthetic stclass \"%s\".\n",
3f7c398e 3373 SvPVX_const(sv));});
653099ff 3374 }
a0d0e21e
LW
3375 }
3376
a0d0e21e 3377 r->minlen = minlen;
b81d288d 3378 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3379 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3380 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3381 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3382 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3383 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3384 if (RExC_seen & REG_SEEN_CANY)
3385 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3386 Newxz(r->startp, RExC_npar, I32);
3387 Newxz(r->endp, RExC_npar, I32);
3dab1dad
YO
3388 DEBUG_COMPILE_r({
3389 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3390 PerlIO_printf(Perl_debug_log,"Final program:\n");
3391 regdump(r);
3392 });
a0d0e21e 3393 return(r);
a687059c
LW
3394}
3395
3dab1dad
YO
3396
3397#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3398 int rem=(int)(RExC_end - RExC_parse); \
3399 int cut; \
3400 int num; \
3401 int iscut=0; \
3402 if (rem>10) { \
3403 rem=10; \
3404 iscut=1; \
3405 } \
3406 cut=10-rem; \
3407 if (RExC_lastparse!=RExC_parse) \
3408 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3409 rem, RExC_parse, \
3410 cut + 4, \
3411 iscut ? "..." : "<" \
3412 ); \
3413 else \
3414 PerlIO_printf(Perl_debug_log,"%16s",""); \
3415 \
3416 if (SIZE_ONLY) \
3417 num=RExC_size; \
3418 else \
3419 num=REG_NODE_NUM(RExC_emit); \
3420 if (RExC_lastnum!=num) \
3421 PerlIO_printf(Perl_debug_log,"%4d",num); \
3422 else \
3423 PerlIO_printf(Perl_debug_log,"%4s",""); \
3424 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
3425 10+(depth*2),"", \
3426 (funcname) \
3427 ); \
3428 RExC_lastnum=num; \
3429 RExC_lastparse=RExC_parse; \
3430})
3431
3432#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3433 DEBUG_PARSE_MSG((funcname)); \
3434 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3435})
a687059c
LW
3436/*
3437 - reg - regular expression, i.e. main body or parenthesized thing
3438 *
3439 * Caller must absorb opening parenthesis.
3440 *
3441 * Combining parenthesis handling with the base level of regular expression
3442 * is a trifle forced, but the need to tie the tails of the branches to what
3443 * follows makes it hard to avoid.
3444 */
3dab1dad
YO
3445#define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
3446
76e3520e 3447STATIC regnode *
3dab1dad 3448S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3449 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3450{
27da23d5 3451 dVAR;
c277df42
IZ
3452 register regnode *ret; /* Will be the head of the group. */
3453 register regnode *br;
3454 register regnode *lastbr;
cbbf8932 3455 register regnode *ender = NULL;
a0d0e21e 3456 register I32 parno = 0;
cbbf8932
AL
3457 I32 flags;
3458 const I32 oregflags = RExC_flags;
6136c704
AL
3459 bool have_branch = 0;
3460 bool is_open = 0;
9d1d55b5
JP
3461
3462 /* for (?g), (?gc), and (?o) warnings; warning
3463 about (?c) will warn about (?g) -- japhy */
3464
6136c704
AL
3465#define WASTED_O 0x01
3466#define WASTED_G 0x02
3467#define WASTED_C 0x04
3468#define WASTED_GC (0x02|0x04)
cbbf8932 3469 I32 wastedflags = 0x00;
9d1d55b5 3470
fac92740 3471 char * parse_start = RExC_parse; /* MJD */
a28509cc 3472 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3473
3dab1dad
YO
3474 GET_RE_DEBUG_FLAGS_DECL;
3475 DEBUG_PARSE("reg ");
3476
3477
821b33a5 3478 *flagp = 0; /* Tentatively. */
a0d0e21e 3479
9d1d55b5 3480
a0d0e21e
LW
3481 /* Make an OPEN node, if parenthesized. */
3482 if (paren) {
fac92740 3483 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3484 U32 posflags = 0, negflags = 0;
3485 U32 *flagsp = &posflags;
6136c704 3486 bool is_logical = 0;
a28509cc 3487 const char * const seqstart = RExC_parse;
ca9dfc88 3488
830247a4
IZ
3489 RExC_parse++;
3490 paren = *RExC_parse++;
c277df42 3491 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3492 switch (paren) {
fac92740 3493 case '<': /* (?<...) */
830247a4 3494 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3495 if (*RExC_parse == '!')
c277df42 3496 paren = ',';
b81d288d 3497 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3498 goto unknown;
830247a4 3499 RExC_parse++;
fac92740
MJD
3500 case '=': /* (?=...) */
3501 case '!': /* (?!...) */
830247a4 3502 RExC_seen_zerolen++;
fac92740
MJD
3503 case ':': /* (?:...) */
3504 case '>': /* (?>...) */
a0d0e21e 3505 break;
fac92740
MJD
3506 case '$': /* (?$...) */
3507 case '@': /* (?@...) */
8615cb43 3508 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3509 break;
fac92740 3510 case '#': /* (?#...) */
830247a4
IZ
3511 while (*RExC_parse && *RExC_parse != ')')
3512 RExC_parse++;
3513 if (*RExC_parse != ')')
c277df42 3514 FAIL("Sequence (?#... not terminated");
830247a4 3515 nextchar(pRExC_state);
a0d0e21e
LW
3516 *flagp = TRYAGAIN;
3517 return NULL;
fac92740 3518 case 'p': /* (?p...) */
9014280d 3519 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3520 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3521 /* FALL THROUGH*/
fac92740 3522 case '?': /* (??...) */
6136c704 3523 is_logical = 1;
438a3801
YST
3524 if (*RExC_parse != '{')
3525 goto unknown;
830247a4 3526 paren = *RExC_parse++;
0f5d15d6 3527 /* FALL THROUGH */
fac92740 3528 case '{': /* (?{...}) */
c277df42 3529 {
c277df42
IZ
3530 I32 count = 1, n = 0;
3531 char c;
830247a4 3532 char *s = RExC_parse;
c277df42 3533
830247a4
IZ
3534 RExC_seen_zerolen++;
3535 RExC_seen |= REG_SEEN_EVAL;
3536 while (count && (c = *RExC_parse)) {
6136c704
AL
3537 if (c == '\\') {
3538 if (RExC_parse[1])
3539 RExC_parse++;
3540 }
b81d288d 3541 else if (c == '{')
c277df42 3542 count++;
b81d288d 3543 else if (c == '}')
c277df42 3544 count--;
830247a4 3545 RExC_parse++;
c277df42 3546 }
6136c704 3547 if (*RExC_parse != ')') {
b81d288d 3548 RExC_parse = s;
b45f050a
JF
3549 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3550 }
c277df42 3551 if (!SIZE_ONLY) {
f3548bdc 3552 PAD *pad;
6136c704
AL
3553 OP_4tree *sop, *rop;
3554 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3555
569233ed
SB
3556 ENTER;
3557 Perl_save_re_context(aTHX);
f3548bdc 3558 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3559 sop->op_private |= OPpREFCOUNTED;
3560 /* re_dup will OpREFCNT_inc */
3561 OpREFCNT_set(sop, 1);
569233ed 3562 LEAVE;
c277df42 3563
830247a4
IZ
3564 n = add_data(pRExC_state, 3, "nop");
3565 RExC_rx->data->data[n] = (void*)rop;
3566 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3567 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3568 SvREFCNT_dec(sv);
a0ed51b3 3569 }
e24b16f9 3570 else { /* First pass */
830247a4 3571 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3572 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3573 /* No compiled RE interpolated, has runtime
3574 components ===> unsafe. */
3575 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3576 if (PL_tainting && PL_tainted)
cc6b7395 3577 FAIL("Eval-group in insecure regular expression");
54df2634 3578#if PERL_VERSION > 8
923e4eb5 3579 if (IN_PERL_COMPILETIME)
b5c19bd7 3580 PL_cv_has_eval = 1;
54df2634 3581#endif
c277df42 3582 }
b5c19bd7 3583
830247a4 3584 nextchar(pRExC_state);
6136c704 3585 if (is_logical) {
830247a4 3586 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3587 if (!SIZE_ONLY)
3588 ret->flags = 2;
3dab1dad 3589 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3590 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3591 return ret;
3592 }
ccb2c380
MP
3593 ret = reganode(pRExC_state, EVAL, n);
3594 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3595 Set_Node_Offset(ret, parse_start);
3596 return ret;
c277df42 3597 }
fac92740 3598 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3599 {
fac92740 3600 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3601 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3602 || RExC_parse[1] == '<'
830247a4 3603 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3604 I32 flag;
3605
830247a4 3606 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3607 if (!SIZE_ONLY)
3608 ret->flags = 1;
3dab1dad 3609 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3610 goto insert_if;
b81d288d 3611 }
a0ed51b3 3612 }
830247a4 3613 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3614 /* (?(1)...) */
6136c704 3615 char c;
830247a4 3616 parno = atoi(RExC_parse++);
c277df42 3617
830247a4
IZ
3618 while (isDIGIT(*RExC_parse))
3619 RExC_parse++;
fac92740 3620 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3621
830247a4 3622 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3623 vFAIL("Switch condition not recognized");
c277df42 3624 insert_if:
3dab1dad
YO
3625 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3626 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3627 if (br == NULL)
830247a4 3628 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3629 else
3dab1dad 3630 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3631 c = *nextchar(pRExC_state);
d1b80229
IZ
3632 if (flags&HASWIDTH)
3633 *flagp |= HASWIDTH;
c277df42 3634 if (c == '|') {
830247a4 3635 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3636 regbranch(pRExC_state, &flags, 1,depth+1);
3637 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3638 if (flags&HASWIDTH)
3639 *flagp |= HASWIDTH;
830247a4 3640 c = *nextchar(pRExC_state);
a0ed51b3
LW
3641 }
3642 else
c277df42
IZ
3643 lastbr = NULL;
3644 if (c != ')')
8615cb43 3645 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3646 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3647 REGTAIL(pRExC_state, br, ender);
c277df42 3648 if (lastbr) {
3dab1dad
YO
3649 REGTAIL(pRExC_state, lastbr, ender);
3650 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3651 }
3652 else
3dab1dad 3653 REGTAIL(pRExC_state, ret, ender);
c277df42 3654 return ret;
a0ed51b3
LW
3655 }
3656 else {
830247a4 3657 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3658 }
3659 }
1b1626e4 3660 case 0:
830247a4 3661 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3662 vFAIL("Sequence (? incomplete");
1b1626e4 3663 break;
a0d0e21e 3664 default:
830247a4 3665 --RExC_parse;
fac92740 3666 parse_flags: /* (?i) */
830247a4 3667 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3668 /* (?g), (?gc) and (?o) are useless here
3669 and must be globally applied -- japhy */
3670
3671 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3672 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3673 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3674 if (! (wastedflags & wflagbit) ) {
3675 wastedflags |= wflagbit;
3676 vWARN5(
3677 RExC_parse + 1,
3678 "Useless (%s%c) - %suse /%c modifier",
3679 flagsp == &negflags ? "?-" : "?",
3680 *RExC_parse,
3681 flagsp == &negflags ? "don't " : "",
3682 *RExC_parse
3683 );
3684 }
3685 }
3686 }
3687 else if (*RExC_parse == 'c') {
3688 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3689 if (! (wastedflags & WASTED_C) ) {
3690 wastedflags |= WASTED_GC;
9d1d55b5
JP
3691 vWARN3(
3692 RExC_parse + 1,
3693 "Useless (%sc) - %suse /gc modifier",
3694 flagsp == &negflags ? "?-" : "?",
3695 flagsp == &negflags ? "don't " : ""
3696 );
3697 }
3698 }
3699 }
3700 else { pmflag(flagsp, *RExC_parse); }
3701
830247a4 3702 ++RExC_parse;
ca9dfc88 3703 }
830247a4 3704 if (*RExC_parse == '-') {
ca9dfc88 3705 flagsp = &negflags;
9d1d55b5 3706 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3707 ++RExC_parse;
ca9dfc88 3708 goto parse_flags;
48c036b1 3709 }
e2509266
JH
3710 RExC_flags |= posflags;
3711 RExC_flags &= ~negflags;
830247a4
IZ
3712 if (*RExC_parse == ':') {
3713 RExC_parse++;
ca9dfc88
IZ
3714 paren = ':';
3715 break;
3716 }
c277df42 3717 unknown:
830247a4
IZ
3718 if (*RExC_parse != ')') {
3719 RExC_parse++;
3720 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3721 }
830247a4 3722 nextchar(pRExC_state);
a0d0e21e
LW
3723 *flagp = TRYAGAIN;
3724 return NULL;
3725 }
3726 }
fac92740 3727 else { /* (...) */
830247a4
IZ
3728 parno = RExC_npar;
3729 RExC_npar++;
3730 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3731 Set_Node_Length(ret, 1); /* MJD */
3732 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3733 is_open = 1;
a0d0e21e 3734 }
a0ed51b3 3735 }
fac92740 3736 else /* ! paren */
a0d0e21e
LW
3737 ret = NULL;
3738
3739 /* Pick up the branches, linking them together. */
fac92740 3740 parse_start = RExC_parse; /* MJD */
3dab1dad 3741 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 3742 /* branch_len = (paren != 0); */
2af232bd 3743
a0d0e21e
LW
3744 if (br == NULL)
3745 return(NULL);
830247a4
IZ
3746 if (*RExC_parse == '|') {
3747 if (!SIZE_ONLY && RExC_extralen) {
3748 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3749 }
fac92740 3750 else { /* MJD */
830247a4 3751 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3752 Set_Node_Length(br, paren != 0);
3753 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3754 }
c277df42
IZ
3755 have_branch = 1;
3756 if (SIZE_ONLY)
830247a4 3757 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3758 }
3759 else if (paren == ':') {
c277df42
IZ
3760 *flagp |= flags&SIMPLE;
3761 }
6136c704 3762 if (is_open) { /* Starts with OPEN. */
3dab1dad 3763 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3764 }
3765 else if (paren != '?') /* Not Conditional */
a0d0e21e 3766 ret = br;
32a0ca98 3767 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3768 lastbr = br;
830247a4
IZ
3769 while (*RExC_parse == '|') {
3770 if (!SIZE_ONLY && RExC_extralen) {
3771 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 3772 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3773 }
3774 if (SIZE_ONLY)
830247a4
IZ
3775 RExC_extralen += 2; /* Account for LONGJMP. */
3776 nextchar(pRExC_state);
3dab1dad 3777 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 3778
a687059c 3779 if (br == NULL)
a0d0e21e 3780 return(NULL);
3dab1dad 3781 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3782 lastbr = br;
821b33a5
IZ
3783 if (flags&HASWIDTH)
3784 *flagp |= HASWIDTH;
a687059c 3785 *flagp |= flags&SPSTART;
a0d0e21e
LW
3786 }
3787
c277df42
IZ
3788 if (have_branch || paren != ':') {
3789 /* Make a closing node, and hook it on the end. */
3790 switch (paren) {
3791 case ':':
830247a4 3792 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3793 break;
3794 case 1:
830247a4 3795 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3796 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3797 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3798 break;
3799 case '<':
c277df42
IZ
3800 case ',':
3801 case '=':
3802 case '!':
c277df42 3803 *flagp &= ~HASWIDTH;
821b33a5
IZ
3804 /* FALL THROUGH */
3805 case '>':
830247a4 3806 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3807 break;
3808 case 0:
830247a4 3809 ender = reg_node(pRExC_state, END);
c277df42
IZ
3810 break;
3811 }
3dab1dad 3812 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 3813
9674d46a 3814 if (have_branch && !SIZE_ONLY) {
c277df42 3815 /* Hook the tails of the branches to the closing node. */
3dab1dad 3816 U8 exact= PSEUDO;
9674d46a
AL
3817 for (br = ret; br; br = regnext(br)) {
3818 const U8 op = PL_regkind[OP(br)];
3dab1dad 3819 U8 exact_ret;
9674d46a 3820 if (op == BRANCH) {
3dab1dad 3821 exact_ret=regtail_study(pRExC_state, NEXTOPER(br), ender,depth+1);
9674d46a
AL
3822 }
3823 else if (op == BRANCHJ) {
3dab1dad 3824 exact_ret=regtail_study(pRExC_state, NEXTOPER(NEXTOPER(br)), ender,depth+1);
9674d46a 3825 }
3dab1dad
YO
3826 if ( exact == PSEUDO )
3827 exact= exact_ret;
3828 else if ( exact != exact_ret )
3829 exact= 0;
c277df42
IZ
3830 }
3831 }
a0d0e21e 3832 }
c277df42
IZ
3833
3834 {
e1ec3a88
AL
3835 const char *p;
3836 static const char parens[] = "=!<,>";
c277df42
IZ
3837
3838 if (paren && (p = strchr(parens, paren))) {
eb160463 3839 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3840 int flag = (p - parens) > 1;
3841
3842 if (paren == '>')
3843 node = SUSPEND, flag = 0;
830247a4 3844 reginsert(pRExC_state, node,ret);
45948336
EP
3845 Set_Node_Cur_Length(ret);
3846 Set_Node_Offset(ret, parse_start + 1);
c277df42 3847 ret->flags = flag;
3dab1dad 3848 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3849 }
a0d0e21e
LW
3850 }
3851
3852 /* Check for proper termination. */
ce3e6498 3853 if (paren) {
e2509266 3854 RExC_flags = oregflags;
830247a4
IZ
3855 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3856 RExC_parse = oregcomp_parse;
380a0633 3857 vFAIL("Unmatched (");
ce3e6498 3858 }
a0ed51b3 3859 }
830247a4
IZ
3860 else if (!paren && RExC_parse < RExC_end) {
3861 if (*RExC_parse == ')') {
3862 RExC_parse++;
380a0633 3863 vFAIL("Unmatched )");
a0ed51b3
LW
3864 }
3865 else
b45f050a 3866 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3867 /* NOTREACHED */
3868 }
a687059c 3869
a0d0e21e 3870 return(ret);
a687059c
LW
3871}
3872
3873/*
3874 - regbranch - one alternative of an | operator
3875 *
3876 * Implements the concatenation operator.
3877 */
76e3520e 3878STATIC regnode *
3dab1dad 3879S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 3880{
97aff369 3881 dVAR;
c277df42
IZ
3882 register regnode *ret;
3883 register regnode *chain = NULL;
3884 register regnode *latest;
3885 I32 flags = 0, c = 0;
3dab1dad
YO
3886 GET_RE_DEBUG_FLAGS_DECL;
3887 DEBUG_PARSE("brnc");
b81d288d 3888 if (first)
c277df42
IZ
3889 ret = NULL;
3890 else {
b81d288d 3891 if (!SIZE_ONLY && RExC_extralen)
830247a4 3892 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3893 else {
830247a4 3894 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3895 Set_Node_Length(ret, 1);
3896 }
c277df42
IZ
3897 }
3898
b81d288d 3899 if (!first && SIZE_ONLY)
830247a4 3900 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3901
c277df42 3902 *flagp = WORST; /* Tentatively. */
a0d0e21e 3903
830247a4
IZ
3904 RExC_parse--;
3905 nextchar(pRExC_state);
3906 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3907 flags &= ~TRYAGAIN;
3dab1dad 3908 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
3909 if (latest == NULL) {
3910 if (flags & TRYAGAIN)
3911 continue;
3912 return(NULL);
a0ed51b3
LW
3913 }
3914 else if (ret == NULL)
c277df42 3915 ret = latest;
a0d0e21e 3916 *flagp |= flags&HASWIDTH;
c277df42 3917 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3918 *flagp |= flags&SPSTART;
3919 else {
830247a4 3920 RExC_naughty++;
3dab1dad 3921 REGTAIL(pRExC_state, chain, latest);
a687059c 3922 }
a0d0e21e 3923 chain = latest;
c277df42
IZ
3924 c++;
3925 }
3926 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3927 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3928 if (ret == NULL)
3929 ret = chain;
3930 }
3931 if (c == 1) {
3932 *flagp |= flags&SIMPLE;
a0d0e21e 3933 }
a687059c 3934
d4c19fe8 3935 return ret;
a687059c
LW
3936}
3937
3938/*
3939 - regpiece - something followed by possible [*+?]
3940 *
3941 * Note that the branching code sequences used for ? and the general cases
3942 * of * and + are somewhat optimized: they use the same NOTHING node as
3943 * both the endmarker for their branch list and the body of the last branch.
3944 * It might seem that this node could be dispensed with entirely, but the
3945 * endmarker role is not redundant.
3946 */
76e3520e 3947STATIC regnode *
3dab1dad 3948S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 3949{
97aff369 3950 dVAR;
c277df42 3951 register regnode *ret;
a0d0e21e
LW
3952 register char op;
3953 register char *next;
3954 I32 flags;
1df70142 3955 const char * const origparse = RExC_parse;
a0d0e21e
LW
3956 char *maxpos;
3957 I32 min;
c277df42 3958 I32 max = REG_INFTY;
fac92740 3959 char *parse_start;
3dab1dad
YO
3960 GET_RE_DEBUG_FLAGS_DECL;
3961 DEBUG_PARSE("piec");
a0d0e21e 3962
3dab1dad 3963 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
3964 if (ret == NULL) {
3965 if (flags & TRYAGAIN)
3966 *flagp |= TRYAGAIN;
3967 return(NULL);
3968 }
3969
830247a4 3970 op = *RExC_parse;
a0d0e21e 3971
830247a4 3972 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 3973 const char *maxpos = NULL;
fac92740 3974 parse_start = RExC_parse; /* MJD */
830247a4 3975 next = RExC_parse + 1;
a0d0e21e
LW
3976 while (isDIGIT(*next) || *next == ',') {
3977 if (*next == ',') {
3978 if (maxpos)
3979 break;
3980 else
3981 maxpos = next;
a687059c 3982 }
a0d0e21e
LW
3983 next++;
3984 }
3985 if (*next == '}') { /* got one */
3986 if (!maxpos)
3987 maxpos = next;
830247a4
IZ
3988 RExC_parse++;
3989 min = atoi(RExC_parse);
a0d0e21e
LW
3990 if (*maxpos == ',')
3991 maxpos++;
3992 else
830247a4 3993 maxpos = RExC_parse;
a0d0e21e
LW
3994 max = atoi(maxpos);
3995 if (!max && *maxpos != '0')
c277df42
IZ
3996 max = REG_INFTY; /* meaning "infinity" */
3997 else if (max >= REG_INFTY)
8615cb43 3998 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3999 RExC_parse = next;
4000 nextchar(pRExC_state);
a0d0e21e
LW
4001
4002 do_curly:
4003 if ((flags&SIMPLE)) {
830247a4
IZ
4004 RExC_naughty += 2 + RExC_naughty / 2;
4005 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4006 Set_Node_Offset(ret, parse_start+1); /* MJD */
4007 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4008 }
4009 else {
3dab1dad 4010 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4011
4012 w->flags = 0;
3dab1dad 4013 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4014 if (!SIZE_ONLY && RExC_extralen) {
4015 reginsert(pRExC_state, LONGJMP,ret);
4016 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4017 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4018 }
830247a4 4019 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4020 /* MJD hk */
4021 Set_Node_Offset(ret, parse_start+1);
2af232bd 4022 Set_Node_Length(ret,
fac92740 4023 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4024
830247a4 4025 if (!SIZE_ONLY && RExC_extralen)
c277df42 4026 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4027 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4028 if (SIZE_ONLY)
830247a4
IZ
4029 RExC_whilem_seen++, RExC_extralen += 3;
4030 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4031 }
c277df42 4032 ret->flags = 0;
a0d0e21e
LW
4033
4034 if (min > 0)
821b33a5
IZ
4035 *flagp = WORST;
4036 if (max > 0)
4037 *flagp |= HASWIDTH;
a0d0e21e 4038 if (max && max < min)
8615cb43 4039 vFAIL("Can't do {n,m} with n > m");
c277df42 4040 if (!SIZE_ONLY) {
eb160463
GS
4041 ARG1_SET(ret, (U16)min);
4042 ARG2_SET(ret, (U16)max);
a687059c 4043 }
a687059c 4044
a0d0e21e 4045 goto nest_check;
a687059c 4046 }
a0d0e21e 4047 }
a687059c 4048
a0d0e21e
LW
4049 if (!ISMULT1(op)) {
4050 *flagp = flags;
a687059c 4051 return(ret);
a0d0e21e 4052 }
bb20fd44 4053
c277df42 4054#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4055
4056 /* if this is reinstated, don't forget to put this back into perldiag:
4057
4058 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4059
4060 (F) The part of the regexp subject to either the * or + quantifier
4061 could match an empty string. The {#} shows in the regular
4062 expression about where the problem was discovered.
4063
4064 */
4065
bb20fd44 4066 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4067 vFAIL("Regexp *+ operand could be empty");
b81d288d 4068#endif
bb20fd44 4069
fac92740 4070 parse_start = RExC_parse;
830247a4 4071 nextchar(pRExC_state);
a0d0e21e 4072
821b33a5 4073 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4074
4075 if (op == '*' && (flags&SIMPLE)) {
830247a4 4076 reginsert(pRExC_state, STAR, ret);
c277df42 4077 ret->flags = 0;
830247a4 4078 RExC_naughty += 4;
a0d0e21e
LW
4079 }
4080 else if (op == '*') {
4081 min = 0;
4082 goto do_curly;
a0ed51b3
LW
4083 }
4084 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4085 reginsert(pRExC_state, PLUS, ret);
c277df42 4086 ret->flags = 0;
830247a4 4087 RExC_naughty += 3;
a0d0e21e
LW
4088 }
4089 else if (op == '+') {
4090 min = 1;
4091 goto do_curly;
a0ed51b3
LW
4092 }
4093 else if (op == '?') {
a0d0e21e
LW
4094 min = 0; max = 1;
4095 goto do_curly;
4096 }
4097 nest_check:
041457d9 4098 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4099 vWARN3(RExC_parse,
b45f050a 4100 "%.*s matches null string many times",
afd78fd5 4101 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4102 origparse);
a0d0e21e
LW
4103 }
4104
830247a4
IZ
4105 if (*RExC_parse == '?') {
4106 nextchar(pRExC_state);
4107 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4108 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4109 }
830247a4
IZ
4110 if (ISMULT2(RExC_parse)) {
4111 RExC_parse++;
b45f050a
JF
4112 vFAIL("Nested quantifiers");
4113 }
a0d0e21e
LW
4114
4115 return(ret);
a687059c
LW
4116}
4117
4118/*
4119 - regatom - the lowest level
4120 *
4121 * Optimization: gobbles an entire sequence of ordinary characters so that
4122 * it can turn them into a single node, which is smaller to store and
4123 * faster to run. Backslashed characters are exceptions, each becoming a
4124 * separate node; the code is simpler that way and it's not worth fixing.
4125 *
7f6f358c
YO
4126 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4127 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4128 */
76e3520e 4129STATIC regnode *
3dab1dad 4130S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4131{
97aff369 4132 dVAR;
cbbf8932 4133 register regnode *ret = NULL;
a0d0e21e 4134 I32 flags;
45948336 4135 char *parse_start = RExC_parse;
3dab1dad
YO
4136 GET_RE_DEBUG_FLAGS_DECL;
4137 DEBUG_PARSE("atom");
a0d0e21e
LW
4138 *flagp = WORST; /* Tentatively. */
4139
4140tryagain:
830247a4 4141 switch (*RExC_parse) {
a0d0e21e 4142 case '^':
830247a4
IZ
4143 RExC_seen_zerolen++;
4144 nextchar(pRExC_state);
e2509266 4145 if (RExC_flags & PMf_MULTILINE)
830247a4 4146 ret = reg_node(pRExC_state, MBOL);
e2509266 4147 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4148 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4149 else
830247a4 4150 ret = reg_node(pRExC_state, BOL);
fac92740 4151 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4152 break;
4153 case '$':
830247a4 4154 nextchar(pRExC_state);
b81d288d 4155 if (*RExC_parse)
830247a4 4156 RExC_seen_zerolen++;
e2509266 4157 if (RExC_flags & PMf_MULTILINE)
830247a4 4158 ret = reg_node(pRExC_state, MEOL);
e2509266 4159 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4160 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4161 else
830247a4 4162 ret = reg_node(pRExC_state, EOL);
fac92740 4163 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4164 break;
4165 case '.':
830247a4 4166 nextchar(pRExC_state);
e2509266 4167 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4168 ret = reg_node(pRExC_state, SANY);
4169 else
4170 ret = reg_node(pRExC_state, REG_ANY);
4171 *flagp |= HASWIDTH|SIMPLE;
830247a4 4172 RExC_naughty++;
fac92740 4173 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4174 break;
4175 case '[':
b45f050a 4176 {
3dab1dad
YO
4177 char * const oregcomp_parse = ++RExC_parse;
4178 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4179 if (*RExC_parse != ']') {
4180 RExC_parse = oregcomp_parse;
b45f050a
JF
4181 vFAIL("Unmatched [");
4182 }
830247a4 4183 nextchar(pRExC_state);
a0d0e21e 4184 *flagp |= HASWIDTH|SIMPLE;
fac92740 4185 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4186 break;
b45f050a 4187 }
a0d0e21e 4188 case '(':
830247a4 4189 nextchar(pRExC_state);
3dab1dad 4190 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4191 if (ret == NULL) {
bf93d4cc 4192 if (flags & TRYAGAIN) {
830247a4 4193 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4194 /* Make parent create an empty node if needed. */
4195 *flagp |= TRYAGAIN;
4196 return(NULL);
4197 }
a0d0e21e 4198 goto tryagain;
bf93d4cc 4199 }
a0d0e21e
LW
4200 return(NULL);
4201 }
c277df42 4202 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4203 break;
4204 case '|':
4205 case ')':
4206 if (flags & TRYAGAIN) {
4207 *flagp |= TRYAGAIN;
4208 return NULL;
4209 }
b45f050a 4210 vFAIL("Internal urp");
a0d0e21e
LW
4211 /* Supposed to be caught earlier. */
4212 break;
85afd4ae 4213 case '{':
830247a4
IZ
4214 if (!regcurly(RExC_parse)) {
4215 RExC_parse++;
85afd4ae
CS
4216 goto defchar;
4217 }
4218 /* FALL THROUGH */
a0d0e21e
LW
4219 case '?':
4220 case '+':
4221 case '*':
830247a4 4222 RExC_parse++;
b45f050a 4223 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4224 break;
4225 case '\\':
830247a4 4226 switch (*++RExC_parse) {
a0d0e21e 4227 case 'A':
830247a4
IZ
4228 RExC_seen_zerolen++;
4229 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4230 *flagp |= SIMPLE;
830247a4 4231 nextchar(pRExC_state);
fac92740 4232 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4233 break;
4234 case 'G':
830247a4
IZ
4235 ret = reg_node(pRExC_state, GPOS);
4236 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4237 *flagp |= SIMPLE;
830247a4 4238 nextchar(pRExC_state);
fac92740 4239 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4240 break;
4241 case 'Z':
830247a4 4242 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4243 *flagp |= SIMPLE;
a1917ab9 4244 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4245 nextchar(pRExC_state);
a0d0e21e 4246 break;
b85d18e9 4247 case 'z':
830247a4 4248 ret = reg_node(pRExC_state, EOS);
b85d18e9 4249 *flagp |= SIMPLE;
830247a4
IZ
4250 RExC_seen_zerolen++; /* Do not optimize RE away */
4251 nextchar(pRExC_state);
fac92740 4252 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4253 break;
4a2d328f 4254 case 'C':
f33976b4
DB
4255 ret = reg_node(pRExC_state, CANY);
4256 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4257 *flagp |= HASWIDTH|SIMPLE;
830247a4 4258 nextchar(pRExC_state);
fac92740 4259 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4260 break;
4261 case 'X':
830247a4 4262 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4263 *flagp |= HASWIDTH;
830247a4 4264 nextchar(pRExC_state);
fac92740 4265 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4266 break;
a0d0e21e 4267 case 'w':
eb160463 4268 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4269 *flagp |= HASWIDTH|SIMPLE;
830247a4 4270 nextchar(pRExC_state);
fac92740 4271 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4272 break;
4273 case 'W':
eb160463 4274 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4275 *flagp |= HASWIDTH|SIMPLE;
830247a4 4276 nextchar(pRExC_state);
fac92740 4277 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4278 break;
4279 case 'b':
830247a4
IZ
4280 RExC_seen_zerolen++;
4281 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4282 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4283 *flagp |= SIMPLE;
830247a4 4284 nextchar(pRExC_state);
fac92740 4285 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4286 break;
4287 case 'B':
830247a4
IZ
4288 RExC_seen_zerolen++;
4289 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4290 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4291 *flagp |= SIMPLE;
830247a4 4292 nextchar(pRExC_state);
fac92740 4293 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4294 break;
4295 case 's':
eb160463 4296 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4297 *flagp |= HASWIDTH|SIMPLE;
830247a4 4298 nextchar(pRExC_state);
fac92740 4299 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4300 break;
4301 case 'S':
eb160463 4302 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4303 *flagp |= HASWIDTH|SIMPLE;
830247a4 4304 nextchar(pRExC_state);
fac92740 4305 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4306 break;
4307 case 'd':
ffc61ed2 4308 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4309 *flagp |= HASWIDTH|SIMPLE;
830247a4 4310 nextchar(pRExC_state);
fac92740 4311 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4312 break;
4313 case 'D':
ffc61ed2 4314 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4315 *flagp |= HASWIDTH|SIMPLE;
830247a4 4316 nextchar(pRExC_state);
fac92740 4317 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4318 break;
a14b48bc
LW
4319 case 'p':
4320 case 'P':
3568d838 4321 {
3dab1dad 4322 char* const oldregxend = RExC_end;
ccb2c380 4323 char* parse_start = RExC_parse - 2;
a14b48bc 4324
830247a4 4325 if (RExC_parse[1] == '{') {
3568d838 4326 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4327 RExC_end = strchr(RExC_parse, '}');
4328 if (!RExC_end) {
3dab1dad 4329 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4330 RExC_parse += 2;
4331 RExC_end = oldregxend;
0da60cf5 4332 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4333 }
830247a4 4334 RExC_end++;
a14b48bc 4335 }
af6f566e 4336 else {
830247a4 4337 RExC_end = RExC_parse + 2;
af6f566e
HS
4338 if (RExC_end > oldregxend)
4339 RExC_end = oldregxend;
4340 }
830247a4 4341 RExC_parse--;
a14b48bc 4342
3dab1dad 4343 ret = regclass(pRExC_state,depth+1);
a14b48bc 4344
830247a4
IZ
4345 RExC_end = oldregxend;
4346 RExC_parse--;
ccb2c380
MP
4347
4348 Set_Node_Offset(ret, parse_start + 2);
4349 Set_Node_Cur_Length(ret);
830247a4 4350 nextchar(pRExC_state);
a14b48bc
LW
4351 *flagp |= HASWIDTH|SIMPLE;
4352 }
4353 break;
a0d0e21e
LW
4354 case 'n':
4355 case 'r':
4356 case 't':
4357 case 'f':
4358 case 'e':
4359 case 'a':
4360 case 'x':
4361 case 'c':
4362 case '0':
4363 goto defchar;
4364 case '1': case '2': case '3': case '4':
4365 case '5': case '6': case '7': case '8': case '9':
4366 {
1df70142 4367 const I32 num = atoi(RExC_parse);
a0d0e21e 4368
830247a4 4369 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4370 goto defchar;
4371 else {
3dab1dad 4372 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4373 while (isDIGIT(*RExC_parse))
4374 RExC_parse++;
b45f050a 4375
eb160463 4376 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4377 vFAIL("Reference to nonexistent group");
830247a4 4378 RExC_sawback = 1;
eb160463
GS
4379 ret = reganode(pRExC_state,
4380 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4381 num);
a0d0e21e 4382 *flagp |= HASWIDTH;
2af232bd 4383
fac92740 4384 /* override incorrect value set in reganode MJD */
2af232bd 4385 Set_Node_Offset(ret, parse_start+1);
fac92740 4386 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4387 RExC_parse--;
4388 nextchar(pRExC_state);
a0d0e21e
LW
4389 }
4390 }
4391 break;
4392 case '\0':
830247a4 4393 if (RExC_parse >= RExC_end)
b45f050a 4394 FAIL("Trailing \\");
a0d0e21e
LW
4395 /* FALL THROUGH */
4396 default:
a0288114 4397 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4398 back into the quick-grab loop below */
45948336 4399 parse_start--;
a0d0e21e
LW
4400 goto defchar;
4401 }
4402 break;
4633a7c4
LW
4403
4404 case '#':
e2509266 4405 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4406 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4407 RExC_parse++;
830247a4 4408 if (RExC_parse < RExC_end)
4633a7c4
LW
4409 goto tryagain;
4410 }
4411 /* FALL THROUGH */
4412
a0d0e21e 4413 default: {
ba210ebe 4414 register STRLEN len;
58ae7d3f 4415 register UV ender;
a0d0e21e 4416 register char *p;
3dab1dad 4417 char *s;
80aecb99 4418 STRLEN foldlen;
89ebb4a3 4419 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4420
4421 parse_start = RExC_parse - 1;
a0d0e21e 4422
830247a4 4423 RExC_parse++;
a0d0e21e
LW
4424
4425 defchar:
58ae7d3f 4426 ender = 0;
eb160463
GS
4427 ret = reg_node(pRExC_state,
4428 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4429 s = STRING(ret);
830247a4
IZ
4430 for (len = 0, p = RExC_parse - 1;
4431 len < 127 && p < RExC_end;
a0d0e21e
LW
4432 len++)
4433 {
3dab1dad 4434 char * const oldp = p;
5b5a24f7 4435
e2509266 4436 if (RExC_flags & PMf_EXTENDED)
830247a4 4437 p = regwhite(p, RExC_end);
a0d0e21e
LW
4438 switch (*p) {
4439 case '^':
4440 case '$':
4441 case '.':
4442 case '[':
4443 case '(':
4444 case ')':
4445 case '|':
4446 goto loopdone;
4447 case '\\':
4448 switch (*++p) {
4449 case 'A':
1ed8eac0
JF
4450 case 'C':
4451 case 'X':
a0d0e21e
LW
4452 case 'G':
4453 case 'Z':
b85d18e9 4454 case 'z':
a0d0e21e
LW
4455 case 'w':
4456 case 'W':
4457 case 'b':
4458 case 'B':
4459 case 's':
4460 case 'S':
4461 case 'd':
4462 case 'D':
a14b48bc
LW
4463 case 'p':
4464 case 'P':
a0d0e21e
LW
4465 --p;
4466 goto loopdone;
4467 case 'n':
4468 ender = '\n';
4469 p++;
a687059c 4470 break;
a0d0e21e
LW
4471 case 'r':
4472 ender = '\r';
4473 p++;
a687059c 4474 break;
a0d0e21e
LW
4475 case 't':
4476 ender = '\t';
4477 p++;
a687059c 4478 break;
a0d0e21e
LW
4479 case 'f':
4480 ender = '\f';
4481 p++;
a687059c 4482 break;
a0d0e21e 4483 case 'e':
c7f1f016 4484 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4485 p++;
a687059c 4486 break;
a0d0e21e 4487 case 'a':
c7f1f016 4488 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4489 p++;
a687059c 4490 break;
a0d0e21e 4491 case 'x':
a0ed51b3 4492 if (*++p == '{') {
1df70142 4493 char* const e = strchr(p, '}');
b81d288d 4494
b45f050a 4495 if (!e) {
830247a4 4496 RExC_parse = p + 1;
b45f050a
JF
4497 vFAIL("Missing right brace on \\x{}");
4498 }
de5f0749 4499 else {
a4c04bdc
NC
4500 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4501 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4502 STRLEN numlen = e - p - 1;
53305cf1 4503 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4504 if (ender > 0xff)
4505 RExC_utf8 = 1;
a0ed51b3
LW
4506 p = e + 1;
4507 }
a0ed51b3
LW
4508 }
4509 else {
a4c04bdc 4510 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4511 STRLEN numlen = 2;
53305cf1 4512 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4513 p += numlen;
4514 }
a687059c 4515 break;
a0d0e21e
LW
4516 case 'c':
4517 p++;
bbce6d69 4518 ender = UCHARAT(p++);
4519 ender = toCTRL(ender);
a687059c 4520 break;
a0d0e21e
LW
4521 case '0': case '1': case '2': case '3':case '4':
4522 case '5': case '6': case '7': case '8':case '9':
4523 if (*p == '0' ||
830247a4 4524 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4525 I32 flags = 0;
1df70142 4526 STRLEN numlen = 3;
53305cf1 4527 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4528 p += numlen;
4529 }
4530 else {
4531 --p;
4532 goto loopdone;
a687059c
LW
4533 }
4534 break;
a0d0e21e 4535 case '\0':
830247a4 4536 if (p >= RExC_end)
b45f050a 4537 FAIL("Trailing \\");
a687059c 4538 /* FALL THROUGH */
a0d0e21e 4539 default:
041457d9 4540 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4541 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4542 goto normal_default;
a0d0e21e
LW
4543 }
4544 break;
a687059c 4545 default:
a0ed51b3 4546 normal_default:
fd400ab9 4547 if (UTF8_IS_START(*p) && UTF) {
1df70142 4548 STRLEN numlen;
5e12f4fb 4549 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4550 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4551 p += numlen;
4552 }
4553 else
4554 ender = *p++;
a0d0e21e 4555 break;
a687059c 4556 }
e2509266 4557 if (RExC_flags & PMf_EXTENDED)
830247a4 4558 p = regwhite(p, RExC_end);
60a8b682
JH
4559 if (UTF && FOLD) {
4560 /* Prime the casefolded buffer. */
ac7e0132 4561 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4562 }
a0d0e21e
LW
4563 if (ISMULT2(p)) { /* Back off on ?+*. */
4564 if (len)
4565 p = oldp;
16ea2a2e 4566 else if (UTF) {
80aecb99 4567 if (FOLD) {
60a8b682 4568 /* Emit all the Unicode characters. */
1df70142 4569 STRLEN numlen;
80aecb99
JH
4570 for (foldbuf = tmpbuf;
4571 foldlen;
4572 foldlen -= numlen) {
4573 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4574 if (numlen > 0) {
71207a34 4575 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4576 s += unilen;
4577 len += unilen;
4578 /* In EBCDIC the numlen
4579 * and unilen can differ. */
9dc45d57 4580 foldbuf += numlen;
47654450
JH
4581 if (numlen >= foldlen)
4582 break;
9dc45d57
JH
4583 }
4584 else
4585 break; /* "Can't happen." */
80aecb99
JH
4586 }
4587 }
4588 else {
71207a34 4589 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4590 if (unilen > 0) {
0ebc6274
JH
4591 s += unilen;
4592 len += unilen;
9dc45d57 4593 }
80aecb99 4594 }
a0ed51b3 4595 }
a0d0e21e
LW
4596 else {
4597 len++;
eb160463 4598 REGC((char)ender, s++);
a0d0e21e
LW
4599 }
4600 break;
a687059c 4601 }
16ea2a2e 4602 if (UTF) {
80aecb99 4603 if (FOLD) {
60a8b682 4604 /* Emit all the Unicode characters. */
1df70142 4605 STRLEN numlen;
80aecb99
JH
4606 for (foldbuf = tmpbuf;
4607 foldlen;
4608 foldlen -= numlen) {
4609 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4610 if (numlen > 0) {
71207a34 4611 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4612 len += unilen;
4613 s += unilen;
4614 /* In EBCDIC the numlen
4615 * and unilen can differ. */
9dc45d57 4616 foldbuf += numlen;
47654450
JH
4617 if (numlen >= foldlen)
4618 break;
9dc45d57
JH
4619 }
4620 else
4621 break;
80aecb99
JH
4622 }
4623 }
4624 else {
71207a34 4625 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4626 if (unilen > 0) {
0ebc6274
JH
4627 s += unilen;
4628 len += unilen;
9dc45d57 4629 }
80aecb99
JH
4630 }
4631 len--;
a0ed51b3
LW
4632 }
4633 else
eb160463 4634 REGC((char)ender, s++);
a0d0e21e
LW
4635 }
4636 loopdone:
830247a4 4637 RExC_parse = p - 1;
fac92740 4638 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4639 nextchar(pRExC_state);
793db0cb
JH
4640 {
4641 /* len is STRLEN which is unsigned, need to copy to signed */
4642 IV iv = len;
4643 if (iv < 0)
4644 vFAIL("Internal disaster");
4645 }
a0d0e21e
LW
4646 if (len > 0)
4647 *flagp |= HASWIDTH;
090f7165 4648 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4649 *flagp |= SIMPLE;
3dab1dad 4650
cd439c50 4651 if (SIZE_ONLY)
830247a4 4652 RExC_size += STR_SZ(len);
3dab1dad
YO
4653 else {
4654 STR_LEN(ret) = len;
830247a4 4655 RExC_emit += STR_SZ(len);
a687059c 4656 }
3dab1dad 4657 }
a0d0e21e
LW
4658 break;
4659 }
a687059c 4660
60a8b682
JH
4661 /* If the encoding pragma is in effect recode the text of
4662 * any EXACT-kind nodes. */
3dab1dad
YO
4663 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4664 const STRLEN oldlen = STR_LEN(ret);
4665 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4666
4667 if (RExC_utf8)
4668 SvUTF8_on(sv);
4669 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4670 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4671 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4672
4673 if (SvUTF8(sv))
4674 RExC_utf8 = 1;
4675 if (!SIZE_ONLY) {
a3621e74
YO
4676 GET_RE_DEBUG_FLAGS_DECL;
4677 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4678 (int)oldlen, STRING(ret),
4679 (int)newlen, s));
4680 Copy(s, STRING(ret), newlen, char);
4681 STR_LEN(ret) += newlen - oldlen;
4682 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4683 } else
4684 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4685 }
a72c7584
JH
4686 }
4687
a0d0e21e 4688 return(ret);
a687059c
LW
4689}
4690
873ef191 4691STATIC char *
5f66b61c 4692S_regwhite(char *p, const char *e)
5b5a24f7
CS
4693{
4694 while (p < e) {
4695 if (isSPACE(*p))
4696 ++p;
4697 else if (*p == '#') {
4698 do {
4699 p++;
4700 } while (p < e && *p != '\n');
4701 }
4702 else
4703 break;
4704 }
4705 return p;
4706}
4707
b8c5462f
JH
4708/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4709 Character classes ([:foo:]) can also be negated ([:^foo:]).
4710 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4711 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4712 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4713
4714#define POSIXCC_DONE(c) ((c) == ':')
4715#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4716#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4717
b8c5462f 4718STATIC I32
830247a4 4719S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4720{
97aff369 4721 dVAR;
936ed897 4722 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4723
830247a4 4724 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4725 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4726 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4727 const char c = UCHARAT(RExC_parse);
097eb12c 4728 char* const s = RExC_parse++;
b81d288d 4729
9a86a77b 4730 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4731 RExC_parse++;
4732 if (RExC_parse == RExC_end)
620e46c5 4733 /* Grandfather lone [:, [=, [. */
830247a4 4734 RExC_parse = s;
620e46c5 4735 else {
3dab1dad 4736 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
4737 assert(*t == c);
4738
9a86a77b 4739 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 4740 const char *posixcc = s + 1;
830247a4 4741 RExC_parse++; /* skip over the ending ] */
3dab1dad 4742
b8c5462f 4743 if (*s == ':') {
1df70142
AL
4744 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4745 const I32 skip = t - posixcc;
80916619
NC
4746
4747 /* Initially switch on the length of the name. */
4748 switch (skip) {
4749 case 4:
3dab1dad
YO
4750 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
4751 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 4752 break;
80916619
NC
4753 case 5:
4754 /* Names all of length 5. */
4755 /* alnum alpha ascii blank cntrl digit graph lower
4756 print punct space upper */
4757 /* Offset 4 gives the best switch position. */
4758 switch (posixcc[4]) {
4759 case 'a':
3dab1dad
YO
4760 if (memEQ(posixcc, "alph", 4)) /* alpha */
4761 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
4762 break;
4763 case 'e':
3dab1dad
YO
4764 if (memEQ(posixcc, "spac", 4)) /* space */
4765 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
4766 break;
4767 case 'h':
3dab1dad
YO
4768 if (memEQ(posixcc, "grap", 4)) /* graph */
4769 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
4770 break;
4771 case 'i':
3dab1dad
YO
4772 if (memEQ(posixcc, "asci", 4)) /* ascii */
4773 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
4774 break;
4775 case 'k':
3dab1dad
YO
4776 if (memEQ(posixcc, "blan", 4)) /* blank */
4777 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
4778 break;
4779 case 'l':
3dab1dad
YO
4780 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
4781 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
4782 break;
4783 case 'm':
3dab1dad
YO
4784 if (memEQ(posixcc, "alnu", 4)) /* alnum */
4785 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
4786 break;
4787 case 'r':
3dab1dad
YO
4788 if (memEQ(posixcc, "lowe", 4)) /* lower */
4789 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4790 else if (memEQ(posixcc, "uppe", 4)) /* upper */
4791 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
4792 break;
4793 case 't':
3dab1dad
YO
4794 if (memEQ(posixcc, "digi", 4)) /* digit */
4795 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4796 else if (memEQ(posixcc, "prin", 4)) /* print */
4797 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4798 else if (memEQ(posixcc, "punc", 4)) /* punct */
4799 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 4800 break;
b8c5462f
JH
4801 }
4802 break;
80916619 4803 case 6:
3dab1dad
YO
4804 if (memEQ(posixcc, "xdigit", 6))
4805 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4806 break;
4807 }
80916619
NC
4808
4809 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4810 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4811 t - s - 1, s + 1);
80916619
NC
4812 assert (posixcc[skip] == ':');
4813 assert (posixcc[skip+1] == ']');
b45f050a 4814 } else if (!SIZE_ONLY) {
b8c5462f 4815 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4816
830247a4 4817 /* adjust RExC_parse so the warning shows after
b45f050a 4818 the class closes */
9a86a77b 4819 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4820 RExC_parse++;
b45f050a
JF
4821 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4822 }
b8c5462f
JH
4823 } else {
4824 /* Maternal grandfather:
4825 * "[:" ending in ":" but not in ":]" */
830247a4 4826 RExC_parse = s;
767d463e 4827 }
620e46c5
JH
4828 }
4829 }
4830
b8c5462f
JH
4831 return namedclass;
4832}
4833
4834STATIC void
830247a4 4835S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4836{
97aff369 4837 dVAR;
3dab1dad 4838 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4839 const char *s = RExC_parse;
4840 const char c = *s++;
b8c5462f 4841
3dab1dad 4842 while (isALNUM(*s))
b8c5462f
JH
4843 s++;
4844 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4845 if (ckWARN(WARN_REGEXP))
4846 vWARN3(s+2,
4847 "POSIX syntax [%c %c] belongs inside character classes",
4848 c, c);
b45f050a
JF
4849
4850 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4851 if (POSIXCC_NOTYET(c)) {
830247a4 4852 /* adjust RExC_parse so the error shows after
b45f050a 4853 the class closes */
9a86a77b 4854 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 4855 NOOP;
b45f050a
JF
4856 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4857 }
b8c5462f
JH
4858 }
4859 }
620e46c5
JH
4860}
4861
7f6f358c
YO
4862
4863/*
4864 parse a class specification and produce either an ANYOF node that
4865 matches the pattern. If the pattern matches a single char only and
4866 that char is < 256 then we produce an EXACT node instead.
4867*/
76e3520e 4868STATIC regnode *
3dab1dad 4869S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 4870{
97aff369 4871 dVAR;
ffc61ed2 4872 register UV value;
9a86a77b 4873 register UV nextvalue;
3568d838 4874 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4875 register IV range = 0;
c277df42 4876 register regnode *ret;
ba210ebe 4877 STRLEN numlen;
ffc61ed2 4878 IV namedclass;
cbbf8932 4879 char *rangebegin = NULL;
936ed897 4880 bool need_class = 0;
c445ea15 4881 SV *listsv = NULL;
ffc61ed2 4882 UV n;
9e55ce06 4883 bool optimize_invert = TRUE;
cbbf8932 4884 AV* unicode_alternate = NULL;
1b2d223b
JH
4885#ifdef EBCDIC
4886 UV literal_endpoint = 0;
4887#endif
7f6f358c 4888 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 4889
3dab1dad 4890 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 4891 case we need to change the emitted regop to an EXACT. */
3dab1dad
YO
4892 GET_RE_DEBUG_FLAGS_DECL;
4893 DEBUG_PARSE("clas");
7f6f358c
YO
4894
4895 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
4896 ret = reganode(pRExC_state, ANYOF, 0);
4897
4898 if (!SIZE_ONLY)
4899 ANYOF_FLAGS(ret) = 0;
4900
9a86a77b 4901 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4902 RExC_naughty++;
4903 RExC_parse++;
4904 if (!SIZE_ONLY)
4905 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4906 }
a0d0e21e 4907
73060fc4 4908 if (SIZE_ONLY) {
830247a4 4909 RExC_size += ANYOF_SKIP;
73060fc4
JH
4910 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4911 }
936ed897 4912 else {
830247a4 4913 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4914 if (FOLD)
4915 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4916 if (LOC)
4917 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4918 ANYOF_BITMAP_ZERO(ret);
396482e1 4919 listsv = newSVpvs("# comment\n");
a0d0e21e 4920 }
b8c5462f 4921
9a86a77b
JH
4922 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4923
b938889d 4924 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4925 checkposixcc(pRExC_state);
b8c5462f 4926
f064b6ad
HS
4927 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4928 if (UCHARAT(RExC_parse) == ']')
4929 goto charclassloop;
ffc61ed2 4930
9a86a77b 4931 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4932
4933 charclassloop:
4934
4935 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4936
73b437c8 4937 if (!range)
830247a4 4938 rangebegin = RExC_parse;
ffc61ed2 4939 if (UTF) {
5e12f4fb 4940 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 4941 RExC_end - RExC_parse,
9f7f3913 4942 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4943 RExC_parse += numlen;
4944 }
4945 else
4946 value = UCHARAT(RExC_parse++);
7f6f358c 4947
9a86a77b
JH
4948 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4949 if (value == '[' && POSIXCC(nextvalue))
830247a4 4950 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4951 else if (value == '\\') {
ffc61ed2 4952 if (UTF) {
5e12f4fb 4953 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 4954 RExC_end - RExC_parse,
9f7f3913 4955 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4956 RExC_parse += numlen;
4957 }
4958 else
4959 value = UCHARAT(RExC_parse++);
470c3474 4960 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4961 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4962 * be a problem later if we want switch on Unicode.
4963 * A similar issue a little bit later when switching on
4964 * namedclass. --jhi */
ffc61ed2 4965 switch ((I32)value) {
b8c5462f
JH
4966 case 'w': namedclass = ANYOF_ALNUM; break;
4967 case 'W': namedclass = ANYOF_NALNUM; break;
4968 case 's': namedclass = ANYOF_SPACE; break;
4969 case 'S': namedclass = ANYOF_NSPACE; break;
4970 case 'd': namedclass = ANYOF_DIGIT; break;
4971 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4972 case 'p':
4973 case 'P':
3dab1dad
YO
4974 {
4975 char *e;
af6f566e 4976 if (RExC_parse >= RExC_end)
2a4859cd 4977 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4978 if (*RExC_parse == '{') {
1df70142 4979 const U8 c = (U8)value;
ffc61ed2
JH
4980 e = strchr(RExC_parse++, '}');
4981 if (!e)
0da60cf5 4982 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4983 while (isSPACE(UCHARAT(RExC_parse)))
4984 RExC_parse++;
4985 if (e == RExC_parse)
0da60cf5 4986 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4987 n = e - RExC_parse;
ab13f0c7
JH
4988 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4989 n--;
ffc61ed2
JH
4990 }
4991 else {
4992 e = RExC_parse;
4993 n = 1;
4994 }
4995 if (!SIZE_ONLY) {
ab13f0c7
JH
4996 if (UCHARAT(RExC_parse) == '^') {
4997 RExC_parse++;
4998 n--;
4999 value = value == 'p' ? 'P' : 'p'; /* toggle */
5000 while (isSPACE(UCHARAT(RExC_parse))) {
5001 RExC_parse++;
5002 n--;
5003 }
5004 }
097eb12c
AL
5005 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5006 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5007 }
5008 RExC_parse = e + 1;
5009 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5010 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5011 }
f81125e2 5012 break;
b8c5462f
JH
5013 case 'n': value = '\n'; break;
5014 case 'r': value = '\r'; break;
5015 case 't': value = '\t'; break;
5016 case 'f': value = '\f'; break;
5017 case 'b': value = '\b'; break;
c7f1f016
NIS
5018 case 'e': value = ASCII_TO_NATIVE('\033');break;
5019 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5020 case 'x':
ffc61ed2 5021 if (*RExC_parse == '{') {
a4c04bdc
NC
5022 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5023 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5024 char * const e = strchr(RExC_parse++, '}');
b81d288d 5025 if (!e)
ffc61ed2 5026 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5027
5028 numlen = e - RExC_parse;
5029 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5030 RExC_parse = e + 1;
5031 }
5032 else {
a4c04bdc 5033 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5034 numlen = 2;
5035 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5036 RExC_parse += numlen;
5037 }
b8c5462f
JH
5038 break;
5039 case 'c':
830247a4 5040 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5041 value = toCTRL(value);
5042 break;
5043 case '0': case '1': case '2': case '3': case '4':
5044 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5045 {
5046 I32 flags = 0;
5047 numlen = 3;
5048 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5049 RExC_parse += numlen;
b8c5462f 5050 break;
53305cf1 5051 }
1028017a 5052 default:
041457d9 5053 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5054 vWARN2(RExC_parse,
5055 "Unrecognized escape \\%c in character class passed through",
5056 (int)value);
1028017a 5057 break;
b8c5462f 5058 }
ffc61ed2 5059 } /* end of \blah */
1b2d223b
JH
5060#ifdef EBCDIC
5061 else
5062 literal_endpoint++;
5063#endif
ffc61ed2
JH
5064
5065 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5066
5067 if (!SIZE_ONLY && !need_class)
936ed897 5068 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5069
936ed897 5070 need_class = 1;
ffc61ed2
JH
5071
5072 /* a bad range like a-\d, a-[:digit:] ? */
5073 if (range) {
73b437c8 5074 if (!SIZE_ONLY) {
afd78fd5 5075 if (ckWARN(WARN_REGEXP)) {
097eb12c 5076 const int w =
afd78fd5
JH
5077 RExC_parse >= rangebegin ?
5078 RExC_parse - rangebegin : 0;
830247a4 5079 vWARN4(RExC_parse,
b45f050a 5080 "False [] range \"%*.*s\"",
097eb12c 5081 w, w, rangebegin);
afd78fd5 5082 }
3568d838
JH
5083 if (prevvalue < 256) {
5084 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5085 ANYOF_BITMAP_SET(ret, '-');
5086 }
5087 else {
5088 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5089 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5090 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5091 }
b8c5462f 5092 }
ffc61ed2
JH
5093
5094 range = 0; /* this was not a true range */
73b437c8 5095 }
ffc61ed2 5096
73b437c8 5097 if (!SIZE_ONLY) {
c49a72a9
NC
5098 const char *what = NULL;
5099 char yesno = 0;
5100
3568d838
JH
5101 if (namedclass > OOB_NAMEDCLASS)
5102 optimize_invert = FALSE;
e2962f66
JH
5103 /* Possible truncation here but in some 64-bit environments
5104 * the compiler gets heartburn about switch on 64-bit values.
5105 * A similar issue a little earlier when switching on value.
98f323fa 5106 * --jhi */
e2962f66 5107 switch ((I32)namedclass) {
73b437c8
JH
5108 case ANYOF_ALNUM:
5109 if (LOC)
936ed897 5110 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5111 else {
5112 for (value = 0; value < 256; value++)
5113 if (isALNUM(value))
936ed897 5114 ANYOF_BITMAP_SET(ret, value);
73b437c8 5115 }
c49a72a9
NC
5116 yesno = '+';
5117 what = "Word";
73b437c8
JH
5118 break;
5119 case ANYOF_NALNUM:
5120 if (LOC)
936ed897 5121 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5122 else {
5123 for (value = 0; value < 256; value++)
5124 if (!isALNUM(value))
936ed897 5125 ANYOF_BITMAP_SET(ret, value);
73b437c8 5126 }
c49a72a9
NC
5127 yesno = '!';
5128 what = "Word";
73b437c8 5129 break;
ffc61ed2 5130 case ANYOF_ALNUMC:
73b437c8 5131 if (LOC)
ffc61ed2 5132 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5133 else {
5134 for (value = 0; value < 256; value++)
ffc61ed2 5135 if (isALNUMC(value))
936ed897 5136 ANYOF_BITMAP_SET(ret, value);
73b437c8 5137 }
c49a72a9
NC
5138 yesno = '+';
5139 what = "Alnum";
73b437c8
JH
5140 break;
5141 case ANYOF_NALNUMC:
5142 if (LOC)
936ed897 5143 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5144 else {
5145 for (value = 0; value < 256; value++)
5146 if (!isALNUMC(value))
936ed897 5147 ANYOF_BITMAP_SET(ret, value);
73b437c8 5148 }
c49a72a9
NC
5149 yesno = '!';
5150 what = "Alnum";
73b437c8
JH
5151 break;
5152 case ANYOF_ALPHA:
5153 if (LOC)
936ed897 5154 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5155 else {
5156 for (value = 0; value < 256; value++)
5157 if (isALPHA(value))
936ed897 5158 ANYOF_BITMAP_SET(ret, value);
73b437c8 5159 }
c49a72a9
NC
5160 yesno = '+';
5161 what = "Alpha";
73b437c8
JH
5162 break;
5163 case ANYOF_NALPHA:
5164 if (LOC)
936ed897 5165 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5166 else {
5167 for (value = 0; value < 256; value++)
5168 if (!isALPHA(value))
936ed897 5169 ANYOF_BITMAP_SET(ret, value);
73b437c8 5170 }
c49a72a9
NC
5171 yesno = '!';
5172 what = "Alpha";
73b437c8
JH
5173 break;
5174 case ANYOF_ASCII:
5175 if (LOC)
936ed897 5176 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5177 else {
c7f1f016 5178#ifndef EBCDIC
1ba5c669
JH
5179 for (value = 0; value < 128; value++)
5180 ANYOF_BITMAP_SET(ret, value);
5181#else /* EBCDIC */
ffbc6a93 5182 for (value = 0; value < 256; value++) {
3a3c4447
JH
5183 if (isASCII(value))
5184 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5185 }
1ba5c669 5186#endif /* EBCDIC */
73b437c8 5187 }
c49a72a9
NC
5188 yesno = '+';
5189 what = "ASCII";
73b437c8
JH
5190 break;
5191 case ANYOF_NASCII:
5192 if (LOC)
936ed897 5193 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5194 else {
c7f1f016 5195#ifndef EBCDIC
1ba5c669
JH
5196 for (value = 128; value < 256; value++)
5197 ANYOF_BITMAP_SET(ret, value);
5198#else /* EBCDIC */
ffbc6a93 5199 for (value = 0; value < 256; value++) {
3a3c4447
JH
5200 if (!isASCII(value))
5201 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5202 }
1ba5c669 5203#endif /* EBCDIC */
73b437c8 5204 }
c49a72a9
NC
5205 yesno = '!';
5206 what = "ASCII";
73b437c8 5207 break;
aaa51d5e
JF
5208 case ANYOF_BLANK:
5209 if (LOC)
5210 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5211 else {
5212 for (value = 0; value < 256; value++)
5213 if (isBLANK(value))
5214 ANYOF_BITMAP_SET(ret, value);
5215 }
c49a72a9
NC
5216 yesno = '+';
5217 what = "Blank";
aaa51d5e
JF
5218 break;
5219 case ANYOF_NBLANK:
5220 if (LOC)
5221 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5222 else {
5223 for (value = 0; value < 256; value++)
5224 if (!isBLANK(value))
5225 ANYOF_BITMAP_SET(ret, value);
5226 }
c49a72a9
NC
5227 yesno = '!';
5228 what = "Blank";
aaa51d5e 5229 break;
73b437c8
JH
5230 case ANYOF_CNTRL:
5231 if (LOC)
936ed897 5232 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5233 else {
5234 for (value = 0; value < 256; value++)
5235 if (isCNTRL(value))
936ed897 5236 ANYOF_BITMAP_SET(ret, value);
73b437c8 5237 }
c49a72a9
NC
5238 yesno = '+';
5239 what = "Cntrl";
73b437c8
JH
5240 break;
5241 case ANYOF_NCNTRL:
5242 if (LOC)
936ed897 5243 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5244 else {
5245 for (value = 0; value < 256; value++)
5246 if (!isCNTRL(value))
936ed897 5247 ANYOF_BITMAP_SET(ret, value);
73b437c8 5248 }
c49a72a9
NC
5249 yesno = '!';
5250 what = "Cntrl";
ffc61ed2
JH
5251 break;
5252 case ANYOF_DIGIT:
5253 if (LOC)
5254 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5255 else {
5256 /* consecutive digits assumed */
5257 for (value = '0'; value <= '9'; value++)
5258 ANYOF_BITMAP_SET(ret, value);
5259 }
c49a72a9
NC
5260 yesno = '+';
5261 what = "Digit";
ffc61ed2
JH
5262 break;
5263 case ANYOF_NDIGIT:
5264 if (LOC)
5265 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5266 else {
5267 /* consecutive digits assumed */
5268 for (value = 0; value < '0'; value++)
5269 ANYOF_BITMAP_SET(ret, value);
5270 for (value = '9' + 1; value < 256; value++)
5271 ANYOF_BITMAP_SET(ret, value);
5272 }
c49a72a9
NC
5273 yesno = '!';
5274 what = "Digit";
73b437c8
JH
5275 break;
5276 case ANYOF_GRAPH:
5277 if (LOC)
936ed897 5278 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5279 else {
5280 for (value = 0; value < 256; value++)
5281 if (isGRAPH(value))
936ed897 5282 ANYOF_BITMAP_SET(ret, value);
73b437c8 5283 }
c49a72a9
NC
5284 yesno = '+';
5285 what = "Graph";
73b437c8
JH
5286 break;
5287 case ANYOF_NGRAPH:
5288 if (LOC)
936ed897 5289 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5290 else {
5291 for (value = 0; value < 256; value++)
5292 if (!isGRAPH(value))
936ed897 5293 ANYOF_BITMAP_SET(ret, value);
73b437c8 5294 }
c49a72a9
NC
5295 yesno = '!';
5296 what = "Graph";
73b437c8
JH
5297 break;
5298 case ANYOF_LOWER:
5299 if (LOC)
936ed897 5300 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5301 else {
5302 for (value = 0; value < 256; value++)
5303 if (isLOWER(value))
936ed897 5304 ANYOF_BITMAP_SET(ret, value);
73b437c8 5305 }
c49a72a9
NC
5306 yesno = '+';
5307 what = "Lower";
73b437c8
JH
5308 break;
5309 case ANYOF_NLOWER:
5310 if (LOC)
936ed897 5311 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5312 else {
5313 for (value = 0; value < 256; value++)
5314 if (!isLOWER(value))
936ed897 5315 ANYOF_BITMAP_SET(ret, value);
73b437c8 5316 }
c49a72a9
NC
5317 yesno = '!';
5318 what = "Lower";
73b437c8
JH
5319 break;
5320 case ANYOF_PRINT:
5321 if (LOC)
936ed897 5322 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5323 else {
5324 for (value = 0; value < 256; value++)
5325 if (isPRINT(value))
936ed897 5326 ANYOF_BITMAP_SET(ret, value);
73b437c8 5327 }
c49a72a9
NC
5328 yesno = '+';
5329 what = "Print";
73b437c8
JH
5330 break;
5331 case ANYOF_NPRINT:
5332 if (LOC)
936ed897 5333 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5334 else {
5335 for (value = 0; value < 256; value++)
5336 if (!isPRINT(value))
936ed897 5337 ANYOF_BITMAP_SET(ret, value);
73b437c8 5338 }
c49a72a9
NC
5339 yesno = '!';
5340 what = "Print";
73b437c8 5341 break;
aaa51d5e
JF
5342 case ANYOF_PSXSPC:
5343 if (LOC)
5344 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5345 else {
5346 for (value = 0; value < 256; value++)
5347 if (isPSXSPC(value))
5348 ANYOF_BITMAP_SET(ret, value);
5349 }
c49a72a9
NC
5350 yesno = '+';
5351 what = "Space";
aaa51d5e
JF
5352 break;
5353 case ANYOF_NPSXSPC:
5354 if (LOC)
5355 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5356 else {
5357 for (value = 0; value < 256; value++)
5358 if (!isPSXSPC(value))
5359 ANYOF_BITMAP_SET(ret, value);
5360 }
c49a72a9
NC
5361 yesno = '!';
5362 what = "Space";
aaa51d5e 5363 break;
73b437c8
JH
5364 case ANYOF_PUNCT:
5365 if (LOC)
936ed897 5366 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5367 else {
5368 for (value = 0; value < 256; value++)
5369 if (isPUNCT(value))
936ed897 5370 ANYOF_BITMAP_SET(ret, value);
73b437c8 5371 }
c49a72a9
NC
5372 yesno = '+';
5373 what = "Punct";
73b437c8
JH
5374 break;
5375 case ANYOF_NPUNCT:
5376 if (LOC)
936ed897 5377 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5378 else {
5379 for (value = 0; value < 256; value++)
5380 if (!isPUNCT(value))
936ed897 5381 ANYOF_BITMAP_SET(ret, value);
73b437c8 5382 }
c49a72a9
NC
5383 yesno = '!';
5384 what = "Punct";
ffc61ed2
JH
5385 break;
5386 case ANYOF_SPACE:
5387 if (LOC)
5388 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5389 else {
5390 for (value = 0; value < 256; value++)
5391 if (isSPACE(value))
5392 ANYOF_BITMAP_SET(ret, value);
5393 }
c49a72a9
NC
5394 yesno = '+';
5395 what = "SpacePerl";
ffc61ed2
JH
5396 break;
5397 case ANYOF_NSPACE:
5398 if (LOC)
5399 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5400 else {
5401 for (value = 0; value < 256; value++)
5402 if (!isSPACE(value))
5403 ANYOF_BITMAP_SET(ret, value);
5404 }
c49a72a9
NC
5405 yesno = '!';
5406 what = "SpacePerl";
73b437c8
JH
5407 break;
5408 case ANYOF_UPPER:
5409 if (LOC)
936ed897 5410 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5411 else {
5412 for (value = 0; value < 256; value++)
5413 if (isUPPER(value))
936ed897 5414 ANYOF_BITMAP_SET(ret, value);
73b437c8 5415 }
c49a72a9
NC
5416 yesno = '+';
5417 what = "Upper";
73b437c8
JH
5418 break;
5419 case ANYOF_NUPPER:
5420 if (LOC)
936ed897 5421 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5422 else {
5423 for (value = 0; value < 256; value++)
5424 if (!isUPPER(value))
936ed897 5425 ANYOF_BITMAP_SET(ret, value);
73b437c8 5426 }
c49a72a9
NC
5427 yesno = '!';
5428 what = "Upper";
73b437c8
JH
5429 break;
5430 case ANYOF_XDIGIT:
5431 if (LOC)
936ed897 5432 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5433 else {
5434 for (value = 0; value < 256; value++)
5435 if (isXDIGIT(value))
936ed897 5436 ANYOF_BITMAP_SET(ret, value);
73b437c8 5437 }
c49a72a9
NC
5438 yesno = '+';
5439 what = "XDigit";
73b437c8
JH
5440 break;
5441 case ANYOF_NXDIGIT:
5442 if (LOC)
936ed897 5443 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5444 else {
5445 for (value = 0; value < 256; value++)
5446 if (!isXDIGIT(value))
936ed897 5447 ANYOF_BITMAP_SET(ret, value);
73b437c8 5448 }
c49a72a9
NC
5449 yesno = '!';
5450 what = "XDigit";
73b437c8 5451 break;
f81125e2
JP
5452 case ANYOF_MAX:
5453 /* this is to handle \p and \P */
5454 break;
73b437c8 5455 default:
b45f050a 5456 vFAIL("Invalid [::] class");
73b437c8 5457 break;
b8c5462f 5458 }
c49a72a9
NC
5459 if (what) {
5460 /* Strings such as "+utf8::isWord\n" */
5461 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5462 }
b8c5462f 5463 if (LOC)
936ed897 5464 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5465 continue;
a0d0e21e 5466 }
ffc61ed2
JH
5467 } /* end of namedclass \blah */
5468
a0d0e21e 5469 if (range) {
eb160463 5470 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5471 const int w = RExC_parse - rangebegin;
5472 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5473 range = 0; /* not a valid range */
73b437c8 5474 }
a0d0e21e
LW
5475 }
5476 else {
3568d838 5477 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5478 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5479 RExC_parse[1] != ']') {
5480 RExC_parse++;
ffc61ed2
JH
5481
5482 /* a bad range like \w-, [:word:]- ? */
5483 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5484 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5485 const int w =
afd78fd5
JH
5486 RExC_parse >= rangebegin ?
5487 RExC_parse - rangebegin : 0;
830247a4 5488 vWARN4(RExC_parse,
b45f050a 5489 "False [] range \"%*.*s\"",
097eb12c 5490 w, w, rangebegin);
afd78fd5 5491 }
73b437c8 5492 if (!SIZE_ONLY)
936ed897 5493 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5494 } else
ffc61ed2
JH
5495 range = 1; /* yeah, it's a range! */
5496 continue; /* but do it the next time */
a0d0e21e 5497 }
a687059c 5498 }
ffc61ed2 5499
93733859 5500 /* now is the next time */
7f6f358c 5501 stored += (value - prevvalue + 1);
ae5c130c 5502 if (!SIZE_ONLY) {
3568d838 5503 if (prevvalue < 256) {
1df70142 5504 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5505 IV i;
3568d838 5506#ifdef EBCDIC
1b2d223b
JH
5507 /* In EBCDIC [\x89-\x91] should include
5508 * the \x8e but [i-j] should not. */
5509 if (literal_endpoint == 2 &&
5510 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5511 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5512 {
3568d838
JH
5513 if (isLOWER(prevvalue)) {
5514 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5515 if (isLOWER(i))
5516 ANYOF_BITMAP_SET(ret, i);
5517 } else {
3568d838 5518 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5519 if (isUPPER(i))
5520 ANYOF_BITMAP_SET(ret, i);
5521 }
8ada0baa 5522 }
ffc61ed2 5523 else
8ada0baa 5524#endif
a5961de5
JH
5525 for (i = prevvalue; i <= ceilvalue; i++)
5526 ANYOF_BITMAP_SET(ret, i);
3568d838 5527 }
a5961de5 5528 if (value > 255 || UTF) {
1df70142
AL
5529 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5530 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5531
ffc61ed2 5532 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5533 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5534 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5535 prevnatvalue, natvalue);
5536 }
5537 else if (prevnatvalue == natvalue) {
5538 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5539 if (FOLD) {
89ebb4a3 5540 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5541 STRLEN foldlen;
1df70142 5542 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5543
c840d2a2
JH
5544 /* If folding and foldable and a single
5545 * character, insert also the folded version
5546 * to the charclass. */
9e55ce06 5547 if (f != value) {
eb160463 5548 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5549 Perl_sv_catpvf(aTHX_ listsv,
5550 "%04"UVxf"\n", f);
5551 else {
5552 /* Any multicharacter foldings
5553 * require the following transform:
5554 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5555 * where E folds into "pq" and F folds
5556 * into "rst", all other characters
5557 * fold to single characters. We save
5558 * away these multicharacter foldings,
5559 * to be later saved as part of the
5560 * additional "s" data. */
5561 SV *sv;
5562
5563 if (!unicode_alternate)
5564 unicode_alternate = newAV();
5565 sv = newSVpvn((char*)foldbuf, foldlen);
5566 SvUTF8_on(sv);
5567 av_push(unicode_alternate, sv);
5568 }
5569 }
254ba52a 5570
60a8b682
JH
5571 /* If folding and the value is one of the Greek
5572 * sigmas insert a few more sigmas to make the
5573 * folding rules of the sigmas to work right.
5574 * Note that not all the possible combinations
5575 * are handled here: some of them are handled
9e55ce06
JH
5576 * by the standard folding rules, and some of
5577 * them (literal or EXACTF cases) are handled
5578 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5579 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5580 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5581 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5582 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5583 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5584 }
5585 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5586 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5587 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5588 }
5589 }
ffc61ed2 5590 }
1b2d223b
JH
5591#ifdef EBCDIC
5592 literal_endpoint = 0;
5593#endif
8ada0baa 5594 }
ffc61ed2
JH
5595
5596 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5597 }
ffc61ed2 5598
936ed897 5599 if (need_class) {
4f66b38d 5600 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5601 if (SIZE_ONLY)
830247a4 5602 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5603 else
830247a4 5604 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5605 }
ffc61ed2 5606
7f6f358c
YO
5607
5608 if (SIZE_ONLY)
5609 return ret;
5610 /****** !SIZE_ONLY AFTER HERE *********/
5611
5612 if( stored == 1 && value < 256
5613 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5614 ) {
5615 /* optimize single char class to an EXACT node
5616 but *only* when its not a UTF/high char */
5617 RExC_emit = orig_emit;
5618 ret = reg_node(pRExC_state,
5619 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5620 *STRING(ret)= (char)value;
5621 STR_LEN(ret)= 1;
5622 RExC_emit += STR_SZ(1);
5623 return ret;
5624 }
ae5c130c 5625 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5626 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5627 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5628 ) {
a0ed51b3 5629 for (value = 0; value < 256; ++value) {
936ed897 5630 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5631 UV fold = PL_fold[value];
ffc61ed2
JH
5632
5633 if (fold != value)
5634 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5635 }
5636 }
936ed897 5637 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5638 }
ffc61ed2 5639
ae5c130c 5640 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5641 if (optimize_invert &&
ffc61ed2
JH
5642 /* If the only flag is inversion. */
5643 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5644 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5645 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5646 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5647 }
7f6f358c 5648 {
097eb12c 5649 AV * const av = newAV();
ffc61ed2 5650 SV *rv;
9e55ce06 5651 /* The 0th element stores the character class description
6a0407ee 5652 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5653 * to initialize the appropriate swash (which gets stored in
5654 * the 1st element), and also useful for dumping the regnode.
5655 * The 2nd element stores the multicharacter foldings,
6a0407ee 5656 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5657 av_store(av, 0, listsv);
5658 av_store(av, 1, NULL);
9e55ce06 5659 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5660 rv = newRV_noinc((SV*)av);
19860706 5661 n = add_data(pRExC_state, 1, "s");
830247a4 5662 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5663 ARG_SET(ret, n);
a0ed51b3 5664 }
a0ed51b3
LW
5665 return ret;
5666}
5667
76e3520e 5668STATIC char*
830247a4 5669S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5670{
097eb12c 5671 char* const retval = RExC_parse++;
a0d0e21e 5672
4633a7c4 5673 for (;;) {
830247a4
IZ
5674 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5675 RExC_parse[2] == '#') {
e994fd66
AE
5676 while (*RExC_parse != ')') {
5677 if (RExC_parse == RExC_end)
5678 FAIL("Sequence (?#... not terminated");
830247a4 5679 RExC_parse++;
e994fd66 5680 }
830247a4 5681 RExC_parse++;
4633a7c4
LW
5682 continue;
5683 }
e2509266 5684 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5685 if (isSPACE(*RExC_parse)) {
5686 RExC_parse++;
748a9306
LW
5687 continue;
5688 }
830247a4 5689 else if (*RExC_parse == '#') {
e994fd66
AE
5690 while (RExC_parse < RExC_end)
5691 if (*RExC_parse++ == '\n') break;
748a9306
LW
5692 continue;
5693 }
748a9306 5694 }
4633a7c4 5695 return retval;
a0d0e21e 5696 }
a687059c
LW
5697}
5698
5699/*
c277df42 5700- reg_node - emit a node
a0d0e21e 5701*/
76e3520e 5702STATIC regnode * /* Location. */
830247a4 5703S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5704{
97aff369 5705 dVAR;
c277df42 5706 register regnode *ptr;
504618e9 5707 regnode * const ret = RExC_emit;
a687059c 5708
c277df42 5709 if (SIZE_ONLY) {
830247a4
IZ
5710 SIZE_ALIGN(RExC_size);
5711 RExC_size += 1;
a0d0e21e
LW
5712 return(ret);
5713 }
c277df42 5714 NODE_ALIGN_FILL(ret);
a0d0e21e 5715 ptr = ret;
c277df42 5716 FILL_ADVANCE_NODE(ptr, op);
fac92740 5717 if (RExC_offsets) { /* MJD */
ccb2c380 5718 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5719 "reg_node", __LINE__,
5720 reg_name[op],
5721 RExC_emit - RExC_emit_start > RExC_offsets[0]
5722 ? "Overwriting end of array!\n" : "OK",
5723 RExC_emit - RExC_emit_start,
5724 RExC_parse - RExC_start,
5725 RExC_offsets[0]));
ccb2c380 5726 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5727 }
5728
830247a4 5729 RExC_emit = ptr;
a687059c 5730
a0d0e21e 5731 return(ret);
a687059c
LW
5732}
5733
5734/*
a0d0e21e
LW
5735- reganode - emit a node with an argument
5736*/
76e3520e 5737STATIC regnode * /* Location. */
830247a4 5738S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5739{
97aff369 5740 dVAR;
c277df42 5741 register regnode *ptr;
504618e9 5742 regnode * const ret = RExC_emit;
fe14fcc3 5743
c277df42 5744 if (SIZE_ONLY) {
830247a4
IZ
5745 SIZE_ALIGN(RExC_size);
5746 RExC_size += 2;
a0d0e21e
LW
5747 return(ret);
5748 }
fe14fcc3 5749
c277df42 5750 NODE_ALIGN_FILL(ret);
a0d0e21e 5751 ptr = ret;
c277df42 5752 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5753 if (RExC_offsets) { /* MJD */
ccb2c380 5754 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5755 "reganode",
ccb2c380
MP
5756 __LINE__,
5757 reg_name[op],
fac92740
MJD
5758 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5759 "Overwriting end of array!\n" : "OK",
5760 RExC_emit - RExC_emit_start,
5761 RExC_parse - RExC_start,
5762 RExC_offsets[0]));
ccb2c380 5763 Set_Cur_Node_Offset;
fac92740
MJD
5764 }
5765
830247a4 5766 RExC_emit = ptr;
fe14fcc3 5767
a0d0e21e 5768 return(ret);
fe14fcc3
LW
5769}
5770
5771/*
cd439c50 5772- reguni - emit (if appropriate) a Unicode character
a0ed51b3 5773*/
71207a34
AL
5774STATIC STRLEN
5775S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 5776{
97aff369 5777 dVAR;
71207a34 5778 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5779}
5780
5781/*
a0d0e21e
LW
5782- reginsert - insert an operator in front of already-emitted operand
5783*
5784* Means relocating the operand.
5785*/
76e3520e 5786STATIC void
830247a4 5787S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5788{
97aff369 5789 dVAR;
c277df42
IZ
5790 register regnode *src;
5791 register regnode *dst;
5792 register regnode *place;
504618e9 5793 const int offset = regarglen[(U8)op];
b81d288d 5794
22c35a8c 5795/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5796
5797 if (SIZE_ONLY) {
830247a4 5798 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5799 return;
5800 }
a687059c 5801
830247a4
IZ
5802 src = RExC_emit;
5803 RExC_emit += NODE_STEP_REGNODE + offset;
5804 dst = RExC_emit;
fac92740 5805 while (src > opnd) {
c277df42 5806 StructCopy(--src, --dst, regnode);
fac92740 5807 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5808 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5809 "reg_insert",
ccb2c380
MP
5810 __LINE__,
5811 reg_name[op],
fac92740
MJD
5812 dst - RExC_emit_start > RExC_offsets[0]
5813 ? "Overwriting end of array!\n" : "OK",
5814 src - RExC_emit_start,
5815 dst - RExC_emit_start,
5816 RExC_offsets[0]));
ccb2c380
MP
5817 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5818 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5819 }
5820 }
5821
a0d0e21e
LW
5822
5823 place = opnd; /* Op node, where operand used to be. */
fac92740 5824 if (RExC_offsets) { /* MJD */
ccb2c380 5825 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5826 "reginsert",
ccb2c380
MP
5827 __LINE__,
5828 reg_name[op],
fac92740
MJD
5829 place - RExC_emit_start > RExC_offsets[0]
5830 ? "Overwriting end of array!\n" : "OK",
5831 place - RExC_emit_start,
5832 RExC_parse - RExC_start,
5833 RExC_offsets[0]));
ccb2c380 5834 Set_Node_Offset(place, RExC_parse);
45948336 5835 Set_Node_Length(place, 1);
fac92740 5836 }
c277df42
IZ
5837 src = NEXTOPER(place);
5838 FILL_ADVANCE_NODE(place, op);
5839 Zero(src, offset, regnode);
a687059c
LW
5840}
5841
5842/*
c277df42 5843- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 5844- SEE ALSO: regtail_study
a0d0e21e 5845*/
097eb12c 5846/* TODO: All three parms should be const */
76e3520e 5847STATIC void
3dab1dad 5848S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 5849{
97aff369 5850 dVAR;
c277df42 5851 register regnode *scan;
3dab1dad 5852 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 5853
c277df42 5854 if (SIZE_ONLY)
a0d0e21e
LW
5855 return;
5856
5857 /* Find last node. */
5858 scan = p;
5859 for (;;) {
504618e9 5860 regnode * const temp = regnext(scan);
3dab1dad
YO
5861 DEBUG_PARSE_r({
5862 SV * const mysv=sv_newmortal();
5863 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
5864 regprop(RExC_rx, mysv, scan);
5865 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
5866 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
5867 });
5868 if (temp == NULL)
5869 break;
5870 scan = temp;
5871 }
5872
5873 if (reg_off_by_arg[OP(scan)]) {
5874 ARG_SET(scan, val - scan);
5875 }
5876 else {
5877 NEXT_OFF(scan) = val - scan;
5878 }
5879}
5880
5881/*
5882- regtail_study - set the next-pointer at the end of a node chain of p to val.
5883- Look for optimizable sequences at the same time.
5884- currently only looks for EXACT chains.
5885*/
5886/* TODO: All four parms should be const */
5887STATIC U8
5888S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5889{
5890 dVAR;
5891 register regnode *scan;
5892 U8 exact= PSEUDO;
5893 GET_RE_DEBUG_FLAGS_DECL;
5894
5895 if (SIZE_ONLY)
5896 return exact;
5897
5898 /* Find last node. */
5899
5900 scan = p;
5901 for (;;) {
5902 regnode * const temp = regnext(scan);
5903 if ( exact ) {
5904 switch (OP(scan)) {
5905 case EXACT:
5906 case EXACTF:
5907 case EXACTFL:
5908 if( exact == PSEUDO )
5909 exact= OP(scan);
5910 else if ( exact != OP(scan) )
5911 exact= 0;
5912 case NOTHING:
5913 break;
5914 default:
5915 exact= 0;
5916 }
5917 }
5918 DEBUG_PARSE_r({
5919 SV * const mysv=sv_newmortal();
5920 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
5921 regprop(RExC_rx, mysv, scan);
5922 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
5923 SvPV_nolen_const(mysv),
5924 reg_name[exact],
5925 REG_NODE_NUM(scan));
5926 });
a0d0e21e
LW
5927 if (temp == NULL)
5928 break;
5929 scan = temp;
5930 }
a687059c 5931
c277df42
IZ
5932 if (reg_off_by_arg[OP(scan)]) {
5933 ARG_SET(scan, val - scan);
a0ed51b3
LW
5934 }
5935 else {
c277df42
IZ
5936 NEXT_OFF(scan) = val - scan;
5937 }
3dab1dad
YO
5938
5939 return exact;
a687059c
LW
5940}
5941
5942/*
a687059c
LW
5943 - regcurly - a little FSA that accepts {\d+,?\d*}
5944 */
79072805 5945STATIC I32
5f66b61c 5946S_regcurly(register const char *s)
a687059c
LW
5947{
5948 if (*s++ != '{')
5949 return FALSE;
f0fcb552 5950 if (!isDIGIT(*s))
a687059c 5951 return FALSE;
f0fcb552 5952 while (isDIGIT(*s))
a687059c
LW
5953 s++;
5954 if (*s == ',')
5955 s++;
f0fcb552 5956 while (isDIGIT(*s))
a687059c
LW
5957 s++;
5958 if (*s != '}')
5959 return FALSE;
5960 return TRUE;
5961}
5962
a687059c
LW
5963
5964/*
fd181c75 5965 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5966 */
5967void
097eb12c 5968Perl_regdump(pTHX_ const regexp *r)
a687059c 5969{
35ff7856 5970#ifdef DEBUGGING
97aff369 5971 dVAR;
c445ea15 5972 SV * const sv = sv_newmortal();
a687059c 5973
4f639d21 5974 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5975
5976 /* Header fields of interest. */
c277df42 5977 if (r->anchored_substr)
7b0972df 5978 PerlIO_printf(Perl_debug_log,
a0288114 5979 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5980 PL_colors[0],
7b0972df 5981 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5982 SvPVX_const(r->anchored_substr),
3280af22 5983 PL_colors[1],
c277df42 5984 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5985 (IV)r->anchored_offset);
33b8afdf
JH
5986 else if (r->anchored_utf8)
5987 PerlIO_printf(Perl_debug_log,
a0288114 5988 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5989 PL_colors[0],
5990 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5991 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5992 PL_colors[1],
5993 SvTAIL(r->anchored_utf8) ? "$" : "",
5994 (IV)r->anchored_offset);
c277df42 5995 if (r->float_substr)
7b0972df 5996 PerlIO_printf(Perl_debug_log,
a0288114 5997 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5998 PL_colors[0],
b81d288d 5999 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6000 SvPVX_const(r->float_substr),
3280af22 6001 PL_colors[1],
c277df42 6002 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6003 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6004 else if (r->float_utf8)
6005 PerlIO_printf(Perl_debug_log,
a0288114 6006 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6007 PL_colors[0],
6008 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6009 SvPVX_const(r->float_utf8),
33b8afdf
JH
6010 PL_colors[1],
6011 SvTAIL(r->float_utf8) ? "$" : "",
6012 (IV)r->float_min_offset, (UV)r->float_max_offset);
6013 if (r->check_substr || r->check_utf8)
b81d288d
AB
6014 PerlIO_printf(Perl_debug_log,
6015 r->check_substr == r->float_substr
33b8afdf 6016 && r->check_utf8 == r->float_utf8
c277df42
IZ
6017 ? "(checking floating" : "(checking anchored");
6018 if (r->reganch & ROPT_NOSCAN)
6019 PerlIO_printf(Perl_debug_log, " noscan");
6020 if (r->reganch & ROPT_CHECK_ALL)
6021 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6022 if (r->check_substr || r->check_utf8)
c277df42
IZ
6023 PerlIO_printf(Perl_debug_log, ") ");
6024
46fc3d4c 6025 if (r->regstclass) {
32fc9b6a 6026 regprop(r, sv, r->regstclass);
3f7c398e 6027 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6028 }
774d564b 6029 if (r->reganch & ROPT_ANCH) {
6030 PerlIO_printf(Perl_debug_log, "anchored");
6031 if (r->reganch & ROPT_ANCH_BOL)
6032 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6033 if (r->reganch & ROPT_ANCH_MBOL)
6034 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6035 if (r->reganch & ROPT_ANCH_SBOL)
6036 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6037 if (r->reganch & ROPT_ANCH_GPOS)
6038 PerlIO_printf(Perl_debug_log, "(GPOS)");
6039 PerlIO_putc(Perl_debug_log, ' ');
6040 }
c277df42
IZ
6041 if (r->reganch & ROPT_GPOS_SEEN)
6042 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6043 if (r->reganch & ROPT_SKIP)
760ac839 6044 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6045 if (r->reganch & ROPT_IMPLICIT)
760ac839 6046 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6047 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6048 if (r->reganch & ROPT_EVAL_SEEN)
6049 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6050 PerlIO_printf(Perl_debug_log, "\n");
fac92740 6051 if (r->offsets) {
e4584336 6052 const U32 len = r->offsets[0];
a3621e74
YO
6053 GET_RE_DEBUG_FLAGS_DECL;
6054 DEBUG_OFFSETS_r({
1df70142 6055 U32 i;
e4584336
RB
6056 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6057 for (i = 1; i <= len; i++)
6058 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
6059 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6060 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 6061 });
fac92740 6062 }
65e66c80 6063#else
96a5add6 6064 PERL_UNUSED_CONTEXT;
65e66c80 6065 PERL_UNUSED_ARG(r);
17c3b450 6066#endif /* DEBUGGING */
a687059c
LW
6067}
6068
6069/*
a0d0e21e
LW
6070- regprop - printable representation of opcode
6071*/
46fc3d4c 6072void
32fc9b6a 6073Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6074{
35ff7856 6075#ifdef DEBUGGING
97aff369 6076 dVAR;
9b155405 6077 register int k;
a0d0e21e 6078
54dc92de 6079 sv_setpvn(sv, "", 0);
9b155405 6080 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6081 /* It would be nice to FAIL() here, but this may be called from
6082 regexec.c, and it would be hard to supply pRExC_state. */
6083 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6084 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6085
3dab1dad 6086 k = PL_regkind[OP(o)];
9b155405 6087
2a782b5b 6088 if (k == EXACT) {
396482e1 6089 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6090 /* Using is_utf8_string() is a crude hack but it may
6091 * be the best for now since we have no flag "this EXACTish
6092 * node was UTF-8" --jhi */
1df70142 6093 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 6094 const char * const s = do_utf8 ?
c728cb41
JH
6095 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6096 UNI_DISPLAY_REGEX) :
2a782b5b 6097 STRING(o);
e1ec3a88 6098 const int len = do_utf8 ?
2a782b5b
JH
6099 strlen(s) :
6100 STR_LEN(o);
6101 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6102 PL_colors[0],
6103 len, s,
6104 PL_colors[1]);
bb263b4e 6105 } else if (k == TRIE) {
3dab1dad
YO
6106 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6107 /* print the details of the trie in dumpuntil instead, as
4f639d21 6108 * prog->data isn't available here */
a3621e74 6109 } else if (k == CURLY) {
cb434fcc 6110 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6111 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6112 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6113 }
2c2d71f5
JH
6114 else if (k == WHILEM && o->flags) /* Ordinal/of */
6115 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6116 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6117 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6118 else if (k == LOGICAL)
04ebc1ab 6119 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6120 else if (k == ANYOF) {
6121 int i, rangestart = -1;
2d03de9c 6122 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6123
6124 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6125 static const char * const anyofs[] = {
653099ff
GS
6126 "\\w",
6127 "\\W",
6128 "\\s",
6129 "\\S",
6130 "\\d",
6131 "\\D",
6132 "[:alnum:]",
6133 "[:^alnum:]",
6134 "[:alpha:]",
6135 "[:^alpha:]",
6136 "[:ascii:]",
6137 "[:^ascii:]",
6138 "[:ctrl:]",
6139 "[:^ctrl:]",
6140 "[:graph:]",
6141 "[:^graph:]",
6142 "[:lower:]",
6143 "[:^lower:]",
6144 "[:print:]",
6145 "[:^print:]",
6146 "[:punct:]",
6147 "[:^punct:]",
6148 "[:upper:]",
aaa51d5e 6149 "[:^upper:]",
653099ff 6150 "[:xdigit:]",
aaa51d5e
JF
6151 "[:^xdigit:]",
6152 "[:space:]",
6153 "[:^space:]",
6154 "[:blank:]",
6155 "[:^blank:]"
653099ff
GS
6156 };
6157
19860706 6158 if (flags & ANYOF_LOCALE)
396482e1 6159 sv_catpvs(sv, "{loc}");
19860706 6160 if (flags & ANYOF_FOLD)
396482e1 6161 sv_catpvs(sv, "{i}");
653099ff 6162 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6163 if (flags & ANYOF_INVERT)
396482e1 6164 sv_catpvs(sv, "^");
ffc61ed2
JH
6165 for (i = 0; i <= 256; i++) {
6166 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6167 if (rangestart == -1)
6168 rangestart = i;
6169 } else if (rangestart != -1) {
6170 if (i <= rangestart + 3)
6171 for (; rangestart < i; rangestart++)
653099ff 6172 put_byte(sv, rangestart);
ffc61ed2
JH
6173 else {
6174 put_byte(sv, rangestart);
396482e1 6175 sv_catpvs(sv, "-");
ffc61ed2 6176 put_byte(sv, i - 1);
653099ff 6177 }
ffc61ed2 6178 rangestart = -1;
653099ff 6179 }
847a199f 6180 }
ffc61ed2
JH
6181
6182 if (o->flags & ANYOF_CLASS)
bb7a0f54 6183 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6184 if (ANYOF_CLASS_TEST(o,i))
6185 sv_catpv(sv, anyofs[i]);
6186
6187 if (flags & ANYOF_UNICODE)
396482e1 6188 sv_catpvs(sv, "{unicode}");
1aa99e6b 6189 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6190 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6191
6192 {
6193 SV *lv;
32fc9b6a 6194 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6195
ffc61ed2
JH
6196 if (lv) {
6197 if (sw) {
89ebb4a3 6198 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6199
ffc61ed2 6200 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6201 uvchr_to_utf8(s, i);
ffc61ed2 6202
3568d838 6203 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6204 if (rangestart == -1)
6205 rangestart = i;
6206 } else if (rangestart != -1) {
ffc61ed2
JH
6207 if (i <= rangestart + 3)
6208 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6209 const U8 * const e = uvchr_to_utf8(s,rangestart);
6210 U8 *p;
6211 for(p = s; p < e; p++)
ffc61ed2
JH
6212 put_byte(sv, *p);
6213 }
6214 else {
2d03de9c
AL
6215 const U8 *e = uvchr_to_utf8(s,rangestart);
6216 U8 *p;
6217 for (p = s; p < e; p++)
ffc61ed2 6218 put_byte(sv, *p);
396482e1 6219 sv_catpvs(sv, "-");
2d03de9c
AL
6220 e = uvchr_to_utf8(s, i-1);
6221 for (p = s; p < e; p++)
1df70142 6222 put_byte(sv, *p);
ffc61ed2
JH
6223 }
6224 rangestart = -1;
6225 }
19860706 6226 }
ffc61ed2 6227
396482e1 6228 sv_catpvs(sv, "..."); /* et cetera */
19860706 6229 }
fde631ed 6230
ffc61ed2 6231 {
2e0de35c 6232 char *s = savesvpv(lv);
c445ea15 6233 char * const origs = s;
b81d288d 6234
3dab1dad
YO
6235 while (*s && *s != '\n')
6236 s++;
b81d288d 6237
ffc61ed2 6238 if (*s == '\n') {
2d03de9c 6239 const char * const t = ++s;
ffc61ed2
JH
6240
6241 while (*s) {
6242 if (*s == '\n')
6243 *s = ' ';
6244 s++;
6245 }
6246 if (s[-1] == ' ')
6247 s[-1] = 0;
6248
6249 sv_catpv(sv, t);
fde631ed 6250 }
b81d288d 6251
ffc61ed2 6252 Safefree(origs);
fde631ed
JH
6253 }
6254 }
653099ff 6255 }
ffc61ed2 6256
653099ff
GS
6257 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6258 }
9b155405 6259 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6260 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 6261#else
96a5add6 6262 PERL_UNUSED_CONTEXT;
65e66c80
SP
6263 PERL_UNUSED_ARG(sv);
6264 PERL_UNUSED_ARG(o);
17c3b450 6265#endif /* DEBUGGING */
35ff7856 6266}
a687059c 6267
cad2e5aa
JH
6268SV *
6269Perl_re_intuit_string(pTHX_ regexp *prog)
6270{ /* Assume that RE_INTUIT is set */
97aff369 6271 dVAR;
a3621e74 6272 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6273 PERL_UNUSED_CONTEXT;
6274
a3621e74 6275 DEBUG_COMPILE_r(
cfd0369c 6276 {
2d03de9c 6277 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6278 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6279
6280 if (!PL_colorset) reginitcolors();
6281 PerlIO_printf(Perl_debug_log,
a0288114 6282 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6283 PL_colors[4],
6284 prog->check_substr ? "" : "utf8 ",
6285 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6286 s,
6287 PL_colors[1],
6288 (strlen(s) > 60 ? "..." : ""));
6289 } );
6290
33b8afdf 6291 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6292}
6293
2b69d0c2 6294void
864dbfa3 6295Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6296{
27da23d5 6297 dVAR;
9e55ce06 6298#ifdef DEBUGGING
c445ea15 6299 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 6300#endif
fc32ee4a 6301 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6302
7821416a
IZ
6303 if (!r || (--r->refcnt > 0))
6304 return;
a3621e74 6305 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6306 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6307 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6308 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6309 const int len = SvCUR(dsv);
9e55ce06
JH
6310 if (!PL_colorset)
6311 reginitcolors();
6312 PerlIO_printf(Perl_debug_log,
a3621e74 6313 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6314 PL_colors[4],PL_colors[5],PL_colors[0],
6315 len, len, s,
6316 PL_colors[1],
6317 len > 60 ? "..." : "");
6318 });
cad2e5aa 6319
43c5f42d
NC
6320 /* gcov results gave these as non-null 100% of the time, so there's no
6321 optimisation in checking them before calling Safefree */
6322 Safefree(r->precomp);
6323 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6324 RX_MATCH_COPY_FREE(r);
f8c7b90f 6325#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6326 if (r->saved_copy)
6327 SvREFCNT_dec(r->saved_copy);
6328#endif
a193d654
GS
6329 if (r->substrs) {
6330 if (r->anchored_substr)
6331 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6332 if (r->anchored_utf8)
6333 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6334 if (r->float_substr)
6335 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6336 if (r->float_utf8)
6337 SvREFCNT_dec(r->float_utf8);
2779dcf1 6338 Safefree(r->substrs);
a193d654 6339 }
c277df42
IZ
6340 if (r->data) {
6341 int n = r->data->count;
f3548bdc
DM
6342 PAD* new_comppad = NULL;
6343 PAD* old_comppad;
4026c95a 6344 PADOFFSET refcnt;
dfad63ad 6345
c277df42 6346 while (--n >= 0) {
261faec3 6347 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6348 switch (r->data->what[n]) {
6349 case 's':
6350 SvREFCNT_dec((SV*)r->data->data[n]);
6351 break;
653099ff
GS
6352 case 'f':
6353 Safefree(r->data->data[n]);
6354 break;
dfad63ad
HS
6355 case 'p':
6356 new_comppad = (AV*)r->data->data[n];
6357 break;
c277df42 6358 case 'o':
dfad63ad 6359 if (new_comppad == NULL)
cea2e8a9 6360 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6361 PAD_SAVE_LOCAL(old_comppad,
6362 /* Watch out for global destruction's random ordering. */
c445ea15 6363 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6364 );
b34c0dd4 6365 OP_REFCNT_LOCK;
4026c95a
SH
6366 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6367 OP_REFCNT_UNLOCK;
6368 if (!refcnt)
9b978d73 6369 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6370
f3548bdc 6371 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6372 SvREFCNT_dec((SV*)new_comppad);
6373 new_comppad = NULL;
c277df42
IZ
6374 break;
6375 case 'n':
9e55ce06 6376 break;
a3621e74
YO
6377 case 't':
6378 {
c445ea15 6379 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6380 U32 refcount;
6381 OP_REFCNT_LOCK;
e27afef8 6382 refcount = --trie->refcount;
a3621e74
YO
6383 OP_REFCNT_UNLOCK;
6384 if ( !refcount ) {
43c5f42d 6385 Safefree(trie->charmap);
a3621e74
YO
6386 if (trie->widecharmap)
6387 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6388 Safefree(trie->states);
6389 Safefree(trie->trans);
3dab1dad
YO
6390 if (trie->bitmap)
6391 Safefree(trie->bitmap);
a3621e74
YO
6392#ifdef DEBUGGING
6393 if (trie->words)
6394 SvREFCNT_dec((SV*)trie->words);
6395 if (trie->revcharmap)
6396 SvREFCNT_dec((SV*)trie->revcharmap);
6397#endif
6398 Safefree(r->data->data[n]); /* do this last!!!! */
6399 }
6400 break;
6401 }
c277df42 6402 default:
830247a4 6403 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6404 }
6405 }
6406 Safefree(r->data->what);
6407 Safefree(r->data);
a0d0e21e
LW
6408 }
6409 Safefree(r->startp);
6410 Safefree(r->endp);
6411 Safefree(r);
a687059c 6412}
c277df42 6413
76234dfb 6414#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6415/*
6416 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6417 */
6418regnode *
864dbfa3 6419Perl_regnext(pTHX_ register regnode *p)
c277df42 6420{
97aff369 6421 dVAR;
c277df42
IZ
6422 register I32 offset;
6423
3280af22 6424 if (p == &PL_regdummy)
c277df42
IZ
6425 return(NULL);
6426
6427 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6428 if (offset == 0)
6429 return(NULL);
6430
c277df42 6431 return(p+offset);
c277df42 6432}
76234dfb 6433#endif
c277df42 6434
01f988be 6435STATIC void
cea2e8a9 6436S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6437{
6438 va_list args;
6439 STRLEN l1 = strlen(pat1);
6440 STRLEN l2 = strlen(pat2);
6441 char buf[512];
06bf62c7 6442 SV *msv;
73d840c0 6443 const char *message;
c277df42
IZ
6444
6445 if (l1 > 510)
6446 l1 = 510;
6447 if (l1 + l2 > 510)
6448 l2 = 510 - l1;
6449 Copy(pat1, buf, l1 , char);
6450 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6451 buf[l1 + l2] = '\n';
6452 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6453#ifdef I_STDARG
6454 /* ANSI variant takes additional second argument */
c277df42 6455 va_start(args, pat2);
8736538c
AS
6456#else
6457 va_start(args);
6458#endif
5a844595 6459 msv = vmess(buf, &args);
c277df42 6460 va_end(args);
cfd0369c 6461 message = SvPV_const(msv,l1);
c277df42
IZ
6462 if (l1 > 512)
6463 l1 = 512;
6464 Copy(message, buf, l1 , char);
197cf9b9 6465 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6466 Perl_croak(aTHX_ "%s", buf);
c277df42 6467}
a0ed51b3
LW
6468
6469/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6470
76234dfb 6471#ifndef PERL_IN_XSUB_RE
a0ed51b3 6472void
864dbfa3 6473Perl_save_re_context(pTHX)
b81d288d 6474{
97aff369 6475 dVAR;
1ade1aa1
NC
6476
6477 struct re_save_state *state;
6478
6479 SAVEVPTR(PL_curcop);
6480 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6481
6482 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6483 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6484 SSPUSHINT(SAVEt_RE_STATE);
6485
46ab3289 6486 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6487
a0ed51b3 6488 PL_reg_start_tmp = 0;
a0ed51b3 6489 PL_reg_start_tmpl = 0;
c445ea15 6490 PL_reg_oldsaved = NULL;
a5db57d6 6491 PL_reg_oldsavedlen = 0;
a5db57d6 6492 PL_reg_maxiter = 0;
a5db57d6 6493 PL_reg_leftiter = 0;
c445ea15 6494 PL_reg_poscache = NULL;
a5db57d6 6495 PL_reg_poscache_size = 0;
1ade1aa1
NC
6496#ifdef PERL_OLD_COPY_ON_WRITE
6497 PL_nrs = NULL;
6498#endif
ada6e8a9 6499
c445ea15
AL
6500 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6501 if (PL_curpm) {
6502 const REGEXP * const rx = PM_GETRE(PL_curpm);
6503 if (rx) {
1df70142 6504 U32 i;
ada6e8a9 6505 for (i = 1; i <= rx->nparens; i++) {
1df70142 6506 char digits[TYPE_CHARS(long)];
d9fad198 6507 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6508 GV *const *const gvp
6509 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6510
b37c2d43
AL
6511 if (gvp) {
6512 GV * const gv = *gvp;
6513 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6514 save_scalar(gv);
49f27e4b 6515 }
ada6e8a9
AMS
6516 }
6517 }
6518 }
a0ed51b3 6519}
76234dfb 6520#endif
51371543 6521
51371543 6522static void
acfe0abc 6523clear_re(pTHX_ void *r)
51371543 6524{
97aff369 6525 dVAR;
51371543
GS
6526 ReREFCNT_dec((regexp *)r);
6527}
ffbc6a93 6528
a28509cc
AL
6529#ifdef DEBUGGING
6530
6531STATIC void
6532S_put_byte(pTHX_ SV *sv, int c)
6533{
6534 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6535 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6536 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6537 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6538 else
6539 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6540}
6541
6542
3dab1dad
YO
6543#define CLEAR_OPTSTART \
6544 if (optstart) STMT_START { \
6545 PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart); \
6546 optstart=NULL; \
6547 } STMT_END
6548
6549#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6550
b5a2f8d8
NC
6551STATIC const regnode *
6552S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6553 const regnode *last, SV* sv, I32 l)
a28509cc 6554{
97aff369 6555 dVAR;
a28509cc 6556 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6557 register const regnode *next;
3dab1dad
YO
6558 const regnode *optstart= NULL;
6559 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6560
6561 while (op != END && (!last || node < last)) {
6562 /* While that wasn't END last time... */
6563
6564 NODE_ALIGN(node);
6565 op = OP(node);
6566 if (op == CLOSE)
6567 l--;
b5a2f8d8 6568 next = regnext((regnode *)node);
a28509cc 6569 /* Where, what. */
3dab1dad
YO
6570 if ( OP(node) == OPTIMIZED) {
6571 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
6572 optstart= node;
6573 else
a28509cc 6574 goto after_print;
3dab1dad
YO
6575 } else
6576 CLEAR_OPTSTART;
32fc9b6a 6577 regprop(r, sv, node);
a28509cc
AL
6578 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6579 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6580
6581 if (OP(node) != OPTIMIZED) {
a28509cc
AL
6582 if (next == NULL) /* Next ptr. */
6583 PerlIO_printf(Perl_debug_log, "(0)");
6584 else
6585 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6586 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6587 }
6588
a28509cc
AL
6589 after_print:
6590 if (PL_regkind[(U8)op] == BRANCHJ) {
b5a2f8d8
NC
6591 register const regnode *nnode = (OP(next) == LONGJMP
6592 ? regnext((regnode *)next)
6593 : next);
a28509cc
AL
6594 if (last && nnode > last)
6595 nnode = last;
3dab1dad 6596 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a28509cc
AL
6597 }
6598 else if (PL_regkind[(U8)op] == BRANCH) {
3dab1dad 6599 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6600 }
6601 else if ( PL_regkind[(U8)op] == TRIE ) {
6602 const I32 n = ARG(node);
4f639d21 6603 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6604 const I32 arry_len = av_len(trie->words)+1;
6605 I32 word_idx;
6606 PerlIO_printf(Perl_debug_log,
3dab1dad 6607 "%*s[Start:%d Words:%d Chars:%d Unique:%d States:%"IVdf" Minlen:%d Maxlen:%d",
a28509cc
AL
6608 (int)(2*(l+3)),
6609 "",
3dab1dad
YO
6610 trie->startstate,
6611 TRIE_WORDCOUNT(trie),
6612 (int)TRIE_CHARCOUNT(trie),
a28509cc 6613 trie->uniquecharcount,
3dab1dad
YO
6614 (IV)TRIE_LASTSTATE(trie)-1,
6615 trie->minlen, trie->maxlen
6616 );
6617 if (trie->bitmap) {
6618 int i;
6619 int rangestart= -1;
6620 sv_setpvn(sv, "", 0);
6621 for (i = 0; i <= 256; i++) {
6622 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6623 if (rangestart == -1)
6624 rangestart = i;
6625 } else if (rangestart != -1) {
6626 if (i <= rangestart + 3)
6627 for (; rangestart < i; rangestart++)
6628 put_byte(sv, rangestart);
6629 else {
6630 put_byte(sv, rangestart);
6631 sv_catpvs(sv, "-");
6632 put_byte(sv, i - 1);
6633 }
6634 rangestart = -1;
6635 }
6636 }
6637 PerlIO_printf(Perl_debug_log, " Start-Class:%s]\n", SvPVX_const(sv));
6638 } else
6639 PerlIO_printf(Perl_debug_log, " No Start-Class]\n");
a28509cc
AL
6640
6641 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6642 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6643 if (elem_ptr) {
6644 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6645 (int)(2*(l+4)), "",
6646 PL_colors[0],
cfd0369c 6647 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6648 PL_colors[1]
6649 );
6650 /*
6651 if (next == NULL)
6652 PerlIO_printf(Perl_debug_log, "(0)\n");
6653 else
6654 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6655 */
6656 }
6657
6658 }
6659
6660 node = NEXTOPER(node);
6661 node += regarglen[(U8)op];
6662
6663 }
6664 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 6665 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6666 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6667 }
6668 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
3dab1dad 6669 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6670 next, sv, l + 1);
6671 }
6672 else if ( op == PLUS || op == STAR) {
3dab1dad 6673 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
6674 }
6675 else if (op == ANYOF) {
6676 /* arglen 1 + class block */
6677 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6678 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6679 node = NEXTOPER(node);
6680 }
6681 else if (PL_regkind[(U8)op] == EXACT) {
6682 /* Literal string, where present. */
6683 node += NODE_SZ_STR(node) - 1;
6684 node = NEXTOPER(node);
6685 }
6686 else {
6687 node = NEXTOPER(node);
6688 node += regarglen[(U8)op];
6689 }
6690 if (op == CURLYX || op == OPEN)
6691 l++;
6692 else if (op == WHILEM)
6693 l--;
6694 }
3dab1dad 6695 CLEAR_OPTSTART;
a28509cc
AL
6696 return node;
6697}
6698
6699#endif /* DEBUGGING */
6700
241d1a3b
NC
6701/*
6702 * Local variables:
6703 * c-indentation-style: bsd
6704 * c-basic-offset: 4
6705 * indent-tabs-mode: t
6706 * End:
6707 *
37442d52
RGS
6708 * ex: set ts=8 sts=4 sw=4 noet:
6709 */