This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add the perlreguts manpage, by Yves Orton
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad
YO
125#ifdef DEBUGGING
126 char *lastparse;
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
27da23d5
JH
198static const scan_data_t zero_scan_data =
199 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380 259#define FAIL(msg) STMT_START { \
bfed75c6 260 const char *ellipses = ""; \
ccb2c380
MP
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a 274/*
b45f050a
JF
275 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
276 */
ccb2c380 277#define Simple_vFAIL(m) STMT_START { \
a28509cc 278 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
279 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
280 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
281} STMT_END
b45f050a
JF
282
283/*
284 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
285 */
ccb2c380
MP
286#define vFAIL(m) STMT_START { \
287 if (!SIZE_ONLY) \
288 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 Simple_vFAIL(m); \
290} STMT_END
b45f050a
JF
291
292/*
293 * Like Simple_vFAIL(), but accepts two arguments.
294 */
ccb2c380 295#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 296 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
297 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
298 (int)offset, RExC_precomp, RExC_precomp + offset); \
299} STMT_END
b45f050a
JF
300
301/*
302 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
303 */
ccb2c380
MP
304#define vFAIL2(m,a1) STMT_START { \
305 if (!SIZE_ONLY) \
306 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
307 Simple_vFAIL2(m, a1); \
308} STMT_END
b45f050a
JF
309
310
311/*
312 * Like Simple_vFAIL(), but accepts three arguments.
313 */
ccb2c380 314#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 315 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
316 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
317 (int)offset, RExC_precomp, RExC_precomp + offset); \
318} STMT_END
b45f050a
JF
319
320/*
321 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
322 */
ccb2c380
MP
323#define vFAIL3(m,a1,a2) STMT_START { \
324 if (!SIZE_ONLY) \
325 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
326 Simple_vFAIL3(m, a1, a2); \
327} STMT_END
b45f050a
JF
328
329/*
330 * Like Simple_vFAIL(), but accepts four arguments.
331 */
ccb2c380 332#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 333 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
334 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
335 (int)offset, RExC_precomp, RExC_precomp + offset); \
336} STMT_END
b45f050a 337
ccb2c380 338#define vWARN(loc,m) STMT_START { \
a28509cc 339 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
340 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
341 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
342} STMT_END
343
344#define vWARNdep(loc,m) STMT_START { \
a28509cc 345 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
346 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
347 "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351
352#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 353 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
354 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
355 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 359 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
360 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
361 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
362} STMT_END
363
364#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 365 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
366 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
367 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
369
370#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 371 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
373 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
9d1d55b5 375
8615cb43 376
cd439c50 377/* Allow for side effects in s */
ccb2c380
MP
378#define REGC(c,s) STMT_START { \
379 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
380} STMT_END
cd439c50 381
fac92740
MJD
382/* Macros for recording node offsets. 20001227 mjd@plover.com
383 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
384 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
385 * Element 0 holds the number n.
386 */
387
388#define MJD_OFFSET_DEBUG(x)
a3621e74 389/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
390
391
392#define Set_Node_Offset_To_R(node,byte) STMT_START { \
393 if (! SIZE_ONLY) { \
394 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
395 __LINE__, (node), (byte))); \
396 if((node) < 0) { \
551405c4 397 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
398 } else { \
399 RExC_offsets[2*(node)-1] = (byte); \
400 } \
401 } \
402} STMT_END
403
404#define Set_Node_Offset(node,byte) \
405 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
406#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
407
408#define Set_Node_Length_To_R(node,len) STMT_START { \
409 if (! SIZE_ONLY) { \
410 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 411 __LINE__, (int)(node), (int)(len))); \
ccb2c380 412 if((node) < 0) { \
551405c4 413 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
414 } else { \
415 RExC_offsets[2*(node)] = (len); \
416 } \
417 } \
418} STMT_END
419
420#define Set_Node_Length(node,len) \
421 Set_Node_Length_To_R((node)-RExC_emit_start, len)
422#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
423#define Set_Node_Cur_Length(node) \
424 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
425
426/* Get offsets and lengths */
427#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
428#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
429
acfe0abc 430static void clear_re(pTHX_ void *r);
4327152a 431
653099ff
GS
432/* Mark that we cannot extend a found fixed substring at this point.
433 Updata the longest found anchored substring and the longest found
434 floating substrings if needed. */
435
4327152a 436STATIC void
097eb12c 437S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 438{
e1ec3a88
AL
439 const STRLEN l = CHR_SVLEN(data->last_found);
440 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 441
c277df42 442 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 443 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
444 if (*data->longest == data->longest_fixed) {
445 data->offset_fixed = l ? data->last_start_min : data->pos_min;
446 if (data->flags & SF_BEFORE_EOL)
b81d288d 447 data->flags
c277df42
IZ
448 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
449 else
450 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
451 }
452 else {
c277df42 453 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
454 data->offset_float_max = (l
455 ? data->last_start_max
c277df42 456 : data->pos_min + data->pos_delta);
9051bda5
HS
457 if ((U32)data->offset_float_max > (U32)I32_MAX)
458 data->offset_float_max = I32_MAX;
c277df42 459 if (data->flags & SF_BEFORE_EOL)
b81d288d 460 data->flags
c277df42
IZ
461 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
462 else
463 data->flags &= ~SF_FL_BEFORE_EOL;
464 }
465 }
466 SvCUR_set(data->last_found, 0);
0eda9292 467 {
a28509cc 468 SV * const sv = data->last_found;
097eb12c
AL
469 if (SvUTF8(sv) && SvMAGICAL(sv)) {
470 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
471 if (mg)
472 mg->mg_len = 0;
473 }
0eda9292 474 }
c277df42
IZ
475 data->last_end = -1;
476 data->flags &= ~SF_BEFORE_EOL;
477}
478
653099ff
GS
479/* Can match anything (initialization) */
480STATIC void
097eb12c 481S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 482{
653099ff 483 ANYOF_CLASS_ZERO(cl);
f8bef550 484 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 485 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
486 if (LOC)
487 cl->flags |= ANYOF_LOCALE;
488}
489
490/* Can match anything (initialization) */
491STATIC int
5f66b61c 492S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
493{
494 int value;
495
aaa51d5e 496 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
497 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
498 return 1;
1aa99e6b
IH
499 if (!(cl->flags & ANYOF_UNICODE_ALL))
500 return 0;
f8bef550
NC
501 if (!ANYOF_BITMAP_TESTALLSET(cl))
502 return 0;
653099ff
GS
503 return 1;
504}
505
506/* Can match anything (initialization) */
507STATIC void
097eb12c 508S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 509{
8ecf7187 510 Zero(cl, 1, struct regnode_charclass_class);
653099ff 511 cl->type = ANYOF;
830247a4 512 cl_anything(pRExC_state, cl);
653099ff
GS
513}
514
515STATIC void
097eb12c 516S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 517{
8ecf7187 518 Zero(cl, 1, struct regnode_charclass_class);
653099ff 519 cl->type = ANYOF;
830247a4 520 cl_anything(pRExC_state, cl);
653099ff
GS
521 if (LOC)
522 cl->flags |= ANYOF_LOCALE;
523}
524
525/* 'And' a given class with another one. Can create false positives */
526/* We assume that cl is not inverted */
527STATIC void
5f66b61c 528S_cl_and(struct regnode_charclass_class *cl,
a28509cc 529 const struct regnode_charclass_class *and_with)
653099ff 530{
653099ff
GS
531 if (!(and_with->flags & ANYOF_CLASS)
532 && !(cl->flags & ANYOF_CLASS)
533 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
534 && !(and_with->flags & ANYOF_FOLD)
535 && !(cl->flags & ANYOF_FOLD)) {
536 int i;
537
538 if (and_with->flags & ANYOF_INVERT)
539 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
540 cl->bitmap[i] &= ~and_with->bitmap[i];
541 else
542 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
543 cl->bitmap[i] &= and_with->bitmap[i];
544 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
545 if (!(and_with->flags & ANYOF_EOS))
546 cl->flags &= ~ANYOF_EOS;
1aa99e6b 547
14ebb1a2
JH
548 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
549 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
550 cl->flags &= ~ANYOF_UNICODE_ALL;
551 cl->flags |= ANYOF_UNICODE;
552 ARG_SET(cl, ARG(and_with));
553 }
14ebb1a2
JH
554 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
555 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 556 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
557 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
558 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 559 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
560}
561
562/* 'OR' a given class with another one. Can create false positives */
563/* We assume that cl is not inverted */
564STATIC void
097eb12c 565S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 566{
653099ff
GS
567 if (or_with->flags & ANYOF_INVERT) {
568 /* We do not use
569 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
570 * <= (B1 | !B2) | (CL1 | !CL2)
571 * which is wasteful if CL2 is small, but we ignore CL2:
572 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
573 * XXXX Can we handle case-fold? Unclear:
574 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
575 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
576 */
577 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
578 && !(or_with->flags & ANYOF_FOLD)
579 && !(cl->flags & ANYOF_FOLD) ) {
580 int i;
581
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] |= ~or_with->bitmap[i];
584 } /* XXXX: logic is complicated otherwise */
585 else {
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 } else {
589 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
590 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 591 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
592 || (cl->flags & ANYOF_FOLD)) ) {
593 int i;
594
595 /* OR char bitmap and class bitmap separately */
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= or_with->bitmap[i];
598 if (or_with->flags & ANYOF_CLASS) {
599 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
600 cl->classflags[i] |= or_with->classflags[i];
601 cl->flags |= ANYOF_CLASS;
602 }
603 }
604 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 605 cl_anything(pRExC_state, cl);
653099ff
GS
606 }
607 }
608 if (or_with->flags & ANYOF_EOS)
609 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
610
611 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
612 ARG(cl) != ARG(or_with)) {
613 cl->flags |= ANYOF_UNICODE_ALL;
614 cl->flags &= ~ANYOF_UNICODE;
615 }
616 if (or_with->flags & ANYOF_UNICODE_ALL) {
617 cl->flags |= ANYOF_UNICODE_ALL;
618 cl->flags &= ~ANYOF_UNICODE;
619 }
653099ff
GS
620}
621
5d1c421c 622/*
a3621e74 623
3dab1dad 624 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
625 startbranch: the first branch in the whole branch sequence
626 first : start branch of sequence of branch-exact nodes.
627 May be the same as startbranch
628 last : Thing following the last branch.
629 May be the same as tail.
630 tail : item following the branch sequence
631 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 632 depth : indent depth
a3621e74
YO
633
634Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
635
636A trie is an N'ary tree where the branches are determined by digital
637decomposition of the key. IE, at the root node you look up the 1st character and
638follow that branch repeat until you find the end of the branches. Nodes can be
639marked as "accepting" meaning they represent a complete word. Eg:
640
641 /he|she|his|hers/
642
643would convert into the following structure. Numbers represent states, letters
644following numbers represent valid transitions on the letter from that state, if
645the number is in square brackets it represents an accepting state, otherwise it
646will be in parenthesis.
647
648 +-h->+-e->[3]-+-r->(8)-+-s->[9]
649 | |
650 | (2)
651 | |
652 (1) +-i->(6)-+-s->[7]
653 |
654 +-s->(3)-+-h->(4)-+-e->[5]
655
656 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
657
658This shows that when matching against the string 'hers' we will begin at state 1
659read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
660then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
661is also accepting. Thus we know that we can match both 'he' and 'hers' with a
662single traverse. We store a mapping from accepting to state to which word was
663matched, and then when we have multiple possibilities we try to complete the
664rest of the regex in the order in which they occured in the alternation.
665
666The only prior NFA like behaviour that would be changed by the TRIE support is
667the silent ignoring of duplicate alternations which are of the form:
668
669 / (DUPE|DUPE) X? (?{ ... }) Y /x
670
671Thus EVAL blocks follwing a trie may be called a different number of times with
672and without the optimisation. With the optimisations dupes will be silently
673ignored. This inconsistant behaviour of EVAL type nodes is well established as
674the following demonstrates:
675
676 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
677
678which prints out 'word' three times, but
679
680 'words'=~/(word|word|word)(?{ print $1 })S/
681
682which doesnt print it out at all. This is due to other optimisations kicking in.
683
684Example of what happens on a structural level:
685
686The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
687
688 1: CURLYM[1] {1,32767}(18)
689 5: BRANCH(8)
690 6: EXACT <ac>(16)
691 8: BRANCH(11)
692 9: EXACT <ad>(16)
693 11: BRANCH(14)
694 12: EXACT <ab>(16)
695 16: SUCCEED(0)
696 17: NOTHING(18)
697 18: END(0)
698
699This would be optimizable with startbranch=5, first=5, last=16, tail=16
700and should turn into:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: TRIE(16)
704 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
705 <ac>
706 <ad>
707 <ab>
708 16: SUCCEED(0)
709 17: NOTHING(18)
710 18: END(0)
711
712Cases where tail != last would be like /(?foo|bar)baz/:
713
714 1: BRANCH(4)
715 2: EXACT <foo>(8)
716 4: BRANCH(7)
717 5: EXACT <bar>(8)
718 7: TAIL(8)
719 8: EXACT <baz>(10)
720 10: END(0)
721
722which would be optimizable with startbranch=1, first=1, last=7, tail=8
723and would end up looking like:
724
725 1: TRIE(8)
726 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
727 <foo>
728 <bar>
729 7: TAIL(8)
730 8: EXACT <baz>(10)
731 10: END(0)
732
3dab1dad
YO
733 d = uvuni_to_utf8_flags(d, uv, 0);
734
735is the recommended Unicode-aware way of saying
736
737 *(d++) = uv;
a3621e74
YO
738*/
739
3dab1dad
YO
740#define TRIE_STORE_REVCHAR \
741 STMT_START { \
742 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
743 av_push( TRIE_REVCHARMAP(trie), tmp ); \
744 } STMT_END
a3621e74
YO
745
746#define TRIE_READ_CHAR STMT_START { \
747 if ( UTF ) { \
748 if ( folder ) { \
749 if ( foldlen > 0 ) { \
750 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
751 foldlen -= len; \
752 scan += len; \
753 len = 0; \
754 } else { \
e1ec3a88 755 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
756 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
757 foldlen -= UNISKIP( uvc ); \
758 scan = foldbuf + UNISKIP( uvc ); \
759 } \
760 } else { \
e1ec3a88 761 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
762 } \
763 } else { \
764 uvc = (U32)*uc; \
765 len = 1; \
766 } \
767} STMT_END
768
769
770#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
771#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
772#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
773#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
774
775#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
776 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
777 TRIE_LIST_LEN( state ) *= 2; \
778 Renew( trie->states[ state ].trans.list, \
779 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
780 } \
781 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
782 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
783 TRIE_LIST_CUR( state )++; \
784} STMT_END
785
786#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 787 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
788 4, reg_trie_trans_le ); \
789 TRIE_LIST_CUR( state ) = 1; \
790 TRIE_LIST_LEN( state ) = 4; \
791} STMT_END
792
3dab1dad
YO
793#define TRIE_HANDLE_WORD(state) STMT_START { \
794 if ( !trie->states[ state ].wordnum ) { \
795 /* we havent inserted this word into the structure yet. */\
796 trie->states[ state ].wordnum = ++curword; \
797 DEBUG_r({ \
798 /* store the word for dumping */ \
799 SV* tmp; \
800 if (OP(noper) != NOTHING ) \
801 tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
802 else \
803 tmp=newSVpvn( "", 0 ); \
804 if ( UTF ) SvUTF8_on( tmp ); \
805 av_push( trie->words, tmp ); \
806 }); \
807 } else { \
808 NOOP; /* It's a dupe. So ignore it. */ \
809 } \
810} STMT_END
811
812#ifdef DEBUGGING
813/*
814 dump_trie(trie)
815 dump_trie_interim_list(trie,next_alloc)
816 dump_trie_interim_table(trie,next_alloc)
817
818 These routines dump out a trie in a somewhat readable format.
819 The _interim_ variants are used for debugging the interim
820 tables that are used to generate the final compressed
821 representation which is what dump_trie expects.
822
823 Part of the reason for their existance is to provide a form
824 of documentation as to how the different representations function.
825
826*/
827
828/*
829 dump_trie(trie)
830 Dumps the final compressed table form of the trie to Perl_debug_log.
831 Used for debugging make_trie().
832*/
833
834STATIC void
835S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
836{
837 U32 state;
838 GET_RE_DEBUG_FLAGS_DECL;
839
840 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
841 (int)depth * 2 + 2,"",
842 "Match","Base","Ofs" );
843
844 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
845 SV **tmp = av_fetch( trie->revcharmap, state, 0);
846 if ( tmp ) {
847 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
848 }
849 }
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
852
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
854 PerlIO_printf( Perl_debug_log, "-----");
855 PerlIO_printf( Perl_debug_log, "\n");
856
857 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
858 const U32 base = trie->states[ state ].trans.base;
859
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
861
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
864 } else {
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
866 }
867
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
869
870 if ( base ) {
871 U32 ofs = 0;
872
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
876 ofs++;
877
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
879
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
884 {
885 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
886 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
887 } else {
888 PerlIO_printf( Perl_debug_log, "%4s "," ." );
889 }
890 }
891
892 PerlIO_printf( Perl_debug_log, "]");
893
894 }
895 PerlIO_printf( Perl_debug_log, "\n" );
896 }
897}
898/*
899 dump_trie_interim_list(trie,next_alloc)
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
904*/
905STATIC void
906S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
907{
908 U32 state;
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
913 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
918 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
929 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
930 SvPV_nolen_const( *tmp ),
931 TRIE_LIST_ITEM(state,charid).forid,
932 (UV)TRIE_LIST_ITEM(state,charid).newstate
933 );
934 }
935
936 }
937}
938
939/*
940 dump_trie_interim_table(trie,next_alloc)
941 Dumps a fully constructed but uncompressed trie in table form.
942 This is the normal DFA style state transition table, with a few
943 twists to facilitate compression later.
944 Used for debugging make_trie().
945*/
946STATIC void
947S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
948{
949 U32 state;
950 U16 charid;
951 GET_RE_DEBUG_FLAGS_DECL;
952
953 /*
954 print out the table precompression so that we can do a visual check
955 that they are identical.
956 */
957
958 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
962 if ( tmp ) {
963 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
964 }
965 }
966
967 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
968
969 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
970 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n" );
974
975 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
976
977 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
978 (int)depth * 2 + 2,"",
979 (UV)TRIE_NODENUM( state ) );
980
981 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
982 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
983 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
984 }
985 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
986 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
987 } else {
988 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
989 trie->states[ TRIE_NODENUM( state ) ].wordnum );
990 }
991 }
992}
993
994#endif
995
996
997
998
999
a3621e74 1000STATIC I32
3dab1dad 1001S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1002{
27da23d5 1003 dVAR;
a3621e74
YO
1004 /* first pass, loop through and scan words */
1005 reg_trie_data *trie;
1006 regnode *cur;
9f7f3913 1007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1008 STRLEN len = 0;
1009 UV uvc = 0;
1010 U16 curword = 0;
1011 U32 next_alloc = 0;
1012 /* we just use folder as a flag in utf8 */
e1ec3a88 1013 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1014 ? PL_fold
1015 : ( flags == EXACTFL
1016 ? PL_fold_locale
1017 : NULL
1018 )
1019 );
1020
e1ec3a88 1021 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1022 SV *re_trie_maxbuff;
3dab1dad
YO
1023#ifndef DEBUGGING
1024 /* these are only used during construction but are useful during
8e11feef
RGS
1025 * debugging so we store them in the struct when debugging.
1026 * Wordcount is actually superfluous in debugging as we have
1027 * (AV*)trie->words to use for it, but that's not available when
1028 * not debugging... We could make the macro use the AV during
1029 * debugging though...
1030 */
3dab1dad
YO
1031 U16 trie_wordcount=0;
1032 STRLEN trie_charcount=0;
1033 U32 trie_laststate=0;
1034 AV *trie_revcharmap;
1035#endif
a3621e74
YO
1036 GET_RE_DEBUG_FLAGS_DECL;
1037
a02a5408 1038 Newxz( trie, 1, reg_trie_data );
a3621e74 1039 trie->refcount = 1;
3dab1dad 1040 trie->startstate = 1;
a3621e74 1041 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1042 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1043 if (!(UTF && folder))
1044 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1045 DEBUG_r({
1046 trie->words = newAV();
a3621e74 1047 });
3dab1dad 1048 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1049
0111c4fd 1050 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1051 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1052 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1053 }
3dab1dad
YO
1054 DEBUG_OPTIMISE_r({
1055 PerlIO_printf( Perl_debug_log,
1056 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1057 (int)depth * 2 + 2, "",
1058 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1059 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1060 });
a3621e74
YO
1061 /* -- First loop and Setup --
1062
1063 We first traverse the branches and scan each word to determine if it
1064 contains widechars, and how many unique chars there are, this is
1065 important as we have to build a table with at least as many columns as we
1066 have unique chars.
1067
1068 We use an array of integers to represent the character codes 0..255
1069 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1070 native representation of the character value as the key and IV's for the
1071 coded index.
1072
1073 *TODO* If we keep track of how many times each character is used we can
1074 remap the columns so that the table compression later on is more
1075 efficient in terms of memory by ensuring most common value is in the
1076 middle and the least common are on the outside. IMO this would be better
1077 than a most to least common mapping as theres a decent chance the most
1078 common letter will share a node with the least common, meaning the node
1079 will not be compressable. With a middle is most common approach the worst
1080 case is when we have the least common nodes twice.
1081
1082 */
1083
a3621e74 1084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1085 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1086 const U8 *uc = (U8*)STRING( noper );
a28509cc 1087 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1088 STRLEN foldlen = 0;
1089 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1090 const U8 *scan = (U8*)NULL;
3dab1dad 1091 STRLEN chars=0;
a3621e74 1092
3dab1dad
YO
1093 TRIE_WORDCOUNT(trie)++;
1094 if (OP(noper) == NOTHING) {
1095 trie->minlen= 0;
1096 continue;
1097 }
1098 if (trie->bitmap) {
1099 TRIE_BITMAP_SET(trie,*uc);
1100 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1101 }
a3621e74 1102 for ( ; uc < e ; uc += len ) {
3dab1dad 1103 TRIE_CHARCOUNT(trie)++;
a3621e74 1104 TRIE_READ_CHAR;
3dab1dad 1105 chars++;
a3621e74
YO
1106 if ( uvc < 256 ) {
1107 if ( !trie->charmap[ uvc ] ) {
1108 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1109 if ( folder )
1110 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1111 TRIE_STORE_REVCHAR;
a3621e74
YO
1112 }
1113 } else {
1114 SV** svpp;
1115 if ( !trie->widecharmap )
1116 trie->widecharmap = newHV();
1117
1118 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1119
1120 if ( !svpp )
e4584336 1121 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1122
1123 if ( !SvTRUE( *svpp ) ) {
1124 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1125 TRIE_STORE_REVCHAR;
a3621e74
YO
1126 }
1127 }
1128 }
3dab1dad
YO
1129 if( cur == first ) {
1130 trie->minlen=chars;
1131 trie->maxlen=chars;
1132 } else if (chars < trie->minlen) {
1133 trie->minlen=chars;
1134 } else if (chars > trie->maxlen) {
1135 trie->maxlen=chars;
1136 }
1137
a3621e74
YO
1138 } /* end first pass */
1139 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1140 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1141 (int)depth * 2 + 2,"",
1142 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1143 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
a3621e74
YO
1144 );
1145
1146
1147 /*
1148 We now know what we are dealing with in terms of unique chars and
1149 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1150 representation using a flat table will take. If it's over a reasonable
1151 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1152 conservative but potentially much slower representation using an array
1153 of lists.
1154
1155 At the end we convert both representations into the same compressed
1156 form that will be used in regexec.c for matching with. The latter
1157 is a form that cannot be used to construct with but has memory
1158 properties similar to the list form and access properties similar
1159 to the table form making it both suitable for fast searches and
1160 small enough that its feasable to store for the duration of a program.
1161
1162 See the comment in the code where the compressed table is produced
1163 inplace from the flat tabe representation for an explanation of how
1164 the compression works.
1165
1166 */
1167
1168
3dab1dad 1169 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1170 /*
1171 Second Pass -- Array Of Lists Representation
1172
1173 Each state will be represented by a list of charid:state records
1174 (reg_trie_trans_le) the first such element holds the CUR and LEN
1175 points of the allocated array. (See defines above).
1176
1177 We build the initial structure using the lists, and then convert
1178 it into the compressed table form which allows faster lookups
1179 (but cant be modified once converted).
a3621e74
YO
1180 */
1181
a3621e74
YO
1182 STRLEN transcount = 1;
1183
3dab1dad 1184 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1185 TRIE_LIST_NEW(1);
1186 next_alloc = 2;
1187
1188 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1189
c445ea15
AL
1190 regnode * const noper = NEXTOPER( cur );
1191 U8 *uc = (U8*)STRING( noper );
1192 const U8 * const e = uc + STR_LEN( noper );
1193 U32 state = 1; /* required init */
1194 U16 charid = 0; /* sanity init */
1195 U8 *scan = (U8*)NULL; /* sanity init */
1196 STRLEN foldlen = 0; /* required init */
1197 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1198
3dab1dad 1199 if (OP(noper) != NOTHING) {
c445ea15
AL
1200 for ( ; uc < e ; uc += len ) {
1201
1202 TRIE_READ_CHAR;
1203
1204 if ( uvc < 256 ) {
1205 charid = trie->charmap[ uvc ];
1206 } else {
1207 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1208 if ( !svpp ) {
1209 charid = 0;
1210 } else {
1211 charid=(U16)SvIV( *svpp );
1212 }
1213 }
1214 if ( charid ) {
a3621e74 1215
c445ea15
AL
1216 U16 check;
1217 U32 newstate = 0;
a3621e74 1218
c445ea15
AL
1219 charid--;
1220 if ( !trie->states[ state ].trans.list ) {
1221 TRIE_LIST_NEW( state );
1222 }
1223 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1224 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1225 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1226 break;
1227 }
1228 }
1229 if ( ! newstate ) {
1230 newstate = next_alloc++;
1231 TRIE_LIST_PUSH( state, charid, newstate );
1232 transcount++;
1233 }
1234 state = newstate;
1235 } else {
1236 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1237 }
c445ea15
AL
1238 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1239 }
c445ea15 1240 }
3dab1dad 1241 TRIE_HANDLE_WORD(state);
a3621e74
YO
1242
1243 } /* end second pass */
1244
3dab1dad 1245 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1246 Renew( trie->states, next_alloc, reg_trie_state );
1247
3dab1dad
YO
1248 /* and now dump it out before we compress it */
1249 DEBUG_TRIE_COMPILE_MORE_r(
1250 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1251 );
a3621e74 1252
a02a5408 1253 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1254 {
1255 U32 state;
a3621e74
YO
1256 U32 tp = 0;
1257 U32 zp = 0;
1258
1259
1260 for( state=1 ; state < next_alloc ; state ++ ) {
1261 U32 base=0;
1262
1263 /*
1264 DEBUG_TRIE_COMPILE_MORE_r(
1265 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1266 );
1267 */
1268
1269 if (trie->states[state].trans.list) {
1270 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1271 U16 maxid=minid;
a28509cc 1272 U16 idx;
a3621e74
YO
1273
1274 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1275 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1276 if ( forid < minid ) {
1277 minid=forid;
1278 } else if ( forid > maxid ) {
1279 maxid=forid;
1280 }
a3621e74
YO
1281 }
1282 if ( transcount < tp + maxid - minid + 1) {
1283 transcount *= 2;
1284 Renew( trie->trans, transcount, reg_trie_trans );
1285 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1286 }
1287 base = trie->uniquecharcount + tp - minid;
1288 if ( maxid == minid ) {
1289 U32 set = 0;
1290 for ( ; zp < tp ; zp++ ) {
1291 if ( ! trie->trans[ zp ].next ) {
1292 base = trie->uniquecharcount + zp - minid;
1293 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1294 trie->trans[ zp ].check = state;
1295 set = 1;
1296 break;
1297 }
1298 }
1299 if ( !set ) {
1300 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1301 trie->trans[ tp ].check = state;
1302 tp++;
1303 zp = tp;
1304 }
1305 } else {
1306 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1307 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1308 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1309 trie->trans[ tid ].check = state;
1310 }
1311 tp += ( maxid - minid + 1 );
1312 }
1313 Safefree(trie->states[ state ].trans.list);
1314 }
1315 /*
1316 DEBUG_TRIE_COMPILE_MORE_r(
1317 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1318 );
1319 */
1320 trie->states[ state ].trans.base=base;
1321 }
cc601c31 1322 trie->lasttrans = tp + 1;
a3621e74
YO
1323 }
1324 } else {
1325 /*
1326 Second Pass -- Flat Table Representation.
1327
1328 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1329 We know that we will need Charcount+1 trans at most to store the data
1330 (one row per char at worst case) So we preallocate both structures
1331 assuming worst case.
1332
1333 We then construct the trie using only the .next slots of the entry
1334 structs.
1335
1336 We use the .check field of the first entry of the node temporarily to
1337 make compression both faster and easier by keeping track of how many non
1338 zero fields are in the node.
1339
1340 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1341 transition.
1342
1343 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1344 number representing the first entry of the node, and state as a
1345 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1346 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1347 are 2 entrys per node. eg:
1348
1349 A B A B
1350 1. 2 4 1. 3 7
1351 2. 0 3 3. 0 5
1352 3. 0 0 5. 0 0
1353 4. 0 0 7. 0 0
1354
1355 The table is internally in the right hand, idx form. However as we also
1356 have to deal with the states array which is indexed by nodenum we have to
1357 use TRIE_NODENUM() to convert.
1358
1359 */
1360
3dab1dad
YO
1361
1362 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1363 reg_trie_trans );
3dab1dad 1364 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1365 next_alloc = trie->uniquecharcount + 1;
1366
3dab1dad 1367
a3621e74
YO
1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1369
c445ea15 1370 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1371 const U8 *uc = (U8*)STRING( noper );
1372 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1373
1374 U32 state = 1; /* required init */
1375
1376 U16 charid = 0; /* sanity init */
1377 U32 accept_state = 0; /* sanity init */
1378 U8 *scan = (U8*)NULL; /* sanity init */
1379
1380 STRLEN foldlen = 0; /* required init */
1381 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1382
3dab1dad 1383 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1384 for ( ; uc < e ; uc += len ) {
1385
1386 TRIE_READ_CHAR;
1387
1388 if ( uvc < 256 ) {
1389 charid = trie->charmap[ uvc ];
1390 } else {
c445ea15
AL
1391 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1392 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1393 }
1394 if ( charid ) {
1395 charid--;
1396 if ( !trie->trans[ state + charid ].next ) {
1397 trie->trans[ state + charid ].next = next_alloc;
1398 trie->trans[ state ].check++;
1399 next_alloc += trie->uniquecharcount;
1400 }
1401 state = trie->trans[ state + charid ].next;
1402 } else {
e4584336 1403 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1404 }
1405 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1406 }
a3621e74 1407 }
3dab1dad
YO
1408 accept_state = TRIE_NODENUM( state );
1409 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1410
1411 } /* end second pass */
1412
3dab1dad
YO
1413 /* and now dump it out before we compress it */
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 dump_trie_interim_table(trie,next_alloc,depth+1)
1416 );
a3621e74 1417
a3621e74
YO
1418 {
1419 /*
1420 * Inplace compress the table.*
1421
1422 For sparse data sets the table constructed by the trie algorithm will
1423 be mostly 0/FAIL transitions or to put it another way mostly empty.
1424 (Note that leaf nodes will not contain any transitions.)
1425
1426 This algorithm compresses the tables by eliminating most such
1427 transitions, at the cost of a modest bit of extra work during lookup:
1428
1429 - Each states[] entry contains a .base field which indicates the
1430 index in the state[] array wheres its transition data is stored.
1431
1432 - If .base is 0 there are no valid transitions from that node.
1433
1434 - If .base is nonzero then charid is added to it to find an entry in
1435 the trans array.
1436
1437 -If trans[states[state].base+charid].check!=state then the
1438 transition is taken to be a 0/Fail transition. Thus if there are fail
1439 transitions at the front of the node then the .base offset will point
1440 somewhere inside the previous nodes data (or maybe even into a node
1441 even earlier), but the .check field determines if the transition is
1442 valid.
1443
1444 The following process inplace converts the table to the compressed
1445 table: We first do not compress the root node 1,and mark its all its
1446 .check pointers as 1 and set its .base pointer as 1 as well. This
1447 allows to do a DFA construction from the compressed table later, and
1448 ensures that any .base pointers we calculate later are greater than
1449 0.
1450
1451 - We set 'pos' to indicate the first entry of the second node.
1452
1453 - We then iterate over the columns of the node, finding the first and
1454 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1455 and set the .check pointers accordingly, and advance pos
1456 appropriately and repreat for the next node. Note that when we copy
1457 the next pointers we have to convert them from the original
1458 NODEIDX form to NODENUM form as the former is not valid post
1459 compression.
1460
1461 - If a node has no transitions used we mark its base as 0 and do not
1462 advance the pos pointer.
1463
1464 - If a node only has one transition we use a second pointer into the
1465 structure to fill in allocated fail transitions from other states.
1466 This pointer is independent of the main pointer and scans forward
1467 looking for null transitions that are allocated to a state. When it
1468 finds one it writes the single transition into the "hole". If the
1469 pointer doesnt find one the single transition is appeneded as normal.
1470
1471 - Once compressed we can Renew/realloc the structures to release the
1472 excess space.
1473
1474 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1475 specifically Fig 3.47 and the associated pseudocode.
1476
1477 demq
1478 */
a3b680e6 1479 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1480 U32 state, charid;
a3621e74 1481 U32 pos = 0, zp=0;
3dab1dad 1482 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1483
1484 for ( state = 1 ; state < laststate ; state++ ) {
1485 U8 flag = 0;
a28509cc
AL
1486 const U32 stateidx = TRIE_NODEIDX( state );
1487 const U32 o_used = trie->trans[ stateidx ].check;
1488 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1489 trie->trans[ stateidx ].check = 0;
1490
1491 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1492 if ( flag || trie->trans[ stateidx + charid ].next ) {
1493 if ( trie->trans[ stateidx + charid ].next ) {
1494 if (o_used == 1) {
1495 for ( ; zp < pos ; zp++ ) {
1496 if ( ! trie->trans[ zp ].next ) {
1497 break;
1498 }
1499 }
1500 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1501 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1502 trie->trans[ zp ].check = state;
1503 if ( ++zp > pos ) pos = zp;
1504 break;
1505 }
1506 used--;
1507 }
1508 if ( !flag ) {
1509 flag = 1;
1510 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1511 }
1512 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1513 trie->trans[ pos ].check = state;
1514 pos++;
1515 }
1516 }
1517 }
cc601c31 1518 trie->lasttrans = pos + 1;
a3621e74
YO
1519 Renew( trie->states, laststate + 1, reg_trie_state);
1520 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1521 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1522 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1523 (int)depth * 2 + 2,"",
1524 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1525 (IV)next_alloc,
1526 (IV)pos,
a3621e74
YO
1527 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1528 );
1529
1530 } /* end table compress */
1531 }
cc601c31
YO
1532 /* resize the trans array to remove unused space */
1533 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1534
3dab1dad
YO
1535 /* and now dump out the compressed format */
1536 DEBUG_TRIE_COMPILE_r(
1537 dump_trie(trie,depth+1)
1538 );
1539
1540 { /* Modify the program and insert the new TRIE node*/
1541 regnode *convert;
1542 U8 nodetype =(U8)(flags & 0xFF);
1543 char *str=NULL;
a3621e74 1544 /*
3dab1dad
YO
1545 This means we convert either the first branch or the first Exact,
1546 depending on whether the thing following (in 'last') is a branch
1547 or not and whther first is the startbranch (ie is it a sub part of
1548 the alternation or is it the whole thing.)
1549 Assuming its a sub part we conver the EXACT otherwise we convert
1550 the whole branch sequence, including the first.
a3621e74 1551 */
3dab1dad
YO
1552 /* Find the node we are going to overwrite */
1553 if ( first == startbranch && OP( last ) != BRANCH ) {
1554 convert = first;
a3621e74 1555 } else {
3dab1dad
YO
1556 convert = NEXTOPER( first );
1557 NEXT_OFF( first ) = (U16)(last - first);
a3621e74
YO
1558 }
1559
3dab1dad
YO
1560 /* But first we check to see if there is a common prefix we can
1561 split out as an EXACT and put in front of the TRIE node. */
1562 trie->startstate= 1;
1563 if ( trie->bitmap && !trie->widecharmap ) {
1564 U32 state;
1565 DEBUG_OPTIMISE_r(
8e11feef
RGS
1566 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1567 (int)depth * 2 + 2, "",
1568 TRIE_LASTSTATE(trie))
1569 );
1570 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1571 U32 ofs = 0;
8e11feef
RGS
1572 I32 idx = -1;
1573 U32 count = 0;
1574 const U32 base = trie->states[ state ].trans.base;
a3621e74 1575
3dab1dad 1576 if ( trie->states[state].wordnum )
8e11feef 1577 count = 1;
a3621e74 1578
8e11feef 1579 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1580 if ( ( base + ofs >= trie->uniquecharcount ) &&
1581 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1582 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1583 {
3dab1dad 1584 if ( ++count > 1 ) {
8e11feef
RGS
1585 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1586 const char *ch = SvPV_nolen_const( *tmp );
1587 if ( state == 1 ) break;
3dab1dad
YO
1588 if ( count == 2 ) {
1589 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1590 DEBUG_OPTIMISE_r(
8e11feef
RGS
1591 PerlIO_printf(Perl_debug_log,
1592 "%*sNew Start State=%"UVuf" Class: [",
1593 (int)depth * 2 + 2, "",
3dab1dad
YO
1594 state));
1595 if (idx>-1) {
8e11feef
RGS
1596 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1597 const char *ch = SvPV_nolen_const( *tmp );
1598
3dab1dad 1599 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1600 if ( folder )
1601 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1602 DEBUG_OPTIMISE_r(
8e11feef 1603 PerlIO_printf(Perl_debug_log, ch)
3dab1dad 1604 );
8e11feef
RGS
1605 }
1606 }
1607 TRIE_BITMAP_SET(trie,*ch);
1608 if ( folder )
1609 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1610 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1611 }
1612 idx = ofs;
1613 }
3dab1dad
YO
1614 }
1615 if ( count == 1 ) {
1616 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1617 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1618 DEBUG_OPTIMISE_r(
8e11feef
RGS
1619 PerlIO_printf( Perl_debug_log,
1620 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1621 (int)depth * 2 + 2, "",
1622 state, idx, ch)
3dab1dad
YO
1623 );
1624 if ( state==1 ) {
1625 OP( convert ) = nodetype;
1626 str=STRING(convert);
1627 STR_LEN(convert)=0;
1628 }
1629 *str++=*ch;
1630 STR_LEN(convert)++;
a3621e74 1631
8e11feef
RGS
1632 } else {
1633 if (state>1)
1634 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1635 break;
1636 }
1637 }
3dab1dad 1638 if (str) {
8e11feef 1639 regnode *n = convert+NODE_SZ_STR(convert);
3dab1dad 1640 NEXT_OFF(convert)= NODE_SZ_STR(convert);
8e11feef 1641 trie->startstate = state;
3dab1dad
YO
1642 trie->minlen-= (state-1);
1643 trie->maxlen-= (state-1);
8e11feef
RGS
1644 if (trie->maxlen) {
1645 convert = n;
1646 } else {
3dab1dad
YO
1647 NEXT_OFF(convert) = (U16)(tail - convert);
1648 }
1649 }
1650 }
1651 if ( trie->maxlen ) {
1652 OP( convert ) = TRIE;
8e11feef
RGS
1653 NEXT_OFF( convert ) = (U16)(tail - convert);
1654 ARG_SET( convert, data_slot );
a3621e74 1655
3dab1dad
YO
1656 /* store the type in the flags */
1657 convert->flags = nodetype;
1658 /* XXX We really should free up the resource in trie now, as we wont use them */
1659 }
a3621e74
YO
1660 /* needed for dumping*/
1661 DEBUG_r({
1662 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1663 /* We now need to mark all of the space originally used by the
1664 branches as optimized away. This keeps the dumpuntil from
1665 throwing a wobbly as it doesnt use regnext() to traverse the
1666 opcodes.
1667 */
1668 while( optimize < last ) {
1669 OP( optimize ) = OPTIMIZED;
1670 optimize++;
1671 }
1672 });
1673 } /* end node insert */
6e8b4190
YO
1674#ifndef DEBUGGING
1675 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1676#endif
a3621e74
YO
1677 return 1;
1678}
1679
a3621e74 1680/*
5d1c421c
JH
1681 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1682 * These need to be revisited when a newer toolchain becomes available.
1683 */
1684#if defined(__sparc64__) && defined(__GNUC__)
1685# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1686# undef SPARC64_GCC_WORKAROUND
1687# define SPARC64_GCC_WORKAROUND 1
1688# endif
1689#endif
1690
653099ff
GS
1691/* REx optimizer. Converts nodes into quickier variants "in place".
1692 Finds fixed substrings. */
1693
a0288114 1694/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1695 to the position after last scanned or to NULL. */
1696
76e3520e 1697STATIC I32
9a957fbc
AL
1698S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1699 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1700 /* scanp: Start here (read-write). */
1701 /* deltap: Write maxlen-minlen here. */
1702 /* last: Stop before this one. */
1703{
97aff369 1704 dVAR;
c277df42
IZ
1705 I32 min = 0, pars = 0, code;
1706 regnode *scan = *scanp, *next;
1707 I32 delta = 0;
1708 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1709 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1710 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1711 scan_data_t data_fake;
653099ff 1712 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1713 SV *re_trie_maxbuff = NULL;
1714
1715 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1716
c277df42 1717 while (scan && OP(scan) != END && scan < last) {
8e11feef 1718#ifdef DEBUGGING
3dab1dad 1719 int merged=0;
8e11feef 1720#endif
c277df42 1721 /* Peephole optimizer: */
a3621e74 1722 DEBUG_OPTIMISE_r({
c445ea15 1723 SV * const mysv=sv_newmortal();
32fc9b6a 1724 regprop(RExC_rx, mysv, scan);
3dab1dad
YO
1725 PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
1726 (int)depth*2, "",
1727 scan==*scanp ? "Peep" : "",
1728 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
a3621e74 1729 });
3dab1dad 1730 if (PL_regkind[OP(scan)] == EXACT) {
653099ff 1731 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1732 regnode *n = regnext(scan);
1733 U32 stringok = 1;
1734#ifdef DEBUGGING
1735 regnode *stop = scan;
b81d288d 1736#endif
cd439c50 1737 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1738 /* Skip NOTHING, merge EXACT*. */
1739 while (n &&
3dab1dad 1740 ( PL_regkind[OP(n)] == NOTHING ||
c277df42
IZ
1741 (stringok && (OP(n) == OP(scan))))
1742 && NEXT_OFF(n)
1743 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1744 if (OP(n) == TAIL || n > next)
1745 stringok = 0;
3dab1dad
YO
1746 if (PL_regkind[OP(n)] == NOTHING) {
1747 DEBUG_OPTIMISE_r({
1748 SV * const mysv=sv_newmortal();
1749 regprop(RExC_rx, mysv, n);
1750 PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
1751 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1752 });
c277df42
IZ
1753 NEXT_OFF(scan) += NEXT_OFF(n);
1754 next = n + NODE_STEP_REGNODE;
1755#ifdef DEBUGGING
1756 if (stringok)
1757 stop = n;
b81d288d 1758#endif
c277df42 1759 n = regnext(n);
a0ed51b3 1760 }
f49d4d0f 1761 else if (stringok) {
a3b680e6 1762 const int oldl = STR_LEN(scan);
c445ea15 1763 regnode * const nnext = regnext(n);
3dab1dad
YO
1764 DEBUG_OPTIMISE_r({
1765 SV * const mysv=sv_newmortal();
1766 regprop(RExC_rx, mysv, n);
1767 PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
1768 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1769 merged++;
1770 });
b81d288d 1771 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1772 break;
1773 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1774 STR_LEN(scan) += STR_LEN(n);
1775 next = n + NODE_SZ_STR(n);
c277df42 1776 /* Now we can overwrite *n : */
f49d4d0f 1777 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1778#ifdef DEBUGGING
f49d4d0f 1779 stop = next - 1;
b81d288d 1780#endif
c277df42
IZ
1781 n = nnext;
1782 }
1783 }
61a36c01 1784
a3621e74 1785 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1786/*
1787 Two problematic code points in Unicode casefolding of EXACT nodes:
1788
1789 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1790 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1791
1792 which casefold to
1793
1794 Unicode UTF-8
1795
1796 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1797 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1798
1799 This means that in case-insensitive matching (or "loose matching",
1800 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1801 length of the above casefolded versions) can match a target string
1802 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1803 This would rather mess up the minimum length computation.
1804
1805 What we'll do is to look for the tail four bytes, and then peek
1806 at the preceding two bytes to see whether we need to decrease
1807 the minimum length by four (six minus two).
1808
1809 Thanks to the design of UTF-8, there cannot be false matches:
1810 A sequence of valid UTF-8 bytes cannot be a subsequence of
1811 another valid sequence of UTF-8 bytes.
1812
1813*/
c445ea15
AL
1814 char * const s0 = STRING(scan), *s, *t;
1815 char * const s1 = s0 + STR_LEN(scan) - 1;
1816 char * const s2 = s1 - 4;
d4c19fe8 1817 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1818 const char * const t1 = t0 + 3;
2af232bd 1819
61a36c01
JH
1820 for (s = s0 + 2;
1821 s < s2 && (t = ninstr(s, s1, t0, t1));
1822 s = t + 4) {
1823 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1824 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1825 min -= 4;
1826 }
1827 }
1828
c277df42
IZ
1829#ifdef DEBUGGING
1830 /* Allow dumping */
cd439c50 1831 n = scan + NODE_SZ_STR(scan);
c277df42 1832 while (n <= stop) {
3dab1dad 1833 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1834 OP(n) = OPTIMIZED;
1835 NEXT_OFF(n) = 0;
1836 }
1837 n++;
1838 }
653099ff 1839#endif
c277df42 1840 }
a3621e74
YO
1841
1842
1843
653099ff
GS
1844 /* Follow the next-chain of the current node and optimize
1845 away all the NOTHINGs from it. */
c277df42 1846 if (OP(scan) != CURLYX) {
a3b680e6 1847 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1848 ? I32_MAX
1849 /* I32 may be smaller than U16 on CRAYs! */
1850 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1851 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1852 int noff;
1853 regnode *n = scan;
b81d288d 1854
c277df42
IZ
1855 /* Skip NOTHING and LONGJMP. */
1856 while ((n = regnext(n))
3dab1dad 1857 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1858 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1859 && off + noff < max)
1860 off += noff;
1861 if (reg_off_by_arg[OP(scan)])
1862 ARG(scan) = off;
b81d288d 1863 else
c277df42
IZ
1864 NEXT_OFF(scan) = off;
1865 }
a3621e74 1866
3dab1dad
YO
1867 DEBUG_OPTIMISE_r({if (merged){
1868 SV * const mysv=sv_newmortal();
1869 regprop(RExC_rx, mysv, scan);
1870 PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
1871 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1872 }});
1873
653099ff
GS
1874 /* The principal pseudo-switch. Cannot be a switch, since we
1875 look into several different things. */
b81d288d 1876 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1877 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1878 next = regnext(scan);
1879 code = OP(scan);
a3621e74 1880 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1881
1882 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1883 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1884 struct regnode_charclass_class accum;
d4c19fe8 1885 regnode * const startbranch=scan;
c277df42 1886
653099ff 1887 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1888 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1889 if (flags & SCF_DO_STCLASS)
830247a4 1890 cl_init_zero(pRExC_state, &accum);
a3621e74 1891
c277df42 1892 while (OP(scan) == code) {
830247a4 1893 I32 deltanext, minnext, f = 0, fake;
653099ff 1894 struct regnode_charclass_class this_class;
c277df42
IZ
1895
1896 num++;
1897 data_fake.flags = 0;
b81d288d 1898 if (data) {
2c2d71f5 1899 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1900 data_fake.last_closep = data->last_closep;
1901 }
1902 else
1903 data_fake.last_closep = &fake;
c277df42
IZ
1904 next = regnext(scan);
1905 scan = NEXTOPER(scan);
1906 if (code != BRANCH)
1907 scan = NEXTOPER(scan);
653099ff 1908 if (flags & SCF_DO_STCLASS) {
830247a4 1909 cl_init(pRExC_state, &this_class);
653099ff
GS
1910 data_fake.start_class = &this_class;
1911 f = SCF_DO_STCLASS_AND;
b81d288d 1912 }
e1901655
IZ
1913 if (flags & SCF_WHILEM_VISITED_POS)
1914 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1915
653099ff 1916 /* we suppose the run is continuous, last=next...*/
830247a4 1917 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1918 next, &data_fake, f,depth+1);
b81d288d 1919 if (min1 > minnext)
c277df42
IZ
1920 min1 = minnext;
1921 if (max1 < minnext + deltanext)
1922 max1 = minnext + deltanext;
1923 if (deltanext == I32_MAX)
aca2d497 1924 is_inf = is_inf_internal = 1;
c277df42
IZ
1925 scan = next;
1926 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1927 pars++;
3dab1dad
YO
1928 if (data) {
1929 if (data_fake.flags & SF_HAS_EVAL)
c277df42 1930 data->flags |= SF_HAS_EVAL;
2c2d71f5 1931 data->whilem_c = data_fake.whilem_c;
3dab1dad 1932 }
653099ff 1933 if (flags & SCF_DO_STCLASS)
830247a4 1934 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1935 if (code == SUSPEND)
c277df42
IZ
1936 break;
1937 }
1938 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1939 min1 = 0;
1940 if (flags & SCF_DO_SUBSTR) {
1941 data->pos_min += min1;
1942 data->pos_delta += max1 - min1;
1943 if (max1 != min1 || is_inf)
1944 data->longest = &(data->longest_float);
1945 }
1946 min += min1;
1947 delta += max1 - min1;
653099ff 1948 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1949 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1950 if (min1) {
1951 cl_and(data->start_class, &and_with);
1952 flags &= ~SCF_DO_STCLASS;
1953 }
1954 }
1955 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1956 if (min1) {
1957 cl_and(data->start_class, &accum);
653099ff 1958 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1959 }
1960 else {
b81d288d 1961 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1962 * data->start_class */
1963 StructCopy(data->start_class, &and_with,
1964 struct regnode_charclass_class);
1965 flags &= ~SCF_DO_STCLASS_AND;
1966 StructCopy(&accum, data->start_class,
1967 struct regnode_charclass_class);
1968 flags |= SCF_DO_STCLASS_OR;
1969 data->start_class->flags |= ANYOF_EOS;
1970 }
653099ff 1971 }
a3621e74
YO
1972
1973 /* demq.
1974
1975 Assuming this was/is a branch we are dealing with: 'scan' now
1976 points at the item that follows the branch sequence, whatever
1977 it is. We now start at the beginning of the sequence and look
1978 for subsequences of
1979
1980 BRANCH->EXACT=>X
1981 BRANCH->EXACT=>X
1982
1983 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1984
1985 If we can find such a subseqence we need to turn the first
1986 element into a trie and then add the subsequent branch exact
1987 strings to the trie.
1988
1989 We have two cases
1990
1991 1. patterns where the whole set of branch can be converted to a trie,
1992
1993 2. patterns where only a subset of the alternations can be
1994 converted to a trie.
1995
1996 In case 1 we can replace the whole set with a single regop
1997 for the trie. In case 2 we need to keep the start and end
1998 branchs so
1999
2000 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2001 becomes BRANCH TRIE; BRANCH X;
2002
2003 Hypthetically when we know the regex isnt anchored we can
2004 turn a case 1 into a DFA and let it rip... Every time it finds a match
2005 it would just call its tail, no WHILEM/CURLY needed.
2006
2007 */
0111c4fd 2008 if (DO_TRIE) {
3dab1dad 2009 int made=0;
0111c4fd
RGS
2010 if (!re_trie_maxbuff) {
2011 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2012 if (!SvIOK(re_trie_maxbuff))
2013 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2014 }
a3621e74
YO
2015 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2016 regnode *cur;
2017 regnode *first = (regnode *)NULL;
2018 regnode *last = (regnode *)NULL;
2019 regnode *tail = scan;
2020 U8 optype = 0;
2021 U32 count=0;
2022
2023#ifdef DEBUGGING
c445ea15 2024 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2025#endif
2026 /* var tail is used because there may be a TAIL
2027 regop in the way. Ie, the exacts will point to the
2028 thing following the TAIL, but the last branch will
2029 point at the TAIL. So we advance tail. If we
2030 have nested (?:) we may have to move through several
2031 tails.
2032 */
2033
2034 while ( OP( tail ) == TAIL ) {
2035 /* this is the TAIL generated by (?:) */
2036 tail = regnext( tail );
2037 }
2038
3dab1dad 2039
a3621e74 2040 DEBUG_OPTIMISE_r({
32fc9b6a 2041 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2042 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2043 (int)depth * 2 + 2, "",
2044 "Looking for TRIE'able sequences. Tail node is: ",
2045 SvPV_nolen_const( mysv )
a3621e74
YO
2046 );
2047 });
3dab1dad 2048
a3621e74
YO
2049 /*
2050
2051 step through the branches, cur represents each
2052 branch, noper is the first thing to be matched
2053 as part of that branch and noper_next is the
2054 regnext() of that node. if noper is an EXACT
2055 and noper_next is the same as scan (our current
2056 position in the regex) then the EXACT branch is
2057 a possible optimization target. Once we have
2058 two or more consequetive such branches we can
2059 create a trie of the EXACT's contents and stich
2060 it in place. If the sequence represents all of
2061 the branches we eliminate the whole thing and
2062 replace it with a single TRIE. If it is a
2063 subsequence then we need to stitch it in. This
2064 means the first branch has to remain, and needs
2065 to be repointed at the item on the branch chain
2066 following the last branch optimized. This could
2067 be either a BRANCH, in which case the
2068 subsequence is internal, or it could be the
2069 item following the branch sequence in which
2070 case the subsequence is at the end.
2071
2072 */
2073
2074 /* dont use tail as the end marker for this traverse */
2075 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2076 regnode * const noper = NEXTOPER( cur );
2077 regnode * const noper_next = regnext( noper );
a3621e74 2078
a3621e74 2079 DEBUG_OPTIMISE_r({
32fc9b6a 2080 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2081 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2082 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2083
32fc9b6a 2084 regprop(RExC_rx, mysv, noper);
a3621e74 2085 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2086 SvPV_nolen_const(mysv));
a3621e74
YO
2087
2088 if ( noper_next ) {
32fc9b6a 2089 regprop(RExC_rx, mysv, noper_next );
a3621e74 2090 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2091 SvPV_nolen_const(mysv));
a3621e74 2092 }
3dab1dad
YO
2093 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2094 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2095 });
3dab1dad
YO
2096 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2097 : PL_regkind[ OP( noper ) ] == EXACT )
2098 || OP(noper) == NOTHING )
a3621e74
YO
2099 && noper_next == tail && count<U16_MAX)
2100 {
2101 count++;
3dab1dad
YO
2102 if ( !first || optype == NOTHING ) {
2103 if (!first) first = cur;
a3621e74
YO
2104 optype = OP( noper );
2105 } else {
a3621e74 2106 last = cur;
a3621e74
YO
2107 }
2108 } else {
2109 if ( last ) {
3dab1dad 2110 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2111 }
3dab1dad 2112 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2113 && noper_next == tail )
2114 {
2115 count = 1;
2116 first = cur;
2117 optype = OP( noper );
2118 } else {
2119 count = 0;
2120 first = NULL;
2121 optype = 0;
2122 }
2123 last = NULL;
2124 }
2125 }
2126 DEBUG_OPTIMISE_r({
32fc9b6a 2127 regprop(RExC_rx, mysv, cur);
a3621e74 2128 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2129 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2130 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2131
2132 });
2133 if ( last ) {
3dab1dad
YO
2134 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2135#ifdef TRIE_STUDY_OPT
2136 if ( OP(first)!=TRIE && startbranch == first ) {
2137
a3621e74 2138 }
3dab1dad 2139#endif
a3621e74
YO
2140 }
2141 }
3dab1dad
YO
2142
2143 } /* do trie */
a0ed51b3 2144 }
a3621e74 2145 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2146 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2147 } else /* single branch is optimized. */
c277df42
IZ
2148 scan = NEXTOPER(scan);
2149 continue;
a0ed51b3
LW
2150 }
2151 else if (OP(scan) == EXACT) {
cd439c50 2152 I32 l = STR_LEN(scan);
c445ea15 2153 UV uc;
a0ed51b3 2154 if (UTF) {
a3b680e6 2155 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2156 l = utf8_length(s, s + l);
9041c2e3 2157 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2158 } else {
2159 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2160 }
2161 min += l;
c277df42 2162 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2163 /* The code below prefers earlier match for fixed
2164 offset, later match for variable offset. */
2165 if (data->last_end == -1) { /* Update the start info. */
2166 data->last_start_min = data->pos_min;
2167 data->last_start_max = is_inf
b81d288d 2168 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2169 }
cd439c50 2170 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2171 if (UTF)
2172 SvUTF8_on(data->last_found);
0eda9292 2173 {
9a957fbc 2174 SV * const sv = data->last_found;
a28509cc 2175 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2176 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2177 if (mg && mg->mg_len >= 0)
5e43f467
JH
2178 mg->mg_len += utf8_length((U8*)STRING(scan),
2179 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2180 }
c277df42
IZ
2181 data->last_end = data->pos_min + l;
2182 data->pos_min += l; /* As in the first entry. */
2183 data->flags &= ~SF_BEFORE_EOL;
2184 }
653099ff
GS
2185 if (flags & SCF_DO_STCLASS_AND) {
2186 /* Check whether it is compatible with what we know already! */
2187 int compat = 1;
2188
1aa99e6b 2189 if (uc >= 0x100 ||
516a5887 2190 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2191 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2192 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2193 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2194 )
653099ff
GS
2195 compat = 0;
2196 ANYOF_CLASS_ZERO(data->start_class);
2197 ANYOF_BITMAP_ZERO(data->start_class);
2198 if (compat)
1aa99e6b 2199 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2200 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2201 if (uc < 0x100)
2202 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2203 }
2204 else if (flags & SCF_DO_STCLASS_OR) {
2205 /* false positive possible if the class is case-folded */
1aa99e6b 2206 if (uc < 0x100)
9b877dbb
IH
2207 ANYOF_BITMAP_SET(data->start_class, uc);
2208 else
2209 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2210 data->start_class->flags &= ~ANYOF_EOS;
2211 cl_and(data->start_class, &and_with);
2212 }
2213 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2214 }
3dab1dad 2215 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2216 I32 l = STR_LEN(scan);
1aa99e6b 2217 UV uc = *((U8*)STRING(scan));
653099ff
GS
2218
2219 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2220 if (flags & SCF_DO_SUBSTR) {
2221 assert(data);
830247a4 2222 scan_commit(pRExC_state, data);
ecaa9b9c 2223 }
a0ed51b3 2224 if (UTF) {
6136c704 2225 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2226 l = utf8_length(s, s + l);
9041c2e3 2227 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2228 }
2229 min += l;
ecaa9b9c 2230 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2231 data->pos_min += l;
653099ff
GS
2232 if (flags & SCF_DO_STCLASS_AND) {
2233 /* Check whether it is compatible with what we know already! */
2234 int compat = 1;
2235
1aa99e6b 2236 if (uc >= 0x100 ||
516a5887 2237 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2238 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2239 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2240 compat = 0;
2241 ANYOF_CLASS_ZERO(data->start_class);
2242 ANYOF_BITMAP_ZERO(data->start_class);
2243 if (compat) {
1aa99e6b 2244 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2245 data->start_class->flags &= ~ANYOF_EOS;
2246 data->start_class->flags |= ANYOF_FOLD;
2247 if (OP(scan) == EXACTFL)
2248 data->start_class->flags |= ANYOF_LOCALE;
2249 }
2250 }
2251 else if (flags & SCF_DO_STCLASS_OR) {
2252 if (data->start_class->flags & ANYOF_FOLD) {
2253 /* false positive possible if the class is case-folded.
2254 Assume that the locale settings are the same... */
1aa99e6b
IH
2255 if (uc < 0x100)
2256 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2257 data->start_class->flags &= ~ANYOF_EOS;
2258 }
2259 cl_and(data->start_class, &and_with);
2260 }
2261 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2262 }
3dab1dad
YO
2263#ifdef TRIE_STUDY_OPT
2264 else if (OP(scan) == TRIE) {
2265 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2266 min += trie->minlen;
2267 flags &= ~SCF_DO_STCLASS; /* xxx */
2268 if (flags & SCF_DO_SUBSTR) {
2269 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2270 data->pos_min += trie->minlen;
2271 data->pos_delta+= (trie->maxlen-trie->minlen);
2272 }
2273 }
2274#endif
bfed75c6 2275 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2276 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2277 I32 f = flags, pos_before = 0;
d4c19fe8 2278 regnode * const oscan = scan;
653099ff
GS
2279 struct regnode_charclass_class this_class;
2280 struct regnode_charclass_class *oclass = NULL;
727f22e3 2281 I32 next_is_eval = 0;
653099ff 2282
3dab1dad 2283 switch (PL_regkind[OP(scan)]) {
653099ff 2284 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2285 scan = NEXTOPER(scan);
2286 goto finish;
2287 case PLUS:
653099ff 2288 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2289 next = NEXTOPER(scan);
653099ff 2290 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2291 mincount = 1;
2292 maxcount = REG_INFTY;
c277df42
IZ
2293 next = regnext(scan);
2294 scan = NEXTOPER(scan);
2295 goto do_curly;
2296 }
2297 }
2298 if (flags & SCF_DO_SUBSTR)
2299 data->pos_min++;
2300 min++;
2301 /* Fall through. */
2302 case STAR:
653099ff
GS
2303 if (flags & SCF_DO_STCLASS) {
2304 mincount = 0;
b81d288d 2305 maxcount = REG_INFTY;
653099ff
GS
2306 next = regnext(scan);
2307 scan = NEXTOPER(scan);
2308 goto do_curly;
2309 }
b81d288d 2310 is_inf = is_inf_internal = 1;
c277df42
IZ
2311 scan = regnext(scan);
2312 if (flags & SCF_DO_SUBSTR) {
830247a4 2313 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2314 data->longest = &(data->longest_float);
2315 }
2316 goto optimize_curly_tail;
2317 case CURLY:
b81d288d 2318 mincount = ARG1(scan);
c277df42
IZ
2319 maxcount = ARG2(scan);
2320 next = regnext(scan);
cb434fcc
IZ
2321 if (OP(scan) == CURLYX) {
2322 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2323 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2324 }
c277df42 2325 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2326 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2327 do_curly:
2328 if (flags & SCF_DO_SUBSTR) {
830247a4 2329 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2330 pos_before = data->pos_min;
2331 }
2332 if (data) {
2333 fl = data->flags;
2334 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2335 if (is_inf)
2336 data->flags |= SF_IS_INF;
2337 }
653099ff 2338 if (flags & SCF_DO_STCLASS) {
830247a4 2339 cl_init(pRExC_state, &this_class);
653099ff
GS
2340 oclass = data->start_class;
2341 data->start_class = &this_class;
2342 f |= SCF_DO_STCLASS_AND;
2343 f &= ~SCF_DO_STCLASS_OR;
2344 }
e1901655
IZ
2345 /* These are the cases when once a subexpression
2346 fails at a particular position, it cannot succeed
2347 even after backtracking at the enclosing scope.
b81d288d 2348
e1901655
IZ
2349 XXXX what if minimal match and we are at the
2350 initial run of {n,m}? */
2351 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2352 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2353
c277df42 2354 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2355 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2356 (mincount == 0
2357 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2358
2359 if (flags & SCF_DO_STCLASS)
2360 data->start_class = oclass;
2361 if (mincount == 0 || minnext == 0) {
2362 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2363 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2364 }
2365 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2366 /* Switch to OR mode: cache the old value of
653099ff
GS
2367 * data->start_class */
2368 StructCopy(data->start_class, &and_with,
2369 struct regnode_charclass_class);
2370 flags &= ~SCF_DO_STCLASS_AND;
2371 StructCopy(&this_class, data->start_class,
2372 struct regnode_charclass_class);
2373 flags |= SCF_DO_STCLASS_OR;
2374 data->start_class->flags |= ANYOF_EOS;
2375 }
2376 } else { /* Non-zero len */
2377 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2378 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2379 cl_and(data->start_class, &and_with);
2380 }
2381 else if (flags & SCF_DO_STCLASS_AND)
2382 cl_and(data->start_class, &this_class);
2383 flags &= ~SCF_DO_STCLASS;
2384 }
c277df42
IZ
2385 if (!scan) /* It was not CURLYX, but CURLY. */
2386 scan = next;
041457d9
DM
2387 if ( /* ? quantifier ok, except for (?{ ... }) */
2388 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2389 && (minnext == 0) && (deltanext == 0)
99799961 2390 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2391 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2392 && ckWARN(WARN_REGEXP))
b45f050a 2393 {
830247a4 2394 vWARN(RExC_parse,
b45f050a
JF
2395 "Quantifier unexpected on zero-length expression");
2396 }
2397
c277df42 2398 min += minnext * mincount;
b81d288d 2399 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2400 && (minnext + deltanext) > 0)
2401 || deltanext == I32_MAX);
aca2d497 2402 is_inf |= is_inf_internal;
c277df42
IZ
2403 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2404
2405 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2406 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2407 && data->flags & SF_IN_PAR
2408 && !(data->flags & SF_HAS_EVAL)
2409 && !deltanext && minnext == 1 ) {
2410 /* Try to optimize to CURLYN. */
2411 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2412 regnode * const nxt1 = nxt;
497b47a8
JH
2413#ifdef DEBUGGING
2414 regnode *nxt2;
2415#endif
c277df42
IZ
2416
2417 /* Skip open. */
2418 nxt = regnext(nxt);
bfed75c6 2419 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2420 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2421 && STR_LEN(nxt) == 1))
c277df42 2422 goto nogo;
497b47a8 2423#ifdef DEBUGGING
c277df42 2424 nxt2 = nxt;
497b47a8 2425#endif
c277df42 2426 nxt = regnext(nxt);
b81d288d 2427 if (OP(nxt) != CLOSE)
c277df42
IZ
2428 goto nogo;
2429 /* Now we know that nxt2 is the only contents: */
eb160463 2430 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2431 OP(oscan) = CURLYN;
2432 OP(nxt1) = NOTHING; /* was OPEN. */
2433#ifdef DEBUGGING
2434 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2435 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2436 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2437 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2438 OP(nxt + 1) = OPTIMIZED; /* was count. */
2439 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2440#endif
c277df42 2441 }
c277df42
IZ
2442 nogo:
2443
2444 /* Try optimization CURLYX => CURLYM. */
b81d288d 2445 if ( OP(oscan) == CURLYX && data
c277df42 2446 && !(data->flags & SF_HAS_PAR)
c277df42 2447 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2448 && !deltanext /* atom is fixed width */
2449 && minnext != 0 /* CURLYM can't handle zero width */
2450 ) {
c277df42
IZ
2451 /* XXXX How to optimize if data == 0? */
2452 /* Optimize to a simpler form. */
2453 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2454 regnode *nxt2;
2455
2456 OP(oscan) = CURLYM;
2457 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2458 && (OP(nxt2) != WHILEM))
c277df42
IZ
2459 nxt = nxt2;
2460 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2461 /* Need to optimize away parenths. */
2462 if (data->flags & SF_IN_PAR) {
2463 /* Set the parenth number. */
2464 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2465
b81d288d 2466 if (OP(nxt) != CLOSE)
b45f050a 2467 FAIL("Panic opt close");
eb160463 2468 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2469 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2470 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2471#ifdef DEBUGGING
2472 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2473 OP(nxt + 1) = OPTIMIZED; /* was count. */
2474 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2475 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2476#endif
c277df42
IZ
2477#if 0
2478 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2479 regnode *nnxt = regnext(nxt1);
b81d288d 2480
c277df42
IZ
2481 if (nnxt == nxt) {
2482 if (reg_off_by_arg[OP(nxt1)])
2483 ARG_SET(nxt1, nxt2 - nxt1);
2484 else if (nxt2 - nxt1 < U16_MAX)
2485 NEXT_OFF(nxt1) = nxt2 - nxt1;
2486 else
2487 OP(nxt) = NOTHING; /* Cannot beautify */
2488 }
2489 nxt1 = nnxt;
2490 }
2491#endif
2492 /* Optimize again: */
b81d288d 2493 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2494 NULL, 0,depth+1);
a0ed51b3
LW
2495 }
2496 else
c277df42 2497 oscan->flags = 0;
c277df42 2498 }
e1901655
IZ
2499 else if ((OP(oscan) == CURLYX)
2500 && (flags & SCF_WHILEM_VISITED_POS)
2501 /* See the comment on a similar expression above.
2502 However, this time it not a subexpression
2503 we care about, but the expression itself. */
2504 && (maxcount == REG_INFTY)
2505 && data && ++data->whilem_c < 16) {
2506 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2507 /* Find WHILEM (as in regexec.c) */
2508 regnode *nxt = oscan + NEXT_OFF(oscan);
2509
2510 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2511 nxt += ARG(nxt);
eb160463
GS
2512 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2513 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2514 }
b81d288d 2515 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2516 pars++;
2517 if (flags & SCF_DO_SUBSTR) {
c445ea15 2518 SV *last_str = NULL;
c277df42
IZ
2519 int counted = mincount != 0;
2520
2521 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2522#if defined(SPARC64_GCC_WORKAROUND)
2523 I32 b = 0;
2524 STRLEN l = 0;
cfd0369c 2525 const char *s = NULL;
5d1c421c
JH
2526 I32 old = 0;
2527
2528 if (pos_before >= data->last_start_min)
2529 b = pos_before;
2530 else
2531 b = data->last_start_min;
2532
2533 l = 0;
cfd0369c 2534 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2535 old = b - data->last_start_min;
2536
2537#else
b81d288d 2538 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2539 ? pos_before : data->last_start_min;
2540 STRLEN l;
d4c19fe8 2541 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2542 I32 old = b - data->last_start_min;
5d1c421c 2543#endif
a0ed51b3
LW
2544
2545 if (UTF)
2546 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2547
a0ed51b3 2548 l -= old;
c277df42 2549 /* Get the added string: */
79cb57f6 2550 last_str = newSVpvn(s + old, l);
0e933229
IH
2551 if (UTF)
2552 SvUTF8_on(last_str);
c277df42
IZ
2553 if (deltanext == 0 && pos_before == b) {
2554 /* What was added is a constant string */
2555 if (mincount > 1) {
2556 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2557 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2558 SvPVX_const(last_str), l, mincount - 1);
b162af07 2559 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2560 /* Add additional parts. */
b81d288d 2561 SvCUR_set(data->last_found,
c277df42
IZ
2562 SvCUR(data->last_found) - l);
2563 sv_catsv(data->last_found, last_str);
0eda9292
JH
2564 {
2565 SV * sv = data->last_found;
2566 MAGIC *mg =
2567 SvUTF8(sv) && SvMAGICAL(sv) ?
2568 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2569 if (mg && mg->mg_len >= 0)
2570 mg->mg_len += CHR_SVLEN(last_str);
2571 }
c277df42
IZ
2572 data->last_end += l * (mincount - 1);
2573 }
2a8d9689
HS
2574 } else {
2575 /* start offset must point into the last copy */
2576 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2577 data->last_start_max += is_inf ? I32_MAX
2578 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2579 }
2580 }
2581 /* It is counted once already... */
2582 data->pos_min += minnext * (mincount - counted);
2583 data->pos_delta += - counted * deltanext +
2584 (minnext + deltanext) * maxcount - minnext * mincount;
2585 if (mincount != maxcount) {
653099ff
GS
2586 /* Cannot extend fixed substrings found inside
2587 the group. */
830247a4 2588 scan_commit(pRExC_state,data);
c277df42 2589 if (mincount && last_str) {
d4c19fe8
AL
2590 SV * const sv = data->last_found;
2591 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2592 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2593
2594 if (mg)
2595 mg->mg_len = -1;
2596 sv_setsv(sv, last_str);
c277df42 2597 data->last_end = data->pos_min;
b81d288d 2598 data->last_start_min =
a0ed51b3 2599 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2600 data->last_start_max = is_inf
2601 ? I32_MAX
c277df42 2602 : data->pos_min + data->pos_delta
a0ed51b3 2603 - CHR_SVLEN(last_str);
c277df42
IZ
2604 }
2605 data->longest = &(data->longest_float);
2606 }
aca2d497 2607 SvREFCNT_dec(last_str);
c277df42 2608 }
405ff068 2609 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2610 data->flags |= SF_HAS_EVAL;
2611 optimize_curly_tail:
c277df42 2612 if (OP(oscan) != CURLYX) {
3dab1dad 2613 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2614 && NEXT_OFF(next))
2615 NEXT_OFF(oscan) += NEXT_OFF(next);
2616 }
c277df42 2617 continue;
653099ff 2618 default: /* REF and CLUMP only? */
c277df42 2619 if (flags & SCF_DO_SUBSTR) {
830247a4 2620 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2621 data->longest = &(data->longest_float);
2622 }
aca2d497 2623 is_inf = is_inf_internal = 1;
653099ff 2624 if (flags & SCF_DO_STCLASS_OR)
830247a4 2625 cl_anything(pRExC_state, data->start_class);
653099ff 2626 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2627 break;
2628 }
a0ed51b3 2629 }
bfed75c6 2630 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2631 int value = 0;
653099ff 2632
c277df42 2633 if (flags & SCF_DO_SUBSTR) {
830247a4 2634 scan_commit(pRExC_state,data);
c277df42
IZ
2635 data->pos_min++;
2636 }
2637 min++;
653099ff
GS
2638 if (flags & SCF_DO_STCLASS) {
2639 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2640
2641 /* Some of the logic below assumes that switching
2642 locale on will only add false positives. */
3dab1dad 2643 switch (PL_regkind[OP(scan)]) {
653099ff 2644 case SANY:
653099ff
GS
2645 default:
2646 do_default:
2647 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2648 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2649 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2650 break;
2651 case REG_ANY:
2652 if (OP(scan) == SANY)
2653 goto do_default;
2654 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2655 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2656 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2657 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2658 }
2659 if (flags & SCF_DO_STCLASS_AND || !value)
2660 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2661 break;
2662 case ANYOF:
2663 if (flags & SCF_DO_STCLASS_AND)
2664 cl_and(data->start_class,
2665 (struct regnode_charclass_class*)scan);
2666 else
830247a4 2667 cl_or(pRExC_state, data->start_class,
653099ff
GS
2668 (struct regnode_charclass_class*)scan);
2669 break;
2670 case ALNUM:
2671 if (flags & SCF_DO_STCLASS_AND) {
2672 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2673 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2674 for (value = 0; value < 256; value++)
2675 if (!isALNUM(value))
2676 ANYOF_BITMAP_CLEAR(data->start_class, value);
2677 }
2678 }
2679 else {
2680 if (data->start_class->flags & ANYOF_LOCALE)
2681 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2682 else {
2683 for (value = 0; value < 256; value++)
2684 if (isALNUM(value))
b81d288d 2685 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2686 }
2687 }
2688 break;
2689 case ALNUML:
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 if (data->start_class->flags & ANYOF_LOCALE)
2692 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2693 }
2694 else {
2695 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2696 data->start_class->flags |= ANYOF_LOCALE;
2697 }
2698 break;
2699 case NALNUM:
2700 if (flags & SCF_DO_STCLASS_AND) {
2701 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2702 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2703 for (value = 0; value < 256; value++)
2704 if (isALNUM(value))
2705 ANYOF_BITMAP_CLEAR(data->start_class, value);
2706 }
2707 }
2708 else {
2709 if (data->start_class->flags & ANYOF_LOCALE)
2710 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2711 else {
2712 for (value = 0; value < 256; value++)
2713 if (!isALNUM(value))
b81d288d 2714 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2715 }
2716 }
2717 break;
2718 case NALNUML:
2719 if (flags & SCF_DO_STCLASS_AND) {
2720 if (data->start_class->flags & ANYOF_LOCALE)
2721 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2722 }
2723 else {
2724 data->start_class->flags |= ANYOF_LOCALE;
2725 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2726 }
2727 break;
2728 case SPACE:
2729 if (flags & SCF_DO_STCLASS_AND) {
2730 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2731 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2732 for (value = 0; value < 256; value++)
2733 if (!isSPACE(value))
2734 ANYOF_BITMAP_CLEAR(data->start_class, value);
2735 }
2736 }
2737 else {
2738 if (data->start_class->flags & ANYOF_LOCALE)
2739 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2740 else {
2741 for (value = 0; value < 256; value++)
2742 if (isSPACE(value))
b81d288d 2743 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2744 }
2745 }
2746 break;
2747 case SPACEL:
2748 if (flags & SCF_DO_STCLASS_AND) {
2749 if (data->start_class->flags & ANYOF_LOCALE)
2750 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2751 }
2752 else {
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2755 }
2756 break;
2757 case NSPACE:
2758 if (flags & SCF_DO_STCLASS_AND) {
2759 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2760 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2761 for (value = 0; value < 256; value++)
2762 if (isSPACE(value))
2763 ANYOF_BITMAP_CLEAR(data->start_class, value);
2764 }
2765 }
2766 else {
2767 if (data->start_class->flags & ANYOF_LOCALE)
2768 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2769 else {
2770 for (value = 0; value < 256; value++)
2771 if (!isSPACE(value))
b81d288d 2772 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2773 }
2774 }
2775 break;
2776 case NSPACEL:
2777 if (flags & SCF_DO_STCLASS_AND) {
2778 if (data->start_class->flags & ANYOF_LOCALE) {
2779 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2780 for (value = 0; value < 256; value++)
2781 if (!isSPACE(value))
2782 ANYOF_BITMAP_CLEAR(data->start_class, value);
2783 }
2784 }
2785 else {
2786 data->start_class->flags |= ANYOF_LOCALE;
2787 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2788 }
2789 break;
2790 case DIGIT:
2791 if (flags & SCF_DO_STCLASS_AND) {
2792 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2793 for (value = 0; value < 256; value++)
2794 if (!isDIGIT(value))
2795 ANYOF_BITMAP_CLEAR(data->start_class, value);
2796 }
2797 else {
2798 if (data->start_class->flags & ANYOF_LOCALE)
2799 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2800 else {
2801 for (value = 0; value < 256; value++)
2802 if (isDIGIT(value))
b81d288d 2803 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2804 }
2805 }
2806 break;
2807 case NDIGIT:
2808 if (flags & SCF_DO_STCLASS_AND) {
2809 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2810 for (value = 0; value < 256; value++)
2811 if (isDIGIT(value))
2812 ANYOF_BITMAP_CLEAR(data->start_class, value);
2813 }
2814 else {
2815 if (data->start_class->flags & ANYOF_LOCALE)
2816 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2817 else {
2818 for (value = 0; value < 256; value++)
2819 if (!isDIGIT(value))
b81d288d 2820 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2821 }
2822 }
2823 break;
2824 }
2825 if (flags & SCF_DO_STCLASS_OR)
2826 cl_and(data->start_class, &and_with);
2827 flags &= ~SCF_DO_STCLASS;
2828 }
a0ed51b3 2829 }
3dab1dad 2830 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2831 data->flags |= (OP(scan) == MEOL
2832 ? SF_BEFORE_MEOL
2833 : SF_BEFORE_SEOL);
a0ed51b3 2834 }
3dab1dad 2835 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
2836 /* Lookbehind, or need to calculate parens/evals/stclass: */
2837 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2838 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2839 /* Lookahead/lookbehind */
cb434fcc 2840 I32 deltanext, minnext, fake = 0;
c277df42 2841 regnode *nscan;
653099ff
GS
2842 struct regnode_charclass_class intrnl;
2843 int f = 0;
c277df42
IZ
2844
2845 data_fake.flags = 0;
b81d288d 2846 if (data) {
2c2d71f5 2847 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2848 data_fake.last_closep = data->last_closep;
2849 }
2850 else
2851 data_fake.last_closep = &fake;
653099ff
GS
2852 if ( flags & SCF_DO_STCLASS && !scan->flags
2853 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2854 cl_init(pRExC_state, &intrnl);
653099ff 2855 data_fake.start_class = &intrnl;
e1901655 2856 f |= SCF_DO_STCLASS_AND;
653099ff 2857 }
e1901655
IZ
2858 if (flags & SCF_WHILEM_VISITED_POS)
2859 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2860 next = regnext(scan);
2861 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2862 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2863 if (scan->flags) {
2864 if (deltanext) {
9baa0206 2865 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2866 }
2867 else if (minnext > U8_MAX) {
9baa0206 2868 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2869 }
eb160463 2870 scan->flags = (U8)minnext;
c277df42
IZ
2871 }
2872 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2873 pars++;
405ff068 2874 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2875 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2876 if (data)
2877 data->whilem_c = data_fake.whilem_c;
e1901655 2878 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2879 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2880
2881 cl_and(data->start_class, &intrnl);
2882 if (was)
2883 data->start_class->flags |= ANYOF_EOS;
2884 }
a0ed51b3
LW
2885 }
2886 else if (OP(scan) == OPEN) {
c277df42 2887 pars++;
a0ed51b3 2888 }
cb434fcc 2889 else if (OP(scan) == CLOSE) {
eb160463 2890 if ((I32)ARG(scan) == is_par) {
cb434fcc 2891 next = regnext(scan);
c277df42 2892
cb434fcc
IZ
2893 if ( next && (OP(next) != WHILEM) && next < last)
2894 is_par = 0; /* Disable optimization */
2895 }
2896 if (data)
2897 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2898 }
2899 else if (OP(scan) == EVAL) {
c277df42
IZ
2900 if (data)
2901 data->flags |= SF_HAS_EVAL;
2902 }
96776eda 2903 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2904 if (flags & SCF_DO_SUBSTR) {
830247a4 2905 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2906 data->longest = &(data->longest_float);
2907 }
2908 is_inf = is_inf_internal = 1;
653099ff 2909 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2910 cl_anything(pRExC_state, data->start_class);
96776eda 2911 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2912 }
c277df42
IZ
2913 /* Else: zero-length, ignore. */
2914 scan = regnext(scan);
2915 }
2916
2917 finish:
2918 *scanp = scan;
aca2d497 2919 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2920 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2921 data->pos_delta = I32_MAX - data->pos_min;
2922 if (is_par > U8_MAX)
2923 is_par = 0;
2924 if (is_par && pars==1 && data) {
2925 data->flags |= SF_IN_PAR;
2926 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2927 }
2928 else if (pars && data) {
c277df42
IZ
2929 data->flags |= SF_HAS_PAR;
2930 data->flags &= ~SF_IN_PAR;
2931 }
653099ff
GS
2932 if (flags & SCF_DO_STCLASS_OR)
2933 cl_and(data->start_class, &and_with);
c277df42
IZ
2934 return min;
2935}
2936
76e3520e 2937STATIC I32
5f66b61c 2938S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2939{
830247a4 2940 if (RExC_rx->data) {
b81d288d
AB
2941 Renewc(RExC_rx->data,
2942 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2943 char, struct reg_data);
830247a4
IZ
2944 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2945 RExC_rx->data->count += n;
a0ed51b3
LW
2946 }
2947 else {
a02a5408 2948 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2949 char, struct reg_data);
a02a5408 2950 Newx(RExC_rx->data->what, n, U8);
830247a4 2951 RExC_rx->data->count = n;
c277df42 2952 }
830247a4
IZ
2953 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2954 return RExC_rx->data->count - n;
c277df42
IZ
2955}
2956
76234dfb 2957#ifndef PERL_IN_XSUB_RE
d88dccdf 2958void
864dbfa3 2959Perl_reginitcolors(pTHX)
d88dccdf 2960{
97aff369 2961 dVAR;
1df70142 2962 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2963 if (s) {
1df70142
AL
2964 char *t = savepv(s);
2965 int i = 0;
2966 PL_colors[0] = t;
d88dccdf 2967 while (++i < 6) {
1df70142
AL
2968 t = strchr(t, '\t');
2969 if (t) {
2970 *t = '\0';
2971 PL_colors[i] = ++t;
d88dccdf
IZ
2972 }
2973 else
1df70142 2974 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2975 }
2976 } else {
1df70142 2977 int i = 0;
b81d288d 2978 while (i < 6)
06b5626a 2979 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2980 }
2981 PL_colorset = 1;
2982}
76234dfb 2983#endif
8615cb43 2984
a687059c 2985/*
e50aee73 2986 - pregcomp - compile a regular expression into internal code
a687059c
LW
2987 *
2988 * We can't allocate space until we know how big the compiled form will be,
2989 * but we can't compile it (and thus know how big it is) until we've got a
2990 * place to put the code. So we cheat: we compile it twice, once with code
2991 * generation turned off and size counting turned on, and once "for real".
2992 * This also means that we don't allocate space until we are sure that the
2993 * thing really will compile successfully, and we never have to move the
2994 * code and thus invalidate pointers into it. (Note that it has to be in
2995 * one piece because free() must be able to free it all.) [NB: not true in perl]
2996 *
2997 * Beware that the optimization-preparation code in here knows about some
2998 * of the structure of the compiled regexp. [I'll say.]
2999 */
3000regexp *
864dbfa3 3001Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3002{
97aff369 3003 dVAR;
a0d0e21e 3004 register regexp *r;
c277df42 3005 regnode *scan;
c277df42 3006 regnode *first;
a0d0e21e 3007 I32 flags;
a0d0e21e
LW
3008 I32 minlen = 0;
3009 I32 sawplus = 0;
3010 I32 sawopen = 0;
2c2d71f5 3011 scan_data_t data;
830247a4
IZ
3012 RExC_state_t RExC_state;
3013 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 3014
a3621e74
YO
3015 GET_RE_DEBUG_FLAGS_DECL;
3016
a0d0e21e 3017 if (exp == NULL)
c277df42 3018 FAIL("NULL regexp argument");
a0d0e21e 3019
a5961de5 3020 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3021
5cfc7842 3022 RExC_precomp = exp;
a3621e74
YO
3023 DEBUG_r(if (!PL_colorset) reginitcolors());
3024 DEBUG_COMPILE_r({
3025 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3026 PL_colors[4],PL_colors[5],PL_colors[0],
3027 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3028 });
e2509266 3029 RExC_flags = pm->op_pmflags;
830247a4 3030 RExC_sawback = 0;
bbce6d69 3031
830247a4
IZ
3032 RExC_seen = 0;
3033 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3034 RExC_seen_evals = 0;
3035 RExC_extralen = 0;
c277df42 3036
bbce6d69 3037 /* First pass: determine size, legality. */
830247a4 3038 RExC_parse = exp;
fac92740 3039 RExC_start = exp;
830247a4
IZ
3040 RExC_end = xend;
3041 RExC_naughty = 0;
3042 RExC_npar = 1;
3043 RExC_size = 0L;
3044 RExC_emit = &PL_regdummy;
3045 RExC_whilem_seen = 0;
85ddcde9
JH
3046#if 0 /* REGC() is (currently) a NOP at the first pass.
3047 * Clever compilers notice this and complain. --jhi */
830247a4 3048 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3049#endif
3dab1dad
YO
3050 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3051 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3052 RExC_precomp = NULL;
a0d0e21e
LW
3053 return(NULL);
3054 }
3dab1dad
YO
3055 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3056 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3057 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
c277df42 3058
c277df42
IZ
3059 /* Small enough for pointer-storage convention?
3060 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3061 if (RExC_size >= 0x10000L && RExC_extralen)
3062 RExC_size += RExC_extralen;
c277df42 3063 else
830247a4
IZ
3064 RExC_extralen = 0;
3065 if (RExC_whilem_seen > 15)
3066 RExC_whilem_seen = 15;
a0d0e21e 3067
bbce6d69 3068 /* Allocate space and initialize. */
a02a5408 3069 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3070 char, regexp);
a0d0e21e 3071 if (r == NULL)
b45f050a
JF
3072 FAIL("Regexp out of space");
3073
0f79a09d
GS
3074#ifdef DEBUGGING
3075 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3076 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3077#endif
c277df42 3078 r->refcnt = 1;
bbce6d69 3079 r->prelen = xend - exp;
5cfc7842 3080 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3081 r->subbeg = NULL;
f8c7b90f 3082#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3083 r->saved_copy = NULL;
ed252734 3084#endif
cf93c79d 3085 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3086 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3087 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3088
3089 r->substrs = 0; /* Useful during FAIL. */
3090 r->startp = 0; /* Useful during FAIL. */
3091 r->endp = 0; /* Useful during FAIL. */
3092
a02a5408 3093 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3094 if (r->offsets) {
2af232bd 3095 r->offsets[0] = RExC_size;
fac92740 3096 }
a3621e74 3097 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3098 "%s %"UVuf" bytes for offset annotations.\n",
3099 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3100 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3101
830247a4 3102 RExC_rx = r;
bbce6d69 3103
3104 /* Second pass: emit code. */
e2509266 3105 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3106 RExC_parse = exp;
3107 RExC_end = xend;
3108 RExC_naughty = 0;
3109 RExC_npar = 1;
fac92740 3110 RExC_emit_start = r->program;
830247a4 3111 RExC_emit = r->program;
2cd61cdb 3112 /* Store the count of eval-groups for security checks: */
eb160463 3113 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3114 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3115 r->data = 0;
3dab1dad 3116 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e
LW
3117 return(NULL);
3118
a3621e74 3119
a0d0e21e 3120 /* Dig out information for optimizations. */
cf93c79d 3121 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3122 pm->op_pmflags = RExC_flags;
a0ed51b3 3123 if (UTF)
5ff6fc6d 3124 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3125 r->regstclass = NULL;
830247a4 3126 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3127 r->reganch |= ROPT_NAUGHTY;
c277df42 3128 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
3129
3130 /* XXXX To minimize changes to RE engine we always allocate
3131 3-units-long substrs field. */
a02a5408 3132 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 3133
2c2d71f5 3134 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 3135 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3136 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3137 I32 fake;
c5254dd6 3138 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
3139 struct regnode_charclass_class ch_class;
3140 int stclass_flag;
cb434fcc 3141 I32 last_close = 0;
a0d0e21e
LW
3142
3143 first = scan;
c277df42 3144 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3145 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3146 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
3147 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3148 (OP(first) == PLUS) ||
3149 (OP(first) == MINMOD) ||
653099ff 3150 /* An {n,m} with n>0 */
3dab1dad 3151 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
3152 if (OP(first) == PLUS)
3153 sawplus = 1;
3154 else
3dab1dad 3155 first += regarglen[OP(first)];
a0d0e21e 3156 first = NEXTOPER(first);
a687059c
LW
3157 }
3158
a0d0e21e
LW
3159 /* Starting-point info. */
3160 again:
3dab1dad 3161 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3162 if (OP(first) == EXACT)
6f207bd3 3163 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3164 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3165 r->regstclass = first;
3166 }
bfed75c6 3167 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3168 r->regstclass = first;
3dab1dad
YO
3169 else if (PL_regkind[OP(first)] == BOUND ||
3170 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3171 r->regstclass = first;
3dab1dad 3172 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3173 r->reganch |= (OP(first) == MBOL
3174 ? ROPT_ANCH_MBOL
3175 : (OP(first) == SBOL
3176 ? ROPT_ANCH_SBOL
3177 : ROPT_ANCH_BOL));
a0d0e21e 3178 first = NEXTOPER(first);
774d564b 3179 goto again;
3180 }
3181 else if (OP(first) == GPOS) {
3182 r->reganch |= ROPT_ANCH_GPOS;
3183 first = NEXTOPER(first);
3184 goto again;
a0d0e21e 3185 }
e09294f4 3186 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3187 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3188 !(r->reganch & ROPT_ANCH) )
3189 {
3190 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3191 const int type =
3192 (OP(NEXTOPER(first)) == REG_ANY)
3193 ? ROPT_ANCH_MBOL
3194 : ROPT_ANCH_SBOL;
cad2e5aa 3195 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3196 first = NEXTOPER(first);
774d564b 3197 goto again;
a0d0e21e 3198 }
b81d288d 3199 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3200 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3201 /* x+ must match at the 1st pos of run of x's */
3202 r->reganch |= ROPT_SKIP;
a0d0e21e 3203
c277df42 3204 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3205 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3206 (IV)(first - scan + 1)));
a0d0e21e
LW
3207 /*
3208 * If there's something expensive in the r.e., find the
3209 * longest literal string that must appear and make it the
3210 * regmust. Resolve ties in favor of later strings, since
3211 * the regstart check works with the beginning of the r.e.
3212 * and avoiding duplication strengthens checking. Not a
3213 * strong reason, but sufficient in the absence of others.
3214 * [Now we resolve ties in favor of the earlier string if
c277df42 3215 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3216 * earlier string may buy us something the later one won't.]
3217 */
a0d0e21e 3218 minlen = 0;
a687059c 3219
396482e1
GA
3220 data.longest_fixed = newSVpvs("");
3221 data.longest_float = newSVpvs("");
3222 data.last_found = newSVpvs("");
c277df42
IZ
3223 data.longest = &(data.longest_fixed);
3224 first = scan;
653099ff 3225 if (!r->regstclass) {
830247a4 3226 cl_init(pRExC_state, &ch_class);
653099ff
GS
3227 data.start_class = &ch_class;
3228 stclass_flag = SCF_DO_STCLASS_AND;
3229 } else /* XXXX Check for BOUND? */
3230 stclass_flag = 0;
cb434fcc 3231 data.last_closep = &last_close;
653099ff 3232
830247a4 3233 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3234 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3235 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3236 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3237 && !RExC_seen_zerolen
3238 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3239 r->reganch |= ROPT_CHECK_ALL;
830247a4 3240 scan_commit(pRExC_state, &data);
c277df42
IZ
3241 SvREFCNT_dec(data.last_found);
3242
a0ed51b3 3243 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3244 if (longest_float_length
c277df42
IZ
3245 || (data.flags & SF_FL_BEFORE_EOL
3246 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3247 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3248 int t;
3249
a0ed51b3 3250 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3251 && data.offset_fixed == data.offset_float_min
3252 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3253 goto remove_float; /* As in (a)+. */
3254
33b8afdf
JH
3255 if (SvUTF8(data.longest_float)) {
3256 r->float_utf8 = data.longest_float;
c445ea15 3257 r->float_substr = NULL;
33b8afdf
JH
3258 } else {
3259 r->float_substr = data.longest_float;
c445ea15 3260 r->float_utf8 = NULL;
33b8afdf 3261 }
c277df42
IZ
3262 r->float_min_offset = data.offset_float_min;
3263 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3264 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3265 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3266 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3267 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3268 }
3269 else {
aca2d497 3270 remove_float:
c445ea15 3271 r->float_substr = r->float_utf8 = NULL;
c277df42 3272 SvREFCNT_dec(data.longest_float);
c5254dd6 3273 longest_float_length = 0;
a0d0e21e 3274 }
c277df42 3275
a0ed51b3 3276 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3277 if (longest_fixed_length
c277df42
IZ
3278 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3279 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3280 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3281 int t;
3282
33b8afdf
JH
3283 if (SvUTF8(data.longest_fixed)) {
3284 r->anchored_utf8 = data.longest_fixed;
c445ea15 3285 r->anchored_substr = NULL;
33b8afdf
JH
3286 } else {
3287 r->anchored_substr = data.longest_fixed;
c445ea15 3288 r->anchored_utf8 = NULL;
33b8afdf 3289 }
c277df42 3290 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3291 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3292 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3293 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3294 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3295 }
3296 else {
c445ea15 3297 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3298 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3299 longest_fixed_length = 0;
a0d0e21e 3300 }
b81d288d 3301 if (r->regstclass
ffc61ed2 3302 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3303 r->regstclass = NULL;
33b8afdf
JH
3304 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3305 && stclass_flag
653099ff 3306 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3307 && !cl_is_anything(data.start_class))
3308 {
1df70142 3309 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3310
a02a5408 3311 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3312 struct regnode_charclass_class);
3313 StructCopy(data.start_class,
830247a4 3314 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3315 struct regnode_charclass_class);
830247a4 3316 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3317 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3318 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3319 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3320 PerlIO_printf(Perl_debug_log,
a0288114 3321 "synthetic stclass \"%s\".\n",
3f7c398e 3322 SvPVX_const(sv));});
653099ff 3323 }
c277df42
IZ
3324
3325 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3326 if (longest_fixed_length > longest_float_length) {
c277df42 3327 r->check_substr = r->anchored_substr;
33b8afdf 3328 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3329 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3330 if (r->reganch & ROPT_ANCH_SINGLE)
3331 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3332 }
3333 else {
c277df42 3334 r->check_substr = r->float_substr;
33b8afdf 3335 r->check_utf8 = r->float_utf8;
c277df42
IZ
3336 r->check_offset_min = data.offset_float_min;
3337 r->check_offset_max = data.offset_float_max;
a0d0e21e 3338 }
30382c73
IZ
3339 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3340 This should be changed ASAP! */
33b8afdf 3341 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3342 r->reganch |= RE_USE_INTUIT;
33b8afdf 3343 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3344 r->reganch |= RE_INTUIT_TAIL;
3345 }
a0ed51b3
LW
3346 }
3347 else {
c277df42
IZ
3348 /* Several toplevels. Best we can is to set minlen. */
3349 I32 fake;
653099ff 3350 struct regnode_charclass_class ch_class;
cb434fcc 3351 I32 last_close = 0;
c277df42 3352
a3621e74 3353 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3354 scan = r->program + 1;
830247a4 3355 cl_init(pRExC_state, &ch_class);
653099ff 3356 data.start_class = &ch_class;
cb434fcc 3357 data.last_closep = &last_close;
a3621e74 3358 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3359 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3360 = r->float_substr = r->float_utf8 = NULL;
653099ff 3361 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3362 && !cl_is_anything(data.start_class))
3363 {
1df70142 3364 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3365
a02a5408 3366 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3367 struct regnode_charclass_class);
3368 StructCopy(data.start_class,
830247a4 3369 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3370 struct regnode_charclass_class);
830247a4 3371 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3372 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3373 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3374 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3375 PerlIO_printf(Perl_debug_log,
a0288114 3376 "synthetic stclass \"%s\".\n",
3f7c398e 3377 SvPVX_const(sv));});
653099ff 3378 }
a0d0e21e
LW
3379 }
3380
a0d0e21e 3381 r->minlen = minlen;
b81d288d 3382 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3383 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3384 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3385 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3386 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3387 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3388 if (RExC_seen & REG_SEEN_CANY)
3389 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3390 Newxz(r->startp, RExC_npar, I32);
3391 Newxz(r->endp, RExC_npar, I32);
3dab1dad
YO
3392 DEBUG_COMPILE_r({
3393 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3394 PerlIO_printf(Perl_debug_log,"Final program:\n");
3395 regdump(r);
3396 });
a0d0e21e 3397 return(r);
a687059c
LW
3398}
3399
3dab1dad
YO
3400
3401#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3402 int rem=(int)(RExC_end - RExC_parse); \
3403 int cut; \
3404 int num; \
3405 int iscut=0; \
3406 if (rem>10) { \
3407 rem=10; \
3408 iscut=1; \
3409 } \
3410 cut=10-rem; \
3411 if (RExC_lastparse!=RExC_parse) \
3412 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3413 rem, RExC_parse, \
3414 cut + 4, \
3415 iscut ? "..." : "<" \
3416 ); \
3417 else \
3418 PerlIO_printf(Perl_debug_log,"%16s",""); \
3419 \
3420 if (SIZE_ONLY) \
3421 num=RExC_size; \
3422 else \
3423 num=REG_NODE_NUM(RExC_emit); \
3424 if (RExC_lastnum!=num) \
3425 PerlIO_printf(Perl_debug_log,"%4d",num); \
3426 else \
3427 PerlIO_printf(Perl_debug_log,"%4s",""); \
3428 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
8e11feef 3429 (int)(10+(depth*2)), "", \
3dab1dad
YO
3430 (funcname) \
3431 ); \
3432 RExC_lastnum=num; \
3433 RExC_lastparse=RExC_parse; \
3434})
3435
3436#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3437 DEBUG_PARSE_MSG((funcname)); \
3438 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3439})
a687059c
LW
3440/*
3441 - reg - regular expression, i.e. main body or parenthesized thing
3442 *
3443 * Caller must absorb opening parenthesis.
3444 *
3445 * Combining parenthesis handling with the base level of regular expression
3446 * is a trifle forced, but the need to tie the tails of the branches to what
3447 * follows makes it hard to avoid.
3448 */
3dab1dad
YO
3449#define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
3450
76e3520e 3451STATIC regnode *
3dab1dad 3452S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3453 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3454{
27da23d5 3455 dVAR;
c277df42
IZ
3456 register regnode *ret; /* Will be the head of the group. */
3457 register regnode *br;
3458 register regnode *lastbr;
cbbf8932 3459 register regnode *ender = NULL;
a0d0e21e 3460 register I32 parno = 0;
cbbf8932
AL
3461 I32 flags;
3462 const I32 oregflags = RExC_flags;
6136c704
AL
3463 bool have_branch = 0;
3464 bool is_open = 0;
9d1d55b5
JP
3465
3466 /* for (?g), (?gc), and (?o) warnings; warning
3467 about (?c) will warn about (?g) -- japhy */
3468
6136c704
AL
3469#define WASTED_O 0x01
3470#define WASTED_G 0x02
3471#define WASTED_C 0x04
3472#define WASTED_GC (0x02|0x04)
cbbf8932 3473 I32 wastedflags = 0x00;
9d1d55b5 3474
fac92740 3475 char * parse_start = RExC_parse; /* MJD */
a28509cc 3476 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3477
3dab1dad
YO
3478 GET_RE_DEBUG_FLAGS_DECL;
3479 DEBUG_PARSE("reg ");
3480
3481
821b33a5 3482 *flagp = 0; /* Tentatively. */
a0d0e21e 3483
9d1d55b5 3484
a0d0e21e
LW
3485 /* Make an OPEN node, if parenthesized. */
3486 if (paren) {
fac92740 3487 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3488 U32 posflags = 0, negflags = 0;
3489 U32 *flagsp = &posflags;
6136c704 3490 bool is_logical = 0;
a28509cc 3491 const char * const seqstart = RExC_parse;
ca9dfc88 3492
830247a4
IZ
3493 RExC_parse++;
3494 paren = *RExC_parse++;
c277df42 3495 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3496 switch (paren) {
fac92740 3497 case '<': /* (?<...) */
830247a4 3498 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3499 if (*RExC_parse == '!')
c277df42 3500 paren = ',';
b81d288d 3501 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3502 goto unknown;
830247a4 3503 RExC_parse++;
fac92740
MJD
3504 case '=': /* (?=...) */
3505 case '!': /* (?!...) */
830247a4 3506 RExC_seen_zerolen++;
fac92740
MJD
3507 case ':': /* (?:...) */
3508 case '>': /* (?>...) */
a0d0e21e 3509 break;
fac92740
MJD
3510 case '$': /* (?$...) */
3511 case '@': /* (?@...) */
8615cb43 3512 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3513 break;
fac92740 3514 case '#': /* (?#...) */
830247a4
IZ
3515 while (*RExC_parse && *RExC_parse != ')')
3516 RExC_parse++;
3517 if (*RExC_parse != ')')
c277df42 3518 FAIL("Sequence (?#... not terminated");
830247a4 3519 nextchar(pRExC_state);
a0d0e21e
LW
3520 *flagp = TRYAGAIN;
3521 return NULL;
fac92740 3522 case 'p': /* (?p...) */
9014280d 3523 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3524 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3525 /* FALL THROUGH*/
fac92740 3526 case '?': /* (??...) */
6136c704 3527 is_logical = 1;
438a3801
YST
3528 if (*RExC_parse != '{')
3529 goto unknown;
830247a4 3530 paren = *RExC_parse++;
0f5d15d6 3531 /* FALL THROUGH */
fac92740 3532 case '{': /* (?{...}) */
c277df42 3533 {
c277df42
IZ
3534 I32 count = 1, n = 0;
3535 char c;
830247a4 3536 char *s = RExC_parse;
c277df42 3537
830247a4
IZ
3538 RExC_seen_zerolen++;
3539 RExC_seen |= REG_SEEN_EVAL;
3540 while (count && (c = *RExC_parse)) {
6136c704
AL
3541 if (c == '\\') {
3542 if (RExC_parse[1])
3543 RExC_parse++;
3544 }
b81d288d 3545 else if (c == '{')
c277df42 3546 count++;
b81d288d 3547 else if (c == '}')
c277df42 3548 count--;
830247a4 3549 RExC_parse++;
c277df42 3550 }
6136c704 3551 if (*RExC_parse != ')') {
b81d288d 3552 RExC_parse = s;
b45f050a
JF
3553 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3554 }
c277df42 3555 if (!SIZE_ONLY) {
f3548bdc 3556 PAD *pad;
6136c704
AL
3557 OP_4tree *sop, *rop;
3558 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3559
569233ed
SB
3560 ENTER;
3561 Perl_save_re_context(aTHX);
f3548bdc 3562 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3563 sop->op_private |= OPpREFCOUNTED;
3564 /* re_dup will OpREFCNT_inc */
3565 OpREFCNT_set(sop, 1);
569233ed 3566 LEAVE;
c277df42 3567
830247a4
IZ
3568 n = add_data(pRExC_state, 3, "nop");
3569 RExC_rx->data->data[n] = (void*)rop;
3570 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3571 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3572 SvREFCNT_dec(sv);
a0ed51b3 3573 }
e24b16f9 3574 else { /* First pass */
830247a4 3575 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3576 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3577 /* No compiled RE interpolated, has runtime
3578 components ===> unsafe. */
3579 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3580 if (PL_tainting && PL_tainted)
cc6b7395 3581 FAIL("Eval-group in insecure regular expression");
54df2634 3582#if PERL_VERSION > 8
923e4eb5 3583 if (IN_PERL_COMPILETIME)
b5c19bd7 3584 PL_cv_has_eval = 1;
54df2634 3585#endif
c277df42 3586 }
b5c19bd7 3587
830247a4 3588 nextchar(pRExC_state);
6136c704 3589 if (is_logical) {
830247a4 3590 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3591 if (!SIZE_ONLY)
3592 ret->flags = 2;
3dab1dad 3593 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3594 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3595 return ret;
3596 }
ccb2c380
MP
3597 ret = reganode(pRExC_state, EVAL, n);
3598 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3599 Set_Node_Offset(ret, parse_start);
3600 return ret;
c277df42 3601 }
fac92740 3602 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3603 {
fac92740 3604 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3605 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3606 || RExC_parse[1] == '<'
830247a4 3607 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3608 I32 flag;
3609
830247a4 3610 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3611 if (!SIZE_ONLY)
3612 ret->flags = 1;
3dab1dad 3613 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3614 goto insert_if;
b81d288d 3615 }
a0ed51b3 3616 }
830247a4 3617 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3618 /* (?(1)...) */
6136c704 3619 char c;
830247a4 3620 parno = atoi(RExC_parse++);
c277df42 3621
830247a4
IZ
3622 while (isDIGIT(*RExC_parse))
3623 RExC_parse++;
fac92740 3624 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3625
830247a4 3626 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3627 vFAIL("Switch condition not recognized");
c277df42 3628 insert_if:
3dab1dad
YO
3629 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3630 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3631 if (br == NULL)
830247a4 3632 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3633 else
3dab1dad 3634 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3635 c = *nextchar(pRExC_state);
d1b80229
IZ
3636 if (flags&HASWIDTH)
3637 *flagp |= HASWIDTH;
c277df42 3638 if (c == '|') {
830247a4 3639 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3640 regbranch(pRExC_state, &flags, 1,depth+1);
3641 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3642 if (flags&HASWIDTH)
3643 *flagp |= HASWIDTH;
830247a4 3644 c = *nextchar(pRExC_state);
a0ed51b3
LW
3645 }
3646 else
c277df42
IZ
3647 lastbr = NULL;
3648 if (c != ')')
8615cb43 3649 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3650 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3651 REGTAIL(pRExC_state, br, ender);
c277df42 3652 if (lastbr) {
3dab1dad
YO
3653 REGTAIL(pRExC_state, lastbr, ender);
3654 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3655 }
3656 else
3dab1dad 3657 REGTAIL(pRExC_state, ret, ender);
c277df42 3658 return ret;
a0ed51b3
LW
3659 }
3660 else {
830247a4 3661 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3662 }
3663 }
1b1626e4 3664 case 0:
830247a4 3665 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3666 vFAIL("Sequence (? incomplete");
1b1626e4 3667 break;
a0d0e21e 3668 default:
830247a4 3669 --RExC_parse;
fac92740 3670 parse_flags: /* (?i) */
830247a4 3671 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3672 /* (?g), (?gc) and (?o) are useless here
3673 and must be globally applied -- japhy */
3674
3675 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3676 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3677 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3678 if (! (wastedflags & wflagbit) ) {
3679 wastedflags |= wflagbit;
3680 vWARN5(
3681 RExC_parse + 1,
3682 "Useless (%s%c) - %suse /%c modifier",
3683 flagsp == &negflags ? "?-" : "?",
3684 *RExC_parse,
3685 flagsp == &negflags ? "don't " : "",
3686 *RExC_parse
3687 );
3688 }
3689 }
3690 }
3691 else if (*RExC_parse == 'c') {
3692 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3693 if (! (wastedflags & WASTED_C) ) {
3694 wastedflags |= WASTED_GC;
9d1d55b5
JP
3695 vWARN3(
3696 RExC_parse + 1,
3697 "Useless (%sc) - %suse /gc modifier",
3698 flagsp == &negflags ? "?-" : "?",
3699 flagsp == &negflags ? "don't " : ""
3700 );
3701 }
3702 }
3703 }
3704 else { pmflag(flagsp, *RExC_parse); }
3705
830247a4 3706 ++RExC_parse;
ca9dfc88 3707 }
830247a4 3708 if (*RExC_parse == '-') {
ca9dfc88 3709 flagsp = &negflags;
9d1d55b5 3710 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3711 ++RExC_parse;
ca9dfc88 3712 goto parse_flags;
48c036b1 3713 }
e2509266
JH
3714 RExC_flags |= posflags;
3715 RExC_flags &= ~negflags;
830247a4
IZ
3716 if (*RExC_parse == ':') {
3717 RExC_parse++;
ca9dfc88
IZ
3718 paren = ':';
3719 break;
3720 }
c277df42 3721 unknown:
830247a4
IZ
3722 if (*RExC_parse != ')') {
3723 RExC_parse++;
3724 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3725 }
830247a4 3726 nextchar(pRExC_state);
a0d0e21e
LW
3727 *flagp = TRYAGAIN;
3728 return NULL;
3729 }
3730 }
fac92740 3731 else { /* (...) */
830247a4
IZ
3732 parno = RExC_npar;
3733 RExC_npar++;
3734 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3735 Set_Node_Length(ret, 1); /* MJD */
3736 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3737 is_open = 1;
a0d0e21e 3738 }
a0ed51b3 3739 }
fac92740 3740 else /* ! paren */
a0d0e21e
LW
3741 ret = NULL;
3742
3743 /* Pick up the branches, linking them together. */
fac92740 3744 parse_start = RExC_parse; /* MJD */
3dab1dad 3745 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 3746 /* branch_len = (paren != 0); */
2af232bd 3747
a0d0e21e
LW
3748 if (br == NULL)
3749 return(NULL);
830247a4
IZ
3750 if (*RExC_parse == '|') {
3751 if (!SIZE_ONLY && RExC_extralen) {
3752 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3753 }
fac92740 3754 else { /* MJD */
830247a4 3755 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3756 Set_Node_Length(br, paren != 0);
3757 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3758 }
c277df42
IZ
3759 have_branch = 1;
3760 if (SIZE_ONLY)
830247a4 3761 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3762 }
3763 else if (paren == ':') {
c277df42
IZ
3764 *flagp |= flags&SIMPLE;
3765 }
6136c704 3766 if (is_open) { /* Starts with OPEN. */
3dab1dad 3767 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3768 }
3769 else if (paren != '?') /* Not Conditional */
a0d0e21e 3770 ret = br;
32a0ca98 3771 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3772 lastbr = br;
830247a4
IZ
3773 while (*RExC_parse == '|') {
3774 if (!SIZE_ONLY && RExC_extralen) {
3775 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 3776 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3777 }
3778 if (SIZE_ONLY)
830247a4
IZ
3779 RExC_extralen += 2; /* Account for LONGJMP. */
3780 nextchar(pRExC_state);
3dab1dad 3781 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 3782
a687059c 3783 if (br == NULL)
a0d0e21e 3784 return(NULL);
3dab1dad 3785 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3786 lastbr = br;
821b33a5
IZ
3787 if (flags&HASWIDTH)
3788 *flagp |= HASWIDTH;
a687059c 3789 *flagp |= flags&SPSTART;
a0d0e21e
LW
3790 }
3791
c277df42
IZ
3792 if (have_branch || paren != ':') {
3793 /* Make a closing node, and hook it on the end. */
3794 switch (paren) {
3795 case ':':
830247a4 3796 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3797 break;
3798 case 1:
830247a4 3799 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3800 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3801 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3802 break;
3803 case '<':
c277df42
IZ
3804 case ',':
3805 case '=':
3806 case '!':
c277df42 3807 *flagp &= ~HASWIDTH;
821b33a5
IZ
3808 /* FALL THROUGH */
3809 case '>':
830247a4 3810 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3811 break;
3812 case 0:
830247a4 3813 ender = reg_node(pRExC_state, END);
c277df42
IZ
3814 break;
3815 }
3dab1dad 3816 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 3817
9674d46a 3818 if (have_branch && !SIZE_ONLY) {
c277df42 3819 /* Hook the tails of the branches to the closing node. */
3dab1dad 3820 U8 exact= PSEUDO;
9674d46a
AL
3821 for (br = ret; br; br = regnext(br)) {
3822 const U8 op = PL_regkind[OP(br)];
3dab1dad 3823 U8 exact_ret;
9674d46a 3824 if (op == BRANCH) {
3dab1dad 3825 exact_ret=regtail_study(pRExC_state, NEXTOPER(br), ender,depth+1);
9674d46a
AL
3826 }
3827 else if (op == BRANCHJ) {
3dab1dad 3828 exact_ret=regtail_study(pRExC_state, NEXTOPER(NEXTOPER(br)), ender,depth+1);
9674d46a 3829 }
3dab1dad
YO
3830 if ( exact == PSEUDO )
3831 exact= exact_ret;
3832 else if ( exact != exact_ret )
3833 exact= 0;
c277df42
IZ
3834 }
3835 }
a0d0e21e 3836 }
c277df42
IZ
3837
3838 {
e1ec3a88
AL
3839 const char *p;
3840 static const char parens[] = "=!<,>";
c277df42
IZ
3841
3842 if (paren && (p = strchr(parens, paren))) {
eb160463 3843 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3844 int flag = (p - parens) > 1;
3845
3846 if (paren == '>')
3847 node = SUSPEND, flag = 0;
830247a4 3848 reginsert(pRExC_state, node,ret);
45948336
EP
3849 Set_Node_Cur_Length(ret);
3850 Set_Node_Offset(ret, parse_start + 1);
c277df42 3851 ret->flags = flag;
3dab1dad 3852 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3853 }
a0d0e21e
LW
3854 }
3855
3856 /* Check for proper termination. */
ce3e6498 3857 if (paren) {
e2509266 3858 RExC_flags = oregflags;
830247a4
IZ
3859 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3860 RExC_parse = oregcomp_parse;
380a0633 3861 vFAIL("Unmatched (");
ce3e6498 3862 }
a0ed51b3 3863 }
830247a4
IZ
3864 else if (!paren && RExC_parse < RExC_end) {
3865 if (*RExC_parse == ')') {
3866 RExC_parse++;
380a0633 3867 vFAIL("Unmatched )");
a0ed51b3
LW
3868 }
3869 else
b45f050a 3870 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3871 /* NOTREACHED */
3872 }
a687059c 3873
a0d0e21e 3874 return(ret);
a687059c
LW
3875}
3876
3877/*
3878 - regbranch - one alternative of an | operator
3879 *
3880 * Implements the concatenation operator.
3881 */
76e3520e 3882STATIC regnode *
3dab1dad 3883S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 3884{
97aff369 3885 dVAR;
c277df42
IZ
3886 register regnode *ret;
3887 register regnode *chain = NULL;
3888 register regnode *latest;
3889 I32 flags = 0, c = 0;
3dab1dad
YO
3890 GET_RE_DEBUG_FLAGS_DECL;
3891 DEBUG_PARSE("brnc");
b81d288d 3892 if (first)
c277df42
IZ
3893 ret = NULL;
3894 else {
b81d288d 3895 if (!SIZE_ONLY && RExC_extralen)
830247a4 3896 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3897 else {
830247a4 3898 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3899 Set_Node_Length(ret, 1);
3900 }
c277df42
IZ
3901 }
3902
b81d288d 3903 if (!first && SIZE_ONLY)
830247a4 3904 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3905
c277df42 3906 *flagp = WORST; /* Tentatively. */
a0d0e21e 3907
830247a4
IZ
3908 RExC_parse--;
3909 nextchar(pRExC_state);
3910 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3911 flags &= ~TRYAGAIN;
3dab1dad 3912 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
3913 if (latest == NULL) {
3914 if (flags & TRYAGAIN)
3915 continue;
3916 return(NULL);
a0ed51b3
LW
3917 }
3918 else if (ret == NULL)
c277df42 3919 ret = latest;
a0d0e21e 3920 *flagp |= flags&HASWIDTH;
c277df42 3921 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3922 *flagp |= flags&SPSTART;
3923 else {
830247a4 3924 RExC_naughty++;
3dab1dad 3925 REGTAIL(pRExC_state, chain, latest);
a687059c 3926 }
a0d0e21e 3927 chain = latest;
c277df42
IZ
3928 c++;
3929 }
3930 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3931 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3932 if (ret == NULL)
3933 ret = chain;
3934 }
3935 if (c == 1) {
3936 *flagp |= flags&SIMPLE;
a0d0e21e 3937 }
a687059c 3938
d4c19fe8 3939 return ret;
a687059c
LW
3940}
3941
3942/*
3943 - regpiece - something followed by possible [*+?]
3944 *
3945 * Note that the branching code sequences used for ? and the general cases
3946 * of * and + are somewhat optimized: they use the same NOTHING node as
3947 * both the endmarker for their branch list and the body of the last branch.
3948 * It might seem that this node could be dispensed with entirely, but the
3949 * endmarker role is not redundant.
3950 */
76e3520e 3951STATIC regnode *
3dab1dad 3952S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 3953{
97aff369 3954 dVAR;
c277df42 3955 register regnode *ret;
a0d0e21e
LW
3956 register char op;
3957 register char *next;
3958 I32 flags;
1df70142 3959 const char * const origparse = RExC_parse;
a0d0e21e 3960 I32 min;
c277df42 3961 I32 max = REG_INFTY;
fac92740 3962 char *parse_start;
3dab1dad
YO
3963 GET_RE_DEBUG_FLAGS_DECL;
3964 DEBUG_PARSE("piec");
a0d0e21e 3965
3dab1dad 3966 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
3967 if (ret == NULL) {
3968 if (flags & TRYAGAIN)
3969 *flagp |= TRYAGAIN;
3970 return(NULL);
3971 }
3972
830247a4 3973 op = *RExC_parse;
a0d0e21e 3974
830247a4 3975 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 3976 const char *maxpos = NULL;
fac92740 3977 parse_start = RExC_parse; /* MJD */
830247a4 3978 next = RExC_parse + 1;
a0d0e21e
LW
3979 while (isDIGIT(*next) || *next == ',') {
3980 if (*next == ',') {
3981 if (maxpos)
3982 break;
3983 else
3984 maxpos = next;
a687059c 3985 }
a0d0e21e
LW
3986 next++;
3987 }
3988 if (*next == '}') { /* got one */
3989 if (!maxpos)
3990 maxpos = next;
830247a4
IZ
3991 RExC_parse++;
3992 min = atoi(RExC_parse);
a0d0e21e
LW
3993 if (*maxpos == ',')
3994 maxpos++;
3995 else
830247a4 3996 maxpos = RExC_parse;
a0d0e21e
LW
3997 max = atoi(maxpos);
3998 if (!max && *maxpos != '0')
c277df42
IZ
3999 max = REG_INFTY; /* meaning "infinity" */
4000 else if (max >= REG_INFTY)
8615cb43 4001 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4002 RExC_parse = next;
4003 nextchar(pRExC_state);
a0d0e21e
LW
4004
4005 do_curly:
4006 if ((flags&SIMPLE)) {
830247a4
IZ
4007 RExC_naughty += 2 + RExC_naughty / 2;
4008 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4009 Set_Node_Offset(ret, parse_start+1); /* MJD */
4010 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4011 }
4012 else {
3dab1dad 4013 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4014
4015 w->flags = 0;
3dab1dad 4016 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4017 if (!SIZE_ONLY && RExC_extralen) {
4018 reginsert(pRExC_state, LONGJMP,ret);
4019 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4020 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4021 }
830247a4 4022 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4023 /* MJD hk */
4024 Set_Node_Offset(ret, parse_start+1);
2af232bd 4025 Set_Node_Length(ret,
fac92740 4026 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4027
830247a4 4028 if (!SIZE_ONLY && RExC_extralen)
c277df42 4029 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4030 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4031 if (SIZE_ONLY)
830247a4
IZ
4032 RExC_whilem_seen++, RExC_extralen += 3;
4033 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4034 }
c277df42 4035 ret->flags = 0;
a0d0e21e
LW
4036
4037 if (min > 0)
821b33a5
IZ
4038 *flagp = WORST;
4039 if (max > 0)
4040 *flagp |= HASWIDTH;
a0d0e21e 4041 if (max && max < min)
8615cb43 4042 vFAIL("Can't do {n,m} with n > m");
c277df42 4043 if (!SIZE_ONLY) {
eb160463
GS
4044 ARG1_SET(ret, (U16)min);
4045 ARG2_SET(ret, (U16)max);
a687059c 4046 }
a687059c 4047
a0d0e21e 4048 goto nest_check;
a687059c 4049 }
a0d0e21e 4050 }
a687059c 4051
a0d0e21e
LW
4052 if (!ISMULT1(op)) {
4053 *flagp = flags;
a687059c 4054 return(ret);
a0d0e21e 4055 }
bb20fd44 4056
c277df42 4057#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4058
4059 /* if this is reinstated, don't forget to put this back into perldiag:
4060
4061 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4062
4063 (F) The part of the regexp subject to either the * or + quantifier
4064 could match an empty string. The {#} shows in the regular
4065 expression about where the problem was discovered.
4066
4067 */
4068
bb20fd44 4069 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4070 vFAIL("Regexp *+ operand could be empty");
b81d288d 4071#endif
bb20fd44 4072
fac92740 4073 parse_start = RExC_parse;
830247a4 4074 nextchar(pRExC_state);
a0d0e21e 4075
821b33a5 4076 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4077
4078 if (op == '*' && (flags&SIMPLE)) {
830247a4 4079 reginsert(pRExC_state, STAR, ret);
c277df42 4080 ret->flags = 0;
830247a4 4081 RExC_naughty += 4;
a0d0e21e
LW
4082 }
4083 else if (op == '*') {
4084 min = 0;
4085 goto do_curly;
a0ed51b3
LW
4086 }
4087 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4088 reginsert(pRExC_state, PLUS, ret);
c277df42 4089 ret->flags = 0;
830247a4 4090 RExC_naughty += 3;
a0d0e21e
LW
4091 }
4092 else if (op == '+') {
4093 min = 1;
4094 goto do_curly;
a0ed51b3
LW
4095 }
4096 else if (op == '?') {
a0d0e21e
LW
4097 min = 0; max = 1;
4098 goto do_curly;
4099 }
4100 nest_check:
041457d9 4101 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4102 vWARN3(RExC_parse,
b45f050a 4103 "%.*s matches null string many times",
afd78fd5 4104 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4105 origparse);
a0d0e21e
LW
4106 }
4107
830247a4
IZ
4108 if (*RExC_parse == '?') {
4109 nextchar(pRExC_state);
4110 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4111 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4112 }
830247a4
IZ
4113 if (ISMULT2(RExC_parse)) {
4114 RExC_parse++;
b45f050a
JF
4115 vFAIL("Nested quantifiers");
4116 }
a0d0e21e
LW
4117
4118 return(ret);
a687059c
LW
4119}
4120
4121/*
4122 - regatom - the lowest level
4123 *
4124 * Optimization: gobbles an entire sequence of ordinary characters so that
4125 * it can turn them into a single node, which is smaller to store and
4126 * faster to run. Backslashed characters are exceptions, each becoming a
4127 * separate node; the code is simpler that way and it's not worth fixing.
4128 *
7f6f358c
YO
4129 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4130 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4131 */
76e3520e 4132STATIC regnode *
3dab1dad 4133S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4134{
97aff369 4135 dVAR;
cbbf8932 4136 register regnode *ret = NULL;
a0d0e21e 4137 I32 flags;
45948336 4138 char *parse_start = RExC_parse;
3dab1dad
YO
4139 GET_RE_DEBUG_FLAGS_DECL;
4140 DEBUG_PARSE("atom");
a0d0e21e
LW
4141 *flagp = WORST; /* Tentatively. */
4142
4143tryagain:
830247a4 4144 switch (*RExC_parse) {
a0d0e21e 4145 case '^':
830247a4
IZ
4146 RExC_seen_zerolen++;
4147 nextchar(pRExC_state);
e2509266 4148 if (RExC_flags & PMf_MULTILINE)
830247a4 4149 ret = reg_node(pRExC_state, MBOL);
e2509266 4150 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4151 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4152 else
830247a4 4153 ret = reg_node(pRExC_state, BOL);
fac92740 4154 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4155 break;
4156 case '$':
830247a4 4157 nextchar(pRExC_state);
b81d288d 4158 if (*RExC_parse)
830247a4 4159 RExC_seen_zerolen++;
e2509266 4160 if (RExC_flags & PMf_MULTILINE)
830247a4 4161 ret = reg_node(pRExC_state, MEOL);
e2509266 4162 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4163 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4164 else
830247a4 4165 ret = reg_node(pRExC_state, EOL);
fac92740 4166 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4167 break;
4168 case '.':
830247a4 4169 nextchar(pRExC_state);
e2509266 4170 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4171 ret = reg_node(pRExC_state, SANY);
4172 else
4173 ret = reg_node(pRExC_state, REG_ANY);
4174 *flagp |= HASWIDTH|SIMPLE;
830247a4 4175 RExC_naughty++;
fac92740 4176 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4177 break;
4178 case '[':
b45f050a 4179 {
3dab1dad
YO
4180 char * const oregcomp_parse = ++RExC_parse;
4181 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4182 if (*RExC_parse != ']') {
4183 RExC_parse = oregcomp_parse;
b45f050a
JF
4184 vFAIL("Unmatched [");
4185 }
830247a4 4186 nextchar(pRExC_state);
a0d0e21e 4187 *flagp |= HASWIDTH|SIMPLE;
fac92740 4188 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4189 break;
b45f050a 4190 }
a0d0e21e 4191 case '(':
830247a4 4192 nextchar(pRExC_state);
3dab1dad 4193 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4194 if (ret == NULL) {
bf93d4cc 4195 if (flags & TRYAGAIN) {
830247a4 4196 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4197 /* Make parent create an empty node if needed. */
4198 *flagp |= TRYAGAIN;
4199 return(NULL);
4200 }
a0d0e21e 4201 goto tryagain;
bf93d4cc 4202 }
a0d0e21e
LW
4203 return(NULL);
4204 }
c277df42 4205 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4206 break;
4207 case '|':
4208 case ')':
4209 if (flags & TRYAGAIN) {
4210 *flagp |= TRYAGAIN;
4211 return NULL;
4212 }
b45f050a 4213 vFAIL("Internal urp");
a0d0e21e
LW
4214 /* Supposed to be caught earlier. */
4215 break;
85afd4ae 4216 case '{':
830247a4
IZ
4217 if (!regcurly(RExC_parse)) {
4218 RExC_parse++;
85afd4ae
CS
4219 goto defchar;
4220 }
4221 /* FALL THROUGH */
a0d0e21e
LW
4222 case '?':
4223 case '+':
4224 case '*':
830247a4 4225 RExC_parse++;
b45f050a 4226 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4227 break;
4228 case '\\':
830247a4 4229 switch (*++RExC_parse) {
a0d0e21e 4230 case 'A':
830247a4
IZ
4231 RExC_seen_zerolen++;
4232 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4233 *flagp |= SIMPLE;
830247a4 4234 nextchar(pRExC_state);
fac92740 4235 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4236 break;
4237 case 'G':
830247a4
IZ
4238 ret = reg_node(pRExC_state, GPOS);
4239 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4240 *flagp |= SIMPLE;
830247a4 4241 nextchar(pRExC_state);
fac92740 4242 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4243 break;
4244 case 'Z':
830247a4 4245 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4246 *flagp |= SIMPLE;
a1917ab9 4247 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4248 nextchar(pRExC_state);
a0d0e21e 4249 break;
b85d18e9 4250 case 'z':
830247a4 4251 ret = reg_node(pRExC_state, EOS);
b85d18e9 4252 *flagp |= SIMPLE;
830247a4
IZ
4253 RExC_seen_zerolen++; /* Do not optimize RE away */
4254 nextchar(pRExC_state);
fac92740 4255 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4256 break;
4a2d328f 4257 case 'C':
f33976b4
DB
4258 ret = reg_node(pRExC_state, CANY);
4259 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4260 *flagp |= HASWIDTH|SIMPLE;
830247a4 4261 nextchar(pRExC_state);
fac92740 4262 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4263 break;
4264 case 'X':
830247a4 4265 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4266 *flagp |= HASWIDTH;
830247a4 4267 nextchar(pRExC_state);
fac92740 4268 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4269 break;
a0d0e21e 4270 case 'w':
eb160463 4271 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4272 *flagp |= HASWIDTH|SIMPLE;
830247a4 4273 nextchar(pRExC_state);
fac92740 4274 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4275 break;
4276 case 'W':
eb160463 4277 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4278 *flagp |= HASWIDTH|SIMPLE;
830247a4 4279 nextchar(pRExC_state);
fac92740 4280 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4281 break;
4282 case 'b':
830247a4
IZ
4283 RExC_seen_zerolen++;
4284 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4285 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4286 *flagp |= SIMPLE;
830247a4 4287 nextchar(pRExC_state);
fac92740 4288 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4289 break;
4290 case 'B':
830247a4
IZ
4291 RExC_seen_zerolen++;
4292 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4293 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4294 *flagp |= SIMPLE;
830247a4 4295 nextchar(pRExC_state);
fac92740 4296 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4297 break;
4298 case 's':
eb160463 4299 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4300 *flagp |= HASWIDTH|SIMPLE;
830247a4 4301 nextchar(pRExC_state);
fac92740 4302 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4303 break;
4304 case 'S':
eb160463 4305 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4306 *flagp |= HASWIDTH|SIMPLE;
830247a4 4307 nextchar(pRExC_state);
fac92740 4308 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4309 break;
4310 case 'd':
ffc61ed2 4311 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4312 *flagp |= HASWIDTH|SIMPLE;
830247a4 4313 nextchar(pRExC_state);
fac92740 4314 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4315 break;
4316 case 'D':
ffc61ed2 4317 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4318 *flagp |= HASWIDTH|SIMPLE;
830247a4 4319 nextchar(pRExC_state);
fac92740 4320 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4321 break;
a14b48bc
LW
4322 case 'p':
4323 case 'P':
3568d838 4324 {
3dab1dad 4325 char* const oldregxend = RExC_end;
ccb2c380 4326 char* parse_start = RExC_parse - 2;
a14b48bc 4327
830247a4 4328 if (RExC_parse[1] == '{') {
3568d838 4329 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4330 RExC_end = strchr(RExC_parse, '}');
4331 if (!RExC_end) {
3dab1dad 4332 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4333 RExC_parse += 2;
4334 RExC_end = oldregxend;
0da60cf5 4335 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4336 }
830247a4 4337 RExC_end++;
a14b48bc 4338 }
af6f566e 4339 else {
830247a4 4340 RExC_end = RExC_parse + 2;
af6f566e
HS
4341 if (RExC_end > oldregxend)
4342 RExC_end = oldregxend;
4343 }
830247a4 4344 RExC_parse--;
a14b48bc 4345
3dab1dad 4346 ret = regclass(pRExC_state,depth+1);
a14b48bc 4347
830247a4
IZ
4348 RExC_end = oldregxend;
4349 RExC_parse--;
ccb2c380
MP
4350
4351 Set_Node_Offset(ret, parse_start + 2);
4352 Set_Node_Cur_Length(ret);
830247a4 4353 nextchar(pRExC_state);
a14b48bc
LW
4354 *flagp |= HASWIDTH|SIMPLE;
4355 }
4356 break;
a0d0e21e
LW
4357 case 'n':
4358 case 'r':
4359 case 't':
4360 case 'f':
4361 case 'e':
4362 case 'a':
4363 case 'x':
4364 case 'c':
4365 case '0':
4366 goto defchar;
4367 case '1': case '2': case '3': case '4':
4368 case '5': case '6': case '7': case '8': case '9':
4369 {
1df70142 4370 const I32 num = atoi(RExC_parse);
a0d0e21e 4371
830247a4 4372 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4373 goto defchar;
4374 else {
3dab1dad 4375 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4376 while (isDIGIT(*RExC_parse))
4377 RExC_parse++;
b45f050a 4378
eb160463 4379 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4380 vFAIL("Reference to nonexistent group");
830247a4 4381 RExC_sawback = 1;
eb160463
GS
4382 ret = reganode(pRExC_state,
4383 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4384 num);
a0d0e21e 4385 *flagp |= HASWIDTH;
2af232bd 4386
fac92740 4387 /* override incorrect value set in reganode MJD */
2af232bd 4388 Set_Node_Offset(ret, parse_start+1);
fac92740 4389 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4390 RExC_parse--;
4391 nextchar(pRExC_state);
a0d0e21e
LW
4392 }
4393 }
4394 break;
4395 case '\0':
830247a4 4396 if (RExC_parse >= RExC_end)
b45f050a 4397 FAIL("Trailing \\");
a0d0e21e
LW
4398 /* FALL THROUGH */
4399 default:
a0288114 4400 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4401 back into the quick-grab loop below */
45948336 4402 parse_start--;
a0d0e21e
LW
4403 goto defchar;
4404 }
4405 break;
4633a7c4
LW
4406
4407 case '#':
e2509266 4408 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4409 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4410 RExC_parse++;
830247a4 4411 if (RExC_parse < RExC_end)
4633a7c4
LW
4412 goto tryagain;
4413 }
4414 /* FALL THROUGH */
4415
a0d0e21e 4416 default: {
ba210ebe 4417 register STRLEN len;
58ae7d3f 4418 register UV ender;
a0d0e21e 4419 register char *p;
3dab1dad 4420 char *s;
80aecb99 4421 STRLEN foldlen;
89ebb4a3 4422 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4423
4424 parse_start = RExC_parse - 1;
a0d0e21e 4425
830247a4 4426 RExC_parse++;
a0d0e21e
LW
4427
4428 defchar:
58ae7d3f 4429 ender = 0;
eb160463
GS
4430 ret = reg_node(pRExC_state,
4431 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4432 s = STRING(ret);
830247a4
IZ
4433 for (len = 0, p = RExC_parse - 1;
4434 len < 127 && p < RExC_end;
a0d0e21e
LW
4435 len++)
4436 {
3dab1dad 4437 char * const oldp = p;
5b5a24f7 4438
e2509266 4439 if (RExC_flags & PMf_EXTENDED)
830247a4 4440 p = regwhite(p, RExC_end);
a0d0e21e
LW
4441 switch (*p) {
4442 case '^':
4443 case '$':
4444 case '.':
4445 case '[':
4446 case '(':
4447 case ')':
4448 case '|':
4449 goto loopdone;
4450 case '\\':
4451 switch (*++p) {
4452 case 'A':
1ed8eac0
JF
4453 case 'C':
4454 case 'X':
a0d0e21e
LW
4455 case 'G':
4456 case 'Z':
b85d18e9 4457 case 'z':
a0d0e21e
LW
4458 case 'w':
4459 case 'W':
4460 case 'b':
4461 case 'B':
4462 case 's':
4463 case 'S':
4464 case 'd':
4465 case 'D':
a14b48bc
LW
4466 case 'p':
4467 case 'P':
a0d0e21e
LW
4468 --p;
4469 goto loopdone;
4470 case 'n':
4471 ender = '\n';
4472 p++;
a687059c 4473 break;
a0d0e21e
LW
4474 case 'r':
4475 ender = '\r';
4476 p++;
a687059c 4477 break;
a0d0e21e
LW
4478 case 't':
4479 ender = '\t';
4480 p++;
a687059c 4481 break;
a0d0e21e
LW
4482 case 'f':
4483 ender = '\f';
4484 p++;
a687059c 4485 break;
a0d0e21e 4486 case 'e':
c7f1f016 4487 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4488 p++;
a687059c 4489 break;
a0d0e21e 4490 case 'a':
c7f1f016 4491 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4492 p++;
a687059c 4493 break;
a0d0e21e 4494 case 'x':
a0ed51b3 4495 if (*++p == '{') {
1df70142 4496 char* const e = strchr(p, '}');
b81d288d 4497
b45f050a 4498 if (!e) {
830247a4 4499 RExC_parse = p + 1;
b45f050a
JF
4500 vFAIL("Missing right brace on \\x{}");
4501 }
de5f0749 4502 else {
a4c04bdc
NC
4503 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4504 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4505 STRLEN numlen = e - p - 1;
53305cf1 4506 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4507 if (ender > 0xff)
4508 RExC_utf8 = 1;
a0ed51b3
LW
4509 p = e + 1;
4510 }
a0ed51b3
LW
4511 }
4512 else {
a4c04bdc 4513 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4514 STRLEN numlen = 2;
53305cf1 4515 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4516 p += numlen;
4517 }
a687059c 4518 break;
a0d0e21e
LW
4519 case 'c':
4520 p++;
bbce6d69 4521 ender = UCHARAT(p++);
4522 ender = toCTRL(ender);
a687059c 4523 break;
a0d0e21e
LW
4524 case '0': case '1': case '2': case '3':case '4':
4525 case '5': case '6': case '7': case '8':case '9':
4526 if (*p == '0' ||
830247a4 4527 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4528 I32 flags = 0;
1df70142 4529 STRLEN numlen = 3;
53305cf1 4530 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4531 p += numlen;
4532 }
4533 else {
4534 --p;
4535 goto loopdone;
a687059c
LW
4536 }
4537 break;
a0d0e21e 4538 case '\0':
830247a4 4539 if (p >= RExC_end)
b45f050a 4540 FAIL("Trailing \\");
a687059c 4541 /* FALL THROUGH */
a0d0e21e 4542 default:
041457d9 4543 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4544 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4545 goto normal_default;
a0d0e21e
LW
4546 }
4547 break;
a687059c 4548 default:
a0ed51b3 4549 normal_default:
fd400ab9 4550 if (UTF8_IS_START(*p) && UTF) {
1df70142 4551 STRLEN numlen;
5e12f4fb 4552 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4553 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4554 p += numlen;
4555 }
4556 else
4557 ender = *p++;
a0d0e21e 4558 break;
a687059c 4559 }
e2509266 4560 if (RExC_flags & PMf_EXTENDED)
830247a4 4561 p = regwhite(p, RExC_end);
60a8b682
JH
4562 if (UTF && FOLD) {
4563 /* Prime the casefolded buffer. */
ac7e0132 4564 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4565 }
a0d0e21e
LW
4566 if (ISMULT2(p)) { /* Back off on ?+*. */
4567 if (len)
4568 p = oldp;
16ea2a2e 4569 else if (UTF) {
80aecb99 4570 if (FOLD) {
60a8b682 4571 /* Emit all the Unicode characters. */
1df70142 4572 STRLEN numlen;
80aecb99
JH
4573 for (foldbuf = tmpbuf;
4574 foldlen;
4575 foldlen -= numlen) {
4576 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4577 if (numlen > 0) {
71207a34 4578 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4579 s += unilen;
4580 len += unilen;
4581 /* In EBCDIC the numlen
4582 * and unilen can differ. */
9dc45d57 4583 foldbuf += numlen;
47654450
JH
4584 if (numlen >= foldlen)
4585 break;
9dc45d57
JH
4586 }
4587 else
4588 break; /* "Can't happen." */
80aecb99
JH
4589 }
4590 }
4591 else {
71207a34 4592 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4593 if (unilen > 0) {
0ebc6274
JH
4594 s += unilen;
4595 len += unilen;
9dc45d57 4596 }
80aecb99 4597 }
a0ed51b3 4598 }
a0d0e21e
LW
4599 else {
4600 len++;
eb160463 4601 REGC((char)ender, s++);
a0d0e21e
LW
4602 }
4603 break;
a687059c 4604 }
16ea2a2e 4605 if (UTF) {
80aecb99 4606 if (FOLD) {
60a8b682 4607 /* Emit all the Unicode characters. */
1df70142 4608 STRLEN numlen;
80aecb99
JH
4609 for (foldbuf = tmpbuf;
4610 foldlen;
4611 foldlen -= numlen) {
4612 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4613 if (numlen > 0) {
71207a34 4614 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4615 len += unilen;
4616 s += unilen;
4617 /* In EBCDIC the numlen
4618 * and unilen can differ. */
9dc45d57 4619 foldbuf += numlen;
47654450
JH
4620 if (numlen >= foldlen)
4621 break;
9dc45d57
JH
4622 }
4623 else
4624 break;
80aecb99
JH
4625 }
4626 }
4627 else {
71207a34 4628 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4629 if (unilen > 0) {
0ebc6274
JH
4630 s += unilen;
4631 len += unilen;
9dc45d57 4632 }
80aecb99
JH
4633 }
4634 len--;
a0ed51b3
LW
4635 }
4636 else
eb160463 4637 REGC((char)ender, s++);
a0d0e21e
LW
4638 }
4639 loopdone:
830247a4 4640 RExC_parse = p - 1;
fac92740 4641 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4642 nextchar(pRExC_state);
793db0cb
JH
4643 {
4644 /* len is STRLEN which is unsigned, need to copy to signed */
4645 IV iv = len;
4646 if (iv < 0)
4647 vFAIL("Internal disaster");
4648 }
a0d0e21e
LW
4649 if (len > 0)
4650 *flagp |= HASWIDTH;
090f7165 4651 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4652 *flagp |= SIMPLE;
3dab1dad 4653
cd439c50 4654 if (SIZE_ONLY)
830247a4 4655 RExC_size += STR_SZ(len);
3dab1dad
YO
4656 else {
4657 STR_LEN(ret) = len;
830247a4 4658 RExC_emit += STR_SZ(len);
a687059c 4659 }
3dab1dad 4660 }
a0d0e21e
LW
4661 break;
4662 }
a687059c 4663
60a8b682
JH
4664 /* If the encoding pragma is in effect recode the text of
4665 * any EXACT-kind nodes. */
3dab1dad
YO
4666 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4667 const STRLEN oldlen = STR_LEN(ret);
4668 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4669
4670 if (RExC_utf8)
4671 SvUTF8_on(sv);
4672 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4673 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4674 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4675
4676 if (SvUTF8(sv))
4677 RExC_utf8 = 1;
4678 if (!SIZE_ONLY) {
a3621e74
YO
4679 GET_RE_DEBUG_FLAGS_DECL;
4680 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4681 (int)oldlen, STRING(ret),
4682 (int)newlen, s));
4683 Copy(s, STRING(ret), newlen, char);
4684 STR_LEN(ret) += newlen - oldlen;
4685 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4686 } else
4687 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4688 }
a72c7584
JH
4689 }
4690
a0d0e21e 4691 return(ret);
a687059c
LW
4692}
4693
873ef191 4694STATIC char *
5f66b61c 4695S_regwhite(char *p, const char *e)
5b5a24f7
CS
4696{
4697 while (p < e) {
4698 if (isSPACE(*p))
4699 ++p;
4700 else if (*p == '#') {
4701 do {
4702 p++;
4703 } while (p < e && *p != '\n');
4704 }
4705 else
4706 break;
4707 }
4708 return p;
4709}
4710
b8c5462f
JH
4711/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4712 Character classes ([:foo:]) can also be negated ([:^foo:]).
4713 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4714 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4715 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4716
4717#define POSIXCC_DONE(c) ((c) == ':')
4718#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4719#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4720
b8c5462f 4721STATIC I32
830247a4 4722S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4723{
97aff369 4724 dVAR;
936ed897 4725 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4726
830247a4 4727 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4728 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4729 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4730 const char c = UCHARAT(RExC_parse);
097eb12c 4731 char* const s = RExC_parse++;
b81d288d 4732
9a86a77b 4733 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4734 RExC_parse++;
4735 if (RExC_parse == RExC_end)
620e46c5 4736 /* Grandfather lone [:, [=, [. */
830247a4 4737 RExC_parse = s;
620e46c5 4738 else {
3dab1dad 4739 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
4740 assert(*t == c);
4741
9a86a77b 4742 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 4743 const char *posixcc = s + 1;
830247a4 4744 RExC_parse++; /* skip over the ending ] */
3dab1dad 4745
b8c5462f 4746 if (*s == ':') {
1df70142
AL
4747 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4748 const I32 skip = t - posixcc;
80916619
NC
4749
4750 /* Initially switch on the length of the name. */
4751 switch (skip) {
4752 case 4:
3dab1dad
YO
4753 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
4754 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 4755 break;
80916619
NC
4756 case 5:
4757 /* Names all of length 5. */
4758 /* alnum alpha ascii blank cntrl digit graph lower
4759 print punct space upper */
4760 /* Offset 4 gives the best switch position. */
4761 switch (posixcc[4]) {
4762 case 'a':
3dab1dad
YO
4763 if (memEQ(posixcc, "alph", 4)) /* alpha */
4764 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
4765 break;
4766 case 'e':
3dab1dad
YO
4767 if (memEQ(posixcc, "spac", 4)) /* space */
4768 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
4769 break;
4770 case 'h':
3dab1dad
YO
4771 if (memEQ(posixcc, "grap", 4)) /* graph */
4772 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
4773 break;
4774 case 'i':
3dab1dad
YO
4775 if (memEQ(posixcc, "asci", 4)) /* ascii */
4776 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
4777 break;
4778 case 'k':
3dab1dad
YO
4779 if (memEQ(posixcc, "blan", 4)) /* blank */
4780 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
4781 break;
4782 case 'l':
3dab1dad
YO
4783 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
4784 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
4785 break;
4786 case 'm':
3dab1dad
YO
4787 if (memEQ(posixcc, "alnu", 4)) /* alnum */
4788 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
4789 break;
4790 case 'r':
3dab1dad
YO
4791 if (memEQ(posixcc, "lowe", 4)) /* lower */
4792 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4793 else if (memEQ(posixcc, "uppe", 4)) /* upper */
4794 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
4795 break;
4796 case 't':
3dab1dad
YO
4797 if (memEQ(posixcc, "digi", 4)) /* digit */
4798 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4799 else if (memEQ(posixcc, "prin", 4)) /* print */
4800 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4801 else if (memEQ(posixcc, "punc", 4)) /* punct */
4802 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 4803 break;
b8c5462f
JH
4804 }
4805 break;
80916619 4806 case 6:
3dab1dad
YO
4807 if (memEQ(posixcc, "xdigit", 6))
4808 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4809 break;
4810 }
80916619
NC
4811
4812 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4813 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4814 t - s - 1, s + 1);
80916619
NC
4815 assert (posixcc[skip] == ':');
4816 assert (posixcc[skip+1] == ']');
b45f050a 4817 } else if (!SIZE_ONLY) {
b8c5462f 4818 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4819
830247a4 4820 /* adjust RExC_parse so the warning shows after
b45f050a 4821 the class closes */
9a86a77b 4822 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4823 RExC_parse++;
b45f050a
JF
4824 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4825 }
b8c5462f
JH
4826 } else {
4827 /* Maternal grandfather:
4828 * "[:" ending in ":" but not in ":]" */
830247a4 4829 RExC_parse = s;
767d463e 4830 }
620e46c5
JH
4831 }
4832 }
4833
b8c5462f
JH
4834 return namedclass;
4835}
4836
4837STATIC void
830247a4 4838S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4839{
97aff369 4840 dVAR;
3dab1dad 4841 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4842 const char *s = RExC_parse;
4843 const char c = *s++;
b8c5462f 4844
3dab1dad 4845 while (isALNUM(*s))
b8c5462f
JH
4846 s++;
4847 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4848 if (ckWARN(WARN_REGEXP))
4849 vWARN3(s+2,
4850 "POSIX syntax [%c %c] belongs inside character classes",
4851 c, c);
b45f050a
JF
4852
4853 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4854 if (POSIXCC_NOTYET(c)) {
830247a4 4855 /* adjust RExC_parse so the error shows after
b45f050a 4856 the class closes */
9a86a77b 4857 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 4858 NOOP;
b45f050a
JF
4859 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4860 }
b8c5462f
JH
4861 }
4862 }
620e46c5
JH
4863}
4864
7f6f358c
YO
4865
4866/*
4867 parse a class specification and produce either an ANYOF node that
4868 matches the pattern. If the pattern matches a single char only and
4869 that char is < 256 then we produce an EXACT node instead.
4870*/
76e3520e 4871STATIC regnode *
3dab1dad 4872S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 4873{
97aff369 4874 dVAR;
fa656879 4875 register UV value = 0;
9a86a77b 4876 register UV nextvalue;
3568d838 4877 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4878 register IV range = 0;
c277df42 4879 register regnode *ret;
ba210ebe 4880 STRLEN numlen;
ffc61ed2 4881 IV namedclass;
cbbf8932 4882 char *rangebegin = NULL;
936ed897 4883 bool need_class = 0;
c445ea15 4884 SV *listsv = NULL;
ffc61ed2 4885 UV n;
9e55ce06 4886 bool optimize_invert = TRUE;
cbbf8932 4887 AV* unicode_alternate = NULL;
1b2d223b
JH
4888#ifdef EBCDIC
4889 UV literal_endpoint = 0;
4890#endif
7f6f358c 4891 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 4892
3dab1dad 4893 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 4894 case we need to change the emitted regop to an EXACT. */
3dab1dad
YO
4895 GET_RE_DEBUG_FLAGS_DECL;
4896 DEBUG_PARSE("clas");
7f6f358c
YO
4897
4898 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
4899 ret = reganode(pRExC_state, ANYOF, 0);
4900
4901 if (!SIZE_ONLY)
4902 ANYOF_FLAGS(ret) = 0;
4903
9a86a77b 4904 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4905 RExC_naughty++;
4906 RExC_parse++;
4907 if (!SIZE_ONLY)
4908 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4909 }
a0d0e21e 4910
73060fc4 4911 if (SIZE_ONLY) {
830247a4 4912 RExC_size += ANYOF_SKIP;
73060fc4
JH
4913 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4914 }
936ed897 4915 else {
830247a4 4916 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4917 if (FOLD)
4918 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4919 if (LOC)
4920 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4921 ANYOF_BITMAP_ZERO(ret);
396482e1 4922 listsv = newSVpvs("# comment\n");
a0d0e21e 4923 }
b8c5462f 4924
9a86a77b
JH
4925 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4926
b938889d 4927 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4928 checkposixcc(pRExC_state);
b8c5462f 4929
f064b6ad
HS
4930 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4931 if (UCHARAT(RExC_parse) == ']')
4932 goto charclassloop;
ffc61ed2 4933
9a86a77b 4934 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4935
4936 charclassloop:
4937
4938 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4939
73b437c8 4940 if (!range)
830247a4 4941 rangebegin = RExC_parse;
ffc61ed2 4942 if (UTF) {
5e12f4fb 4943 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 4944 RExC_end - RExC_parse,
9f7f3913 4945 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4946 RExC_parse += numlen;
4947 }
4948 else
4949 value = UCHARAT(RExC_parse++);
7f6f358c 4950
9a86a77b
JH
4951 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4952 if (value == '[' && POSIXCC(nextvalue))
830247a4 4953 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4954 else if (value == '\\') {
ffc61ed2 4955 if (UTF) {
5e12f4fb 4956 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 4957 RExC_end - RExC_parse,
9f7f3913 4958 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4959 RExC_parse += numlen;
4960 }
4961 else
4962 value = UCHARAT(RExC_parse++);
470c3474 4963 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4964 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4965 * be a problem later if we want switch on Unicode.
4966 * A similar issue a little bit later when switching on
4967 * namedclass. --jhi */
ffc61ed2 4968 switch ((I32)value) {
b8c5462f
JH
4969 case 'w': namedclass = ANYOF_ALNUM; break;
4970 case 'W': namedclass = ANYOF_NALNUM; break;
4971 case 's': namedclass = ANYOF_SPACE; break;
4972 case 'S': namedclass = ANYOF_NSPACE; break;
4973 case 'd': namedclass = ANYOF_DIGIT; break;
4974 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4975 case 'p':
4976 case 'P':
3dab1dad
YO
4977 {
4978 char *e;
af6f566e 4979 if (RExC_parse >= RExC_end)
2a4859cd 4980 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4981 if (*RExC_parse == '{') {
1df70142 4982 const U8 c = (U8)value;
ffc61ed2
JH
4983 e = strchr(RExC_parse++, '}');
4984 if (!e)
0da60cf5 4985 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4986 while (isSPACE(UCHARAT(RExC_parse)))
4987 RExC_parse++;
4988 if (e == RExC_parse)
0da60cf5 4989 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4990 n = e - RExC_parse;
ab13f0c7
JH
4991 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4992 n--;
ffc61ed2
JH
4993 }
4994 else {
4995 e = RExC_parse;
4996 n = 1;
4997 }
4998 if (!SIZE_ONLY) {
ab13f0c7
JH
4999 if (UCHARAT(RExC_parse) == '^') {
5000 RExC_parse++;
5001 n--;
5002 value = value == 'p' ? 'P' : 'p'; /* toggle */
5003 while (isSPACE(UCHARAT(RExC_parse))) {
5004 RExC_parse++;
5005 n--;
5006 }
5007 }
097eb12c
AL
5008 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5009 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5010 }
5011 RExC_parse = e + 1;
5012 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5013 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5014 }
f81125e2 5015 break;
b8c5462f
JH
5016 case 'n': value = '\n'; break;
5017 case 'r': value = '\r'; break;
5018 case 't': value = '\t'; break;
5019 case 'f': value = '\f'; break;
5020 case 'b': value = '\b'; break;
c7f1f016
NIS
5021 case 'e': value = ASCII_TO_NATIVE('\033');break;
5022 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5023 case 'x':
ffc61ed2 5024 if (*RExC_parse == '{') {
a4c04bdc
NC
5025 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5026 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5027 char * const e = strchr(RExC_parse++, '}');
b81d288d 5028 if (!e)
ffc61ed2 5029 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5030
5031 numlen = e - RExC_parse;
5032 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5033 RExC_parse = e + 1;
5034 }
5035 else {
a4c04bdc 5036 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5037 numlen = 2;
5038 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5039 RExC_parse += numlen;
5040 }
b8c5462f
JH
5041 break;
5042 case 'c':
830247a4 5043 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5044 value = toCTRL(value);
5045 break;
5046 case '0': case '1': case '2': case '3': case '4':
5047 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5048 {
5049 I32 flags = 0;
5050 numlen = 3;
5051 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5052 RExC_parse += numlen;
b8c5462f 5053 break;
53305cf1 5054 }
1028017a 5055 default:
041457d9 5056 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5057 vWARN2(RExC_parse,
5058 "Unrecognized escape \\%c in character class passed through",
5059 (int)value);
1028017a 5060 break;
b8c5462f 5061 }
ffc61ed2 5062 } /* end of \blah */
1b2d223b
JH
5063#ifdef EBCDIC
5064 else
5065 literal_endpoint++;
5066#endif
ffc61ed2
JH
5067
5068 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5069
5070 if (!SIZE_ONLY && !need_class)
936ed897 5071 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5072
936ed897 5073 need_class = 1;
ffc61ed2
JH
5074
5075 /* a bad range like a-\d, a-[:digit:] ? */
5076 if (range) {
73b437c8 5077 if (!SIZE_ONLY) {
afd78fd5 5078 if (ckWARN(WARN_REGEXP)) {
097eb12c 5079 const int w =
afd78fd5
JH
5080 RExC_parse >= rangebegin ?
5081 RExC_parse - rangebegin : 0;
830247a4 5082 vWARN4(RExC_parse,
b45f050a 5083 "False [] range \"%*.*s\"",
097eb12c 5084 w, w, rangebegin);
afd78fd5 5085 }
3568d838
JH
5086 if (prevvalue < 256) {
5087 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5088 ANYOF_BITMAP_SET(ret, '-');
5089 }
5090 else {
5091 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5092 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5093 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5094 }
b8c5462f 5095 }
ffc61ed2
JH
5096
5097 range = 0; /* this was not a true range */
73b437c8 5098 }
ffc61ed2 5099
73b437c8 5100 if (!SIZE_ONLY) {
c49a72a9
NC
5101 const char *what = NULL;
5102 char yesno = 0;
5103
3568d838
JH
5104 if (namedclass > OOB_NAMEDCLASS)
5105 optimize_invert = FALSE;
e2962f66
JH
5106 /* Possible truncation here but in some 64-bit environments
5107 * the compiler gets heartburn about switch on 64-bit values.
5108 * A similar issue a little earlier when switching on value.
98f323fa 5109 * --jhi */
e2962f66 5110 switch ((I32)namedclass) {
73b437c8
JH
5111 case ANYOF_ALNUM:
5112 if (LOC)
936ed897 5113 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5114 else {
5115 for (value = 0; value < 256; value++)
5116 if (isALNUM(value))
936ed897 5117 ANYOF_BITMAP_SET(ret, value);
73b437c8 5118 }
c49a72a9
NC
5119 yesno = '+';
5120 what = "Word";
73b437c8
JH
5121 break;
5122 case ANYOF_NALNUM:
5123 if (LOC)
936ed897 5124 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5125 else {
5126 for (value = 0; value < 256; value++)
5127 if (!isALNUM(value))
936ed897 5128 ANYOF_BITMAP_SET(ret, value);
73b437c8 5129 }
c49a72a9
NC
5130 yesno = '!';
5131 what = "Word";
73b437c8 5132 break;
ffc61ed2 5133 case ANYOF_ALNUMC:
73b437c8 5134 if (LOC)
ffc61ed2 5135 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5136 else {
5137 for (value = 0; value < 256; value++)
ffc61ed2 5138 if (isALNUMC(value))
936ed897 5139 ANYOF_BITMAP_SET(ret, value);
73b437c8 5140 }
c49a72a9
NC
5141 yesno = '+';
5142 what = "Alnum";
73b437c8
JH
5143 break;
5144 case ANYOF_NALNUMC:
5145 if (LOC)
936ed897 5146 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5147 else {
5148 for (value = 0; value < 256; value++)
5149 if (!isALNUMC(value))
936ed897 5150 ANYOF_BITMAP_SET(ret, value);
73b437c8 5151 }
c49a72a9
NC
5152 yesno = '!';
5153 what = "Alnum";
73b437c8
JH
5154 break;
5155 case ANYOF_ALPHA:
5156 if (LOC)
936ed897 5157 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5158 else {
5159 for (value = 0; value < 256; value++)
5160 if (isALPHA(value))
936ed897 5161 ANYOF_BITMAP_SET(ret, value);
73b437c8 5162 }
c49a72a9
NC
5163 yesno = '+';
5164 what = "Alpha";
73b437c8
JH
5165 break;
5166 case ANYOF_NALPHA:
5167 if (LOC)
936ed897 5168 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5169 else {
5170 for (value = 0; value < 256; value++)
5171 if (!isALPHA(value))
936ed897 5172 ANYOF_BITMAP_SET(ret, value);
73b437c8 5173 }
c49a72a9
NC
5174 yesno = '!';
5175 what = "Alpha";
73b437c8
JH
5176 break;
5177 case ANYOF_ASCII:
5178 if (LOC)
936ed897 5179 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5180 else {
c7f1f016 5181#ifndef EBCDIC
1ba5c669
JH
5182 for (value = 0; value < 128; value++)
5183 ANYOF_BITMAP_SET(ret, value);
5184#else /* EBCDIC */
ffbc6a93 5185 for (value = 0; value < 256; value++) {
3a3c4447
JH
5186 if (isASCII(value))
5187 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5188 }
1ba5c669 5189#endif /* EBCDIC */
73b437c8 5190 }
c49a72a9
NC
5191 yesno = '+';
5192 what = "ASCII";
73b437c8
JH
5193 break;
5194 case ANYOF_NASCII:
5195 if (LOC)
936ed897 5196 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5197 else {
c7f1f016 5198#ifndef EBCDIC
1ba5c669
JH
5199 for (value = 128; value < 256; value++)
5200 ANYOF_BITMAP_SET(ret, value);
5201#else /* EBCDIC */
ffbc6a93 5202 for (value = 0; value < 256; value++) {
3a3c4447
JH
5203 if (!isASCII(value))
5204 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5205 }
1ba5c669 5206#endif /* EBCDIC */
73b437c8 5207 }
c49a72a9
NC
5208 yesno = '!';
5209 what = "ASCII";
73b437c8 5210 break;
aaa51d5e
JF
5211 case ANYOF_BLANK:
5212 if (LOC)
5213 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5214 else {
5215 for (value = 0; value < 256; value++)
5216 if (isBLANK(value))
5217 ANYOF_BITMAP_SET(ret, value);
5218 }
c49a72a9
NC
5219 yesno = '+';
5220 what = "Blank";
aaa51d5e
JF
5221 break;
5222 case ANYOF_NBLANK:
5223 if (LOC)
5224 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5225 else {
5226 for (value = 0; value < 256; value++)
5227 if (!isBLANK(value))
5228 ANYOF_BITMAP_SET(ret, value);
5229 }
c49a72a9
NC
5230 yesno = '!';
5231 what = "Blank";
aaa51d5e 5232 break;
73b437c8
JH
5233 case ANYOF_CNTRL:
5234 if (LOC)
936ed897 5235 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5236 else {
5237 for (value = 0; value < 256; value++)
5238 if (isCNTRL(value))
936ed897 5239 ANYOF_BITMAP_SET(ret, value);
73b437c8 5240 }
c49a72a9
NC
5241 yesno = '+';
5242 what = "Cntrl";
73b437c8
JH
5243 break;
5244 case ANYOF_NCNTRL:
5245 if (LOC)
936ed897 5246 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5247 else {
5248 for (value = 0; value < 256; value++)
5249 if (!isCNTRL(value))
936ed897 5250 ANYOF_BITMAP_SET(ret, value);
73b437c8 5251 }
c49a72a9
NC
5252 yesno = '!';
5253 what = "Cntrl";
ffc61ed2
JH
5254 break;
5255 case ANYOF_DIGIT:
5256 if (LOC)
5257 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5258 else {
5259 /* consecutive digits assumed */
5260 for (value = '0'; value <= '9'; value++)
5261 ANYOF_BITMAP_SET(ret, value);
5262 }
c49a72a9
NC
5263 yesno = '+';
5264 what = "Digit";
ffc61ed2
JH
5265 break;
5266 case ANYOF_NDIGIT:
5267 if (LOC)
5268 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5269 else {
5270 /* consecutive digits assumed */
5271 for (value = 0; value < '0'; value++)
5272 ANYOF_BITMAP_SET(ret, value);
5273 for (value = '9' + 1; value < 256; value++)
5274 ANYOF_BITMAP_SET(ret, value);
5275 }
c49a72a9
NC
5276 yesno = '!';
5277 what = "Digit";
73b437c8
JH
5278 break;
5279 case ANYOF_GRAPH:
5280 if (LOC)
936ed897 5281 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5282 else {
5283 for (value = 0; value < 256; value++)
5284 if (isGRAPH(value))
936ed897 5285 ANYOF_BITMAP_SET(ret, value);
73b437c8 5286 }
c49a72a9
NC
5287 yesno = '+';
5288 what = "Graph";
73b437c8
JH
5289 break;
5290 case ANYOF_NGRAPH:
5291 if (LOC)
936ed897 5292 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5293 else {
5294 for (value = 0; value < 256; value++)
5295 if (!isGRAPH(value))
936ed897 5296 ANYOF_BITMAP_SET(ret, value);
73b437c8 5297 }
c49a72a9
NC
5298 yesno = '!';
5299 what = "Graph";
73b437c8
JH
5300 break;
5301 case ANYOF_LOWER:
5302 if (LOC)
936ed897 5303 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5304 else {
5305 for (value = 0; value < 256; value++)
5306 if (isLOWER(value))
936ed897 5307 ANYOF_BITMAP_SET(ret, value);
73b437c8 5308 }
c49a72a9
NC
5309 yesno = '+';
5310 what = "Lower";
73b437c8
JH
5311 break;
5312 case ANYOF_NLOWER:
5313 if (LOC)
936ed897 5314 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5315 else {
5316 for (value = 0; value < 256; value++)
5317 if (!isLOWER(value))
936ed897 5318 ANYOF_BITMAP_SET(ret, value);
73b437c8 5319 }
c49a72a9
NC
5320 yesno = '!';
5321 what = "Lower";
73b437c8
JH
5322 break;
5323 case ANYOF_PRINT:
5324 if (LOC)
936ed897 5325 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5326 else {
5327 for (value = 0; value < 256; value++)
5328 if (isPRINT(value))
936ed897 5329 ANYOF_BITMAP_SET(ret, value);
73b437c8 5330 }
c49a72a9
NC
5331 yesno = '+';
5332 what = "Print";
73b437c8
JH
5333 break;
5334 case ANYOF_NPRINT:
5335 if (LOC)
936ed897 5336 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5337 else {
5338 for (value = 0; value < 256; value++)
5339 if (!isPRINT(value))
936ed897 5340 ANYOF_BITMAP_SET(ret, value);
73b437c8 5341 }
c49a72a9
NC
5342 yesno = '!';
5343 what = "Print";
73b437c8 5344 break;
aaa51d5e
JF
5345 case ANYOF_PSXSPC:
5346 if (LOC)
5347 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5348 else {
5349 for (value = 0; value < 256; value++)
5350 if (isPSXSPC(value))
5351 ANYOF_BITMAP_SET(ret, value);
5352 }
c49a72a9
NC
5353 yesno = '+';
5354 what = "Space";
aaa51d5e
JF
5355 break;
5356 case ANYOF_NPSXSPC:
5357 if (LOC)
5358 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5359 else {
5360 for (value = 0; value < 256; value++)
5361 if (!isPSXSPC(value))
5362 ANYOF_BITMAP_SET(ret, value);
5363 }
c49a72a9
NC
5364 yesno = '!';
5365 what = "Space";
aaa51d5e 5366 break;
73b437c8
JH
5367 case ANYOF_PUNCT:
5368 if (LOC)
936ed897 5369 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5370 else {
5371 for (value = 0; value < 256; value++)
5372 if (isPUNCT(value))
936ed897 5373 ANYOF_BITMAP_SET(ret, value);
73b437c8 5374 }
c49a72a9
NC
5375 yesno = '+';
5376 what = "Punct";
73b437c8
JH
5377 break;
5378 case ANYOF_NPUNCT:
5379 if (LOC)
936ed897 5380 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5381 else {
5382 for (value = 0; value < 256; value++)
5383 if (!isPUNCT(value))
936ed897 5384 ANYOF_BITMAP_SET(ret, value);
73b437c8 5385 }
c49a72a9
NC
5386 yesno = '!';
5387 what = "Punct";
ffc61ed2
JH
5388 break;
5389 case ANYOF_SPACE:
5390 if (LOC)
5391 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5392 else {
5393 for (value = 0; value < 256; value++)
5394 if (isSPACE(value))
5395 ANYOF_BITMAP_SET(ret, value);
5396 }
c49a72a9
NC
5397 yesno = '+';
5398 what = "SpacePerl";
ffc61ed2
JH
5399 break;
5400 case ANYOF_NSPACE:
5401 if (LOC)
5402 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5403 else {
5404 for (value = 0; value < 256; value++)
5405 if (!isSPACE(value))
5406 ANYOF_BITMAP_SET(ret, value);
5407 }
c49a72a9
NC
5408 yesno = '!';
5409 what = "SpacePerl";
73b437c8
JH
5410 break;
5411 case ANYOF_UPPER:
5412 if (LOC)
936ed897 5413 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5414 else {
5415 for (value = 0; value < 256; value++)
5416 if (isUPPER(value))
936ed897 5417 ANYOF_BITMAP_SET(ret, value);
73b437c8 5418 }
c49a72a9
NC
5419 yesno = '+';
5420 what = "Upper";
73b437c8
JH
5421 break;
5422 case ANYOF_NUPPER:
5423 if (LOC)
936ed897 5424 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5425 else {
5426 for (value = 0; value < 256; value++)
5427 if (!isUPPER(value))
936ed897 5428 ANYOF_BITMAP_SET(ret, value);
73b437c8 5429 }
c49a72a9
NC
5430 yesno = '!';
5431 what = "Upper";
73b437c8
JH
5432 break;
5433 case ANYOF_XDIGIT:
5434 if (LOC)
936ed897 5435 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5436 else {
5437 for (value = 0; value < 256; value++)
5438 if (isXDIGIT(value))
936ed897 5439 ANYOF_BITMAP_SET(ret, value);
73b437c8 5440 }
c49a72a9
NC
5441 yesno = '+';
5442 what = "XDigit";
73b437c8
JH
5443 break;
5444 case ANYOF_NXDIGIT:
5445 if (LOC)
936ed897 5446 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5447 else {
5448 for (value = 0; value < 256; value++)
5449 if (!isXDIGIT(value))
936ed897 5450 ANYOF_BITMAP_SET(ret, value);
73b437c8 5451 }
c49a72a9
NC
5452 yesno = '!';
5453 what = "XDigit";
73b437c8 5454 break;
f81125e2
JP
5455 case ANYOF_MAX:
5456 /* this is to handle \p and \P */
5457 break;
73b437c8 5458 default:
b45f050a 5459 vFAIL("Invalid [::] class");
73b437c8 5460 break;
b8c5462f 5461 }
c49a72a9
NC
5462 if (what) {
5463 /* Strings such as "+utf8::isWord\n" */
5464 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5465 }
b8c5462f 5466 if (LOC)
936ed897 5467 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5468 continue;
a0d0e21e 5469 }
ffc61ed2
JH
5470 } /* end of namedclass \blah */
5471
a0d0e21e 5472 if (range) {
eb160463 5473 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5474 const int w = RExC_parse - rangebegin;
5475 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5476 range = 0; /* not a valid range */
73b437c8 5477 }
a0d0e21e
LW
5478 }
5479 else {
3568d838 5480 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5481 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5482 RExC_parse[1] != ']') {
5483 RExC_parse++;
ffc61ed2
JH
5484
5485 /* a bad range like \w-, [:word:]- ? */
5486 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5487 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5488 const int w =
afd78fd5
JH
5489 RExC_parse >= rangebegin ?
5490 RExC_parse - rangebegin : 0;
830247a4 5491 vWARN4(RExC_parse,
b45f050a 5492 "False [] range \"%*.*s\"",
097eb12c 5493 w, w, rangebegin);
afd78fd5 5494 }
73b437c8 5495 if (!SIZE_ONLY)
936ed897 5496 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5497 } else
ffc61ed2
JH
5498 range = 1; /* yeah, it's a range! */
5499 continue; /* but do it the next time */
a0d0e21e 5500 }
a687059c 5501 }
ffc61ed2 5502
93733859 5503 /* now is the next time */
7f6f358c 5504 stored += (value - prevvalue + 1);
ae5c130c 5505 if (!SIZE_ONLY) {
3568d838 5506 if (prevvalue < 256) {
1df70142 5507 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5508 IV i;
3568d838 5509#ifdef EBCDIC
1b2d223b
JH
5510 /* In EBCDIC [\x89-\x91] should include
5511 * the \x8e but [i-j] should not. */
5512 if (literal_endpoint == 2 &&
5513 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5514 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5515 {
3568d838
JH
5516 if (isLOWER(prevvalue)) {
5517 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5518 if (isLOWER(i))
5519 ANYOF_BITMAP_SET(ret, i);
5520 } else {
3568d838 5521 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5522 if (isUPPER(i))
5523 ANYOF_BITMAP_SET(ret, i);
5524 }
8ada0baa 5525 }
ffc61ed2 5526 else
8ada0baa 5527#endif
a5961de5
JH
5528 for (i = prevvalue; i <= ceilvalue; i++)
5529 ANYOF_BITMAP_SET(ret, i);
3568d838 5530 }
a5961de5 5531 if (value > 255 || UTF) {
1df70142
AL
5532 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5533 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5534
ffc61ed2 5535 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5536 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5537 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5538 prevnatvalue, natvalue);
5539 }
5540 else if (prevnatvalue == natvalue) {
5541 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5542 if (FOLD) {
89ebb4a3 5543 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5544 STRLEN foldlen;
1df70142 5545 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5546
c840d2a2
JH
5547 /* If folding and foldable and a single
5548 * character, insert also the folded version
5549 * to the charclass. */
9e55ce06 5550 if (f != value) {
eb160463 5551 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5552 Perl_sv_catpvf(aTHX_ listsv,
5553 "%04"UVxf"\n", f);
5554 else {
5555 /* Any multicharacter foldings
5556 * require the following transform:
5557 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5558 * where E folds into "pq" and F folds
5559 * into "rst", all other characters
5560 * fold to single characters. We save
5561 * away these multicharacter foldings,
5562 * to be later saved as part of the
5563 * additional "s" data. */
5564 SV *sv;
5565
5566 if (!unicode_alternate)
5567 unicode_alternate = newAV();
5568 sv = newSVpvn((char*)foldbuf, foldlen);
5569 SvUTF8_on(sv);
5570 av_push(unicode_alternate, sv);
5571 }
5572 }
254ba52a 5573
60a8b682
JH
5574 /* If folding and the value is one of the Greek
5575 * sigmas insert a few more sigmas to make the
5576 * folding rules of the sigmas to work right.
5577 * Note that not all the possible combinations
5578 * are handled here: some of them are handled
9e55ce06
JH
5579 * by the standard folding rules, and some of
5580 * them (literal or EXACTF cases) are handled
5581 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5582 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5583 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5584 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5585 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5586 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5587 }
5588 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5589 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5590 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5591 }
5592 }
ffc61ed2 5593 }
1b2d223b
JH
5594#ifdef EBCDIC
5595 literal_endpoint = 0;
5596#endif
8ada0baa 5597 }
ffc61ed2
JH
5598
5599 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5600 }
ffc61ed2 5601
936ed897 5602 if (need_class) {
4f66b38d 5603 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5604 if (SIZE_ONLY)
830247a4 5605 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5606 else
830247a4 5607 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5608 }
ffc61ed2 5609
7f6f358c
YO
5610
5611 if (SIZE_ONLY)
5612 return ret;
5613 /****** !SIZE_ONLY AFTER HERE *********/
5614
5615 if( stored == 1 && value < 256
5616 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5617 ) {
5618 /* optimize single char class to an EXACT node
5619 but *only* when its not a UTF/high char */
5620 RExC_emit = orig_emit;
5621 ret = reg_node(pRExC_state,
5622 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5623 *STRING(ret)= (char)value;
5624 STR_LEN(ret)= 1;
5625 RExC_emit += STR_SZ(1);
5626 return ret;
5627 }
ae5c130c 5628 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5629 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5630 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5631 ) {
a0ed51b3 5632 for (value = 0; value < 256; ++value) {
936ed897 5633 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5634 UV fold = PL_fold[value];
ffc61ed2
JH
5635
5636 if (fold != value)
5637 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5638 }
5639 }
936ed897 5640 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5641 }
ffc61ed2 5642
ae5c130c 5643 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5644 if (optimize_invert &&
ffc61ed2
JH
5645 /* If the only flag is inversion. */
5646 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5647 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5648 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5649 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5650 }
7f6f358c 5651 {
097eb12c 5652 AV * const av = newAV();
ffc61ed2 5653 SV *rv;
9e55ce06 5654 /* The 0th element stores the character class description
6a0407ee 5655 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5656 * to initialize the appropriate swash (which gets stored in
5657 * the 1st element), and also useful for dumping the regnode.
5658 * The 2nd element stores the multicharacter foldings,
6a0407ee 5659 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5660 av_store(av, 0, listsv);
5661 av_store(av, 1, NULL);
9e55ce06 5662 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5663 rv = newRV_noinc((SV*)av);
19860706 5664 n = add_data(pRExC_state, 1, "s");
830247a4 5665 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5666 ARG_SET(ret, n);
a0ed51b3 5667 }
a0ed51b3
LW
5668 return ret;
5669}
5670
76e3520e 5671STATIC char*
830247a4 5672S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5673{
097eb12c 5674 char* const retval = RExC_parse++;
a0d0e21e 5675
4633a7c4 5676 for (;;) {
830247a4
IZ
5677 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5678 RExC_parse[2] == '#') {
e994fd66
AE
5679 while (*RExC_parse != ')') {
5680 if (RExC_parse == RExC_end)
5681 FAIL("Sequence (?#... not terminated");
830247a4 5682 RExC_parse++;
e994fd66 5683 }
830247a4 5684 RExC_parse++;
4633a7c4
LW
5685 continue;
5686 }
e2509266 5687 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5688 if (isSPACE(*RExC_parse)) {
5689 RExC_parse++;
748a9306
LW
5690 continue;
5691 }
830247a4 5692 else if (*RExC_parse == '#') {
e994fd66
AE
5693 while (RExC_parse < RExC_end)
5694 if (*RExC_parse++ == '\n') break;
748a9306
LW
5695 continue;
5696 }
748a9306 5697 }
4633a7c4 5698 return retval;
a0d0e21e 5699 }
a687059c
LW
5700}
5701
5702/*
c277df42 5703- reg_node - emit a node
a0d0e21e 5704*/
76e3520e 5705STATIC regnode * /* Location. */
830247a4 5706S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5707{
97aff369 5708 dVAR;
c277df42 5709 register regnode *ptr;
504618e9 5710 regnode * const ret = RExC_emit;
a687059c 5711
c277df42 5712 if (SIZE_ONLY) {
830247a4
IZ
5713 SIZE_ALIGN(RExC_size);
5714 RExC_size += 1;
a0d0e21e
LW
5715 return(ret);
5716 }
c277df42 5717 NODE_ALIGN_FILL(ret);
a0d0e21e 5718 ptr = ret;
c277df42 5719 FILL_ADVANCE_NODE(ptr, op);
fac92740 5720 if (RExC_offsets) { /* MJD */
ccb2c380 5721 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5722 "reg_node", __LINE__,
5723 reg_name[op],
5724 RExC_emit - RExC_emit_start > RExC_offsets[0]
5725 ? "Overwriting end of array!\n" : "OK",
5726 RExC_emit - RExC_emit_start,
5727 RExC_parse - RExC_start,
5728 RExC_offsets[0]));
ccb2c380 5729 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5730 }
5731
830247a4 5732 RExC_emit = ptr;
a687059c 5733
a0d0e21e 5734 return(ret);
a687059c
LW
5735}
5736
5737/*
a0d0e21e
LW
5738- reganode - emit a node with an argument
5739*/
76e3520e 5740STATIC regnode * /* Location. */
830247a4 5741S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5742{
97aff369 5743 dVAR;
c277df42 5744 register regnode *ptr;
504618e9 5745 regnode * const ret = RExC_emit;
fe14fcc3 5746
c277df42 5747 if (SIZE_ONLY) {
830247a4
IZ
5748 SIZE_ALIGN(RExC_size);
5749 RExC_size += 2;
a0d0e21e
LW
5750 return(ret);
5751 }
fe14fcc3 5752
c277df42 5753 NODE_ALIGN_FILL(ret);
a0d0e21e 5754 ptr = ret;
c277df42 5755 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5756 if (RExC_offsets) { /* MJD */
ccb2c380 5757 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5758 "reganode",
ccb2c380
MP
5759 __LINE__,
5760 reg_name[op],
fac92740
MJD
5761 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5762 "Overwriting end of array!\n" : "OK",
5763 RExC_emit - RExC_emit_start,
5764 RExC_parse - RExC_start,
5765 RExC_offsets[0]));
ccb2c380 5766 Set_Cur_Node_Offset;
fac92740
MJD
5767 }
5768
830247a4 5769 RExC_emit = ptr;
fe14fcc3 5770
a0d0e21e 5771 return(ret);
fe14fcc3
LW
5772}
5773
5774/*
cd439c50 5775- reguni - emit (if appropriate) a Unicode character
a0ed51b3 5776*/
71207a34
AL
5777STATIC STRLEN
5778S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 5779{
97aff369 5780 dVAR;
71207a34 5781 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5782}
5783
5784/*
a0d0e21e
LW
5785- reginsert - insert an operator in front of already-emitted operand
5786*
5787* Means relocating the operand.
5788*/
76e3520e 5789STATIC void
830247a4 5790S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5791{
97aff369 5792 dVAR;
c277df42
IZ
5793 register regnode *src;
5794 register regnode *dst;
5795 register regnode *place;
504618e9 5796 const int offset = regarglen[(U8)op];
b81d288d 5797
22c35a8c 5798/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5799
5800 if (SIZE_ONLY) {
830247a4 5801 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5802 return;
5803 }
a687059c 5804
830247a4
IZ
5805 src = RExC_emit;
5806 RExC_emit += NODE_STEP_REGNODE + offset;
5807 dst = RExC_emit;
fac92740 5808 while (src > opnd) {
c277df42 5809 StructCopy(--src, --dst, regnode);
fac92740 5810 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5811 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5812 "reg_insert",
ccb2c380
MP
5813 __LINE__,
5814 reg_name[op],
fac92740
MJD
5815 dst - RExC_emit_start > RExC_offsets[0]
5816 ? "Overwriting end of array!\n" : "OK",
5817 src - RExC_emit_start,
5818 dst - RExC_emit_start,
5819 RExC_offsets[0]));
ccb2c380
MP
5820 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5821 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5822 }
5823 }
5824
a0d0e21e
LW
5825
5826 place = opnd; /* Op node, where operand used to be. */
fac92740 5827 if (RExC_offsets) { /* MJD */
ccb2c380 5828 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5829 "reginsert",
ccb2c380
MP
5830 __LINE__,
5831 reg_name[op],
fac92740
MJD
5832 place - RExC_emit_start > RExC_offsets[0]
5833 ? "Overwriting end of array!\n" : "OK",
5834 place - RExC_emit_start,
5835 RExC_parse - RExC_start,
5836 RExC_offsets[0]));
ccb2c380 5837 Set_Node_Offset(place, RExC_parse);
45948336 5838 Set_Node_Length(place, 1);
fac92740 5839 }
c277df42
IZ
5840 src = NEXTOPER(place);
5841 FILL_ADVANCE_NODE(place, op);
5842 Zero(src, offset, regnode);
a687059c
LW
5843}
5844
5845/*
c277df42 5846- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 5847- SEE ALSO: regtail_study
a0d0e21e 5848*/
097eb12c 5849/* TODO: All three parms should be const */
76e3520e 5850STATIC void
3dab1dad 5851S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 5852{
97aff369 5853 dVAR;
c277df42 5854 register regnode *scan;
3dab1dad 5855 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 5856
c277df42 5857 if (SIZE_ONLY)
a0d0e21e
LW
5858 return;
5859
5860 /* Find last node. */
5861 scan = p;
5862 for (;;) {
504618e9 5863 regnode * const temp = regnext(scan);
3dab1dad
YO
5864 DEBUG_PARSE_r({
5865 SV * const mysv=sv_newmortal();
5866 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
5867 regprop(RExC_rx, mysv, scan);
5868 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
5869 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
5870 });
5871 if (temp == NULL)
5872 break;
5873 scan = temp;
5874 }
5875
5876 if (reg_off_by_arg[OP(scan)]) {
5877 ARG_SET(scan, val - scan);
5878 }
5879 else {
5880 NEXT_OFF(scan) = val - scan;
5881 }
5882}
5883
5884/*
5885- regtail_study - set the next-pointer at the end of a node chain of p to val.
5886- Look for optimizable sequences at the same time.
5887- currently only looks for EXACT chains.
5888*/
5889/* TODO: All four parms should be const */
5890STATIC U8
5891S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5892{
5893 dVAR;
5894 register regnode *scan;
5895 U8 exact= PSEUDO;
5896 GET_RE_DEBUG_FLAGS_DECL;
5897
5898 if (SIZE_ONLY)
5899 return exact;
5900
5901 /* Find last node. */
5902
5903 scan = p;
5904 for (;;) {
5905 regnode * const temp = regnext(scan);
5906 if ( exact ) {
5907 switch (OP(scan)) {
5908 case EXACT:
5909 case EXACTF:
5910 case EXACTFL:
5911 if( exact == PSEUDO )
5912 exact= OP(scan);
5913 else if ( exact != OP(scan) )
5914 exact= 0;
5915 case NOTHING:
5916 break;
5917 default:
5918 exact= 0;
5919 }
5920 }
5921 DEBUG_PARSE_r({
5922 SV * const mysv=sv_newmortal();
5923 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
5924 regprop(RExC_rx, mysv, scan);
5925 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
5926 SvPV_nolen_const(mysv),
5927 reg_name[exact],
5928 REG_NODE_NUM(scan));
5929 });
a0d0e21e
LW
5930 if (temp == NULL)
5931 break;
5932 scan = temp;
5933 }
a687059c 5934
c277df42
IZ
5935 if (reg_off_by_arg[OP(scan)]) {
5936 ARG_SET(scan, val - scan);
a0ed51b3
LW
5937 }
5938 else {
c277df42
IZ
5939 NEXT_OFF(scan) = val - scan;
5940 }
3dab1dad
YO
5941
5942 return exact;
a687059c
LW
5943}
5944
5945/*
a687059c
LW
5946 - regcurly - a little FSA that accepts {\d+,?\d*}
5947 */
79072805 5948STATIC I32
5f66b61c 5949S_regcurly(register const char *s)
a687059c
LW
5950{
5951 if (*s++ != '{')
5952 return FALSE;
f0fcb552 5953 if (!isDIGIT(*s))
a687059c 5954 return FALSE;
f0fcb552 5955 while (isDIGIT(*s))
a687059c
LW
5956 s++;
5957 if (*s == ',')
5958 s++;
f0fcb552 5959 while (isDIGIT(*s))
a687059c
LW
5960 s++;
5961 if (*s != '}')
5962 return FALSE;
5963 return TRUE;
5964}
5965
a687059c
LW
5966
5967/*
fd181c75 5968 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5969 */
5970void
097eb12c 5971Perl_regdump(pTHX_ const regexp *r)
a687059c 5972{
35ff7856 5973#ifdef DEBUGGING
97aff369 5974 dVAR;
c445ea15 5975 SV * const sv = sv_newmortal();
a687059c 5976
4f639d21 5977 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5978
5979 /* Header fields of interest. */
c277df42 5980 if (r->anchored_substr)
7b0972df 5981 PerlIO_printf(Perl_debug_log,
a0288114 5982 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5983 PL_colors[0],
7b0972df 5984 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5985 SvPVX_const(r->anchored_substr),
3280af22 5986 PL_colors[1],
c277df42 5987 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5988 (IV)r->anchored_offset);
33b8afdf
JH
5989 else if (r->anchored_utf8)
5990 PerlIO_printf(Perl_debug_log,
a0288114 5991 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5992 PL_colors[0],
5993 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5994 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5995 PL_colors[1],
5996 SvTAIL(r->anchored_utf8) ? "$" : "",
5997 (IV)r->anchored_offset);
c277df42 5998 if (r->float_substr)
7b0972df 5999 PerlIO_printf(Perl_debug_log,
a0288114 6000 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 6001 PL_colors[0],
b81d288d 6002 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6003 SvPVX_const(r->float_substr),
3280af22 6004 PL_colors[1],
c277df42 6005 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6006 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6007 else if (r->float_utf8)
6008 PerlIO_printf(Perl_debug_log,
a0288114 6009 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6010 PL_colors[0],
6011 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6012 SvPVX_const(r->float_utf8),
33b8afdf
JH
6013 PL_colors[1],
6014 SvTAIL(r->float_utf8) ? "$" : "",
6015 (IV)r->float_min_offset, (UV)r->float_max_offset);
6016 if (r->check_substr || r->check_utf8)
b81d288d
AB
6017 PerlIO_printf(Perl_debug_log,
6018 r->check_substr == r->float_substr
33b8afdf 6019 && r->check_utf8 == r->float_utf8
c277df42
IZ
6020 ? "(checking floating" : "(checking anchored");
6021 if (r->reganch & ROPT_NOSCAN)
6022 PerlIO_printf(Perl_debug_log, " noscan");
6023 if (r->reganch & ROPT_CHECK_ALL)
6024 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6025 if (r->check_substr || r->check_utf8)
c277df42
IZ
6026 PerlIO_printf(Perl_debug_log, ") ");
6027
46fc3d4c 6028 if (r->regstclass) {
32fc9b6a 6029 regprop(r, sv, r->regstclass);
3f7c398e 6030 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6031 }
774d564b 6032 if (r->reganch & ROPT_ANCH) {
6033 PerlIO_printf(Perl_debug_log, "anchored");
6034 if (r->reganch & ROPT_ANCH_BOL)
6035 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6036 if (r->reganch & ROPT_ANCH_MBOL)
6037 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6038 if (r->reganch & ROPT_ANCH_SBOL)
6039 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6040 if (r->reganch & ROPT_ANCH_GPOS)
6041 PerlIO_printf(Perl_debug_log, "(GPOS)");
6042 PerlIO_putc(Perl_debug_log, ' ');
6043 }
c277df42
IZ
6044 if (r->reganch & ROPT_GPOS_SEEN)
6045 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6046 if (r->reganch & ROPT_SKIP)
760ac839 6047 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6048 if (r->reganch & ROPT_IMPLICIT)
760ac839 6049 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6050 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6051 if (r->reganch & ROPT_EVAL_SEEN)
6052 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6053 PerlIO_printf(Perl_debug_log, "\n");
fac92740 6054 if (r->offsets) {
e4584336 6055 const U32 len = r->offsets[0];
a3621e74
YO
6056 GET_RE_DEBUG_FLAGS_DECL;
6057 DEBUG_OFFSETS_r({
1df70142 6058 U32 i;
e4584336
RB
6059 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6060 for (i = 1; i <= len; i++)
6061 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
6062 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6063 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 6064 });
fac92740 6065 }
65e66c80 6066#else
96a5add6 6067 PERL_UNUSED_CONTEXT;
65e66c80 6068 PERL_UNUSED_ARG(r);
17c3b450 6069#endif /* DEBUGGING */
a687059c
LW
6070}
6071
6072/*
a0d0e21e
LW
6073- regprop - printable representation of opcode
6074*/
46fc3d4c 6075void
32fc9b6a 6076Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6077{
35ff7856 6078#ifdef DEBUGGING
97aff369 6079 dVAR;
9b155405 6080 register int k;
a0d0e21e 6081
54dc92de 6082 sv_setpvn(sv, "", 0);
9b155405 6083 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6084 /* It would be nice to FAIL() here, but this may be called from
6085 regexec.c, and it would be hard to supply pRExC_state. */
6086 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6087 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6088
3dab1dad 6089 k = PL_regkind[OP(o)];
9b155405 6090
2a782b5b 6091 if (k == EXACT) {
396482e1 6092 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6093 /* Using is_utf8_string() is a crude hack but it may
6094 * be the best for now since we have no flag "this EXACTish
6095 * node was UTF-8" --jhi */
1df70142 6096 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 6097 const char * const s = do_utf8 ?
c728cb41
JH
6098 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6099 UNI_DISPLAY_REGEX) :
2a782b5b 6100 STRING(o);
e1ec3a88 6101 const int len = do_utf8 ?
2a782b5b
JH
6102 strlen(s) :
6103 STR_LEN(o);
6104 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6105 PL_colors[0],
6106 len, s,
6107 PL_colors[1]);
bb263b4e 6108 } else if (k == TRIE) {
3dab1dad
YO
6109 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6110 /* print the details of the trie in dumpuntil instead, as
4f639d21 6111 * prog->data isn't available here */
a3621e74 6112 } else if (k == CURLY) {
cb434fcc 6113 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6114 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6115 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6116 }
2c2d71f5
JH
6117 else if (k == WHILEM && o->flags) /* Ordinal/of */
6118 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6119 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6120 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6121 else if (k == LOGICAL)
04ebc1ab 6122 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6123 else if (k == ANYOF) {
6124 int i, rangestart = -1;
2d03de9c 6125 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6126
6127 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6128 static const char * const anyofs[] = {
653099ff
GS
6129 "\\w",
6130 "\\W",
6131 "\\s",
6132 "\\S",
6133 "\\d",
6134 "\\D",
6135 "[:alnum:]",
6136 "[:^alnum:]",
6137 "[:alpha:]",
6138 "[:^alpha:]",
6139 "[:ascii:]",
6140 "[:^ascii:]",
6141 "[:ctrl:]",
6142 "[:^ctrl:]",
6143 "[:graph:]",
6144 "[:^graph:]",
6145 "[:lower:]",
6146 "[:^lower:]",
6147 "[:print:]",
6148 "[:^print:]",
6149 "[:punct:]",
6150 "[:^punct:]",
6151 "[:upper:]",
aaa51d5e 6152 "[:^upper:]",
653099ff 6153 "[:xdigit:]",
aaa51d5e
JF
6154 "[:^xdigit:]",
6155 "[:space:]",
6156 "[:^space:]",
6157 "[:blank:]",
6158 "[:^blank:]"
653099ff
GS
6159 };
6160
19860706 6161 if (flags & ANYOF_LOCALE)
396482e1 6162 sv_catpvs(sv, "{loc}");
19860706 6163 if (flags & ANYOF_FOLD)
396482e1 6164 sv_catpvs(sv, "{i}");
653099ff 6165 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6166 if (flags & ANYOF_INVERT)
396482e1 6167 sv_catpvs(sv, "^");
ffc61ed2
JH
6168 for (i = 0; i <= 256; i++) {
6169 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6170 if (rangestart == -1)
6171 rangestart = i;
6172 } else if (rangestart != -1) {
6173 if (i <= rangestart + 3)
6174 for (; rangestart < i; rangestart++)
653099ff 6175 put_byte(sv, rangestart);
ffc61ed2
JH
6176 else {
6177 put_byte(sv, rangestart);
396482e1 6178 sv_catpvs(sv, "-");
ffc61ed2 6179 put_byte(sv, i - 1);
653099ff 6180 }
ffc61ed2 6181 rangestart = -1;
653099ff 6182 }
847a199f 6183 }
ffc61ed2
JH
6184
6185 if (o->flags & ANYOF_CLASS)
bb7a0f54 6186 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6187 if (ANYOF_CLASS_TEST(o,i))
6188 sv_catpv(sv, anyofs[i]);
6189
6190 if (flags & ANYOF_UNICODE)
396482e1 6191 sv_catpvs(sv, "{unicode}");
1aa99e6b 6192 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6193 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6194
6195 {
6196 SV *lv;
32fc9b6a 6197 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6198
ffc61ed2
JH
6199 if (lv) {
6200 if (sw) {
89ebb4a3 6201 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6202
ffc61ed2 6203 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6204 uvchr_to_utf8(s, i);
ffc61ed2 6205
3568d838 6206 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6207 if (rangestart == -1)
6208 rangestart = i;
6209 } else if (rangestart != -1) {
ffc61ed2
JH
6210 if (i <= rangestart + 3)
6211 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6212 const U8 * const e = uvchr_to_utf8(s,rangestart);
6213 U8 *p;
6214 for(p = s; p < e; p++)
ffc61ed2
JH
6215 put_byte(sv, *p);
6216 }
6217 else {
2d03de9c
AL
6218 const U8 *e = uvchr_to_utf8(s,rangestart);
6219 U8 *p;
6220 for (p = s; p < e; p++)
ffc61ed2 6221 put_byte(sv, *p);
396482e1 6222 sv_catpvs(sv, "-");
2d03de9c
AL
6223 e = uvchr_to_utf8(s, i-1);
6224 for (p = s; p < e; p++)
1df70142 6225 put_byte(sv, *p);
ffc61ed2
JH
6226 }
6227 rangestart = -1;
6228 }
19860706 6229 }
ffc61ed2 6230
396482e1 6231 sv_catpvs(sv, "..."); /* et cetera */
19860706 6232 }
fde631ed 6233
ffc61ed2 6234 {
2e0de35c 6235 char *s = savesvpv(lv);
c445ea15 6236 char * const origs = s;
b81d288d 6237
3dab1dad
YO
6238 while (*s && *s != '\n')
6239 s++;
b81d288d 6240
ffc61ed2 6241 if (*s == '\n') {
2d03de9c 6242 const char * const t = ++s;
ffc61ed2
JH
6243
6244 while (*s) {
6245 if (*s == '\n')
6246 *s = ' ';
6247 s++;
6248 }
6249 if (s[-1] == ' ')
6250 s[-1] = 0;
6251
6252 sv_catpv(sv, t);
fde631ed 6253 }
b81d288d 6254
ffc61ed2 6255 Safefree(origs);
fde631ed
JH
6256 }
6257 }
653099ff 6258 }
ffc61ed2 6259
653099ff
GS
6260 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6261 }
9b155405 6262 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6263 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 6264#else
96a5add6 6265 PERL_UNUSED_CONTEXT;
65e66c80
SP
6266 PERL_UNUSED_ARG(sv);
6267 PERL_UNUSED_ARG(o);
17c3b450 6268#endif /* DEBUGGING */
35ff7856 6269}
a687059c 6270
cad2e5aa
JH
6271SV *
6272Perl_re_intuit_string(pTHX_ regexp *prog)
6273{ /* Assume that RE_INTUIT is set */
97aff369 6274 dVAR;
a3621e74 6275 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6276 PERL_UNUSED_CONTEXT;
6277
a3621e74 6278 DEBUG_COMPILE_r(
cfd0369c 6279 {
2d03de9c 6280 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6281 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6282
6283 if (!PL_colorset) reginitcolors();
6284 PerlIO_printf(Perl_debug_log,
a0288114 6285 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6286 PL_colors[4],
6287 prog->check_substr ? "" : "utf8 ",
6288 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6289 s,
6290 PL_colors[1],
6291 (strlen(s) > 60 ? "..." : ""));
6292 } );
6293
33b8afdf 6294 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6295}
6296
2b69d0c2 6297void
864dbfa3 6298Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6299{
27da23d5 6300 dVAR;
9e55ce06 6301#ifdef DEBUGGING
c445ea15 6302 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 6303#endif
fc32ee4a 6304 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6305
7821416a
IZ
6306 if (!r || (--r->refcnt > 0))
6307 return;
a3621e74 6308 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6309 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6310 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6311 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6312 const int len = SvCUR(dsv);
9e55ce06
JH
6313 if (!PL_colorset)
6314 reginitcolors();
6315 PerlIO_printf(Perl_debug_log,
a3621e74 6316 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6317 PL_colors[4],PL_colors[5],PL_colors[0],
6318 len, len, s,
6319 PL_colors[1],
6320 len > 60 ? "..." : "");
6321 });
cad2e5aa 6322
43c5f42d
NC
6323 /* gcov results gave these as non-null 100% of the time, so there's no
6324 optimisation in checking them before calling Safefree */
6325 Safefree(r->precomp);
6326 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6327 RX_MATCH_COPY_FREE(r);
f8c7b90f 6328#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6329 if (r->saved_copy)
6330 SvREFCNT_dec(r->saved_copy);
6331#endif
a193d654
GS
6332 if (r->substrs) {
6333 if (r->anchored_substr)
6334 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6335 if (r->anchored_utf8)
6336 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6337 if (r->float_substr)
6338 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6339 if (r->float_utf8)
6340 SvREFCNT_dec(r->float_utf8);
2779dcf1 6341 Safefree(r->substrs);
a193d654 6342 }
c277df42
IZ
6343 if (r->data) {
6344 int n = r->data->count;
f3548bdc
DM
6345 PAD* new_comppad = NULL;
6346 PAD* old_comppad;
4026c95a 6347 PADOFFSET refcnt;
dfad63ad 6348
c277df42 6349 while (--n >= 0) {
261faec3 6350 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6351 switch (r->data->what[n]) {
6352 case 's':
6353 SvREFCNT_dec((SV*)r->data->data[n]);
6354 break;
653099ff
GS
6355 case 'f':
6356 Safefree(r->data->data[n]);
6357 break;
dfad63ad
HS
6358 case 'p':
6359 new_comppad = (AV*)r->data->data[n];
6360 break;
c277df42 6361 case 'o':
dfad63ad 6362 if (new_comppad == NULL)
cea2e8a9 6363 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6364 PAD_SAVE_LOCAL(old_comppad,
6365 /* Watch out for global destruction's random ordering. */
c445ea15 6366 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6367 );
b34c0dd4 6368 OP_REFCNT_LOCK;
4026c95a
SH
6369 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6370 OP_REFCNT_UNLOCK;
6371 if (!refcnt)
9b978d73 6372 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6373
f3548bdc 6374 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6375 SvREFCNT_dec((SV*)new_comppad);
6376 new_comppad = NULL;
c277df42
IZ
6377 break;
6378 case 'n':
9e55ce06 6379 break;
a3621e74
YO
6380 case 't':
6381 {
c445ea15 6382 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6383 U32 refcount;
6384 OP_REFCNT_LOCK;
e27afef8 6385 refcount = --trie->refcount;
a3621e74
YO
6386 OP_REFCNT_UNLOCK;
6387 if ( !refcount ) {
43c5f42d 6388 Safefree(trie->charmap);
a3621e74
YO
6389 if (trie->widecharmap)
6390 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6391 Safefree(trie->states);
6392 Safefree(trie->trans);
3dab1dad
YO
6393 if (trie->bitmap)
6394 Safefree(trie->bitmap);
a3621e74
YO
6395#ifdef DEBUGGING
6396 if (trie->words)
6397 SvREFCNT_dec((SV*)trie->words);
6398 if (trie->revcharmap)
6399 SvREFCNT_dec((SV*)trie->revcharmap);
6400#endif
6401 Safefree(r->data->data[n]); /* do this last!!!! */
6402 }
6403 break;
6404 }
c277df42 6405 default:
830247a4 6406 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6407 }
6408 }
6409 Safefree(r->data->what);
6410 Safefree(r->data);
a0d0e21e
LW
6411 }
6412 Safefree(r->startp);
6413 Safefree(r->endp);
6414 Safefree(r);
a687059c 6415}
c277df42 6416
76234dfb 6417#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6418/*
6419 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6420 */
6421regnode *
864dbfa3 6422Perl_regnext(pTHX_ register regnode *p)
c277df42 6423{
97aff369 6424 dVAR;
c277df42
IZ
6425 register I32 offset;
6426
3280af22 6427 if (p == &PL_regdummy)
c277df42
IZ
6428 return(NULL);
6429
6430 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6431 if (offset == 0)
6432 return(NULL);
6433
c277df42 6434 return(p+offset);
c277df42 6435}
76234dfb 6436#endif
c277df42 6437
01f988be 6438STATIC void
cea2e8a9 6439S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6440{
6441 va_list args;
6442 STRLEN l1 = strlen(pat1);
6443 STRLEN l2 = strlen(pat2);
6444 char buf[512];
06bf62c7 6445 SV *msv;
73d840c0 6446 const char *message;
c277df42
IZ
6447
6448 if (l1 > 510)
6449 l1 = 510;
6450 if (l1 + l2 > 510)
6451 l2 = 510 - l1;
6452 Copy(pat1, buf, l1 , char);
6453 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6454 buf[l1 + l2] = '\n';
6455 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6456#ifdef I_STDARG
6457 /* ANSI variant takes additional second argument */
c277df42 6458 va_start(args, pat2);
8736538c
AS
6459#else
6460 va_start(args);
6461#endif
5a844595 6462 msv = vmess(buf, &args);
c277df42 6463 va_end(args);
cfd0369c 6464 message = SvPV_const(msv,l1);
c277df42
IZ
6465 if (l1 > 512)
6466 l1 = 512;
6467 Copy(message, buf, l1 , char);
197cf9b9 6468 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6469 Perl_croak(aTHX_ "%s", buf);
c277df42 6470}
a0ed51b3
LW
6471
6472/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6473
76234dfb 6474#ifndef PERL_IN_XSUB_RE
a0ed51b3 6475void
864dbfa3 6476Perl_save_re_context(pTHX)
b81d288d 6477{
97aff369 6478 dVAR;
1ade1aa1
NC
6479
6480 struct re_save_state *state;
6481
6482 SAVEVPTR(PL_curcop);
6483 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6484
6485 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6486 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6487 SSPUSHINT(SAVEt_RE_STATE);
6488
46ab3289 6489 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6490
a0ed51b3 6491 PL_reg_start_tmp = 0;
a0ed51b3 6492 PL_reg_start_tmpl = 0;
c445ea15 6493 PL_reg_oldsaved = NULL;
a5db57d6 6494 PL_reg_oldsavedlen = 0;
a5db57d6 6495 PL_reg_maxiter = 0;
a5db57d6 6496 PL_reg_leftiter = 0;
c445ea15 6497 PL_reg_poscache = NULL;
a5db57d6 6498 PL_reg_poscache_size = 0;
1ade1aa1
NC
6499#ifdef PERL_OLD_COPY_ON_WRITE
6500 PL_nrs = NULL;
6501#endif
ada6e8a9 6502
c445ea15
AL
6503 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6504 if (PL_curpm) {
6505 const REGEXP * const rx = PM_GETRE(PL_curpm);
6506 if (rx) {
1df70142 6507 U32 i;
ada6e8a9 6508 for (i = 1; i <= rx->nparens; i++) {
1df70142 6509 char digits[TYPE_CHARS(long)];
d9fad198 6510 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6511 GV *const *const gvp
6512 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6513
b37c2d43
AL
6514 if (gvp) {
6515 GV * const gv = *gvp;
6516 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6517 save_scalar(gv);
49f27e4b 6518 }
ada6e8a9
AMS
6519 }
6520 }
6521 }
a0ed51b3 6522}
76234dfb 6523#endif
51371543 6524
51371543 6525static void
acfe0abc 6526clear_re(pTHX_ void *r)
51371543 6527{
97aff369 6528 dVAR;
51371543
GS
6529 ReREFCNT_dec((regexp *)r);
6530}
ffbc6a93 6531
a28509cc
AL
6532#ifdef DEBUGGING
6533
6534STATIC void
6535S_put_byte(pTHX_ SV *sv, int c)
6536{
6537 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6538 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6539 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6540 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6541 else
6542 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6543}
6544
6545
3dab1dad
YO
6546#define CLEAR_OPTSTART \
6547 if (optstart) STMT_START { \
6548 PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart); \
6549 optstart=NULL; \
6550 } STMT_END
6551
6552#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6553
b5a2f8d8
NC
6554STATIC const regnode *
6555S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6556 const regnode *last, SV* sv, I32 l)
a28509cc 6557{
97aff369 6558 dVAR;
a28509cc 6559 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6560 register const regnode *next;
3dab1dad
YO
6561 const regnode *optstart= NULL;
6562 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6563
6564 while (op != END && (!last || node < last)) {
6565 /* While that wasn't END last time... */
6566
6567 NODE_ALIGN(node);
6568 op = OP(node);
6569 if (op == CLOSE)
6570 l--;
b5a2f8d8 6571 next = regnext((regnode *)node);
a28509cc 6572 /* Where, what. */
8e11feef 6573 if (OP(node) == OPTIMIZED) {
3dab1dad 6574 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
8e11feef 6575 optstart = node;
3dab1dad 6576 else
8e11feef 6577 goto after_print;
3dab1dad
YO
6578 } else
6579 CLEAR_OPTSTART;
32fc9b6a 6580 regprop(r, sv, node);
a28509cc
AL
6581 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6582 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6583
6584 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6585 if (next == NULL) /* Next ptr. */
6586 PerlIO_printf(Perl_debug_log, "(0)");
6587 else
6588 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6589 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6590 }
6591
a28509cc
AL
6592 after_print:
6593 if (PL_regkind[(U8)op] == BRANCHJ) {
b5a2f8d8
NC
6594 register const regnode *nnode = (OP(next) == LONGJMP
6595 ? regnext((regnode *)next)
6596 : next);
a28509cc
AL
6597 if (last && nnode > last)
6598 nnode = last;
3dab1dad 6599 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a28509cc
AL
6600 }
6601 else if (PL_regkind[(U8)op] == BRANCH) {
3dab1dad 6602 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6603 }
6604 else if ( PL_regkind[(U8)op] == TRIE ) {
6605 const I32 n = ARG(node);
4f639d21 6606 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6607 const I32 arry_len = av_len(trie->words)+1;
6608 I32 word_idx;
6609 PerlIO_printf(Perl_debug_log,
8e11feef
RGS
6610 "%*s[Start:%"UVuf" Words:%d Chars:%d Unique:%d States:%"IVdf" Minlen:%d Maxlen:%d",
6611 (int)(2*(l+3)),
6612 "",
6613 trie->startstate,
6614 TRIE_WORDCOUNT(trie),
6615 (int)TRIE_CHARCOUNT(trie),
6616 trie->uniquecharcount,
6617 (IV)TRIE_LASTSTATE(trie)-1,
6618 trie->minlen, trie->maxlen
6619 );
3dab1dad
YO
6620 if (trie->bitmap) {
6621 int i;
6622 int rangestart= -1;
6623 sv_setpvn(sv, "", 0);
6624 for (i = 0; i <= 256; i++) {
6625 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6626 if (rangestart == -1)
6627 rangestart = i;
6628 } else if (rangestart != -1) {
6629 if (i <= rangestart + 3)
6630 for (; rangestart < i; rangestart++)
6631 put_byte(sv, rangestart);
6632 else {
6633 put_byte(sv, rangestart);
6634 sv_catpvs(sv, "-");
6635 put_byte(sv, i - 1);
6636 }
6637 rangestart = -1;
6638 }
6639 }
6640 PerlIO_printf(Perl_debug_log, " Start-Class:%s]\n", SvPVX_const(sv));
6641 } else
6642 PerlIO_printf(Perl_debug_log, " No Start-Class]\n");
a28509cc
AL
6643
6644 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6645 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6646 if (elem_ptr) {
6647 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6648 (int)(2*(l+4)), "",
6649 PL_colors[0],
cfd0369c 6650 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6651 PL_colors[1]
6652 );
6653 /*
6654 if (next == NULL)
6655 PerlIO_printf(Perl_debug_log, "(0)\n");
6656 else
6657 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6658 */
6659 }
6660
6661 }
6662
6663 node = NEXTOPER(node);
6664 node += regarglen[(U8)op];
6665
6666 }
6667 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 6668 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6669 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6670 }
6671 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
3dab1dad 6672 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6673 next, sv, l + 1);
6674 }
6675 else if ( op == PLUS || op == STAR) {
3dab1dad 6676 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
6677 }
6678 else if (op == ANYOF) {
6679 /* arglen 1 + class block */
6680 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6681 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6682 node = NEXTOPER(node);
6683 }
6684 else if (PL_regkind[(U8)op] == EXACT) {
6685 /* Literal string, where present. */
6686 node += NODE_SZ_STR(node) - 1;
6687 node = NEXTOPER(node);
6688 }
6689 else {
6690 node = NEXTOPER(node);
6691 node += regarglen[(U8)op];
6692 }
6693 if (op == CURLYX || op == OPEN)
6694 l++;
6695 else if (op == WHILEM)
6696 l--;
6697 }
3dab1dad 6698 CLEAR_OPTSTART;
a28509cc
AL
6699 return node;
6700}
6701
6702#endif /* DEBUGGING */
6703
241d1a3b
NC
6704/*
6705 * Local variables:
6706 * c-indentation-style: bsd
6707 * c-basic-offset: 4
6708 * indent-tabs-mode: t
6709 * End:
6710 *
37442d52
RGS
6711 * ex: set ts=8 sts=4 sw=4 noet:
6712 */