This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
New authors
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
10edeb5d 515 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
ab3bbdeb 757 if (UTF) SvUTF8_on(tmp); \
3dab1dad
YO
758 av_push( TRIE_REVCHARMAP(trie), tmp ); \
759 } STMT_END
a3621e74
YO
760
761#define TRIE_READ_CHAR STMT_START { \
07be1b83 762 wordlen++; \
a3621e74
YO
763 if ( UTF ) { \
764 if ( folder ) { \
765 if ( foldlen > 0 ) { \
766 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
767 foldlen -= len; \
768 scan += len; \
769 len = 0; \
770 } else { \
e1ec3a88 771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
772 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
773 foldlen -= UNISKIP( uvc ); \
774 scan = foldbuf + UNISKIP( uvc ); \
775 } \
776 } else { \
e1ec3a88 777 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
778 } \
779 } else { \
780 uvc = (U32)*uc; \
781 len = 1; \
782 } \
783} STMT_END
784
785
786#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
787#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
788#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
789#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790
791#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
792 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
793 TRIE_LIST_LEN( state ) *= 2; \
794 Renew( trie->states[ state ].trans.list, \
795 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 } \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
798 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
799 TRIE_LIST_CUR( state )++; \
800} STMT_END
801
802#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 803 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
804 4, reg_trie_trans_le ); \
805 TRIE_LIST_CUR( state ) = 1; \
806 TRIE_LIST_LEN( state ) = 4; \
807} STMT_END
808
07be1b83
YO
809#define TRIE_HANDLE_WORD(state) STMT_START { \
810 if ( !trie->states[ state ].wordnum ) { \
811 /* we haven't inserted this word into the structure yet. */ \
812 if (trie->wordlen) \
813 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
814 trie->states[ state ].wordnum = ++curword; \
815 DEBUG_r({ \
816 /* store the word for dumping */ \
817 SV* tmp; \
07be1b83
YO
818 if (OP(noper) != NOTHING) \
819 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 820 else \
07be1b83 821 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
822 if ( UTF ) SvUTF8_on( tmp ); \
823 av_push( trie->words, tmp ); \
824 }); \
825 } else { \
826 NOOP; /* It's a dupe. So ignore it. */ \
827 } \
828} STMT_END
829
830#ifdef DEBUGGING
07be1b83 831/*
3dab1dad
YO
832 dump_trie(trie)
833 dump_trie_interim_list(trie,next_alloc)
834 dump_trie_interim_table(trie,next_alloc)
835
836 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
837 The _interim_ variants are used for debugging the interim
838 tables that are used to generate the final compressed
839 representation which is what dump_trie expects.
840
3dab1dad
YO
841 Part of the reason for their existance is to provide a form
842 of documentation as to how the different representations function.
07be1b83
YO
843
844*/
3dab1dad
YO
845
846/*
847 dump_trie(trie)
848 Dumps the final compressed table form of the trie to Perl_debug_log.
849 Used for debugging make_trie().
850*/
851
852STATIC void
853S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
854{
855 U32 state;
ab3bbdeb
YO
856 SV *sv=sv_newmortal();
857 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
858 GET_RE_DEBUG_FLAGS_DECL;
859
ab3bbdeb 860
3dab1dad
YO
861 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
862 (int)depth * 2 + 2,"",
863 "Match","Base","Ofs" );
864
865 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 866 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 867 if ( tmp ) {
ab3bbdeb
YO
868 PerlIO_printf( Perl_debug_log, "%*s",
869 colwidth,
ddc5bc0f 870 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
871 PL_colors[0], PL_colors[1],
872 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
873 PERL_PV_ESCAPE_FIRSTCHAR
874 )
875 );
3dab1dad
YO
876 }
877 }
878 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
879 (int)depth * 2 + 2,"");
880
881 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 882 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
883 PerlIO_printf( Perl_debug_log, "\n");
884
885 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 886 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
887
888 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
889
890 if ( trie->states[ state ].wordnum ) {
891 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
892 } else {
893 PerlIO_printf( Perl_debug_log, "%6s", "" );
894 }
895
896 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897
898 if ( base ) {
899 U32 ofs = 0;
900
901 while( ( base + ofs < trie->uniquecharcount ) ||
902 ( base + ofs - trie->uniquecharcount < trie->lasttrans
903 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
904 ofs++;
905
906 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
907
908 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
909 if ( ( base + ofs >= trie->uniquecharcount ) &&
910 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
911 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
912 {
ab3bbdeb
YO
913 PerlIO_printf( Perl_debug_log, "%*"UVXf,
914 colwidth,
3dab1dad
YO
915 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
916 } else {
ab3bbdeb 917 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
918 }
919 }
920
921 PerlIO_printf( Perl_debug_log, "]");
922
923 }
924 PerlIO_printf( Perl_debug_log, "\n" );
925 }
926}
927/*
928 dump_trie_interim_list(trie,next_alloc)
929 Dumps a fully constructed but uncompressed trie in list form.
930 List tries normally only are used for construction when the number of
931 possible chars (trie->uniquecharcount) is very high.
932 Used for debugging make_trie().
933*/
934STATIC void
935S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
936{
937 U32 state;
ab3bbdeb
YO
938 SV *sv=sv_newmortal();
939 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
940 GET_RE_DEBUG_FLAGS_DECL;
941 /* print out the table precompression. */
ab3bbdeb
YO
942 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
943 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
944 "------:-----+-----------------\n" );
3dab1dad
YO
945
946 for( state=1 ; state < next_alloc ; state ++ ) {
947 U16 charid;
948
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
950 (int)depth * 2 + 2,"", (UV)state );
951 if ( ! trie->states[ state ].wordnum ) {
952 PerlIO_printf( Perl_debug_log, "%5s| ","");
953 } else {
954 PerlIO_printf( Perl_debug_log, "W%4x| ",
955 trie->states[ state ].wordnum
956 );
957 }
958 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 959 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
960 if ( tmp ) {
961 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
962 colwidth,
ddc5bc0f 963 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
964 PL_colors[0], PL_colors[1],
965 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
966 PERL_PV_ESCAPE_FIRSTCHAR
967 ) ,
3dab1dad
YO
968 TRIE_LIST_ITEM(state,charid).forid,
969 (UV)TRIE_LIST_ITEM(state,charid).newstate
970 );
971 }
ab3bbdeb
YO
972 }
973 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
974 }
975}
976
977/*
978 dump_trie_interim_table(trie,next_alloc)
979 Dumps a fully constructed but uncompressed trie in table form.
980 This is the normal DFA style state transition table, with a few
981 twists to facilitate compression later.
982 Used for debugging make_trie().
983*/
984STATIC void
985S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
986{
987 U32 state;
988 U16 charid;
ab3bbdeb
YO
989 SV *sv=sv_newmortal();
990 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
991 GET_RE_DEBUG_FLAGS_DECL;
992
993 /*
994 print out the table precompression so that we can do a visual check
995 that they are identical.
996 */
997
998 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 1001 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 1002 if ( tmp ) {
ab3bbdeb
YO
1003 PerlIO_printf( Perl_debug_log, "%*s",
1004 colwidth,
ddc5bc0f 1005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1006 PL_colors[0], PL_colors[1],
1007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1008 PERL_PV_ESCAPE_FIRSTCHAR
1009 )
1010 );
3dab1dad
YO
1011 }
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1015
1016 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1017 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1018 }
1019
1020 PerlIO_printf( Perl_debug_log, "\n" );
1021
1022 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1023
1024 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1025 (int)depth * 2 + 2,"",
1026 (UV)TRIE_NODENUM( state ) );
1027
1028 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1029 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1030 if (v)
1031 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1032 else
1033 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1034 }
1035 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1036 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1037 } else {
1038 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1039 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1040 }
1041 }
07be1b83 1042}
3dab1dad
YO
1043
1044#endif
1045
07be1b83
YO
1046#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1047 ( ( base + charid >= ucharcount \
1048 && base + charid < ubound \
1049 && state == trie->trans[ base - ucharcount + charid ].check \
1050 && trie->trans[ base - ucharcount + charid ].next ) \
1051 ? trie->trans[ base - ucharcount + charid ].next \
1052 : ( state==1 ? special : 0 ) \
1053 )
3dab1dad 1054
07be1b83
YO
1055STATIC void
1056S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1057{
1058/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1059
1060 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1061 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1062 ISBN 0-201-10088-6
1063
1064 We find the fail state for each state in the trie, this state is the longest proper
1065 suffix of the current states 'word' that is also a proper prefix of another word in our
1066 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1067 the DFA not to have to restart after its tried and failed a word at a given point, it
1068 simply continues as though it had been matching the other word in the first place.
1069 Consider
1070 'abcdgu'=~/abcdefg|cdgu/
1071 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1072 fail, which would bring use to the state representing 'd' in the second word where we would
1073 try 'g' and succeed, prodceding to match 'cdgu'.
1074 */
1075 /* add a fail transition */
1076 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1077 U32 *q;
be8e71aa
YO
1078 const U32 ucharcount = trie->uniquecharcount;
1079 const U32 numstates = trie->laststate;
1080 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1081 U32 q_read = 0;
1082 U32 q_write = 0;
1083 U32 charid;
1084 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1085 U32 *fail;
1086 reg_ac_data *aho;
1087 const U32 data_slot = add_data( pRExC_state, 1, "T" );
72f13be8 1088 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
1089#ifndef DEBUGGING
1090 PERL_UNUSED_ARG(depth);
1091#endif
72f13be8 1092
07be1b83
YO
1093
1094 ARG_SET( stclass, data_slot );
1095 Newxz( aho, 1, reg_ac_data );
1096 RExC_rx->data->data[ data_slot ] = (void*)aho;
1097 aho->trie=trie;
1098 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1099 (trie->laststate+1)*sizeof(reg_trie_state));
1100 Newxz( q, numstates, U32);
1101 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1102 aho->refcount = 1;
1103 fail = aho->fail;
07be1b83
YO
1104 fail[ 0 ] = fail[ 1 ] = 1;
1105
1106 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1107 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1108 if ( newstate ) {
07be1b83
YO
1109 q[ q_write ] = newstate;
1110 /* set to point at the root */
1111 fail[ q[ q_write++ ] ]=1;
1112 }
1113 }
1114 while ( q_read < q_write) {
be8e71aa 1115 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1116 base = trie->states[ cur ].trans.base;
1117
1118 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1119 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1120 if (ch_state) {
07be1b83
YO
1121 U32 fail_state = cur;
1122 U32 fail_base;
1123 do {
1124 fail_state = fail[ fail_state ];
1125 fail_base = aho->states[ fail_state ].trans.base;
1126 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1127
1128 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1129 fail[ ch_state ] = fail_state;
1130 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1131 {
1132 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1133 }
1134 q[ q_write++ % numstates] = ch_state;
1135 }
1136 }
1137 }
1138
1139 DEBUG_TRIE_COMPILE_MORE_r({
1140 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1141 for( q_read=2; q_read<numstates; q_read++ ) {
1142 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1143 }
1144 PerlIO_printf(Perl_debug_log, "\n");
1145 });
1146 Safefree(q);
1147 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1148}
3dab1dad
YO
1149
1150
1151
a3621e74 1152STATIC I32
3dab1dad 1153S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1154{
27da23d5 1155 dVAR;
a3621e74
YO
1156 /* first pass, loop through and scan words */
1157 reg_trie_data *trie;
1158 regnode *cur;
9f7f3913 1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1160 STRLEN len = 0;
1161 UV uvc = 0;
1162 U16 curword = 0;
1163 U32 next_alloc = 0;
1164 /* we just use folder as a flag in utf8 */
e1ec3a88 1165 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1166 ? PL_fold
1167 : ( flags == EXACTFL
1168 ? PL_fold_locale
1169 : NULL
1170 )
1171 );
1172
e1ec3a88 1173 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1174 SV *re_trie_maxbuff;
3dab1dad
YO
1175#ifndef DEBUGGING
1176 /* these are only used during construction but are useful during
8e11feef
RGS
1177 * debugging so we store them in the struct when debugging.
1178 * Wordcount is actually superfluous in debugging as we have
1179 * (AV*)trie->words to use for it, but that's not available when
1180 * not debugging... We could make the macro use the AV during
1181 * debugging though...
1182 */
3dab1dad
YO
1183 U16 trie_wordcount=0;
1184 STRLEN trie_charcount=0;
07be1b83 1185 /*U32 trie_laststate=0;*/
3dab1dad
YO
1186 AV *trie_revcharmap;
1187#endif
a3621e74 1188 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1189#ifndef DEBUGGING
1190 PERL_UNUSED_ARG(depth);
1191#endif
a3621e74 1192
a02a5408 1193 Newxz( trie, 1, reg_trie_data );
a3621e74 1194 trie->refcount = 1;
3dab1dad 1195 trie->startstate = 1;
a3621e74 1196 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1197 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1198 if (!(UTF && folder))
1199 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1200 DEBUG_r({
1201 trie->words = newAV();
a3621e74 1202 });
3dab1dad 1203 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1204
0111c4fd 1205 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1206 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1207 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1208 }
3dab1dad
YO
1209 DEBUG_OPTIMISE_r({
1210 PerlIO_printf( Perl_debug_log,
1211 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1212 (int)depth * 2 + 2, "",
1213 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1214 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1215 });
a3621e74
YO
1216 /* -- First loop and Setup --
1217
1218 We first traverse the branches and scan each word to determine if it
1219 contains widechars, and how many unique chars there are, this is
1220 important as we have to build a table with at least as many columns as we
1221 have unique chars.
1222
1223 We use an array of integers to represent the character codes 0..255
1224 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1225 native representation of the character value as the key and IV's for the
1226 coded index.
1227
1228 *TODO* If we keep track of how many times each character is used we can
1229 remap the columns so that the table compression later on is more
1230 efficient in terms of memory by ensuring most common value is in the
1231 middle and the least common are on the outside. IMO this would be better
1232 than a most to least common mapping as theres a decent chance the most
1233 common letter will share a node with the least common, meaning the node
1234 will not be compressable. With a middle is most common approach the worst
1235 case is when we have the least common nodes twice.
1236
1237 */
1238
a3621e74 1239 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1240 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1241 const U8 *uc = (U8*)STRING( noper );
a28509cc 1242 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1243 STRLEN foldlen = 0;
1244 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1245 const U8 *scan = (U8*)NULL;
07be1b83 1246 U32 wordlen = 0; /* required init */
3dab1dad 1247 STRLEN chars=0;
a3621e74 1248
3dab1dad
YO
1249 TRIE_WORDCOUNT(trie)++;
1250 if (OP(noper) == NOTHING) {
1251 trie->minlen= 0;
1252 continue;
1253 }
1254 if (trie->bitmap) {
1255 TRIE_BITMAP_SET(trie,*uc);
1256 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1257 }
a3621e74 1258 for ( ; uc < e ; uc += len ) {
3dab1dad 1259 TRIE_CHARCOUNT(trie)++;
a3621e74 1260 TRIE_READ_CHAR;
3dab1dad 1261 chars++;
a3621e74
YO
1262 if ( uvc < 256 ) {
1263 if ( !trie->charmap[ uvc ] ) {
1264 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1265 if ( folder )
1266 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1267 TRIE_STORE_REVCHAR;
a3621e74
YO
1268 }
1269 } else {
1270 SV** svpp;
1271 if ( !trie->widecharmap )
1272 trie->widecharmap = newHV();
1273
1274 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1275
1276 if ( !svpp )
e4584336 1277 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1278
1279 if ( !SvTRUE( *svpp ) ) {
1280 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1281 TRIE_STORE_REVCHAR;
a3621e74
YO
1282 }
1283 }
1284 }
3dab1dad
YO
1285 if( cur == first ) {
1286 trie->minlen=chars;
1287 trie->maxlen=chars;
1288 } else if (chars < trie->minlen) {
1289 trie->minlen=chars;
1290 } else if (chars > trie->maxlen) {
1291 trie->maxlen=chars;
1292 }
1293
a3621e74
YO
1294 } /* end first pass */
1295 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1296 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1297 (int)depth * 2 + 2,"",
1298 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1299 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1300 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1301 );
07be1b83 1302 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1303
1304 /*
1305 We now know what we are dealing with in terms of unique chars and
1306 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1307 representation using a flat table will take. If it's over a reasonable
1308 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1309 conservative but potentially much slower representation using an array
1310 of lists.
1311
1312 At the end we convert both representations into the same compressed
1313 form that will be used in regexec.c for matching with. The latter
1314 is a form that cannot be used to construct with but has memory
1315 properties similar to the list form and access properties similar
1316 to the table form making it both suitable for fast searches and
1317 small enough that its feasable to store for the duration of a program.
1318
1319 See the comment in the code where the compressed table is produced
1320 inplace from the flat tabe representation for an explanation of how
1321 the compression works.
1322
1323 */
1324
1325
3dab1dad 1326 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1327 /*
1328 Second Pass -- Array Of Lists Representation
1329
1330 Each state will be represented by a list of charid:state records
1331 (reg_trie_trans_le) the first such element holds the CUR and LEN
1332 points of the allocated array. (See defines above).
1333
1334 We build the initial structure using the lists, and then convert
1335 it into the compressed table form which allows faster lookups
1336 (but cant be modified once converted).
a3621e74
YO
1337 */
1338
a3621e74
YO
1339 STRLEN transcount = 1;
1340
3dab1dad 1341 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1342 TRIE_LIST_NEW(1);
1343 next_alloc = 2;
1344
1345 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1346
c445ea15
AL
1347 regnode * const noper = NEXTOPER( cur );
1348 U8 *uc = (U8*)STRING( noper );
1349 const U8 * const e = uc + STR_LEN( noper );
1350 U32 state = 1; /* required init */
1351 U16 charid = 0; /* sanity init */
1352 U8 *scan = (U8*)NULL; /* sanity init */
1353 STRLEN foldlen = 0; /* required init */
07be1b83 1354 U32 wordlen = 0; /* required init */
c445ea15
AL
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1356
3dab1dad 1357 if (OP(noper) != NOTHING) {
c445ea15
AL
1358 for ( ; uc < e ; uc += len ) {
1359
1360 TRIE_READ_CHAR;
1361
1362 if ( uvc < 256 ) {
1363 charid = trie->charmap[ uvc ];
1364 } else {
1365 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1366 if ( !svpp ) {
1367 charid = 0;
1368 } else {
1369 charid=(U16)SvIV( *svpp );
1370 }
1371 }
1372 if ( charid ) {
a3621e74 1373
c445ea15
AL
1374 U16 check;
1375 U32 newstate = 0;
a3621e74 1376
c445ea15
AL
1377 charid--;
1378 if ( !trie->states[ state ].trans.list ) {
1379 TRIE_LIST_NEW( state );
1380 }
1381 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1382 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1383 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1384 break;
1385 }
1386 }
1387 if ( ! newstate ) {
1388 newstate = next_alloc++;
1389 TRIE_LIST_PUSH( state, charid, newstate );
1390 transcount++;
1391 }
1392 state = newstate;
1393 } else {
1394 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1395 }
c445ea15
AL
1396 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1397 }
c445ea15 1398 }
3dab1dad 1399 TRIE_HANDLE_WORD(state);
a3621e74
YO
1400
1401 } /* end second pass */
1402
3dab1dad 1403 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1404 Renew( trie->states, next_alloc, reg_trie_state );
1405
3dab1dad
YO
1406 /* and now dump it out before we compress it */
1407 DEBUG_TRIE_COMPILE_MORE_r(
1408 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1409 );
a3621e74 1410
a02a5408 1411 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1412 {
1413 U32 state;
a3621e74
YO
1414 U32 tp = 0;
1415 U32 zp = 0;
1416
1417
1418 for( state=1 ; state < next_alloc ; state ++ ) {
1419 U32 base=0;
1420
1421 /*
1422 DEBUG_TRIE_COMPILE_MORE_r(
1423 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1424 );
1425 */
1426
1427 if (trie->states[state].trans.list) {
1428 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1429 U16 maxid=minid;
a28509cc 1430 U16 idx;
a3621e74
YO
1431
1432 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1433 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1434 if ( forid < minid ) {
1435 minid=forid;
1436 } else if ( forid > maxid ) {
1437 maxid=forid;
1438 }
a3621e74
YO
1439 }
1440 if ( transcount < tp + maxid - minid + 1) {
1441 transcount *= 2;
1442 Renew( trie->trans, transcount, reg_trie_trans );
1443 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1444 }
1445 base = trie->uniquecharcount + tp - minid;
1446 if ( maxid == minid ) {
1447 U32 set = 0;
1448 for ( ; zp < tp ; zp++ ) {
1449 if ( ! trie->trans[ zp ].next ) {
1450 base = trie->uniquecharcount + zp - minid;
1451 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1452 trie->trans[ zp ].check = state;
1453 set = 1;
1454 break;
1455 }
1456 }
1457 if ( !set ) {
1458 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1459 trie->trans[ tp ].check = state;
1460 tp++;
1461 zp = tp;
1462 }
1463 } else {
1464 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1465 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1466 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1467 trie->trans[ tid ].check = state;
1468 }
1469 tp += ( maxid - minid + 1 );
1470 }
1471 Safefree(trie->states[ state ].trans.list);
1472 }
1473 /*
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1476 );
1477 */
1478 trie->states[ state ].trans.base=base;
1479 }
cc601c31 1480 trie->lasttrans = tp + 1;
a3621e74
YO
1481 }
1482 } else {
1483 /*
1484 Second Pass -- Flat Table Representation.
1485
1486 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1487 We know that we will need Charcount+1 trans at most to store the data
1488 (one row per char at worst case) So we preallocate both structures
1489 assuming worst case.
1490
1491 We then construct the trie using only the .next slots of the entry
1492 structs.
1493
1494 We use the .check field of the first entry of the node temporarily to
1495 make compression both faster and easier by keeping track of how many non
1496 zero fields are in the node.
1497
1498 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1499 transition.
1500
1501 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1502 number representing the first entry of the node, and state as a
1503 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1504 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1505 are 2 entrys per node. eg:
1506
1507 A B A B
1508 1. 2 4 1. 3 7
1509 2. 0 3 3. 0 5
1510 3. 0 0 5. 0 0
1511 4. 0 0 7. 0 0
1512
1513 The table is internally in the right hand, idx form. However as we also
1514 have to deal with the states array which is indexed by nodenum we have to
1515 use TRIE_NODENUM() to convert.
1516
1517 */
1518
3dab1dad
YO
1519
1520 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1521 reg_trie_trans );
3dab1dad 1522 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1523 next_alloc = trie->uniquecharcount + 1;
1524
3dab1dad 1525
a3621e74
YO
1526 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1527
c445ea15 1528 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1529 const U8 *uc = (U8*)STRING( noper );
1530 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1531
1532 U32 state = 1; /* required init */
1533
1534 U16 charid = 0; /* sanity init */
1535 U32 accept_state = 0; /* sanity init */
1536 U8 *scan = (U8*)NULL; /* sanity init */
1537
1538 STRLEN foldlen = 0; /* required init */
07be1b83 1539 U32 wordlen = 0; /* required init */
a3621e74
YO
1540 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1541
3dab1dad 1542 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1543 for ( ; uc < e ; uc += len ) {
1544
1545 TRIE_READ_CHAR;
1546
1547 if ( uvc < 256 ) {
1548 charid = trie->charmap[ uvc ];
1549 } else {
c445ea15
AL
1550 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1551 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1552 }
1553 if ( charid ) {
1554 charid--;
1555 if ( !trie->trans[ state + charid ].next ) {
1556 trie->trans[ state + charid ].next = next_alloc;
1557 trie->trans[ state ].check++;
1558 next_alloc += trie->uniquecharcount;
1559 }
1560 state = trie->trans[ state + charid ].next;
1561 } else {
e4584336 1562 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1563 }
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1565 }
a3621e74 1566 }
3dab1dad
YO
1567 accept_state = TRIE_NODENUM( state );
1568 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1569
1570 } /* end second pass */
1571
3dab1dad
YO
1572 /* and now dump it out before we compress it */
1573 DEBUG_TRIE_COMPILE_MORE_r(
1574 dump_trie_interim_table(trie,next_alloc,depth+1)
1575 );
a3621e74 1576
a3621e74
YO
1577 {
1578 /*
1579 * Inplace compress the table.*
1580
1581 For sparse data sets the table constructed by the trie algorithm will
1582 be mostly 0/FAIL transitions or to put it another way mostly empty.
1583 (Note that leaf nodes will not contain any transitions.)
1584
1585 This algorithm compresses the tables by eliminating most such
1586 transitions, at the cost of a modest bit of extra work during lookup:
1587
1588 - Each states[] entry contains a .base field which indicates the
1589 index in the state[] array wheres its transition data is stored.
1590
1591 - If .base is 0 there are no valid transitions from that node.
1592
1593 - If .base is nonzero then charid is added to it to find an entry in
1594 the trans array.
1595
1596 -If trans[states[state].base+charid].check!=state then the
1597 transition is taken to be a 0/Fail transition. Thus if there are fail
1598 transitions at the front of the node then the .base offset will point
1599 somewhere inside the previous nodes data (or maybe even into a node
1600 even earlier), but the .check field determines if the transition is
1601 valid.
1602
1603 The following process inplace converts the table to the compressed
1604 table: We first do not compress the root node 1,and mark its all its
1605 .check pointers as 1 and set its .base pointer as 1 as well. This
1606 allows to do a DFA construction from the compressed table later, and
1607 ensures that any .base pointers we calculate later are greater than
1608 0.
1609
1610 - We set 'pos' to indicate the first entry of the second node.
1611
1612 - We then iterate over the columns of the node, finding the first and
1613 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1614 and set the .check pointers accordingly, and advance pos
1615 appropriately and repreat for the next node. Note that when we copy
1616 the next pointers we have to convert them from the original
1617 NODEIDX form to NODENUM form as the former is not valid post
1618 compression.
1619
1620 - If a node has no transitions used we mark its base as 0 and do not
1621 advance the pos pointer.
1622
1623 - If a node only has one transition we use a second pointer into the
1624 structure to fill in allocated fail transitions from other states.
1625 This pointer is independent of the main pointer and scans forward
1626 looking for null transitions that are allocated to a state. When it
1627 finds one it writes the single transition into the "hole". If the
1628 pointer doesnt find one the single transition is appeneded as normal.
1629
1630 - Once compressed we can Renew/realloc the structures to release the
1631 excess space.
1632
1633 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1634 specifically Fig 3.47 and the associated pseudocode.
1635
1636 demq
1637 */
a3b680e6 1638 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1639 U32 state, charid;
a3621e74 1640 U32 pos = 0, zp=0;
3dab1dad 1641 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1642
1643 for ( state = 1 ; state < laststate ; state++ ) {
1644 U8 flag = 0;
a28509cc
AL
1645 const U32 stateidx = TRIE_NODEIDX( state );
1646 const U32 o_used = trie->trans[ stateidx ].check;
1647 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1648 trie->trans[ stateidx ].check = 0;
1649
1650 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1651 if ( flag || trie->trans[ stateidx + charid ].next ) {
1652 if ( trie->trans[ stateidx + charid ].next ) {
1653 if (o_used == 1) {
1654 for ( ; zp < pos ; zp++ ) {
1655 if ( ! trie->trans[ zp ].next ) {
1656 break;
1657 }
1658 }
1659 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1660 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1661 trie->trans[ zp ].check = state;
1662 if ( ++zp > pos ) pos = zp;
1663 break;
1664 }
1665 used--;
1666 }
1667 if ( !flag ) {
1668 flag = 1;
1669 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1670 }
1671 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1672 trie->trans[ pos ].check = state;
1673 pos++;
1674 }
1675 }
1676 }
cc601c31 1677 trie->lasttrans = pos + 1;
a3621e74
YO
1678 Renew( trie->states, laststate + 1, reg_trie_state);
1679 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1680 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1682 (int)depth * 2 + 2,"",
1683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1684 (IV)next_alloc,
1685 (IV)pos,
a3621e74
YO
1686 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1687 );
1688
1689 } /* end table compress */
1690 }
cc601c31
YO
1691 /* resize the trans array to remove unused space */
1692 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1693
3dab1dad
YO
1694 /* and now dump out the compressed format */
1695 DEBUG_TRIE_COMPILE_r(
1696 dump_trie(trie,depth+1)
1697 );
07be1b83 1698
3dab1dad
YO
1699 { /* Modify the program and insert the new TRIE node*/
1700 regnode *convert;
1701 U8 nodetype =(U8)(flags & 0xFF);
1702 char *str=NULL;
07be1b83
YO
1703#ifdef DEBUGGING
1704 U32 mjd_offset;
1705 U32 mjd_nodelen;
1706#endif
a3621e74 1707 /*
3dab1dad
YO
1708 This means we convert either the first branch or the first Exact,
1709 depending on whether the thing following (in 'last') is a branch
1710 or not and whther first is the startbranch (ie is it a sub part of
1711 the alternation or is it the whole thing.)
1712 Assuming its a sub part we conver the EXACT otherwise we convert
1713 the whole branch sequence, including the first.
a3621e74 1714 */
3dab1dad
YO
1715 /* Find the node we are going to overwrite */
1716 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1717 /* whole branch chain */
3dab1dad 1718 convert = first;
07be1b83
YO
1719 DEBUG_r({
1720 const regnode *nop = NEXTOPER( convert );
1721 mjd_offset= Node_Offset((nop));
1722 mjd_nodelen= Node_Length((nop));
1723 });
1724 } else {
1725 /* branch sub-chain */
3dab1dad
YO
1726 convert = NEXTOPER( first );
1727 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1728 DEBUG_r({
1729 mjd_offset= Node_Offset((convert));
1730 mjd_nodelen= Node_Length((convert));
1731 });
1732 }
1733 DEBUG_OPTIMISE_r(
1734 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1735 (int)depth * 2 + 2, "",
1736 mjd_offset,mjd_nodelen)
1737 );
a3621e74 1738
3dab1dad
YO
1739 /* But first we check to see if there is a common prefix we can
1740 split out as an EXACT and put in front of the TRIE node. */
1741 trie->startstate= 1;
1742 if ( trie->bitmap && !trie->widecharmap ) {
1743 U32 state;
1744 DEBUG_OPTIMISE_r(
8e11feef
RGS
1745 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1746 (int)depth * 2 + 2, "",
1747 TRIE_LASTSTATE(trie))
1748 );
1749 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1750 U32 ofs = 0;
8e11feef
RGS
1751 I32 idx = -1;
1752 U32 count = 0;
1753 const U32 base = trie->states[ state ].trans.base;
a3621e74 1754
3dab1dad 1755 if ( trie->states[state].wordnum )
8e11feef 1756 count = 1;
a3621e74 1757
8e11feef 1758 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1759 if ( ( base + ofs >= trie->uniquecharcount ) &&
1760 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1761 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1762 {
3dab1dad 1763 if ( ++count > 1 ) {
8e11feef 1764 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1765 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1766 if ( state == 1 ) break;
3dab1dad
YO
1767 if ( count == 2 ) {
1768 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1769 DEBUG_OPTIMISE_r(
8e11feef
RGS
1770 PerlIO_printf(Perl_debug_log,
1771 "%*sNew Start State=%"UVuf" Class: [",
1772 (int)depth * 2 + 2, "",
3dab1dad 1773 state));
be8e71aa
YO
1774 if (idx >= 0) {
1775 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1776 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1777
3dab1dad 1778 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1779 if ( folder )
1780 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1781 DEBUG_OPTIMISE_r(
07be1b83 1782 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1783 );
8e11feef
RGS
1784 }
1785 }
1786 TRIE_BITMAP_SET(trie,*ch);
1787 if ( folder )
1788 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1789 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1790 }
1791 idx = ofs;
1792 }
3dab1dad
YO
1793 }
1794 if ( count == 1 ) {
1795 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1796 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1797 DEBUG_OPTIMISE_r(
8e11feef
RGS
1798 PerlIO_printf( Perl_debug_log,
1799 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1800 (int)depth * 2 + 2, "",
1801 state, idx, ch)
3dab1dad
YO
1802 );
1803 if ( state==1 ) {
1804 OP( convert ) = nodetype;
1805 str=STRING(convert);
1806 STR_LEN(convert)=0;
1807 }
1808 *str++=*ch;
1809 STR_LEN(convert)++;
a3621e74 1810
8e11feef 1811 } else {
f9049ba1 1812#ifdef DEBUGGING
8e11feef
RGS
1813 if (state>1)
1814 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1815#endif
8e11feef
RGS
1816 break;
1817 }
1818 }
3dab1dad 1819 if (str) {
8e11feef 1820 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1821 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1822 trie->startstate = state;
07be1b83
YO
1823 trie->minlen -= (state - 1);
1824 trie->maxlen -= (state - 1);
1825 DEBUG_r({
1826 regnode *fix = convert;
1827 mjd_nodelen++;
1828 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1829 while( ++fix < n ) {
1830 Set_Node_Offset_Length(fix, 0, 0);
1831 }
1832 });
8e11feef
RGS
1833 if (trie->maxlen) {
1834 convert = n;
1835 } else {
3dab1dad
YO
1836 NEXT_OFF(convert) = (U16)(tail - convert);
1837 }
1838 }
1839 }
1840 if ( trie->maxlen ) {
1841 OP( convert ) = TRIE;
8e11feef
RGS
1842 NEXT_OFF( convert ) = (U16)(tail - convert);
1843 ARG_SET( convert, data_slot );
a3621e74 1844
3dab1dad
YO
1845 /* store the type in the flags */
1846 convert->flags = nodetype;
1847 /* XXX We really should free up the resource in trie now, as we wont use them */
1848 }
a3621e74
YO
1849 /* needed for dumping*/
1850 DEBUG_r({
1851 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1852 regnode *opt = convert;
1853 while (++opt<optimize) {
1854 Set_Node_Offset_Length(opt,0,0);
1855 }
a3621e74
YO
1856 /* We now need to mark all of the space originally used by the
1857 branches as optimized away. This keeps the dumpuntil from
1858 throwing a wobbly as it doesnt use regnext() to traverse the
1859 opcodes.
07be1b83 1860 We also "fix" the offsets
a3621e74
YO
1861 */
1862 while( optimize < last ) {
07be1b83 1863 mjd_nodelen += Node_Length((optimize));
a3621e74 1864 OP( optimize ) = OPTIMIZED;
07be1b83 1865 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1866 optimize++;
1867 }
07be1b83 1868 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1869 });
1870 } /* end node insert */
07be1b83 1871#ifndef DEBUGGING
6e8b4190 1872 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1873#endif
a3621e74
YO
1874 return 1;
1875}
1876
a3621e74 1877/*
5d1c421c
JH
1878 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1879 * These need to be revisited when a newer toolchain becomes available.
1880 */
1881#if defined(__sparc64__) && defined(__GNUC__)
1882# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1883# undef SPARC64_GCC_WORKAROUND
1884# define SPARC64_GCC_WORKAROUND 1
1885# endif
1886#endif
1887
07be1b83
YO
1888#define DEBUG_PEEP(str,scan,depth) \
1889 DEBUG_OPTIMISE_r({ \
1890 SV * const mysv=sv_newmortal(); \
1891 regnode *Next = regnext(scan); \
1892 regprop(RExC_rx, mysv, scan); \
1893 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1894 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1895 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1896 });
1897
1898#define JOIN_EXACT(scan,min,flags) \
1899 if (PL_regkind[OP(scan)] == EXACT) \
1900 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1901
be8e71aa 1902STATIC U32
07be1b83
YO
1903S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1904 /* Merge several consecutive EXACTish nodes into one. */
1905 regnode *n = regnext(scan);
1906 U32 stringok = 1;
1907 regnode *next = scan + NODE_SZ_STR(scan);
1908 U32 merged = 0;
1909 U32 stopnow = 0;
1910#ifdef DEBUGGING
1911 regnode *stop = scan;
72f13be8 1912 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 1913#else
d47053eb
RGS
1914 PERL_UNUSED_ARG(depth);
1915#endif
1916#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
1917 PERL_UNUSED_ARG(flags);
1918 PERL_UNUSED_ARG(val);
07be1b83 1919#endif
07be1b83
YO
1920 DEBUG_PEEP("join",scan,depth);
1921
1922 /* Skip NOTHING, merge EXACT*. */
1923 while (n &&
1924 ( PL_regkind[OP(n)] == NOTHING ||
1925 (stringok && (OP(n) == OP(scan))))
1926 && NEXT_OFF(n)
1927 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1928
1929 if (OP(n) == TAIL || n > next)
1930 stringok = 0;
1931 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
1932 DEBUG_PEEP("skip:",n,depth);
1933 NEXT_OFF(scan) += NEXT_OFF(n);
1934 next = n + NODE_STEP_REGNODE;
1935#ifdef DEBUGGING
1936 if (stringok)
1937 stop = n;
1938#endif
1939 n = regnext(n);
1940 }
1941 else if (stringok) {
1942 const int oldl = STR_LEN(scan);
1943 regnode * const nnext = regnext(n);
1944
1945 DEBUG_PEEP("merg",n,depth);
1946
1947 merged++;
1948 if (oldl + STR_LEN(n) > U8_MAX)
1949 break;
1950 NEXT_OFF(scan) += NEXT_OFF(n);
1951 STR_LEN(scan) += STR_LEN(n);
1952 next = n + NODE_SZ_STR(n);
1953 /* Now we can overwrite *n : */
1954 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1955#ifdef DEBUGGING
1956 stop = next - 1;
1957#endif
1958 n = nnext;
1959 if (stopnow) break;
1960 }
1961
d47053eb
RGS
1962#ifdef EXPERIMENTAL_INPLACESCAN
1963 if (flags && !NEXT_OFF(n)) {
1964 DEBUG_PEEP("atch", val, depth);
1965 if (reg_off_by_arg[OP(n)]) {
1966 ARG_SET(n, val - n);
1967 }
1968 else {
1969 NEXT_OFF(n) = val - n;
1970 }
1971 stopnow = 1;
1972 }
07be1b83
YO
1973#endif
1974 }
1975
1976 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1977 /*
1978 Two problematic code points in Unicode casefolding of EXACT nodes:
1979
1980 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1981 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1982
1983 which casefold to
1984
1985 Unicode UTF-8
1986
1987 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1988 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1989
1990 This means that in case-insensitive matching (or "loose matching",
1991 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1992 length of the above casefolded versions) can match a target string
1993 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1994 This would rather mess up the minimum length computation.
1995
1996 What we'll do is to look for the tail four bytes, and then peek
1997 at the preceding two bytes to see whether we need to decrease
1998 the minimum length by four (six minus two).
1999
2000 Thanks to the design of UTF-8, there cannot be false matches:
2001 A sequence of valid UTF-8 bytes cannot be a subsequence of
2002 another valid sequence of UTF-8 bytes.
2003
2004 */
2005 char * const s0 = STRING(scan), *s, *t;
2006 char * const s1 = s0 + STR_LEN(scan) - 1;
2007 char * const s2 = s1 - 4;
e294cc5d
JH
2008#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2009 const char t0[] = "\xaf\x49\xaf\x42";
2010#else
07be1b83 2011 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2012#endif
07be1b83
YO
2013 const char * const t1 = t0 + 3;
2014
2015 for (s = s0 + 2;
2016 s < s2 && (t = ninstr(s, s1, t0, t1));
2017 s = t + 4) {
e294cc5d
JH
2018#ifdef EBCDIC
2019 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2020 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2021#else
07be1b83
YO
2022 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2023 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2024#endif
07be1b83
YO
2025 *min -= 4;
2026 }
2027 }
2028
2029#ifdef DEBUGGING
2030 /* Allow dumping */
2031 n = scan + NODE_SZ_STR(scan);
2032 while (n <= stop) {
2033 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2034 OP(n) = OPTIMIZED;
2035 NEXT_OFF(n) = 0;
2036 }
2037 n++;
2038 }
2039#endif
2040 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2041 return stopnow;
2042}
2043
653099ff
GS
2044/* REx optimizer. Converts nodes into quickier variants "in place".
2045 Finds fixed substrings. */
2046
a0288114 2047/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2048 to the position after last scanned or to NULL. */
2049
07be1b83
YO
2050
2051
76e3520e 2052STATIC I32
9a957fbc
AL
2053S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2054 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2055 /* scanp: Start here (read-write). */
2056 /* deltap: Write maxlen-minlen here. */
2057 /* last: Stop before this one. */
2058{
97aff369 2059 dVAR;
c277df42
IZ
2060 I32 min = 0, pars = 0, code;
2061 regnode *scan = *scanp, *next;
2062 I32 delta = 0;
2063 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2064 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2065 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2066 scan_data_t data_fake;
653099ff 2067 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2068 SV *re_trie_maxbuff = NULL;
2069
2070 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2071#ifdef DEBUGGING
2072 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2073#endif
b81d288d 2074
c277df42
IZ
2075 while (scan && OP(scan) != END && scan < last) {
2076 /* Peephole optimizer: */
07be1b83 2077 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2078
07be1b83 2079 JOIN_EXACT(scan,&min,0);
a3621e74 2080
653099ff
GS
2081 /* Follow the next-chain of the current node and optimize
2082 away all the NOTHINGs from it. */
c277df42 2083 if (OP(scan) != CURLYX) {
a3b680e6 2084 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2085 ? I32_MAX
2086 /* I32 may be smaller than U16 on CRAYs! */
2087 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2088 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2089 int noff;
2090 regnode *n = scan;
b81d288d 2091
c277df42
IZ
2092 /* Skip NOTHING and LONGJMP. */
2093 while ((n = regnext(n))
3dab1dad 2094 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2095 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2096 && off + noff < max)
2097 off += noff;
2098 if (reg_off_by_arg[OP(scan)])
2099 ARG(scan) = off;
b81d288d 2100 else
c277df42
IZ
2101 NEXT_OFF(scan) = off;
2102 }
a3621e74 2103
07be1b83 2104
3dab1dad 2105
653099ff
GS
2106 /* The principal pseudo-switch. Cannot be a switch, since we
2107 look into several different things. */
b81d288d 2108 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2109 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2110 next = regnext(scan);
2111 code = OP(scan);
a3621e74 2112 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2113
2114 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2115 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2116 struct regnode_charclass_class accum;
d4c19fe8 2117 regnode * const startbranch=scan;
c277df42 2118
653099ff 2119 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2120 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2121 if (flags & SCF_DO_STCLASS)
830247a4 2122 cl_init_zero(pRExC_state, &accum);
a3621e74 2123
c277df42 2124 while (OP(scan) == code) {
830247a4 2125 I32 deltanext, minnext, f = 0, fake;
653099ff 2126 struct regnode_charclass_class this_class;
c277df42
IZ
2127
2128 num++;
2129 data_fake.flags = 0;
b81d288d 2130 if (data) {
2c2d71f5 2131 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2132 data_fake.last_closep = data->last_closep;
2133 }
2134 else
2135 data_fake.last_closep = &fake;
c277df42
IZ
2136 next = regnext(scan);
2137 scan = NEXTOPER(scan);
2138 if (code != BRANCH)
2139 scan = NEXTOPER(scan);
653099ff 2140 if (flags & SCF_DO_STCLASS) {
830247a4 2141 cl_init(pRExC_state, &this_class);
653099ff
GS
2142 data_fake.start_class = &this_class;
2143 f = SCF_DO_STCLASS_AND;
b81d288d 2144 }
e1901655
IZ
2145 if (flags & SCF_WHILEM_VISITED_POS)
2146 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2147
653099ff 2148 /* we suppose the run is continuous, last=next...*/
830247a4 2149 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2150 next, &data_fake, f,depth+1);
b81d288d 2151 if (min1 > minnext)
c277df42
IZ
2152 min1 = minnext;
2153 if (max1 < minnext + deltanext)
2154 max1 = minnext + deltanext;
2155 if (deltanext == I32_MAX)
aca2d497 2156 is_inf = is_inf_internal = 1;
c277df42
IZ
2157 scan = next;
2158 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2159 pars++;
3dab1dad
YO
2160 if (data) {
2161 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2162 data->flags |= SF_HAS_EVAL;
2c2d71f5 2163 data->whilem_c = data_fake.whilem_c;
3dab1dad 2164 }
653099ff 2165 if (flags & SCF_DO_STCLASS)
830247a4 2166 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2167 if (code == SUSPEND)
c277df42
IZ
2168 break;
2169 }
2170 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2171 min1 = 0;
2172 if (flags & SCF_DO_SUBSTR) {
2173 data->pos_min += min1;
2174 data->pos_delta += max1 - min1;
2175 if (max1 != min1 || is_inf)
2176 data->longest = &(data->longest_float);
2177 }
2178 min += min1;
2179 delta += max1 - min1;
653099ff 2180 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2181 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2182 if (min1) {
2183 cl_and(data->start_class, &and_with);
2184 flags &= ~SCF_DO_STCLASS;
2185 }
2186 }
2187 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2188 if (min1) {
2189 cl_and(data->start_class, &accum);
653099ff 2190 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2191 }
2192 else {
b81d288d 2193 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2194 * data->start_class */
2195 StructCopy(data->start_class, &and_with,
2196 struct regnode_charclass_class);
2197 flags &= ~SCF_DO_STCLASS_AND;
2198 StructCopy(&accum, data->start_class,
2199 struct regnode_charclass_class);
2200 flags |= SCF_DO_STCLASS_OR;
2201 data->start_class->flags |= ANYOF_EOS;
2202 }
653099ff 2203 }
a3621e74
YO
2204
2205 /* demq.
2206
2207 Assuming this was/is a branch we are dealing with: 'scan' now
2208 points at the item that follows the branch sequence, whatever
2209 it is. We now start at the beginning of the sequence and look
2210 for subsequences of
2211
2212 BRANCH->EXACT=>X
2213 BRANCH->EXACT=>X
2214
2215 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2216
2217 If we can find such a subseqence we need to turn the first
2218 element into a trie and then add the subsequent branch exact
2219 strings to the trie.
2220
2221 We have two cases
2222
2223 1. patterns where the whole set of branch can be converted to a trie,
2224
2225 2. patterns where only a subset of the alternations can be
2226 converted to a trie.
2227
2228 In case 1 we can replace the whole set with a single regop
2229 for the trie. In case 2 we need to keep the start and end
2230 branchs so
2231
2232 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2233 becomes BRANCH TRIE; BRANCH X;
2234
2235 Hypthetically when we know the regex isnt anchored we can
2236 turn a case 1 into a DFA and let it rip... Every time it finds a match
2237 it would just call its tail, no WHILEM/CURLY needed.
2238
2239 */
07be1b83 2240 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2241 int made=0;
0111c4fd
RGS
2242 if (!re_trie_maxbuff) {
2243 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2244 if (!SvIOK(re_trie_maxbuff))
2245 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2246 }
a3621e74
YO
2247 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2248 regnode *cur;
2249 regnode *first = (regnode *)NULL;
2250 regnode *last = (regnode *)NULL;
2251 regnode *tail = scan;
2252 U8 optype = 0;
2253 U32 count=0;
2254
2255#ifdef DEBUGGING
c445ea15 2256 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2257#endif
2258 /* var tail is used because there may be a TAIL
2259 regop in the way. Ie, the exacts will point to the
2260 thing following the TAIL, but the last branch will
2261 point at the TAIL. So we advance tail. If we
2262 have nested (?:) we may have to move through several
2263 tails.
2264 */
2265
2266 while ( OP( tail ) == TAIL ) {
2267 /* this is the TAIL generated by (?:) */
2268 tail = regnext( tail );
2269 }
2270
3dab1dad 2271
a3621e74 2272 DEBUG_OPTIMISE_r({
32fc9b6a 2273 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2274 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2275 (int)depth * 2 + 2, "",
2276 "Looking for TRIE'able sequences. Tail node is: ",
2277 SvPV_nolen_const( mysv )
a3621e74
YO
2278 );
2279 });
3dab1dad 2280
a3621e74
YO
2281 /*
2282
2283 step through the branches, cur represents each
2284 branch, noper is the first thing to be matched
2285 as part of that branch and noper_next is the
2286 regnext() of that node. if noper is an EXACT
2287 and noper_next is the same as scan (our current
2288 position in the regex) then the EXACT branch is
2289 a possible optimization target. Once we have
2290 two or more consequetive such branches we can
2291 create a trie of the EXACT's contents and stich
2292 it in place. If the sequence represents all of
2293 the branches we eliminate the whole thing and
2294 replace it with a single TRIE. If it is a
2295 subsequence then we need to stitch it in. This
2296 means the first branch has to remain, and needs
2297 to be repointed at the item on the branch chain
2298 following the last branch optimized. This could
2299 be either a BRANCH, in which case the
2300 subsequence is internal, or it could be the
2301 item following the branch sequence in which
2302 case the subsequence is at the end.
2303
2304 */
2305
2306 /* dont use tail as the end marker for this traverse */
2307 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2308 regnode * const noper = NEXTOPER( cur );
2309 regnode * const noper_next = regnext( noper );
a3621e74 2310
a3621e74 2311 DEBUG_OPTIMISE_r({
32fc9b6a 2312 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2313 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2314 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2315
32fc9b6a 2316 regprop(RExC_rx, mysv, noper);
a3621e74 2317 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2318 SvPV_nolen_const(mysv));
a3621e74
YO
2319
2320 if ( noper_next ) {
32fc9b6a 2321 regprop(RExC_rx, mysv, noper_next );
a3621e74 2322 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2323 SvPV_nolen_const(mysv));
a3621e74 2324 }
3dab1dad
YO
2325 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2326 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2327 });
3dab1dad
YO
2328 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2329 : PL_regkind[ OP( noper ) ] == EXACT )
2330 || OP(noper) == NOTHING )
a3621e74
YO
2331 && noper_next == tail && count<U16_MAX)
2332 {
2333 count++;
3dab1dad
YO
2334 if ( !first || optype == NOTHING ) {
2335 if (!first) first = cur;
a3621e74
YO
2336 optype = OP( noper );
2337 } else {
a3621e74 2338 last = cur;
a3621e74
YO
2339 }
2340 } else {
2341 if ( last ) {
3dab1dad 2342 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2343 }
3dab1dad 2344 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2345 && noper_next == tail )
2346 {
2347 count = 1;
2348 first = cur;
2349 optype = OP( noper );
2350 } else {
2351 count = 0;
2352 first = NULL;
2353 optype = 0;
2354 }
2355 last = NULL;
2356 }
2357 }
2358 DEBUG_OPTIMISE_r({
32fc9b6a 2359 regprop(RExC_rx, mysv, cur);
a3621e74 2360 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2361 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2362 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2363
2364 });
2365 if ( last ) {
3dab1dad
YO
2366 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2367#ifdef TRIE_STUDY_OPT
07be1b83
YO
2368 if ( made && startbranch == first ) {
2369 if ( OP(first)!=TRIE )
2370 flags |= SCF_EXACT_TRIE;
2371 else {
2372 regnode *chk=*scanp;
2373 while ( OP( chk ) == OPEN )
2374 chk = regnext( chk );
2375 if (chk==first)
2376 flags |= SCF_EXACT_TRIE;
2377 }
2378 }
3dab1dad 2379#endif
07be1b83 2380 }
a3621e74 2381 }
3dab1dad
YO
2382
2383 } /* do trie */
a0ed51b3 2384 }
a3621e74 2385 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2386 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2387 } else /* single branch is optimized. */
c277df42
IZ
2388 scan = NEXTOPER(scan);
2389 continue;
a0ed51b3
LW
2390 }
2391 else if (OP(scan) == EXACT) {
cd439c50 2392 I32 l = STR_LEN(scan);
c445ea15 2393 UV uc;
a0ed51b3 2394 if (UTF) {
a3b680e6 2395 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2396 l = utf8_length(s, s + l);
9041c2e3 2397 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2398 } else {
2399 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2400 }
2401 min += l;
c277df42 2402 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2403 /* The code below prefers earlier match for fixed
2404 offset, later match for variable offset. */
2405 if (data->last_end == -1) { /* Update the start info. */
2406 data->last_start_min = data->pos_min;
2407 data->last_start_max = is_inf
b81d288d 2408 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2409 }
cd439c50 2410 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2411 if (UTF)
2412 SvUTF8_on(data->last_found);
0eda9292 2413 {
9a957fbc 2414 SV * const sv = data->last_found;
a28509cc 2415 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2416 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2417 if (mg && mg->mg_len >= 0)
5e43f467
JH
2418 mg->mg_len += utf8_length((U8*)STRING(scan),
2419 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2420 }
c277df42
IZ
2421 data->last_end = data->pos_min + l;
2422 data->pos_min += l; /* As in the first entry. */
2423 data->flags &= ~SF_BEFORE_EOL;
2424 }
653099ff
GS
2425 if (flags & SCF_DO_STCLASS_AND) {
2426 /* Check whether it is compatible with what we know already! */
2427 int compat = 1;
2428
1aa99e6b 2429 if (uc >= 0x100 ||
516a5887 2430 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2431 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2432 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2433 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2434 )
653099ff
GS
2435 compat = 0;
2436 ANYOF_CLASS_ZERO(data->start_class);
2437 ANYOF_BITMAP_ZERO(data->start_class);
2438 if (compat)
1aa99e6b 2439 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2440 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2441 if (uc < 0x100)
2442 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2443 }
2444 else if (flags & SCF_DO_STCLASS_OR) {
2445 /* false positive possible if the class is case-folded */
1aa99e6b 2446 if (uc < 0x100)
9b877dbb
IH
2447 ANYOF_BITMAP_SET(data->start_class, uc);
2448 else
2449 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2450 data->start_class->flags &= ~ANYOF_EOS;
2451 cl_and(data->start_class, &and_with);
2452 }
2453 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2454 }
3dab1dad 2455 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2456 I32 l = STR_LEN(scan);
1aa99e6b 2457 UV uc = *((U8*)STRING(scan));
653099ff
GS
2458
2459 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2460 if (flags & SCF_DO_SUBSTR) {
2461 assert(data);
830247a4 2462 scan_commit(pRExC_state, data);
ecaa9b9c 2463 }
a0ed51b3 2464 if (UTF) {
6136c704 2465 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2466 l = utf8_length(s, s + l);
9041c2e3 2467 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2468 }
2469 min += l;
ecaa9b9c 2470 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2471 data->pos_min += l;
653099ff
GS
2472 if (flags & SCF_DO_STCLASS_AND) {
2473 /* Check whether it is compatible with what we know already! */
2474 int compat = 1;
2475
1aa99e6b 2476 if (uc >= 0x100 ||
516a5887 2477 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2478 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2479 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2480 compat = 0;
2481 ANYOF_CLASS_ZERO(data->start_class);
2482 ANYOF_BITMAP_ZERO(data->start_class);
2483 if (compat) {
1aa99e6b 2484 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2485 data->start_class->flags &= ~ANYOF_EOS;
2486 data->start_class->flags |= ANYOF_FOLD;
2487 if (OP(scan) == EXACTFL)
2488 data->start_class->flags |= ANYOF_LOCALE;
2489 }
2490 }
2491 else if (flags & SCF_DO_STCLASS_OR) {
2492 if (data->start_class->flags & ANYOF_FOLD) {
2493 /* false positive possible if the class is case-folded.
2494 Assume that the locale settings are the same... */
1aa99e6b
IH
2495 if (uc < 0x100)
2496 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2497 data->start_class->flags &= ~ANYOF_EOS;
2498 }
2499 cl_and(data->start_class, &and_with);
2500 }
2501 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2502 }
3dab1dad
YO
2503#ifdef TRIE_STUDY_OPT
2504 else if (OP(scan) == TRIE) {
10edeb5d 2505 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3dab1dad 2506 min += trie->minlen;
07be1b83 2507 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2508 flags &= ~SCF_DO_STCLASS; /* xxx */
2509 if (flags & SCF_DO_SUBSTR) {
2510 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2511 data->pos_min += trie->minlen;
07be1b83
YO
2512 data->pos_delta += (trie->maxlen - trie->minlen);
2513 if (trie->maxlen != trie->minlen)
2514 data->longest = &(data->longest_float);
3dab1dad
YO
2515 }
2516 }
2517#endif
bfed75c6 2518 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2519 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2520 I32 f = flags, pos_before = 0;
d4c19fe8 2521 regnode * const oscan = scan;
653099ff
GS
2522 struct regnode_charclass_class this_class;
2523 struct regnode_charclass_class *oclass = NULL;
727f22e3 2524 I32 next_is_eval = 0;
653099ff 2525
3dab1dad 2526 switch (PL_regkind[OP(scan)]) {
653099ff 2527 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2528 scan = NEXTOPER(scan);
2529 goto finish;
2530 case PLUS:
653099ff 2531 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2532 next = NEXTOPER(scan);
653099ff 2533 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2534 mincount = 1;
2535 maxcount = REG_INFTY;
c277df42
IZ
2536 next = regnext(scan);
2537 scan = NEXTOPER(scan);
2538 goto do_curly;
2539 }
2540 }
2541 if (flags & SCF_DO_SUBSTR)
2542 data->pos_min++;
2543 min++;
2544 /* Fall through. */
2545 case STAR:
653099ff
GS
2546 if (flags & SCF_DO_STCLASS) {
2547 mincount = 0;
b81d288d 2548 maxcount = REG_INFTY;
653099ff
GS
2549 next = regnext(scan);
2550 scan = NEXTOPER(scan);
2551 goto do_curly;
2552 }
b81d288d 2553 is_inf = is_inf_internal = 1;
c277df42
IZ
2554 scan = regnext(scan);
2555 if (flags & SCF_DO_SUBSTR) {
830247a4 2556 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2557 data->longest = &(data->longest_float);
2558 }
2559 goto optimize_curly_tail;
2560 case CURLY:
b81d288d 2561 mincount = ARG1(scan);
c277df42
IZ
2562 maxcount = ARG2(scan);
2563 next = regnext(scan);
cb434fcc
IZ
2564 if (OP(scan) == CURLYX) {
2565 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2566 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2567 }
c277df42 2568 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2569 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2570 do_curly:
2571 if (flags & SCF_DO_SUBSTR) {
830247a4 2572 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2573 pos_before = data->pos_min;
2574 }
2575 if (data) {
2576 fl = data->flags;
2577 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2578 if (is_inf)
2579 data->flags |= SF_IS_INF;
2580 }
653099ff 2581 if (flags & SCF_DO_STCLASS) {
830247a4 2582 cl_init(pRExC_state, &this_class);
653099ff
GS
2583 oclass = data->start_class;
2584 data->start_class = &this_class;
2585 f |= SCF_DO_STCLASS_AND;
2586 f &= ~SCF_DO_STCLASS_OR;
2587 }
e1901655
IZ
2588 /* These are the cases when once a subexpression
2589 fails at a particular position, it cannot succeed
2590 even after backtracking at the enclosing scope.
b81d288d 2591
e1901655
IZ
2592 XXXX what if minimal match and we are at the
2593 initial run of {n,m}? */
2594 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2595 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2596
c277df42 2597 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2598 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2599 (mincount == 0
2600 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2601
2602 if (flags & SCF_DO_STCLASS)
2603 data->start_class = oclass;
2604 if (mincount == 0 || minnext == 0) {
2605 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2606 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2607 }
2608 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2609 /* Switch to OR mode: cache the old value of
653099ff
GS
2610 * data->start_class */
2611 StructCopy(data->start_class, &and_with,
2612 struct regnode_charclass_class);
2613 flags &= ~SCF_DO_STCLASS_AND;
2614 StructCopy(&this_class, data->start_class,
2615 struct regnode_charclass_class);
2616 flags |= SCF_DO_STCLASS_OR;
2617 data->start_class->flags |= ANYOF_EOS;
2618 }
2619 } else { /* Non-zero len */
2620 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2621 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2622 cl_and(data->start_class, &and_with);
2623 }
2624 else if (flags & SCF_DO_STCLASS_AND)
2625 cl_and(data->start_class, &this_class);
2626 flags &= ~SCF_DO_STCLASS;
2627 }
c277df42
IZ
2628 if (!scan) /* It was not CURLYX, but CURLY. */
2629 scan = next;
041457d9
DM
2630 if ( /* ? quantifier ok, except for (?{ ... }) */
2631 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2632 && (minnext == 0) && (deltanext == 0)
99799961 2633 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2634 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2635 && ckWARN(WARN_REGEXP))
b45f050a 2636 {
830247a4 2637 vWARN(RExC_parse,
b45f050a
JF
2638 "Quantifier unexpected on zero-length expression");
2639 }
2640
c277df42 2641 min += minnext * mincount;
b81d288d 2642 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2643 && (minnext + deltanext) > 0)
2644 || deltanext == I32_MAX);
aca2d497 2645 is_inf |= is_inf_internal;
c277df42
IZ
2646 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2647
2648 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2649 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2650 && data->flags & SF_IN_PAR
2651 && !(data->flags & SF_HAS_EVAL)
2652 && !deltanext && minnext == 1 ) {
2653 /* Try to optimize to CURLYN. */
2654 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2655 regnode * const nxt1 = nxt;
497b47a8
JH
2656#ifdef DEBUGGING
2657 regnode *nxt2;
2658#endif
c277df42
IZ
2659
2660 /* Skip open. */
2661 nxt = regnext(nxt);
bfed75c6 2662 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2663 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2664 && STR_LEN(nxt) == 1))
c277df42 2665 goto nogo;
497b47a8 2666#ifdef DEBUGGING
c277df42 2667 nxt2 = nxt;
497b47a8 2668#endif
c277df42 2669 nxt = regnext(nxt);
b81d288d 2670 if (OP(nxt) != CLOSE)
c277df42
IZ
2671 goto nogo;
2672 /* Now we know that nxt2 is the only contents: */
eb160463 2673 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2674 OP(oscan) = CURLYN;
2675 OP(nxt1) = NOTHING; /* was OPEN. */
2676#ifdef DEBUGGING
2677 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2678 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2679 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2680 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2681 OP(nxt + 1) = OPTIMIZED; /* was count. */
2682 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2683#endif
c277df42 2684 }
c277df42
IZ
2685 nogo:
2686
2687 /* Try optimization CURLYX => CURLYM. */
b81d288d 2688 if ( OP(oscan) == CURLYX && data
c277df42 2689 && !(data->flags & SF_HAS_PAR)
c277df42 2690 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2691 && !deltanext /* atom is fixed width */
2692 && minnext != 0 /* CURLYM can't handle zero width */
2693 ) {
c277df42
IZ
2694 /* XXXX How to optimize if data == 0? */
2695 /* Optimize to a simpler form. */
2696 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2697 regnode *nxt2;
2698
2699 OP(oscan) = CURLYM;
2700 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2701 && (OP(nxt2) != WHILEM))
c277df42
IZ
2702 nxt = nxt2;
2703 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2704 /* Need to optimize away parenths. */
2705 if (data->flags & SF_IN_PAR) {
2706 /* Set the parenth number. */
2707 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2708
b81d288d 2709 if (OP(nxt) != CLOSE)
b45f050a 2710 FAIL("Panic opt close");
eb160463 2711 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2712 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2713 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2714#ifdef DEBUGGING
2715 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2716 OP(nxt + 1) = OPTIMIZED; /* was count. */
2717 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2718 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2719#endif
c277df42
IZ
2720#if 0
2721 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2722 regnode *nnxt = regnext(nxt1);
b81d288d 2723
c277df42
IZ
2724 if (nnxt == nxt) {
2725 if (reg_off_by_arg[OP(nxt1)])
2726 ARG_SET(nxt1, nxt2 - nxt1);
2727 else if (nxt2 - nxt1 < U16_MAX)
2728 NEXT_OFF(nxt1) = nxt2 - nxt1;
2729 else
2730 OP(nxt) = NOTHING; /* Cannot beautify */
2731 }
2732 nxt1 = nnxt;
2733 }
2734#endif
2735 /* Optimize again: */
b81d288d 2736 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2737 NULL, 0,depth+1);
a0ed51b3
LW
2738 }
2739 else
c277df42 2740 oscan->flags = 0;
c277df42 2741 }
e1901655
IZ
2742 else if ((OP(oscan) == CURLYX)
2743 && (flags & SCF_WHILEM_VISITED_POS)
2744 /* See the comment on a similar expression above.
2745 However, this time it not a subexpression
2746 we care about, but the expression itself. */
2747 && (maxcount == REG_INFTY)
2748 && data && ++data->whilem_c < 16) {
2749 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2750 /* Find WHILEM (as in regexec.c) */
2751 regnode *nxt = oscan + NEXT_OFF(oscan);
2752
2753 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2754 nxt += ARG(nxt);
eb160463
GS
2755 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2756 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2757 }
b81d288d 2758 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2759 pars++;
2760 if (flags & SCF_DO_SUBSTR) {
c445ea15 2761 SV *last_str = NULL;
c277df42
IZ
2762 int counted = mincount != 0;
2763
2764 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2765#if defined(SPARC64_GCC_WORKAROUND)
2766 I32 b = 0;
2767 STRLEN l = 0;
cfd0369c 2768 const char *s = NULL;
5d1c421c
JH
2769 I32 old = 0;
2770
2771 if (pos_before >= data->last_start_min)
2772 b = pos_before;
2773 else
2774 b = data->last_start_min;
2775
2776 l = 0;
cfd0369c 2777 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2778 old = b - data->last_start_min;
2779
2780#else
b81d288d 2781 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2782 ? pos_before : data->last_start_min;
2783 STRLEN l;
d4c19fe8 2784 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2785 I32 old = b - data->last_start_min;
5d1c421c 2786#endif
a0ed51b3
LW
2787
2788 if (UTF)
2789 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2790
a0ed51b3 2791 l -= old;
c277df42 2792 /* Get the added string: */
79cb57f6 2793 last_str = newSVpvn(s + old, l);
0e933229
IH
2794 if (UTF)
2795 SvUTF8_on(last_str);
c277df42
IZ
2796 if (deltanext == 0 && pos_before == b) {
2797 /* What was added is a constant string */
2798 if (mincount > 1) {
2799 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2800 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2801 SvPVX_const(last_str), l, mincount - 1);
b162af07 2802 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2803 /* Add additional parts. */
b81d288d 2804 SvCUR_set(data->last_found,
c277df42
IZ
2805 SvCUR(data->last_found) - l);
2806 sv_catsv(data->last_found, last_str);
0eda9292
JH
2807 {
2808 SV * sv = data->last_found;
2809 MAGIC *mg =
2810 SvUTF8(sv) && SvMAGICAL(sv) ?
2811 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2812 if (mg && mg->mg_len >= 0)
2813 mg->mg_len += CHR_SVLEN(last_str);
2814 }
c277df42
IZ
2815 data->last_end += l * (mincount - 1);
2816 }
2a8d9689
HS
2817 } else {
2818 /* start offset must point into the last copy */
2819 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2820 data->last_start_max += is_inf ? I32_MAX
2821 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2822 }
2823 }
2824 /* It is counted once already... */
2825 data->pos_min += minnext * (mincount - counted);
2826 data->pos_delta += - counted * deltanext +
2827 (minnext + deltanext) * maxcount - minnext * mincount;
2828 if (mincount != maxcount) {
653099ff
GS
2829 /* Cannot extend fixed substrings found inside
2830 the group. */
830247a4 2831 scan_commit(pRExC_state,data);
c277df42 2832 if (mincount && last_str) {
d4c19fe8
AL
2833 SV * const sv = data->last_found;
2834 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2835 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2836
2837 if (mg)
2838 mg->mg_len = -1;
2839 sv_setsv(sv, last_str);
c277df42 2840 data->last_end = data->pos_min;
b81d288d 2841 data->last_start_min =
a0ed51b3 2842 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2843 data->last_start_max = is_inf
2844 ? I32_MAX
c277df42 2845 : data->pos_min + data->pos_delta
a0ed51b3 2846 - CHR_SVLEN(last_str);
c277df42
IZ
2847 }
2848 data->longest = &(data->longest_float);
2849 }
aca2d497 2850 SvREFCNT_dec(last_str);
c277df42 2851 }
405ff068 2852 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2853 data->flags |= SF_HAS_EVAL;
2854 optimize_curly_tail:
c277df42 2855 if (OP(oscan) != CURLYX) {
3dab1dad 2856 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2857 && NEXT_OFF(next))
2858 NEXT_OFF(oscan) += NEXT_OFF(next);
2859 }
c277df42 2860 continue;
653099ff 2861 default: /* REF and CLUMP only? */
c277df42 2862 if (flags & SCF_DO_SUBSTR) {
830247a4 2863 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2864 data->longest = &(data->longest_float);
2865 }
aca2d497 2866 is_inf = is_inf_internal = 1;
653099ff 2867 if (flags & SCF_DO_STCLASS_OR)
830247a4 2868 cl_anything(pRExC_state, data->start_class);
653099ff 2869 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2870 break;
2871 }
a0ed51b3 2872 }
bfed75c6 2873 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2874 int value = 0;
653099ff 2875
c277df42 2876 if (flags & SCF_DO_SUBSTR) {
830247a4 2877 scan_commit(pRExC_state,data);
c277df42
IZ
2878 data->pos_min++;
2879 }
2880 min++;
653099ff
GS
2881 if (flags & SCF_DO_STCLASS) {
2882 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2883
2884 /* Some of the logic below assumes that switching
2885 locale on will only add false positives. */
3dab1dad 2886 switch (PL_regkind[OP(scan)]) {
653099ff 2887 case SANY:
653099ff
GS
2888 default:
2889 do_default:
2890 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2891 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2892 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2893 break;
2894 case REG_ANY:
2895 if (OP(scan) == SANY)
2896 goto do_default;
2897 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2898 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2899 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2900 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2901 }
2902 if (flags & SCF_DO_STCLASS_AND || !value)
2903 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2904 break;
2905 case ANYOF:
2906 if (flags & SCF_DO_STCLASS_AND)
2907 cl_and(data->start_class,
2908 (struct regnode_charclass_class*)scan);
2909 else
830247a4 2910 cl_or(pRExC_state, data->start_class,
653099ff
GS
2911 (struct regnode_charclass_class*)scan);
2912 break;
2913 case ALNUM:
2914 if (flags & SCF_DO_STCLASS_AND) {
2915 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2916 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2917 for (value = 0; value < 256; value++)
2918 if (!isALNUM(value))
2919 ANYOF_BITMAP_CLEAR(data->start_class, value);
2920 }
2921 }
2922 else {
2923 if (data->start_class->flags & ANYOF_LOCALE)
2924 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2925 else {
2926 for (value = 0; value < 256; value++)
2927 if (isALNUM(value))
b81d288d 2928 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2929 }
2930 }
2931 break;
2932 case ALNUML:
2933 if (flags & SCF_DO_STCLASS_AND) {
2934 if (data->start_class->flags & ANYOF_LOCALE)
2935 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2936 }
2937 else {
2938 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2939 data->start_class->flags |= ANYOF_LOCALE;
2940 }
2941 break;
2942 case NALNUM:
2943 if (flags & SCF_DO_STCLASS_AND) {
2944 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2945 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2946 for (value = 0; value < 256; value++)
2947 if (isALNUM(value))
2948 ANYOF_BITMAP_CLEAR(data->start_class, value);
2949 }
2950 }
2951 else {
2952 if (data->start_class->flags & ANYOF_LOCALE)
2953 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2954 else {
2955 for (value = 0; value < 256; value++)
2956 if (!isALNUM(value))
b81d288d 2957 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2958 }
2959 }
2960 break;
2961 case NALNUML:
2962 if (flags & SCF_DO_STCLASS_AND) {
2963 if (data->start_class->flags & ANYOF_LOCALE)
2964 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2965 }
2966 else {
2967 data->start_class->flags |= ANYOF_LOCALE;
2968 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2969 }
2970 break;
2971 case SPACE:
2972 if (flags & SCF_DO_STCLASS_AND) {
2973 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2974 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2975 for (value = 0; value < 256; value++)
2976 if (!isSPACE(value))
2977 ANYOF_BITMAP_CLEAR(data->start_class, value);
2978 }
2979 }
2980 else {
2981 if (data->start_class->flags & ANYOF_LOCALE)
2982 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2983 else {
2984 for (value = 0; value < 256; value++)
2985 if (isSPACE(value))
b81d288d 2986 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2987 }
2988 }
2989 break;
2990 case SPACEL:
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 if (data->start_class->flags & ANYOF_LOCALE)
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2994 }
2995 else {
2996 data->start_class->flags |= ANYOF_LOCALE;
2997 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2998 }
2999 break;
3000 case NSPACE:
3001 if (flags & SCF_DO_STCLASS_AND) {
3002 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3003 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3004 for (value = 0; value < 256; value++)
3005 if (isSPACE(value))
3006 ANYOF_BITMAP_CLEAR(data->start_class, value);
3007 }
3008 }
3009 else {
3010 if (data->start_class->flags & ANYOF_LOCALE)
3011 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3012 else {
3013 for (value = 0; value < 256; value++)
3014 if (!isSPACE(value))
b81d288d 3015 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3016 }
3017 }
3018 break;
3019 case NSPACEL:
3020 if (flags & SCF_DO_STCLASS_AND) {
3021 if (data->start_class->flags & ANYOF_LOCALE) {
3022 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3023 for (value = 0; value < 256; value++)
3024 if (!isSPACE(value))
3025 ANYOF_BITMAP_CLEAR(data->start_class, value);
3026 }
3027 }
3028 else {
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3031 }
3032 break;
3033 case DIGIT:
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3036 for (value = 0; value < 256; value++)
3037 if (!isDIGIT(value))
3038 ANYOF_BITMAP_CLEAR(data->start_class, value);
3039 }
3040 else {
3041 if (data->start_class->flags & ANYOF_LOCALE)
3042 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3043 else {
3044 for (value = 0; value < 256; value++)
3045 if (isDIGIT(value))
b81d288d 3046 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3047 }
3048 }
3049 break;
3050 case NDIGIT:
3051 if (flags & SCF_DO_STCLASS_AND) {
3052 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3053 for (value = 0; value < 256; value++)
3054 if (isDIGIT(value))
3055 ANYOF_BITMAP_CLEAR(data->start_class, value);
3056 }
3057 else {
3058 if (data->start_class->flags & ANYOF_LOCALE)
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3060 else {
3061 for (value = 0; value < 256; value++)
3062 if (!isDIGIT(value))
b81d288d 3063 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3064 }
3065 }
3066 break;
3067 }
3068 if (flags & SCF_DO_STCLASS_OR)
3069 cl_and(data->start_class, &and_with);
3070 flags &= ~SCF_DO_STCLASS;
3071 }
a0ed51b3 3072 }
3dab1dad 3073 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3074 data->flags |= (OP(scan) == MEOL
3075 ? SF_BEFORE_MEOL
3076 : SF_BEFORE_SEOL);
a0ed51b3 3077 }
3dab1dad 3078 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3079 /* Lookbehind, or need to calculate parens/evals/stclass: */
3080 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3081 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3082 /* Lookahead/lookbehind */
cb434fcc 3083 I32 deltanext, minnext, fake = 0;
c277df42 3084 regnode *nscan;
653099ff
GS
3085 struct regnode_charclass_class intrnl;
3086 int f = 0;
c277df42
IZ
3087
3088 data_fake.flags = 0;
b81d288d 3089 if (data) {
2c2d71f5 3090 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3091 data_fake.last_closep = data->last_closep;
3092 }
3093 else
3094 data_fake.last_closep = &fake;
653099ff
GS
3095 if ( flags & SCF_DO_STCLASS && !scan->flags
3096 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3097 cl_init(pRExC_state, &intrnl);
653099ff 3098 data_fake.start_class = &intrnl;
e1901655 3099 f |= SCF_DO_STCLASS_AND;
653099ff 3100 }
e1901655
IZ
3101 if (flags & SCF_WHILEM_VISITED_POS)
3102 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3103 next = regnext(scan);
3104 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3105 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3106 if (scan->flags) {
3107 if (deltanext) {
9baa0206 3108 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3109 }
3110 else if (minnext > U8_MAX) {
9baa0206 3111 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3112 }
eb160463 3113 scan->flags = (U8)minnext;
c277df42 3114 }
be8e71aa
YO
3115 if (data) {
3116 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3117 pars++;
3118 if (data_fake.flags & SF_HAS_EVAL)
3119 data->flags |= SF_HAS_EVAL;
2c2d71f5 3120 data->whilem_c = data_fake.whilem_c;
be8e71aa 3121 }
e1901655 3122 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3123 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3124
3125 cl_and(data->start_class, &intrnl);
3126 if (was)
3127 data->start_class->flags |= ANYOF_EOS;
3128 }
a0ed51b3
LW
3129 }
3130 else if (OP(scan) == OPEN) {
c277df42 3131 pars++;
a0ed51b3 3132 }
cb434fcc 3133 else if (OP(scan) == CLOSE) {
eb160463 3134 if ((I32)ARG(scan) == is_par) {
cb434fcc 3135 next = regnext(scan);
c277df42 3136
cb434fcc
IZ
3137 if ( next && (OP(next) != WHILEM) && next < last)
3138 is_par = 0; /* Disable optimization */
3139 }
3140 if (data)
3141 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3142 }
3143 else if (OP(scan) == EVAL) {
c277df42
IZ
3144 if (data)
3145 data->flags |= SF_HAS_EVAL;
3146 }
96776eda 3147 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3148 if (flags & SCF_DO_SUBSTR) {
830247a4 3149 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3150 data->longest = &(data->longest_float);
3151 }
3152 is_inf = is_inf_internal = 1;
653099ff 3153 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3154 cl_anything(pRExC_state, data->start_class);
96776eda 3155 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3156 }
c277df42
IZ
3157 /* Else: zero-length, ignore. */
3158 scan = regnext(scan);
3159 }
3160
3161 finish:
3162 *scanp = scan;
aca2d497 3163 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3164 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3165 data->pos_delta = I32_MAX - data->pos_min;
3166 if (is_par > U8_MAX)
3167 is_par = 0;
3168 if (is_par && pars==1 && data) {
3169 data->flags |= SF_IN_PAR;
3170 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3171 }
3172 else if (pars && data) {
c277df42
IZ
3173 data->flags |= SF_HAS_PAR;
3174 data->flags &= ~SF_IN_PAR;
3175 }
653099ff
GS
3176 if (flags & SCF_DO_STCLASS_OR)
3177 cl_and(data->start_class, &and_with);
07be1b83
YO
3178 if (flags & SCF_EXACT_TRIE)
3179 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3180 return min;
3181}
3182
76e3520e 3183STATIC I32
5f66b61c 3184S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3185{
830247a4 3186 if (RExC_rx->data) {
b81d288d
AB
3187 Renewc(RExC_rx->data,
3188 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3189 char, struct reg_data);
830247a4
IZ
3190 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3191 RExC_rx->data->count += n;
a0ed51b3
LW
3192 }
3193 else {
a02a5408 3194 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3195 char, struct reg_data);
a02a5408 3196 Newx(RExC_rx->data->what, n, U8);
830247a4 3197 RExC_rx->data->count = n;
c277df42 3198 }
830247a4
IZ
3199 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3200 return RExC_rx->data->count - n;
c277df42
IZ
3201}
3202
76234dfb 3203#ifndef PERL_IN_XSUB_RE
d88dccdf 3204void
864dbfa3 3205Perl_reginitcolors(pTHX)
d88dccdf 3206{
97aff369 3207 dVAR;
1df70142 3208 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3209 if (s) {
1df70142
AL
3210 char *t = savepv(s);
3211 int i = 0;
3212 PL_colors[0] = t;
d88dccdf 3213 while (++i < 6) {
1df70142
AL
3214 t = strchr(t, '\t');
3215 if (t) {
3216 *t = '\0';
3217 PL_colors[i] = ++t;
d88dccdf
IZ
3218 }
3219 else
1df70142 3220 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3221 }
3222 } else {
1df70142 3223 int i = 0;
b81d288d 3224 while (i < 6)
06b5626a 3225 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3226 }
3227 PL_colorset = 1;
3228}
76234dfb 3229#endif
8615cb43 3230
07be1b83 3231
a687059c 3232/*
e50aee73 3233 - pregcomp - compile a regular expression into internal code
a687059c
LW
3234 *
3235 * We can't allocate space until we know how big the compiled form will be,
3236 * but we can't compile it (and thus know how big it is) until we've got a
3237 * place to put the code. So we cheat: we compile it twice, once with code
3238 * generation turned off and size counting turned on, and once "for real".
3239 * This also means that we don't allocate space until we are sure that the
3240 * thing really will compile successfully, and we never have to move the
3241 * code and thus invalidate pointers into it. (Note that it has to be in
3242 * one piece because free() must be able to free it all.) [NB: not true in perl]
3243 *
3244 * Beware that the optimization-preparation code in here knows about some
3245 * of the structure of the compiled regexp. [I'll say.]
3246 */
3247regexp *
864dbfa3 3248Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3249{
97aff369 3250 dVAR;
a0d0e21e 3251 register regexp *r;
c277df42 3252 regnode *scan;
c277df42 3253 regnode *first;
a0d0e21e 3254 I32 flags;
a0d0e21e
LW
3255 I32 minlen = 0;
3256 I32 sawplus = 0;
3257 I32 sawopen = 0;
2c2d71f5 3258 scan_data_t data;
830247a4 3259 RExC_state_t RExC_state;
be8e71aa 3260 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3261#ifdef TRIE_STUDY_OPT
3262 int restudied= 0;
3263 RExC_state_t copyRExC_state;
3264#endif
a0d0e21e 3265
a3621e74
YO
3266 GET_RE_DEBUG_FLAGS_DECL;
3267
a0d0e21e 3268 if (exp == NULL)
c277df42 3269 FAIL("NULL regexp argument");
a0d0e21e 3270
a5961de5 3271 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3272
5cfc7842 3273 RExC_precomp = exp;
a3621e74
YO
3274 DEBUG_r(if (!PL_colorset) reginitcolors());
3275 DEBUG_COMPILE_r({
ab3bbdeb
YO
3276 SV *dsv= sv_newmortal();
3277 RE_PV_QUOTED_DECL(s, RExC_utf8,
3278 dsv, RExC_precomp, (xend - exp), 60);
3279 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3280 PL_colors[4],PL_colors[5],s);
a5961de5 3281 });
e2509266 3282 RExC_flags = pm->op_pmflags;
830247a4 3283 RExC_sawback = 0;
bbce6d69 3284
830247a4
IZ
3285 RExC_seen = 0;
3286 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3287 RExC_seen_evals = 0;
3288 RExC_extralen = 0;
c277df42 3289
bbce6d69 3290 /* First pass: determine size, legality. */
830247a4 3291 RExC_parse = exp;
fac92740 3292 RExC_start = exp;
830247a4
IZ
3293 RExC_end = xend;
3294 RExC_naughty = 0;
3295 RExC_npar = 1;
3296 RExC_size = 0L;
3297 RExC_emit = &PL_regdummy;
3298 RExC_whilem_seen = 0;
85ddcde9
JH
3299#if 0 /* REGC() is (currently) a NOP at the first pass.
3300 * Clever compilers notice this and complain. --jhi */
830247a4 3301 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3302#endif
3dab1dad
YO
3303 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3304 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3305 RExC_precomp = NULL;
a0d0e21e
LW
3306 return(NULL);
3307 }
3dab1dad
YO
3308 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3309 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3310 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3311 DEBUG_PARSE_r({
3312 RExC_lastnum=0;
3313 RExC_lastparse=NULL;
3314 });
c277df42 3315
07be1b83 3316
c277df42
IZ
3317 /* Small enough for pointer-storage convention?
3318 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3319 if (RExC_size >= 0x10000L && RExC_extralen)
3320 RExC_size += RExC_extralen;
c277df42 3321 else
830247a4
IZ
3322 RExC_extralen = 0;
3323 if (RExC_whilem_seen > 15)
3324 RExC_whilem_seen = 15;
a0d0e21e 3325
bbce6d69 3326 /* Allocate space and initialize. */
a02a5408 3327 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3328 char, regexp);
a0d0e21e 3329 if (r == NULL)
b45f050a
JF
3330 FAIL("Regexp out of space");
3331
0f79a09d
GS
3332#ifdef DEBUGGING
3333 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3334 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3335#endif
c277df42 3336 r->refcnt = 1;
bbce6d69 3337 r->prelen = xend - exp;
5cfc7842 3338 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3339 r->subbeg = NULL;
f8c7b90f 3340#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3341 r->saved_copy = NULL;
ed252734 3342#endif
cf93c79d 3343 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3344 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3345 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3346
3347 r->substrs = 0; /* Useful during FAIL. */
3348 r->startp = 0; /* Useful during FAIL. */
3349 r->endp = 0; /* Useful during FAIL. */
3350
a02a5408 3351 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3352 if (r->offsets) {
2af232bd 3353 r->offsets[0] = RExC_size;
fac92740 3354 }
a3621e74 3355 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3356 "%s %"UVuf" bytes for offset annotations.\n",
3357 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3358 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3359
830247a4 3360 RExC_rx = r;
bbce6d69 3361
3362 /* Second pass: emit code. */
e2509266 3363 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3364 RExC_parse = exp;
3365 RExC_end = xend;
3366 RExC_naughty = 0;
3367 RExC_npar = 1;
fac92740 3368 RExC_emit_start = r->program;
830247a4 3369 RExC_emit = r->program;
2cd61cdb 3370 /* Store the count of eval-groups for security checks: */
eb160463 3371 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3372 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3373 r->data = 0;
3dab1dad 3374 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3375 return(NULL);
07be1b83
YO
3376 /* XXXX To minimize changes to RE engine we always allocate
3377 3-units-long substrs field. */
3378 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3379
07be1b83
YO
3380reStudy:
3381 Zero(r->substrs, 1, struct reg_substr_data);
3382 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3383
07be1b83
YO
3384#ifdef TRIE_STUDY_OPT
3385 if ( restudied ) {
3386 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3387 RExC_state=copyRExC_state;
3388 if (data.longest_fixed)
3389 SvREFCNT_dec(data.longest_fixed);
3390 if (data.longest_float)
3391 SvREFCNT_dec(data.longest_float);
3392 if (data.last_found)
3393 SvREFCNT_dec(data.last_found);
3394 } else {
3395 copyRExC_state=RExC_state;
3396 }
3397#endif
a0d0e21e 3398 /* Dig out information for optimizations. */
cf93c79d 3399 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3400 pm->op_pmflags = RExC_flags;
a0ed51b3 3401 if (UTF)
5ff6fc6d 3402 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3403 r->regstclass = NULL;
830247a4 3404 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3405 r->reganch |= ROPT_NAUGHTY;
c277df42 3406 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3407
653099ff 3408 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3409 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3410 I32 fake;
c5254dd6 3411 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3412 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3413 int stclass_flag;
07be1b83 3414 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3415
3416 first = scan;
c277df42 3417 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3418 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3419 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3420 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3421 /* for now we can't handle lookbehind IFMATCH*/
3422 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3423 (OP(first) == PLUS) ||
3424 (OP(first) == MINMOD) ||
653099ff 3425 /* An {n,m} with n>0 */
07be1b83
YO
3426 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3427 {
3428 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3429 if (OP(first) == PLUS)
3430 sawplus = 1;
3431 else
3dab1dad 3432 first += regarglen[OP(first)];
07be1b83
YO
3433 if (OP(first) == IFMATCH) {
3434 first = NEXTOPER(first);
3435 first += EXTRA_STEP_2ARGS;
7c167cea 3436 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3437 first = NEXTOPER(first);
a687059c
LW
3438 }
3439
a0d0e21e
LW
3440 /* Starting-point info. */
3441 again:
07be1b83 3442 /* Ignore EXACT as we deal with it later. */
3dab1dad 3443 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3444 if (OP(first) == EXACT)
6f207bd3 3445 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3446 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3447 r->regstclass = first;
3448 }
07be1b83
YO
3449#ifdef TRIE_STCLASS
3450 else if (OP(first) == TRIE &&
3451 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3452 {
3453 /* this can happen only on restudy */
3454 struct regnode_1 *trie_op;
3455 Newxz(trie_op,1,struct regnode_1);
3456 StructCopy(first,trie_op,struct regnode_1);
3457 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3458 r->regstclass = (regnode *)trie_op;
3459 }
3460#endif
bfed75c6 3461 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3462 r->regstclass = first;
3dab1dad
YO
3463 else if (PL_regkind[OP(first)] == BOUND ||
3464 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3465 r->regstclass = first;
3dab1dad 3466 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3467 r->reganch |= (OP(first) == MBOL
3468 ? ROPT_ANCH_MBOL
3469 : (OP(first) == SBOL
3470 ? ROPT_ANCH_SBOL
3471 : ROPT_ANCH_BOL));
a0d0e21e 3472 first = NEXTOPER(first);
774d564b 3473 goto again;
3474 }
3475 else if (OP(first) == GPOS) {
3476 r->reganch |= ROPT_ANCH_GPOS;
3477 first = NEXTOPER(first);
3478 goto again;
a0d0e21e 3479 }
e09294f4 3480 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3481 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3482 !(r->reganch & ROPT_ANCH) )
3483 {
3484 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3485 const int type =
3486 (OP(NEXTOPER(first)) == REG_ANY)
3487 ? ROPT_ANCH_MBOL
3488 : ROPT_ANCH_SBOL;
cad2e5aa 3489 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3490 first = NEXTOPER(first);
774d564b 3491 goto again;
a0d0e21e 3492 }
b81d288d 3493 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3494 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3495 /* x+ must match at the 1st pos of run of x's */
3496 r->reganch |= ROPT_SKIP;
a0d0e21e 3497
c277df42 3498 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3499#ifdef TRIE_STUDY_OPT
3500 DEBUG_COMPILE_r(
3501 if (!restudied)
3502 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3503 (IV)(first - scan + 1))
3504 );
3505#else
3506 DEBUG_COMPILE_r(
3507 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3508 (IV)(first - scan + 1))
3509 );
3510#endif
3511
3512
a0d0e21e
LW
3513 /*
3514 * If there's something expensive in the r.e., find the
3515 * longest literal string that must appear and make it the
3516 * regmust. Resolve ties in favor of later strings, since
3517 * the regstart check works with the beginning of the r.e.
3518 * and avoiding duplication strengthens checking. Not a
3519 * strong reason, but sufficient in the absence of others.
3520 * [Now we resolve ties in favor of the earlier string if
c277df42 3521 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3522 * earlier string may buy us something the later one won't.]
3523 */
a0d0e21e 3524 minlen = 0;
a687059c 3525
396482e1
GA
3526 data.longest_fixed = newSVpvs("");
3527 data.longest_float = newSVpvs("");
3528 data.last_found = newSVpvs("");
c277df42
IZ
3529 data.longest = &(data.longest_fixed);
3530 first = scan;
653099ff 3531 if (!r->regstclass) {
830247a4 3532 cl_init(pRExC_state, &ch_class);
653099ff
GS
3533 data.start_class = &ch_class;
3534 stclass_flag = SCF_DO_STCLASS_AND;
3535 } else /* XXXX Check for BOUND? */
3536 stclass_flag = 0;
cb434fcc 3537 data.last_closep = &last_close;
653099ff 3538
830247a4 3539 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3540 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3541
3542#ifdef TRIE_STUDY_OPT
3543 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3544 goto reStudy;
3545 }
3546#endif
3547
830247a4 3548 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3549 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3550 && !RExC_seen_zerolen
3551 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3552 r->reganch |= ROPT_CHECK_ALL;
830247a4 3553 scan_commit(pRExC_state, &data);
c277df42
IZ
3554 SvREFCNT_dec(data.last_found);
3555
a0ed51b3 3556 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3557 if (longest_float_length
c277df42
IZ
3558 || (data.flags & SF_FL_BEFORE_EOL
3559 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3560 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3561 int t;
3562
a0ed51b3 3563 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3564 && data.offset_fixed == data.offset_float_min
3565 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3566 goto remove_float; /* As in (a)+. */
3567
33b8afdf
JH
3568 if (SvUTF8(data.longest_float)) {
3569 r->float_utf8 = data.longest_float;
c445ea15 3570 r->float_substr = NULL;
33b8afdf
JH
3571 } else {
3572 r->float_substr = data.longest_float;
c445ea15 3573 r->float_utf8 = NULL;
33b8afdf 3574 }
c277df42
IZ
3575 r->float_min_offset = data.offset_float_min;
3576 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3577 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3578 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3579 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3580 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3581 }
3582 else {
aca2d497 3583 remove_float:
c445ea15 3584 r->float_substr = r->float_utf8 = NULL;
c277df42 3585 SvREFCNT_dec(data.longest_float);
c5254dd6 3586 longest_float_length = 0;
a0d0e21e 3587 }
c277df42 3588
a0ed51b3 3589 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3590 if (longest_fixed_length
c277df42
IZ
3591 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3592 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3593 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3594 int t;
3595
33b8afdf
JH
3596 if (SvUTF8(data.longest_fixed)) {
3597 r->anchored_utf8 = data.longest_fixed;
c445ea15 3598 r->anchored_substr = NULL;
33b8afdf
JH
3599 } else {
3600 r->anchored_substr = data.longest_fixed;
c445ea15 3601 r->anchored_utf8 = NULL;
33b8afdf 3602 }
c277df42 3603 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3604 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3605 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3606 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3607 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3608 }
3609 else {
c445ea15 3610 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3611 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3612 longest_fixed_length = 0;
a0d0e21e 3613 }
b81d288d 3614 if (r->regstclass
ffc61ed2 3615 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3616 r->regstclass = NULL;
33b8afdf
JH
3617 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3618 && stclass_flag
653099ff 3619 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3620 && !cl_is_anything(data.start_class))
3621 {
1df70142 3622 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3623
a02a5408 3624 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3625 struct regnode_charclass_class);
3626 StructCopy(data.start_class,
830247a4 3627 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3628 struct regnode_charclass_class);
830247a4 3629 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3630 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3631 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3632 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3633 PerlIO_printf(Perl_debug_log,
a0288114 3634 "synthetic stclass \"%s\".\n",
3f7c398e 3635 SvPVX_const(sv));});
653099ff 3636 }
c277df42
IZ
3637
3638 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3639 if (longest_fixed_length > longest_float_length) {
c277df42 3640 r->check_substr = r->anchored_substr;
33b8afdf 3641 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3642 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3643 if (r->reganch & ROPT_ANCH_SINGLE)
3644 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3645 }
3646 else {
c277df42 3647 r->check_substr = r->float_substr;
33b8afdf 3648 r->check_utf8 = r->float_utf8;
c277df42
IZ
3649 r->check_offset_min = data.offset_float_min;
3650 r->check_offset_max = data.offset_float_max;
a0d0e21e 3651 }
30382c73
IZ
3652 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3653 This should be changed ASAP! */
33b8afdf 3654 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3655 r->reganch |= RE_USE_INTUIT;
33b8afdf 3656 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3657 r->reganch |= RE_INTUIT_TAIL;
3658 }
a0ed51b3
LW
3659 }
3660 else {
c277df42
IZ
3661 /* Several toplevels. Best we can is to set minlen. */
3662 I32 fake;
653099ff 3663 struct regnode_charclass_class ch_class;
cb434fcc 3664 I32 last_close = 0;
c277df42 3665
a3621e74 3666 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3667
c277df42 3668 scan = r->program + 1;
830247a4 3669 cl_init(pRExC_state, &ch_class);
653099ff 3670 data.start_class = &ch_class;
cb434fcc 3671 data.last_closep = &last_close;
07be1b83
YO
3672
3673 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3674 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3675
3676#ifdef TRIE_STUDY_OPT
3677 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3678 goto reStudy;
3679 }
3680#endif
3681
33b8afdf 3682 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3683 = r->float_substr = r->float_utf8 = NULL;
653099ff 3684 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3685 && !cl_is_anything(data.start_class))
3686 {
1df70142 3687 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3688
a02a5408 3689 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3690 struct regnode_charclass_class);
3691 StructCopy(data.start_class,
830247a4 3692 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3693 struct regnode_charclass_class);
830247a4 3694 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3695 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3696 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3697 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3698 PerlIO_printf(Perl_debug_log,
a0288114 3699 "synthetic stclass \"%s\".\n",
3f7c398e 3700 SvPVX_const(sv));});
653099ff 3701 }
a0d0e21e
LW
3702 }
3703
a0d0e21e 3704 r->minlen = minlen;
b81d288d 3705 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3706 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3707 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3708 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3709 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3710 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3711 if (RExC_seen & REG_SEEN_CANY)
3712 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3713 Newxz(r->startp, RExC_npar, I32);
3714 Newxz(r->endp, RExC_npar, I32);
07be1b83 3715
f2278c82 3716 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3717 DEBUG_DUMP_r({
3718 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3719 regdump(r);
3720 });
8e9a8a48
YO
3721 DEBUG_OFFSETS_r(if (r->offsets) {
3722 const U32 len = r->offsets[0];
3723 U32 i;
3724 GET_RE_DEBUG_FLAGS_DECL;
3725 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3726 for (i = 1; i <= len; i++) {
3727 if (r->offsets[i*2-1] || r->offsets[i*2])
3728 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3729 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3730 }
3731 PerlIO_printf(Perl_debug_log, "\n");
3732 });
a0d0e21e 3733 return(r);
a687059c
LW
3734}
3735
3dab1dad
YO
3736
3737#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3738 int rem=(int)(RExC_end - RExC_parse); \
3739 int cut; \
3740 int num; \
3741 int iscut=0; \
3742 if (rem>10) { \
3743 rem=10; \
3744 iscut=1; \
3745 } \
3746 cut=10-rem; \
3747 if (RExC_lastparse!=RExC_parse) \
3748 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3749 rem, RExC_parse, \
3750 cut + 4, \
3751 iscut ? "..." : "<" \
3752 ); \
3753 else \
3754 PerlIO_printf(Perl_debug_log,"%16s",""); \
3755 \
3756 if (SIZE_ONLY) \
3757 num=RExC_size; \
3758 else \
3759 num=REG_NODE_NUM(RExC_emit); \
3760 if (RExC_lastnum!=num) \
be8e71aa 3761 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3762 else \
be8e71aa
YO
3763 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3764 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3765 (int)((depth*2)), "", \
3dab1dad
YO
3766 (funcname) \
3767 ); \
3768 RExC_lastnum=num; \
3769 RExC_lastparse=RExC_parse; \
3770})
3771
07be1b83
YO
3772
3773
3dab1dad
YO
3774#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3775 DEBUG_PARSE_MSG((funcname)); \
3776 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3777})
a687059c
LW
3778/*
3779 - reg - regular expression, i.e. main body or parenthesized thing
3780 *
3781 * Caller must absorb opening parenthesis.
3782 *
3783 * Combining parenthesis handling with the base level of regular expression
3784 * is a trifle forced, but the need to tie the tails of the branches to what
3785 * follows makes it hard to avoid.
3786 */
07be1b83
YO
3787#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3788#ifdef DEBUGGING
3789#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3790#else
3791#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3792#endif
3dab1dad 3793
76e3520e 3794STATIC regnode *
3dab1dad 3795S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3796 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3797{
27da23d5 3798 dVAR;
c277df42
IZ
3799 register regnode *ret; /* Will be the head of the group. */
3800 register regnode *br;
3801 register regnode *lastbr;
cbbf8932 3802 register regnode *ender = NULL;
a0d0e21e 3803 register I32 parno = 0;
cbbf8932
AL
3804 I32 flags;
3805 const I32 oregflags = RExC_flags;
6136c704
AL
3806 bool have_branch = 0;
3807 bool is_open = 0;
9d1d55b5
JP
3808
3809 /* for (?g), (?gc), and (?o) warnings; warning
3810 about (?c) will warn about (?g) -- japhy */
3811
6136c704
AL
3812#define WASTED_O 0x01
3813#define WASTED_G 0x02
3814#define WASTED_C 0x04
3815#define WASTED_GC (0x02|0x04)
cbbf8932 3816 I32 wastedflags = 0x00;
9d1d55b5 3817
fac92740 3818 char * parse_start = RExC_parse; /* MJD */
a28509cc 3819 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3820
3dab1dad
YO
3821 GET_RE_DEBUG_FLAGS_DECL;
3822 DEBUG_PARSE("reg ");
3823
3824
821b33a5 3825 *flagp = 0; /* Tentatively. */
a0d0e21e 3826
9d1d55b5 3827
a0d0e21e
LW
3828 /* Make an OPEN node, if parenthesized. */
3829 if (paren) {
fac92740 3830 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3831 U32 posflags = 0, negflags = 0;
3832 U32 *flagsp = &posflags;
6136c704 3833 bool is_logical = 0;
a28509cc 3834 const char * const seqstart = RExC_parse;
ca9dfc88 3835
830247a4
IZ
3836 RExC_parse++;
3837 paren = *RExC_parse++;
c277df42 3838 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3839 switch (paren) {
fac92740 3840 case '<': /* (?<...) */
830247a4 3841 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3842 if (*RExC_parse == '!')
c277df42 3843 paren = ',';
b81d288d 3844 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3845 goto unknown;
830247a4 3846 RExC_parse++;
fac92740
MJD
3847 case '=': /* (?=...) */
3848 case '!': /* (?!...) */
830247a4 3849 RExC_seen_zerolen++;
fac92740
MJD
3850 case ':': /* (?:...) */
3851 case '>': /* (?>...) */
a0d0e21e 3852 break;
fac92740
MJD
3853 case '$': /* (?$...) */
3854 case '@': /* (?@...) */
8615cb43 3855 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3856 break;
fac92740 3857 case '#': /* (?#...) */
830247a4
IZ
3858 while (*RExC_parse && *RExC_parse != ')')
3859 RExC_parse++;
3860 if (*RExC_parse != ')')
c277df42 3861 FAIL("Sequence (?#... not terminated");
830247a4 3862 nextchar(pRExC_state);
a0d0e21e
LW
3863 *flagp = TRYAGAIN;
3864 return NULL;
fac92740 3865 case 'p': /* (?p...) */
9014280d 3866 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3867 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3868 /* FALL THROUGH*/
fac92740 3869 case '?': /* (??...) */
6136c704 3870 is_logical = 1;
438a3801
YST
3871 if (*RExC_parse != '{')
3872 goto unknown;
830247a4 3873 paren = *RExC_parse++;
0f5d15d6 3874 /* FALL THROUGH */
fac92740 3875 case '{': /* (?{...}) */
c277df42 3876 {
c277df42
IZ
3877 I32 count = 1, n = 0;
3878 char c;
830247a4 3879 char *s = RExC_parse;
c277df42 3880
830247a4
IZ
3881 RExC_seen_zerolen++;
3882 RExC_seen |= REG_SEEN_EVAL;
3883 while (count && (c = *RExC_parse)) {
6136c704
AL
3884 if (c == '\\') {
3885 if (RExC_parse[1])
3886 RExC_parse++;
3887 }
b81d288d 3888 else if (c == '{')
c277df42 3889 count++;
b81d288d 3890 else if (c == '}')
c277df42 3891 count--;
830247a4 3892 RExC_parse++;
c277df42 3893 }
6136c704 3894 if (*RExC_parse != ')') {
b81d288d 3895 RExC_parse = s;
b45f050a
JF
3896 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3897 }
c277df42 3898 if (!SIZE_ONLY) {
f3548bdc 3899 PAD *pad;
6136c704
AL
3900 OP_4tree *sop, *rop;
3901 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3902
569233ed
SB
3903 ENTER;
3904 Perl_save_re_context(aTHX);
f3548bdc 3905 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3906 sop->op_private |= OPpREFCOUNTED;
3907 /* re_dup will OpREFCNT_inc */
3908 OpREFCNT_set(sop, 1);
569233ed 3909 LEAVE;
c277df42 3910
830247a4
IZ
3911 n = add_data(pRExC_state, 3, "nop");
3912 RExC_rx->data->data[n] = (void*)rop;
3913 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3914 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3915 SvREFCNT_dec(sv);
a0ed51b3 3916 }
e24b16f9 3917 else { /* First pass */
830247a4 3918 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3919 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3920 /* No compiled RE interpolated, has runtime
3921 components ===> unsafe. */
3922 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3923 if (PL_tainting && PL_tainted)
cc6b7395 3924 FAIL("Eval-group in insecure regular expression");
54df2634 3925#if PERL_VERSION > 8
923e4eb5 3926 if (IN_PERL_COMPILETIME)
b5c19bd7 3927 PL_cv_has_eval = 1;
54df2634 3928#endif
c277df42 3929 }
b5c19bd7 3930
830247a4 3931 nextchar(pRExC_state);
6136c704 3932 if (is_logical) {
830247a4 3933 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3934 if (!SIZE_ONLY)
3935 ret->flags = 2;
3dab1dad 3936 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3937 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3938 return ret;
3939 }
ccb2c380
MP
3940 ret = reganode(pRExC_state, EVAL, n);
3941 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3942 Set_Node_Offset(ret, parse_start);
3943 return ret;
c277df42 3944 }
fac92740 3945 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3946 {
fac92740 3947 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3948 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3949 || RExC_parse[1] == '<'
830247a4 3950 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3951 I32 flag;
3952
830247a4 3953 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3954 if (!SIZE_ONLY)
3955 ret->flags = 1;
3dab1dad 3956 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3957 goto insert_if;
b81d288d 3958 }
a0ed51b3 3959 }
830247a4 3960 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3961 /* (?(1)...) */
6136c704 3962 char c;
830247a4 3963 parno = atoi(RExC_parse++);
c277df42 3964
830247a4
IZ
3965 while (isDIGIT(*RExC_parse))
3966 RExC_parse++;
fac92740 3967 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3968
830247a4 3969 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3970 vFAIL("Switch condition not recognized");
c277df42 3971 insert_if:
3dab1dad
YO
3972 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3973 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3974 if (br == NULL)
830247a4 3975 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3976 else
3dab1dad 3977 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3978 c = *nextchar(pRExC_state);
d1b80229
IZ
3979 if (flags&HASWIDTH)
3980 *flagp |= HASWIDTH;
c277df42 3981 if (c == '|') {
830247a4 3982 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3983 regbranch(pRExC_state, &flags, 1,depth+1);
3984 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3985 if (flags&HASWIDTH)
3986 *flagp |= HASWIDTH;
830247a4 3987 c = *nextchar(pRExC_state);
a0ed51b3
LW
3988 }
3989 else
c277df42
IZ
3990 lastbr = NULL;
3991 if (c != ')')
8615cb43 3992 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3993 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3994 REGTAIL(pRExC_state, br, ender);
c277df42 3995 if (lastbr) {
3dab1dad
YO
3996 REGTAIL(pRExC_state, lastbr, ender);
3997 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3998 }
3999 else
3dab1dad 4000 REGTAIL(pRExC_state, ret, ender);
c277df42 4001 return ret;
a0ed51b3
LW
4002 }
4003 else {
830247a4 4004 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4005 }
4006 }
1b1626e4 4007 case 0:
830247a4 4008 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4009 vFAIL("Sequence (? incomplete");
1b1626e4 4010 break;
a0d0e21e 4011 default:
830247a4 4012 --RExC_parse;
fac92740 4013 parse_flags: /* (?i) */
830247a4 4014 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4015 /* (?g), (?gc) and (?o) are useless here
4016 and must be globally applied -- japhy */
4017
4018 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4019 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4020 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4021 if (! (wastedflags & wflagbit) ) {
4022 wastedflags |= wflagbit;
4023 vWARN5(
4024 RExC_parse + 1,
4025 "Useless (%s%c) - %suse /%c modifier",
4026 flagsp == &negflags ? "?-" : "?",
4027 *RExC_parse,
4028 flagsp == &negflags ? "don't " : "",
4029 *RExC_parse
4030 );
4031 }
4032 }
4033 }
4034 else if (*RExC_parse == 'c') {
4035 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4036 if (! (wastedflags & WASTED_C) ) {
4037 wastedflags |= WASTED_GC;
9d1d55b5
JP
4038 vWARN3(
4039 RExC_parse + 1,
4040 "Useless (%sc) - %suse /gc modifier",
4041 flagsp == &negflags ? "?-" : "?",
4042 flagsp == &negflags ? "don't " : ""
4043 );
4044 }
4045 }
4046 }
4047 else { pmflag(flagsp, *RExC_parse); }
4048
830247a4 4049 ++RExC_parse;
ca9dfc88 4050 }
830247a4 4051 if (*RExC_parse == '-') {
ca9dfc88 4052 flagsp = &negflags;
9d1d55b5 4053 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4054 ++RExC_parse;
ca9dfc88 4055 goto parse_flags;
48c036b1 4056 }
e2509266
JH
4057 RExC_flags |= posflags;
4058 RExC_flags &= ~negflags;
830247a4
IZ
4059 if (*RExC_parse == ':') {
4060 RExC_parse++;
ca9dfc88
IZ
4061 paren = ':';
4062 break;
4063 }
c277df42 4064 unknown:
830247a4
IZ
4065 if (*RExC_parse != ')') {
4066 RExC_parse++;
4067 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4068 }
830247a4 4069 nextchar(pRExC_state);
a0d0e21e
LW
4070 *flagp = TRYAGAIN;
4071 return NULL;
4072 }
4073 }
fac92740 4074 else { /* (...) */
830247a4
IZ
4075 parno = RExC_npar;
4076 RExC_npar++;
4077 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4078 Set_Node_Length(ret, 1); /* MJD */
4079 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4080 is_open = 1;
a0d0e21e 4081 }
a0ed51b3 4082 }
fac92740 4083 else /* ! paren */
a0d0e21e
LW
4084 ret = NULL;
4085
4086 /* Pick up the branches, linking them together. */
fac92740 4087 parse_start = RExC_parse; /* MJD */
3dab1dad 4088 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4089 /* branch_len = (paren != 0); */
2af232bd 4090
a0d0e21e
LW
4091 if (br == NULL)
4092 return(NULL);
830247a4
IZ
4093 if (*RExC_parse == '|') {
4094 if (!SIZE_ONLY && RExC_extralen) {
4095 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4096 }
fac92740 4097 else { /* MJD */
830247a4 4098 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4099 Set_Node_Length(br, paren != 0);
4100 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4101 }
c277df42
IZ
4102 have_branch = 1;
4103 if (SIZE_ONLY)
830247a4 4104 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4105 }
4106 else if (paren == ':') {
c277df42
IZ
4107 *flagp |= flags&SIMPLE;
4108 }
6136c704 4109 if (is_open) { /* Starts with OPEN. */
3dab1dad 4110 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4111 }
4112 else if (paren != '?') /* Not Conditional */
a0d0e21e 4113 ret = br;
32a0ca98 4114 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4115 lastbr = br;
830247a4
IZ
4116 while (*RExC_parse == '|') {
4117 if (!SIZE_ONLY && RExC_extralen) {
4118 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4119 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4120 }
4121 if (SIZE_ONLY)
830247a4
IZ
4122 RExC_extralen += 2; /* Account for LONGJMP. */
4123 nextchar(pRExC_state);
3dab1dad 4124 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4125
a687059c 4126 if (br == NULL)
a0d0e21e 4127 return(NULL);
3dab1dad 4128 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4129 lastbr = br;
821b33a5
IZ
4130 if (flags&HASWIDTH)
4131 *flagp |= HASWIDTH;
a687059c 4132 *flagp |= flags&SPSTART;
a0d0e21e
LW
4133 }
4134
c277df42
IZ
4135 if (have_branch || paren != ':') {
4136 /* Make a closing node, and hook it on the end. */
4137 switch (paren) {
4138 case ':':
830247a4 4139 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4140 break;
4141 case 1:
830247a4 4142 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4143 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4144 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4145 break;
4146 case '<':
c277df42
IZ
4147 case ',':
4148 case '=':
4149 case '!':
c277df42 4150 *flagp &= ~HASWIDTH;
821b33a5
IZ
4151 /* FALL THROUGH */
4152 case '>':
830247a4 4153 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4154 break;
4155 case 0:
830247a4 4156 ender = reg_node(pRExC_state, END);
c277df42
IZ
4157 break;
4158 }
07be1b83 4159 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4160
9674d46a 4161 if (have_branch && !SIZE_ONLY) {
c277df42 4162 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4163 for (br = ret; br; br = regnext(br)) {
4164 const U8 op = PL_regkind[OP(br)];
4165 if (op == BRANCH) {
07be1b83 4166 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4167 }
4168 else if (op == BRANCHJ) {
07be1b83 4169 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4170 }
c277df42
IZ
4171 }
4172 }
a0d0e21e 4173 }
c277df42
IZ
4174
4175 {
e1ec3a88
AL
4176 const char *p;
4177 static const char parens[] = "=!<,>";
c277df42
IZ
4178
4179 if (paren && (p = strchr(parens, paren))) {
eb160463 4180 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4181 int flag = (p - parens) > 1;
4182
4183 if (paren == '>')
4184 node = SUSPEND, flag = 0;
830247a4 4185 reginsert(pRExC_state, node,ret);
45948336
EP
4186 Set_Node_Cur_Length(ret);
4187 Set_Node_Offset(ret, parse_start + 1);
c277df42 4188 ret->flags = flag;
07be1b83 4189 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4190 }
a0d0e21e
LW
4191 }
4192
4193 /* Check for proper termination. */
ce3e6498 4194 if (paren) {
e2509266 4195 RExC_flags = oregflags;
830247a4
IZ
4196 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4197 RExC_parse = oregcomp_parse;
380a0633 4198 vFAIL("Unmatched (");
ce3e6498 4199 }
a0ed51b3 4200 }
830247a4
IZ
4201 else if (!paren && RExC_parse < RExC_end) {
4202 if (*RExC_parse == ')') {
4203 RExC_parse++;
380a0633 4204 vFAIL("Unmatched )");
a0ed51b3
LW
4205 }
4206 else
b45f050a 4207 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4208 /* NOTREACHED */
4209 }
a687059c 4210
a0d0e21e 4211 return(ret);
a687059c
LW
4212}
4213
4214/*
4215 - regbranch - one alternative of an | operator
4216 *
4217 * Implements the concatenation operator.
4218 */
76e3520e 4219STATIC regnode *
3dab1dad 4220S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4221{
97aff369 4222 dVAR;
c277df42
IZ
4223 register regnode *ret;
4224 register regnode *chain = NULL;
4225 register regnode *latest;
4226 I32 flags = 0, c = 0;
3dab1dad
YO
4227 GET_RE_DEBUG_FLAGS_DECL;
4228 DEBUG_PARSE("brnc");
b81d288d 4229 if (first)
c277df42
IZ
4230 ret = NULL;
4231 else {
b81d288d 4232 if (!SIZE_ONLY && RExC_extralen)
830247a4 4233 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4234 else {
830247a4 4235 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4236 Set_Node_Length(ret, 1);
4237 }
c277df42
IZ
4238 }
4239
b81d288d 4240 if (!first && SIZE_ONLY)
830247a4 4241 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4242
c277df42 4243 *flagp = WORST; /* Tentatively. */
a0d0e21e 4244
830247a4
IZ
4245 RExC_parse--;
4246 nextchar(pRExC_state);
4247 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4248 flags &= ~TRYAGAIN;
3dab1dad 4249 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4250 if (latest == NULL) {
4251 if (flags & TRYAGAIN)
4252 continue;
4253 return(NULL);
a0ed51b3
LW
4254 }
4255 else if (ret == NULL)
c277df42 4256 ret = latest;
a0d0e21e 4257 *flagp |= flags&HASWIDTH;
c277df42 4258 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4259 *flagp |= flags&SPSTART;
4260 else {
830247a4 4261 RExC_naughty++;
3dab1dad 4262 REGTAIL(pRExC_state, chain, latest);
a687059c 4263 }
a0d0e21e 4264 chain = latest;
c277df42
IZ
4265 c++;
4266 }
4267 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4268 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4269 if (ret == NULL)
4270 ret = chain;
4271 }
4272 if (c == 1) {
4273 *flagp |= flags&SIMPLE;
a0d0e21e 4274 }
a687059c 4275
d4c19fe8 4276 return ret;
a687059c
LW
4277}
4278
4279/*
4280 - regpiece - something followed by possible [*+?]
4281 *
4282 * Note that the branching code sequences used for ? and the general cases
4283 * of * and + are somewhat optimized: they use the same NOTHING node as
4284 * both the endmarker for their branch list and the body of the last branch.
4285 * It might seem that this node could be dispensed with entirely, but the
4286 * endmarker role is not redundant.
4287 */
76e3520e 4288STATIC regnode *
3dab1dad 4289S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4290{
97aff369 4291 dVAR;
c277df42 4292 register regnode *ret;
a0d0e21e
LW
4293 register char op;
4294 register char *next;
4295 I32 flags;
1df70142 4296 const char * const origparse = RExC_parse;
a0d0e21e 4297 I32 min;
c277df42 4298 I32 max = REG_INFTY;
fac92740 4299 char *parse_start;
10edeb5d 4300 const char *maxpos = NULL;
3dab1dad
YO
4301 GET_RE_DEBUG_FLAGS_DECL;
4302 DEBUG_PARSE("piec");
a0d0e21e 4303
3dab1dad 4304 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4305 if (ret == NULL) {
4306 if (flags & TRYAGAIN)
4307 *flagp |= TRYAGAIN;
4308 return(NULL);
4309 }
4310
830247a4 4311 op = *RExC_parse;
a0d0e21e 4312
830247a4 4313 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4314 maxpos = NULL;
fac92740 4315 parse_start = RExC_parse; /* MJD */
830247a4 4316 next = RExC_parse + 1;
a0d0e21e
LW
4317 while (isDIGIT(*next) || *next == ',') {
4318 if (*next == ',') {
4319 if (maxpos)
4320 break;
4321 else
4322 maxpos = next;
a687059c 4323 }
a0d0e21e
LW
4324 next++;
4325 }
4326 if (*next == '}') { /* got one */
4327 if (!maxpos)
4328 maxpos = next;
830247a4
IZ
4329 RExC_parse++;
4330 min = atoi(RExC_parse);
a0d0e21e
LW
4331 if (*maxpos == ',')
4332 maxpos++;
4333 else
830247a4 4334 maxpos = RExC_parse;
a0d0e21e
LW
4335 max = atoi(maxpos);
4336 if (!max && *maxpos != '0')
c277df42
IZ
4337 max = REG_INFTY; /* meaning "infinity" */
4338 else if (max >= REG_INFTY)
8615cb43 4339 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4340 RExC_parse = next;
4341 nextchar(pRExC_state);
a0d0e21e
LW
4342
4343 do_curly:
4344 if ((flags&SIMPLE)) {
830247a4
IZ
4345 RExC_naughty += 2 + RExC_naughty / 2;
4346 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4347 Set_Node_Offset(ret, parse_start+1); /* MJD */
4348 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4349 }
4350 else {
3dab1dad 4351 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4352
4353 w->flags = 0;
3dab1dad 4354 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4355 if (!SIZE_ONLY && RExC_extralen) {
4356 reginsert(pRExC_state, LONGJMP,ret);
4357 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4358 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4359 }
830247a4 4360 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4361 /* MJD hk */
4362 Set_Node_Offset(ret, parse_start+1);
2af232bd 4363 Set_Node_Length(ret,
fac92740 4364 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4365
830247a4 4366 if (!SIZE_ONLY && RExC_extralen)
c277df42 4367 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4368 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4369 if (SIZE_ONLY)
830247a4
IZ
4370 RExC_whilem_seen++, RExC_extralen += 3;
4371 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4372 }
c277df42 4373 ret->flags = 0;
a0d0e21e
LW
4374
4375 if (min > 0)
821b33a5
IZ
4376 *flagp = WORST;
4377 if (max > 0)
4378 *flagp |= HASWIDTH;
a0d0e21e 4379 if (max && max < min)
8615cb43 4380 vFAIL("Can't do {n,m} with n > m");
c277df42 4381 if (!SIZE_ONLY) {
eb160463
GS
4382 ARG1_SET(ret, (U16)min);
4383 ARG2_SET(ret, (U16)max);
a687059c 4384 }
a687059c 4385
a0d0e21e 4386 goto nest_check;
a687059c 4387 }
a0d0e21e 4388 }
a687059c 4389
a0d0e21e
LW
4390 if (!ISMULT1(op)) {
4391 *flagp = flags;
a687059c 4392 return(ret);
a0d0e21e 4393 }
bb20fd44 4394
c277df42 4395#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4396
4397 /* if this is reinstated, don't forget to put this back into perldiag:
4398
4399 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4400
4401 (F) The part of the regexp subject to either the * or + quantifier
4402 could match an empty string. The {#} shows in the regular
4403 expression about where the problem was discovered.
4404
4405 */
4406
bb20fd44 4407 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4408 vFAIL("Regexp *+ operand could be empty");
b81d288d 4409#endif
bb20fd44 4410
fac92740 4411 parse_start = RExC_parse;
830247a4 4412 nextchar(pRExC_state);
a0d0e21e 4413
821b33a5 4414 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4415
4416 if (op == '*' && (flags&SIMPLE)) {
830247a4 4417 reginsert(pRExC_state, STAR, ret);
c277df42 4418 ret->flags = 0;
830247a4 4419 RExC_naughty += 4;
a0d0e21e
LW
4420 }
4421 else if (op == '*') {
4422 min = 0;
4423 goto do_curly;
a0ed51b3
LW
4424 }
4425 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4426 reginsert(pRExC_state, PLUS, ret);
c277df42 4427 ret->flags = 0;
830247a4 4428 RExC_naughty += 3;
a0d0e21e
LW
4429 }
4430 else if (op == '+') {
4431 min = 1;
4432 goto do_curly;
a0ed51b3
LW
4433 }
4434 else if (op == '?') {
a0d0e21e
LW
4435 min = 0; max = 1;
4436 goto do_curly;
4437 }
4438 nest_check:
041457d9 4439 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4440 vWARN3(RExC_parse,
b45f050a 4441 "%.*s matches null string many times",
afd78fd5 4442 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4443 origparse);
a0d0e21e
LW
4444 }
4445
830247a4
IZ
4446 if (*RExC_parse == '?') {
4447 nextchar(pRExC_state);
4448 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4449 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4450 }
830247a4
IZ
4451 if (ISMULT2(RExC_parse)) {
4452 RExC_parse++;
b45f050a
JF
4453 vFAIL("Nested quantifiers");
4454 }
a0d0e21e
LW
4455
4456 return(ret);
a687059c
LW
4457}
4458
4459/*
4460 - regatom - the lowest level
4461 *
4462 * Optimization: gobbles an entire sequence of ordinary characters so that
4463 * it can turn them into a single node, which is smaller to store and
4464 * faster to run. Backslashed characters are exceptions, each becoming a
4465 * separate node; the code is simpler that way and it's not worth fixing.
4466 *
7f6f358c
YO
4467 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4468 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4469 */
76e3520e 4470STATIC regnode *
3dab1dad 4471S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4472{
97aff369 4473 dVAR;
cbbf8932 4474 register regnode *ret = NULL;
a0d0e21e 4475 I32 flags;
45948336 4476 char *parse_start = RExC_parse;
3dab1dad
YO
4477 GET_RE_DEBUG_FLAGS_DECL;
4478 DEBUG_PARSE("atom");
a0d0e21e
LW
4479 *flagp = WORST; /* Tentatively. */
4480
4481tryagain:
830247a4 4482 switch (*RExC_parse) {
a0d0e21e 4483 case '^':
830247a4
IZ
4484 RExC_seen_zerolen++;
4485 nextchar(pRExC_state);
e2509266 4486 if (RExC_flags & PMf_MULTILINE)
830247a4 4487 ret = reg_node(pRExC_state, MBOL);
e2509266 4488 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4489 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4490 else
830247a4 4491 ret = reg_node(pRExC_state, BOL);
fac92740 4492 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4493 break;
4494 case '$':
830247a4 4495 nextchar(pRExC_state);
b81d288d 4496 if (*RExC_parse)
830247a4 4497 RExC_seen_zerolen++;
e2509266 4498 if (RExC_flags & PMf_MULTILINE)
830247a4 4499 ret = reg_node(pRExC_state, MEOL);
e2509266 4500 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4501 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4502 else
830247a4 4503 ret = reg_node(pRExC_state, EOL);
fac92740 4504 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4505 break;
4506 case '.':
830247a4 4507 nextchar(pRExC_state);
e2509266 4508 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4509 ret = reg_node(pRExC_state, SANY);
4510 else
4511 ret = reg_node(pRExC_state, REG_ANY);
4512 *flagp |= HASWIDTH|SIMPLE;
830247a4 4513 RExC_naughty++;
fac92740 4514 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4515 break;
4516 case '[':
b45f050a 4517 {
3dab1dad
YO
4518 char * const oregcomp_parse = ++RExC_parse;
4519 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4520 if (*RExC_parse != ']') {
4521 RExC_parse = oregcomp_parse;
b45f050a
JF
4522 vFAIL("Unmatched [");
4523 }
830247a4 4524 nextchar(pRExC_state);
a0d0e21e 4525 *flagp |= HASWIDTH|SIMPLE;
fac92740 4526 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4527 break;
b45f050a 4528 }
a0d0e21e 4529 case '(':
830247a4 4530 nextchar(pRExC_state);
3dab1dad 4531 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4532 if (ret == NULL) {
bf93d4cc 4533 if (flags & TRYAGAIN) {
830247a4 4534 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4535 /* Make parent create an empty node if needed. */
4536 *flagp |= TRYAGAIN;
4537 return(NULL);
4538 }
a0d0e21e 4539 goto tryagain;
bf93d4cc 4540 }
a0d0e21e
LW
4541 return(NULL);
4542 }
c277df42 4543 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4544 break;
4545 case '|':
4546 case ')':
4547 if (flags & TRYAGAIN) {
4548 *flagp |= TRYAGAIN;
4549 return NULL;
4550 }
b45f050a 4551 vFAIL("Internal urp");
a0d0e21e
LW
4552 /* Supposed to be caught earlier. */
4553 break;
85afd4ae 4554 case '{':
830247a4
IZ
4555 if (!regcurly(RExC_parse)) {
4556 RExC_parse++;
85afd4ae
CS
4557 goto defchar;
4558 }
4559 /* FALL THROUGH */
a0d0e21e
LW
4560 case '?':
4561 case '+':
4562 case '*':
830247a4 4563 RExC_parse++;
b45f050a 4564 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4565 break;
4566 case '\\':
830247a4 4567 switch (*++RExC_parse) {
a0d0e21e 4568 case 'A':
830247a4
IZ
4569 RExC_seen_zerolen++;
4570 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4571 *flagp |= SIMPLE;
830247a4 4572 nextchar(pRExC_state);
fac92740 4573 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4574 break;
4575 case 'G':
830247a4
IZ
4576 ret = reg_node(pRExC_state, GPOS);
4577 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4578 *flagp |= SIMPLE;
830247a4 4579 nextchar(pRExC_state);
fac92740 4580 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4581 break;
4582 case 'Z':
830247a4 4583 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4584 *flagp |= SIMPLE;
a1917ab9 4585 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4586 nextchar(pRExC_state);
a0d0e21e 4587 break;
b85d18e9 4588 case 'z':
830247a4 4589 ret = reg_node(pRExC_state, EOS);
b85d18e9 4590 *flagp |= SIMPLE;
830247a4
IZ
4591 RExC_seen_zerolen++; /* Do not optimize RE away */
4592 nextchar(pRExC_state);
fac92740 4593 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4594 break;
4a2d328f 4595 case 'C':
f33976b4
DB
4596 ret = reg_node(pRExC_state, CANY);
4597 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4598 *flagp |= HASWIDTH|SIMPLE;
830247a4 4599 nextchar(pRExC_state);
fac92740 4600 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4601 break;
4602 case 'X':
830247a4 4603 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4604 *flagp |= HASWIDTH;
830247a4 4605 nextchar(pRExC_state);
fac92740 4606 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4607 break;
a0d0e21e 4608 case 'w':
eb160463 4609 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4610 *flagp |= HASWIDTH|SIMPLE;
830247a4 4611 nextchar(pRExC_state);
fac92740 4612 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4613 break;
4614 case 'W':
eb160463 4615 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4616 *flagp |= HASWIDTH|SIMPLE;
830247a4 4617 nextchar(pRExC_state);
fac92740 4618 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4619 break;
4620 case 'b':
830247a4
IZ
4621 RExC_seen_zerolen++;
4622 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4623 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4624 *flagp |= SIMPLE;
830247a4 4625 nextchar(pRExC_state);
fac92740 4626 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4627 break;
4628 case 'B':
830247a4
IZ
4629 RExC_seen_zerolen++;
4630 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4631 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4632 *flagp |= SIMPLE;
830247a4 4633 nextchar(pRExC_state);
fac92740 4634 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4635 break;
4636 case 's':
eb160463 4637 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4638 *flagp |= HASWIDTH|SIMPLE;
830247a4 4639 nextchar(pRExC_state);
fac92740 4640 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4641 break;
4642 case 'S':
eb160463 4643 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4644 *flagp |= HASWIDTH|SIMPLE;
830247a4 4645 nextchar(pRExC_state);
fac92740 4646 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4647 break;
4648 case 'd':
ffc61ed2 4649 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4650 *flagp |= HASWIDTH|SIMPLE;
830247a4 4651 nextchar(pRExC_state);
fac92740 4652 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4653 break;
4654 case 'D':
ffc61ed2 4655 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4656 *flagp |= HASWIDTH|SIMPLE;
830247a4 4657 nextchar(pRExC_state);
fac92740 4658 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4659 break;
a14b48bc
LW
4660 case 'p':
4661 case 'P':
3568d838 4662 {
3dab1dad 4663 char* const oldregxend = RExC_end;
ccb2c380 4664 char* parse_start = RExC_parse - 2;
a14b48bc 4665
830247a4 4666 if (RExC_parse[1] == '{') {
3568d838 4667 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4668 RExC_end = strchr(RExC_parse, '}');
4669 if (!RExC_end) {
3dab1dad 4670 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4671 RExC_parse += 2;
4672 RExC_end = oldregxend;
0da60cf5 4673 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4674 }
830247a4 4675 RExC_end++;
a14b48bc 4676 }
af6f566e 4677 else {
830247a4 4678 RExC_end = RExC_parse + 2;
af6f566e
HS
4679 if (RExC_end > oldregxend)
4680 RExC_end = oldregxend;
4681 }
830247a4 4682 RExC_parse--;
a14b48bc 4683
3dab1dad 4684 ret = regclass(pRExC_state,depth+1);
a14b48bc 4685
830247a4
IZ
4686 RExC_end = oldregxend;
4687 RExC_parse--;
ccb2c380
MP
4688
4689 Set_Node_Offset(ret, parse_start + 2);
4690 Set_Node_Cur_Length(ret);
830247a4 4691 nextchar(pRExC_state);
a14b48bc
LW
4692 *flagp |= HASWIDTH|SIMPLE;
4693 }
4694 break;
a0d0e21e
LW
4695 case 'n':
4696 case 'r':
4697 case 't':
4698 case 'f':
4699 case 'e':
4700 case 'a':
4701 case 'x':
4702 case 'c':
4703 case '0':
4704 goto defchar;
4705 case '1': case '2': case '3': case '4':
4706 case '5': case '6': case '7': case '8': case '9':
4707 {
1df70142 4708 const I32 num = atoi(RExC_parse);
a0d0e21e 4709
830247a4 4710 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4711 goto defchar;
4712 else {
3dab1dad 4713 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4714 while (isDIGIT(*RExC_parse))
4715 RExC_parse++;
b45f050a 4716
eb160463 4717 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4718 vFAIL("Reference to nonexistent group");
830247a4 4719 RExC_sawback = 1;
eb160463
GS
4720 ret = reganode(pRExC_state,
4721 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4722 num);
a0d0e21e 4723 *flagp |= HASWIDTH;
2af232bd 4724
fac92740 4725 /* override incorrect value set in reganode MJD */
2af232bd 4726 Set_Node_Offset(ret, parse_start+1);
fac92740 4727 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4728 RExC_parse--;
4729 nextchar(pRExC_state);
a0d0e21e
LW
4730 }
4731 }
4732 break;
4733 case '\0':
830247a4 4734 if (RExC_parse >= RExC_end)
b45f050a 4735 FAIL("Trailing \\");
a0d0e21e
LW
4736 /* FALL THROUGH */
4737 default:
a0288114 4738 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4739 back into the quick-grab loop below */
45948336 4740 parse_start--;
a0d0e21e
LW
4741 goto defchar;
4742 }
4743 break;
4633a7c4
LW
4744
4745 case '#':
e2509266 4746 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4747 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4748 RExC_parse++;
830247a4 4749 if (RExC_parse < RExC_end)
4633a7c4
LW
4750 goto tryagain;
4751 }
4752 /* FALL THROUGH */
4753
a0d0e21e 4754 default: {
ba210ebe 4755 register STRLEN len;
58ae7d3f 4756 register UV ender;
a0d0e21e 4757 register char *p;
3dab1dad 4758 char *s;
80aecb99 4759 STRLEN foldlen;
89ebb4a3 4760 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4761
4762 parse_start = RExC_parse - 1;
a0d0e21e 4763
830247a4 4764 RExC_parse++;
a0d0e21e
LW
4765
4766 defchar:
58ae7d3f 4767 ender = 0;
eb160463
GS
4768 ret = reg_node(pRExC_state,
4769 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4770 s = STRING(ret);
830247a4
IZ
4771 for (len = 0, p = RExC_parse - 1;
4772 len < 127 && p < RExC_end;
a0d0e21e
LW
4773 len++)
4774 {
3dab1dad 4775 char * const oldp = p;
5b5a24f7 4776
e2509266 4777 if (RExC_flags & PMf_EXTENDED)
830247a4 4778 p = regwhite(p, RExC_end);
a0d0e21e
LW
4779 switch (*p) {
4780 case '^':
4781 case '$':
4782 case '.':
4783 case '[':
4784 case '(':
4785 case ')':
4786 case '|':
4787 goto loopdone;
4788 case '\\':
4789 switch (*++p) {
4790 case 'A':
1ed8eac0
JF
4791 case 'C':
4792 case 'X':
a0d0e21e
LW
4793 case 'G':
4794 case 'Z':
b85d18e9 4795 case 'z':
a0d0e21e
LW
4796 case 'w':
4797 case 'W':
4798 case 'b':
4799 case 'B':
4800 case 's':
4801 case 'S':
4802 case 'd':
4803 case 'D':
a14b48bc
LW
4804 case 'p':
4805 case 'P':
a0d0e21e
LW
4806 --p;
4807 goto loopdone;
4808 case 'n':
4809 ender = '\n';
4810 p++;
a687059c 4811 break;
a0d0e21e
LW
4812 case 'r':
4813 ender = '\r';
4814 p++;
a687059c 4815 break;
a0d0e21e
LW
4816 case 't':
4817 ender = '\t';
4818 p++;
a687059c 4819 break;
a0d0e21e
LW
4820 case 'f':
4821 ender = '\f';
4822 p++;
a687059c 4823 break;
a0d0e21e 4824 case 'e':
c7f1f016 4825 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4826 p++;
a687059c 4827 break;
a0d0e21e 4828 case 'a':
c7f1f016 4829 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4830 p++;
a687059c 4831 break;
a0d0e21e 4832 case 'x':
a0ed51b3 4833 if (*++p == '{') {
1df70142 4834 char* const e = strchr(p, '}');
b81d288d 4835
b45f050a 4836 if (!e) {
830247a4 4837 RExC_parse = p + 1;
b45f050a
JF
4838 vFAIL("Missing right brace on \\x{}");
4839 }
de5f0749 4840 else {
a4c04bdc
NC
4841 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4842 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4843 STRLEN numlen = e - p - 1;
53305cf1 4844 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4845 if (ender > 0xff)
4846 RExC_utf8 = 1;
a0ed51b3
LW
4847 p = e + 1;
4848 }
a0ed51b3
LW
4849 }
4850 else {
a4c04bdc 4851 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4852 STRLEN numlen = 2;
53305cf1 4853 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4854 p += numlen;
4855 }
a687059c 4856 break;
a0d0e21e
LW
4857 case 'c':
4858 p++;
bbce6d69 4859 ender = UCHARAT(p++);
4860 ender = toCTRL(ender);
a687059c 4861 break;
a0d0e21e
LW
4862 case '0': case '1': case '2': case '3':case '4':
4863 case '5': case '6': case '7': case '8':case '9':
4864 if (*p == '0' ||
830247a4 4865 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4866 I32 flags = 0;
1df70142 4867 STRLEN numlen = 3;
53305cf1 4868 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4869 p += numlen;
4870 }
4871 else {
4872 --p;
4873 goto loopdone;
a687059c
LW
4874 }
4875 break;
a0d0e21e 4876 case '\0':
830247a4 4877 if (p >= RExC_end)
b45f050a 4878 FAIL("Trailing \\");
a687059c 4879 /* FALL THROUGH */
a0d0e21e 4880 default:
041457d9 4881 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4882 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4883 goto normal_default;
a0d0e21e
LW
4884 }
4885 break;
a687059c 4886 default:
a0ed51b3 4887 normal_default:
fd400ab9 4888 if (UTF8_IS_START(*p) && UTF) {
1df70142 4889 STRLEN numlen;
5e12f4fb 4890 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4891 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4892 p += numlen;
4893 }
4894 else
4895 ender = *p++;
a0d0e21e 4896 break;
a687059c 4897 }
e2509266 4898 if (RExC_flags & PMf_EXTENDED)
830247a4 4899 p = regwhite(p, RExC_end);
60a8b682
JH
4900 if (UTF && FOLD) {
4901 /* Prime the casefolded buffer. */
ac7e0132 4902 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4903 }
a0d0e21e
LW
4904 if (ISMULT2(p)) { /* Back off on ?+*. */
4905 if (len)
4906 p = oldp;
16ea2a2e 4907 else if (UTF) {
80aecb99 4908 if (FOLD) {
60a8b682 4909 /* Emit all the Unicode characters. */
1df70142 4910 STRLEN numlen;
80aecb99
JH
4911 for (foldbuf = tmpbuf;
4912 foldlen;
4913 foldlen -= numlen) {
4914 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4915 if (numlen > 0) {
71207a34 4916 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4917 s += unilen;
4918 len += unilen;
4919 /* In EBCDIC the numlen
4920 * and unilen can differ. */
9dc45d57 4921 foldbuf += numlen;
47654450
JH
4922 if (numlen >= foldlen)
4923 break;
9dc45d57
JH
4924 }
4925 else
4926 break; /* "Can't happen." */
80aecb99
JH
4927 }
4928 }
4929 else {
71207a34 4930 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4931 if (unilen > 0) {
0ebc6274
JH
4932 s += unilen;
4933 len += unilen;
9dc45d57 4934 }
80aecb99 4935 }
a0ed51b3 4936 }
a0d0e21e
LW
4937 else {
4938 len++;
eb160463 4939 REGC((char)ender, s++);
a0d0e21e
LW
4940 }
4941 break;
a687059c 4942 }
16ea2a2e 4943 if (UTF) {
80aecb99 4944 if (FOLD) {
60a8b682 4945 /* Emit all the Unicode characters. */
1df70142 4946 STRLEN numlen;
80aecb99
JH
4947 for (foldbuf = tmpbuf;
4948 foldlen;
4949 foldlen -= numlen) {
4950 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4951 if (numlen > 0) {
71207a34 4952 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4953 len += unilen;
4954 s += unilen;
4955 /* In EBCDIC the numlen
4956 * and unilen can differ. */
9dc45d57 4957 foldbuf += numlen;
47654450
JH
4958 if (numlen >= foldlen)
4959 break;
9dc45d57
JH
4960 }
4961 else
4962 break;
80aecb99
JH
4963 }
4964 }
4965 else {
71207a34 4966 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4967 if (unilen > 0) {
0ebc6274
JH
4968 s += unilen;
4969 len += unilen;
9dc45d57 4970 }
80aecb99
JH
4971 }
4972 len--;
a0ed51b3
LW
4973 }
4974 else
eb160463 4975 REGC((char)ender, s++);
a0d0e21e
LW
4976 }
4977 loopdone:
830247a4 4978 RExC_parse = p - 1;
fac92740 4979 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4980 nextchar(pRExC_state);
793db0cb
JH
4981 {
4982 /* len is STRLEN which is unsigned, need to copy to signed */
4983 IV iv = len;
4984 if (iv < 0)
4985 vFAIL("Internal disaster");
4986 }
a0d0e21e
LW
4987 if (len > 0)
4988 *flagp |= HASWIDTH;
090f7165 4989 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4990 *flagp |= SIMPLE;
3dab1dad 4991
cd439c50 4992 if (SIZE_ONLY)
830247a4 4993 RExC_size += STR_SZ(len);
3dab1dad
YO
4994 else {
4995 STR_LEN(ret) = len;
830247a4 4996 RExC_emit += STR_SZ(len);
07be1b83 4997 }
3dab1dad 4998 }
a0d0e21e
LW
4999 break;
5000 }
a687059c 5001
60a8b682
JH
5002 /* If the encoding pragma is in effect recode the text of
5003 * any EXACT-kind nodes. */
3dab1dad
YO
5004 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5005 const STRLEN oldlen = STR_LEN(ret);
5006 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5007
5008 if (RExC_utf8)
5009 SvUTF8_on(sv);
5010 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5011 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5012 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5013
5014 if (SvUTF8(sv))
5015 RExC_utf8 = 1;
5016 if (!SIZE_ONLY) {
a3621e74
YO
5017 GET_RE_DEBUG_FLAGS_DECL;
5018 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5019 (int)oldlen, STRING(ret),
5020 (int)newlen, s));
5021 Copy(s, STRING(ret), newlen, char);
5022 STR_LEN(ret) += newlen - oldlen;
5023 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5024 } else
5025 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5026 }
a72c7584
JH
5027 }
5028
a0d0e21e 5029 return(ret);
a687059c
LW
5030}
5031
873ef191 5032STATIC char *
5f66b61c 5033S_regwhite(char *p, const char *e)
5b5a24f7
CS
5034{
5035 while (p < e) {
5036 if (isSPACE(*p))
5037 ++p;
5038 else if (*p == '#') {
5039 do {
5040 p++;
5041 } while (p < e && *p != '\n');
5042 }
5043 else
5044 break;
5045 }
5046 return p;
5047}
5048
b8c5462f
JH
5049/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5050 Character classes ([:foo:]) can also be negated ([:^foo:]).
5051 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5052 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5053 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5054
5055#define POSIXCC_DONE(c) ((c) == ':')
5056#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5057#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5058
b8c5462f 5059STATIC I32
830247a4 5060S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5061{
97aff369 5062 dVAR;
936ed897 5063 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5064
830247a4 5065 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5066 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5067 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5068 const char c = UCHARAT(RExC_parse);
097eb12c 5069 char* const s = RExC_parse++;
b81d288d 5070
9a86a77b 5071 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5072 RExC_parse++;
5073 if (RExC_parse == RExC_end)
620e46c5 5074 /* Grandfather lone [:, [=, [. */
830247a4 5075 RExC_parse = s;
620e46c5 5076 else {
3dab1dad 5077 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5078 assert(*t == c);
5079
9a86a77b 5080 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5081 const char *posixcc = s + 1;
830247a4 5082 RExC_parse++; /* skip over the ending ] */
3dab1dad 5083
b8c5462f 5084 if (*s == ':') {
1df70142
AL
5085 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5086 const I32 skip = t - posixcc;
80916619
NC
5087
5088 /* Initially switch on the length of the name. */
5089 switch (skip) {
5090 case 4:
3dab1dad
YO
5091 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5092 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5093 break;
80916619
NC
5094 case 5:
5095 /* Names all of length 5. */
5096 /* alnum alpha ascii blank cntrl digit graph lower
5097 print punct space upper */
5098 /* Offset 4 gives the best switch position. */
5099 switch (posixcc[4]) {
5100 case 'a':
3dab1dad
YO
5101 if (memEQ(posixcc, "alph", 4)) /* alpha */
5102 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5103 break;
5104 case 'e':
3dab1dad
YO
5105 if (memEQ(posixcc, "spac", 4)) /* space */
5106 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5107 break;
5108 case 'h':
3dab1dad
YO
5109 if (memEQ(posixcc, "grap", 4)) /* graph */
5110 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5111 break;
5112 case 'i':
3dab1dad
YO
5113 if (memEQ(posixcc, "asci", 4)) /* ascii */
5114 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5115 break;
5116 case 'k':
3dab1dad
YO
5117 if (memEQ(posixcc, "blan", 4)) /* blank */
5118 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5119 break;
5120 case 'l':
3dab1dad
YO
5121 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5122 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5123 break;
5124 case 'm':
3dab1dad
YO
5125 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5126 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5127 break;
5128 case 'r':
3dab1dad
YO
5129 if (memEQ(posixcc, "lowe", 4)) /* lower */
5130 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5131 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5132 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5133 break;
5134 case 't':
3dab1dad
YO
5135 if (memEQ(posixcc, "digi", 4)) /* digit */
5136 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5137 else if (memEQ(posixcc, "prin", 4)) /* print */
5138 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5139 else if (memEQ(posixcc, "punc", 4)) /* punct */
5140 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5141 break;
b8c5462f
JH
5142 }
5143 break;
80916619 5144 case 6:
3dab1dad
YO
5145 if (memEQ(posixcc, "xdigit", 6))
5146 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5147 break;
5148 }
80916619
NC
5149
5150 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5151 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5152 t - s - 1, s + 1);
80916619
NC
5153 assert (posixcc[skip] == ':');
5154 assert (posixcc[skip+1] == ']');
b45f050a 5155 } else if (!SIZE_ONLY) {
b8c5462f 5156 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5157
830247a4 5158 /* adjust RExC_parse so the warning shows after
b45f050a 5159 the class closes */
9a86a77b 5160 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5161 RExC_parse++;
b45f050a
JF
5162 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5163 }
b8c5462f
JH
5164 } else {
5165 /* Maternal grandfather:
5166 * "[:" ending in ":" but not in ":]" */
830247a4 5167 RExC_parse = s;
767d463e 5168 }
620e46c5
JH
5169 }
5170 }
5171
b8c5462f
JH
5172 return namedclass;
5173}
5174
5175STATIC void
830247a4 5176S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5177{
97aff369 5178 dVAR;
3dab1dad 5179 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5180 const char *s = RExC_parse;
5181 const char c = *s++;
b8c5462f 5182
3dab1dad 5183 while (isALNUM(*s))
b8c5462f
JH
5184 s++;
5185 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5186 if (ckWARN(WARN_REGEXP))
5187 vWARN3(s+2,
5188 "POSIX syntax [%c %c] belongs inside character classes",
5189 c, c);
b45f050a
JF
5190
5191 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5192 if (POSIXCC_NOTYET(c)) {
830247a4 5193 /* adjust RExC_parse so the error shows after
b45f050a 5194 the class closes */
9a86a77b 5195 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5196 NOOP;
b45f050a
JF
5197 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5198 }
b8c5462f
JH
5199 }
5200 }
620e46c5
JH
5201}
5202
7f6f358c
YO
5203
5204/*
5205 parse a class specification and produce either an ANYOF node that
5206 matches the pattern. If the pattern matches a single char only and
5207 that char is < 256 then we produce an EXACT node instead.
5208*/
76e3520e 5209STATIC regnode *
3dab1dad 5210S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5211{
97aff369 5212 dVAR;
07be1b83 5213 register UV value;
9a86a77b 5214 register UV nextvalue;
3568d838 5215 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5216 register IV range = 0;
c277df42 5217 register regnode *ret;
ba210ebe 5218 STRLEN numlen;
ffc61ed2 5219 IV namedclass;
cbbf8932 5220 char *rangebegin = NULL;
936ed897 5221 bool need_class = 0;
c445ea15 5222 SV *listsv = NULL;
ffc61ed2 5223 UV n;
9e55ce06 5224 bool optimize_invert = TRUE;
cbbf8932 5225 AV* unicode_alternate = NULL;
1b2d223b
JH
5226#ifdef EBCDIC
5227 UV literal_endpoint = 0;
5228#endif
7f6f358c 5229 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5230
3dab1dad 5231 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5232 case we need to change the emitted regop to an EXACT. */
07be1b83 5233 const char * orig_parse = RExC_parse;
72f13be8 5234 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5235#ifndef DEBUGGING
5236 PERL_UNUSED_ARG(depth);
5237#endif
72f13be8 5238
3dab1dad 5239 DEBUG_PARSE("clas");
7f6f358c
YO
5240
5241 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5242 ret = reganode(pRExC_state, ANYOF, 0);
5243
5244 if (!SIZE_ONLY)
5245 ANYOF_FLAGS(ret) = 0;
5246
9a86a77b 5247 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5248 RExC_naughty++;
5249 RExC_parse++;
5250 if (!SIZE_ONLY)
5251 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5252 }
a0d0e21e 5253
73060fc4 5254 if (SIZE_ONLY) {
830247a4 5255 RExC_size += ANYOF_SKIP;
73060fc4
JH
5256 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5257 }
936ed897 5258 else {
830247a4 5259 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5260 if (FOLD)
5261 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5262 if (LOC)
5263 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5264 ANYOF_BITMAP_ZERO(ret);
396482e1 5265 listsv = newSVpvs("# comment\n");
a0d0e21e 5266 }
b8c5462f 5267
9a86a77b
JH
5268 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5269
b938889d 5270 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5271 checkposixcc(pRExC_state);
b8c5462f 5272
f064b6ad
HS
5273 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5274 if (UCHARAT(RExC_parse) == ']')
5275 goto charclassloop;
ffc61ed2 5276
9a86a77b 5277 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5278
5279 charclassloop:
5280
5281 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5282
73b437c8 5283 if (!range)
830247a4 5284 rangebegin = RExC_parse;
ffc61ed2 5285 if (UTF) {
5e12f4fb 5286 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5287 RExC_end - RExC_parse,
9f7f3913 5288 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5289 RExC_parse += numlen;
5290 }
5291 else
5292 value = UCHARAT(RExC_parse++);
7f6f358c 5293
9a86a77b
JH
5294 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5295 if (value == '[' && POSIXCC(nextvalue))
830247a4 5296 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5297 else if (value == '\\') {
ffc61ed2 5298 if (UTF) {
5e12f4fb 5299 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5300 RExC_end - RExC_parse,
9f7f3913 5301 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5302 RExC_parse += numlen;
5303 }
5304 else
5305 value = UCHARAT(RExC_parse++);
470c3474 5306 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5307 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5308 * be a problem later if we want switch on Unicode.
5309 * A similar issue a little bit later when switching on
5310 * namedclass. --jhi */
ffc61ed2 5311 switch ((I32)value) {
b8c5462f
JH
5312 case 'w': namedclass = ANYOF_ALNUM; break;
5313 case 'W': namedclass = ANYOF_NALNUM; break;
5314 case 's': namedclass = ANYOF_SPACE; break;
5315 case 'S': namedclass = ANYOF_NSPACE; break;
5316 case 'd': namedclass = ANYOF_DIGIT; break;
5317 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5318 case 'p':
5319 case 'P':
3dab1dad
YO
5320 {
5321 char *e;
af6f566e 5322 if (RExC_parse >= RExC_end)
2a4859cd 5323 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5324 if (*RExC_parse == '{') {
1df70142 5325 const U8 c = (U8)value;
ffc61ed2
JH
5326 e = strchr(RExC_parse++, '}');
5327 if (!e)
0da60cf5 5328 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5329 while (isSPACE(UCHARAT(RExC_parse)))
5330 RExC_parse++;
5331 if (e == RExC_parse)
0da60cf5 5332 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5333 n = e - RExC_parse;
ab13f0c7
JH
5334 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5335 n--;
ffc61ed2
JH
5336 }
5337 else {
5338 e = RExC_parse;
5339 n = 1;
5340 }
5341 if (!SIZE_ONLY) {
ab13f0c7
JH
5342 if (UCHARAT(RExC_parse) == '^') {
5343 RExC_parse++;
5344 n--;
5345 value = value == 'p' ? 'P' : 'p'; /* toggle */
5346 while (isSPACE(UCHARAT(RExC_parse))) {
5347 RExC_parse++;
5348 n--;
5349 }
5350 }
097eb12c
AL
5351 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5352 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5353 }
5354 RExC_parse = e + 1;
5355 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5356 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5357 }
f81125e2 5358 break;
b8c5462f
JH
5359 case 'n': value = '\n'; break;
5360 case 'r': value = '\r'; break;
5361 case 't': value = '\t'; break;
5362 case 'f': value = '\f'; break;
5363 case 'b': value = '\b'; break;
c7f1f016
NIS
5364 case 'e': value = ASCII_TO_NATIVE('\033');break;
5365 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5366 case 'x':
ffc61ed2 5367 if (*RExC_parse == '{') {
a4c04bdc
NC
5368 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5369 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5370 char * const e = strchr(RExC_parse++, '}');
b81d288d 5371 if (!e)
ffc61ed2 5372 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5373
5374 numlen = e - RExC_parse;
5375 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5376 RExC_parse = e + 1;
5377 }
5378 else {
a4c04bdc 5379 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5380 numlen = 2;
5381 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5382 RExC_parse += numlen;
5383 }
b8c5462f
JH
5384 break;
5385 case 'c':
830247a4 5386 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5387 value = toCTRL(value);
5388 break;
5389 case '0': case '1': case '2': case '3': case '4':
5390 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5391 {
5392 I32 flags = 0;
5393 numlen = 3;
5394 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5395 RExC_parse += numlen;
b8c5462f 5396 break;
53305cf1 5397 }
1028017a 5398 default:
041457d9 5399 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5400 vWARN2(RExC_parse,
5401 "Unrecognized escape \\%c in character class passed through",
5402 (int)value);
1028017a 5403 break;
b8c5462f 5404 }
ffc61ed2 5405 } /* end of \blah */
1b2d223b
JH
5406#ifdef EBCDIC
5407 else
5408 literal_endpoint++;
5409#endif
ffc61ed2
JH
5410
5411 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5412
5413 if (!SIZE_ONLY && !need_class)
936ed897 5414 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5415
936ed897 5416 need_class = 1;
ffc61ed2
JH
5417
5418 /* a bad range like a-\d, a-[:digit:] ? */
5419 if (range) {
73b437c8 5420 if (!SIZE_ONLY) {
afd78fd5 5421 if (ckWARN(WARN_REGEXP)) {
097eb12c 5422 const int w =
afd78fd5
JH
5423 RExC_parse >= rangebegin ?
5424 RExC_parse - rangebegin : 0;
830247a4 5425 vWARN4(RExC_parse,
b45f050a 5426 "False [] range \"%*.*s\"",
097eb12c 5427 w, w, rangebegin);
afd78fd5 5428 }
3568d838
JH
5429 if (prevvalue < 256) {
5430 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5431 ANYOF_BITMAP_SET(ret, '-');
5432 }
5433 else {
5434 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5435 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5436 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5437 }
b8c5462f 5438 }
ffc61ed2
JH
5439
5440 range = 0; /* this was not a true range */
73b437c8 5441 }
ffc61ed2 5442
73b437c8 5443 if (!SIZE_ONLY) {
c49a72a9
NC
5444 const char *what = NULL;
5445 char yesno = 0;
5446
3568d838
JH
5447 if (namedclass > OOB_NAMEDCLASS)
5448 optimize_invert = FALSE;
e2962f66
JH
5449 /* Possible truncation here but in some 64-bit environments
5450 * the compiler gets heartburn about switch on 64-bit values.
5451 * A similar issue a little earlier when switching on value.
98f323fa 5452 * --jhi */
e2962f66 5453 switch ((I32)namedclass) {
73b437c8
JH
5454 case ANYOF_ALNUM:
5455 if (LOC)
936ed897 5456 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5457 else {
5458 for (value = 0; value < 256; value++)
5459 if (isALNUM(value))
936ed897 5460 ANYOF_BITMAP_SET(ret, value);
73b437c8 5461 }
c49a72a9
NC
5462 yesno = '+';
5463 what = "Word";
73b437c8
JH
5464 break;
5465 case ANYOF_NALNUM:
5466 if (LOC)
936ed897 5467 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5468 else {
5469 for (value = 0; value < 256; value++)
5470 if (!isALNUM(value))
936ed897 5471 ANYOF_BITMAP_SET(ret, value);
73b437c8 5472 }
c49a72a9
NC
5473 yesno = '!';
5474 what = "Word";
73b437c8 5475 break;
ffc61ed2 5476 case ANYOF_ALNUMC:
73b437c8 5477 if (LOC)
ffc61ed2 5478 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5479 else {
5480 for (value = 0; value < 256; value++)
ffc61ed2 5481 if (isALNUMC(value))
936ed897 5482 ANYOF_BITMAP_SET(ret, value);
73b437c8 5483 }
c49a72a9
NC
5484 yesno = '+';
5485 what = "Alnum";
73b437c8
JH
5486 break;
5487 case ANYOF_NALNUMC:
5488 if (LOC)
936ed897 5489 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5490 else {
5491 for (value = 0; value < 256; value++)
5492 if (!isALNUMC(value))
936ed897 5493 ANYOF_BITMAP_SET(ret, value);
73b437c8 5494 }
c49a72a9
NC
5495 yesno = '!';
5496 what = "Alnum";
73b437c8
JH
5497 break;
5498 case ANYOF_ALPHA:
5499 if (LOC)
936ed897 5500 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5501 else {
5502 for (value = 0; value < 256; value++)
5503 if (isALPHA(value))
936ed897 5504 ANYOF_BITMAP_SET(ret, value);
73b437c8 5505 }
c49a72a9
NC
5506 yesno = '+';
5507 what = "Alpha";
73b437c8
JH
5508 break;
5509 case ANYOF_NALPHA:
5510 if (LOC)
936ed897 5511 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5512 else {
5513 for (value = 0; value < 256; value++)
5514 if (!isALPHA(value))
936ed897 5515 ANYOF_BITMAP_SET(ret, value);
73b437c8 5516 }
c49a72a9
NC
5517 yesno = '!';
5518 what = "Alpha";
73b437c8
JH
5519 break;
5520 case ANYOF_ASCII:
5521 if (LOC)
936ed897 5522 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5523 else {
c7f1f016 5524#ifndef EBCDIC
1ba5c669
JH
5525 for (value = 0; value < 128; value++)
5526 ANYOF_BITMAP_SET(ret, value);
5527#else /* EBCDIC */
ffbc6a93 5528 for (value = 0; value < 256; value++) {
3a3c4447
JH
5529 if (isASCII(value))
5530 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5531 }
1ba5c669 5532#endif /* EBCDIC */
73b437c8 5533 }
c49a72a9
NC
5534 yesno = '+';
5535 what = "ASCII";
73b437c8
JH
5536 break;
5537 case ANYOF_NASCII:
5538 if (LOC)
936ed897 5539 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5540 else {
c7f1f016 5541#ifndef EBCDIC
1ba5c669
JH
5542 for (value = 128; value < 256; value++)
5543 ANYOF_BITMAP_SET(ret, value);
5544#else /* EBCDIC */
ffbc6a93 5545 for (value = 0; value < 256; value++) {
3a3c4447
JH
5546 if (!isASCII(value))
5547 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5548 }
1ba5c669 5549#endif /* EBCDIC */
73b437c8 5550 }
c49a72a9
NC
5551 yesno = '!';
5552 what = "ASCII";
73b437c8 5553 break;
aaa51d5e
JF
5554 case ANYOF_BLANK:
5555 if (LOC)
5556 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5557 else {
5558 for (value = 0; value < 256; value++)
5559 if (isBLANK(value))
5560 ANYOF_BITMAP_SET(ret, value);
5561 }
c49a72a9
NC
5562 yesno = '+';
5563 what = "Blank";
aaa51d5e
JF
5564 break;
5565 case ANYOF_NBLANK:
5566 if (LOC)
5567 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5568 else {
5569 for (value = 0; value < 256; value++)
5570 if (!isBLANK(value))
5571 ANYOF_BITMAP_SET(ret, value);
5572 }
c49a72a9
NC
5573 yesno = '!';
5574 what = "Blank";
aaa51d5e 5575 break;
73b437c8
JH
5576 case ANYOF_CNTRL:
5577 if (LOC)
936ed897 5578 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5579 else {
5580 for (value = 0; value < 256; value++)
5581 if (isCNTRL(value))
936ed897 5582 ANYOF_BITMAP_SET(ret, value);
73b437c8 5583 }
c49a72a9
NC
5584 yesno = '+';
5585 what = "Cntrl";
73b437c8
JH
5586 break;
5587 case ANYOF_NCNTRL:
5588 if (LOC)
936ed897 5589 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5590 else {
5591 for (value = 0; value < 256; value++)
5592 if (!isCNTRL(value))
936ed897 5593 ANYOF_BITMAP_SET(ret, value);
73b437c8 5594 }
c49a72a9
NC
5595 yesno = '!';
5596 what = "Cntrl";
ffc61ed2
JH
5597 break;
5598 case ANYOF_DIGIT:
5599 if (LOC)
5600 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5601 else {
5602 /* consecutive digits assumed */
5603 for (value = '0'; value <= '9'; value++)
5604 ANYOF_BITMAP_SET(ret, value);
5605 }
c49a72a9
NC
5606 yesno = '+';
5607 what = "Digit";
ffc61ed2
JH
5608 break;
5609 case ANYOF_NDIGIT:
5610 if (LOC)
5611 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5612 else {
5613 /* consecutive digits assumed */
5614 for (value = 0; value < '0'; value++)
5615 ANYOF_BITMAP_SET(ret, value);
5616 for (value = '9' + 1; value < 256; value++)
5617 ANYOF_BITMAP_SET(ret, value);
5618 }
c49a72a9
NC
5619 yesno = '!';
5620 what = "Digit";
73b437c8
JH
5621 break;
5622 case ANYOF_GRAPH:
5623 if (LOC)
936ed897 5624 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5625 else {
5626 for (value = 0; value < 256; value++)
5627 if (isGRAPH(value))
936ed897 5628 ANYOF_BITMAP_SET(ret, value);
73b437c8 5629 }
c49a72a9
NC
5630 yesno = '+';
5631 what = "Graph";
73b437c8
JH
5632 break;
5633 case ANYOF_NGRAPH:
5634 if (LOC)
936ed897 5635 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5636 else {
5637 for (value = 0; value < 256; value++)
5638 if (!isGRAPH(value))
936ed897 5639 ANYOF_BITMAP_SET(ret, value);
73b437c8 5640 }
c49a72a9
NC
5641 yesno = '!';
5642 what = "Graph";
73b437c8
JH
5643 break;
5644 case ANYOF_LOWER:
5645 if (LOC)
936ed897 5646 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5647 else {
5648 for (value = 0; value < 256; value++)
5649 if (isLOWER(value))
936ed897 5650 ANYOF_BITMAP_SET(ret, value);
73b437c8 5651 }
c49a72a9
NC
5652 yesno = '+';
5653 what = "Lower";
73b437c8
JH
5654 break;
5655 case ANYOF_NLOWER:
5656 if (LOC)
936ed897 5657 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5658 else {
5659 for (value = 0; value < 256; value++)
5660 if (!isLOWER(value))
936ed897 5661 ANYOF_BITMAP_SET(ret, value);
73b437c8 5662 }
c49a72a9
NC
5663 yesno = '!';
5664 what = "Lower";
73b437c8
JH
5665 break;
5666 case ANYOF_PRINT:
5667 if (LOC)
936ed897 5668 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5669 else {
5670 for (value = 0; value < 256; value++)
5671 if (isPRINT(value))
936ed897 5672 ANYOF_BITMAP_SET(ret, value);
73b437c8 5673 }
c49a72a9
NC
5674 yesno = '+';
5675 what = "Print";
73b437c8
JH
5676 break;
5677 case ANYOF_NPRINT:
5678 if (LOC)
936ed897 5679 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5680 else {
5681 for (value = 0; value < 256; value++)
5682 if (!isPRINT(value))
936ed897 5683 ANYOF_BITMAP_SET(ret, value);
73b437c8 5684 }
c49a72a9
NC
5685 yesno = '!';
5686 what = "Print";
73b437c8 5687 break;
aaa51d5e
JF
5688 case ANYOF_PSXSPC:
5689 if (LOC)
5690 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5691 else {
5692 for (value = 0; value < 256; value++)
5693 if (isPSXSPC(value))
5694 ANYOF_BITMAP_SET(ret, value);
5695 }
c49a72a9
NC
5696 yesno = '+';
5697 what = "Space";
aaa51d5e
JF
5698 break;
5699 case ANYOF_NPSXSPC:
5700 if (LOC)
5701 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5702 else {
5703 for (value = 0; value < 256; value++)
5704 if (!isPSXSPC(value))
5705 ANYOF_BITMAP_SET(ret, value);
5706 }
c49a72a9
NC
5707 yesno = '!';
5708 what = "Space";
aaa51d5e 5709 break;
73b437c8
JH
5710 case ANYOF_PUNCT:
5711 if (LOC)
936ed897 5712 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5713 else {
5714 for (value = 0; value < 256; value++)
5715 if (isPUNCT(value))
936ed897 5716 ANYOF_BITMAP_SET(ret, value);
73b437c8 5717 }
c49a72a9
NC
5718 yesno = '+';
5719 what = "Punct";
73b437c8
JH
5720 break;
5721 case ANYOF_NPUNCT:
5722 if (LOC)
936ed897 5723 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5724 else {
5725 for (value = 0; value < 256; value++)
5726 if (!isPUNCT(value))
936ed897 5727 ANYOF_BITMAP_SET(ret, value);
73b437c8 5728 }
c49a72a9
NC
5729 yesno = '!';
5730 what = "Punct";
ffc61ed2
JH
5731 break;
5732 case ANYOF_SPACE:
5733 if (LOC)
5734 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5735 else {
5736 for (value = 0; value < 256; value++)
5737 if (isSPACE(value))
5738 ANYOF_BITMAP_SET(ret, value);
5739 }
c49a72a9
NC
5740 yesno = '+';
5741 what = "SpacePerl";
ffc61ed2
JH
5742 break;
5743 case ANYOF_NSPACE:
5744 if (LOC)
5745 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5746 else {
5747 for (value = 0; value < 256; value++)
5748 if (!isSPACE(value))
5749 ANYOF_BITMAP_SET(ret, value);
5750 }
c49a72a9
NC
5751 yesno = '!';
5752 what = "SpacePerl";
73b437c8
JH
5753 break;
5754 case ANYOF_UPPER:
5755 if (LOC)
936ed897 5756 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5757 else {
5758 for (value = 0; value < 256; value++)
5759 if (isUPPER(value))
936ed897 5760 ANYOF_BITMAP_SET(ret, value);
73b437c8 5761 }
c49a72a9
NC
5762 yesno = '+';
5763 what = "Upper";
73b437c8
JH
5764 break;
5765 case ANYOF_NUPPER:
5766 if (LOC)
936ed897 5767 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5768 else {
5769 for (value = 0; value < 256; value++)
5770 if (!isUPPER(value))
936ed897 5771 ANYOF_BITMAP_SET(ret, value);
73b437c8 5772 }
c49a72a9
NC
5773 yesno = '!';
5774 what = "Upper";
73b437c8
JH
5775 break;
5776 case ANYOF_XDIGIT:
5777 if (LOC)
936ed897 5778 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5779 else {
5780 for (value = 0; value < 256; value++)
5781 if (isXDIGIT(value))
936ed897 5782 ANYOF_BITMAP_SET(ret, value);
73b437c8 5783 }
c49a72a9
NC
5784 yesno = '+';
5785 what = "XDigit";
73b437c8
JH
5786 break;
5787 case ANYOF_NXDIGIT:
5788 if (LOC)
936ed897 5789 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5790 else {
5791 for (value = 0; value < 256; value++)
5792 if (!isXDIGIT(value))
936ed897 5793 ANYOF_BITMAP_SET(ret, value);
73b437c8 5794 }
c49a72a9
NC
5795 yesno = '!';
5796 what = "XDigit";
73b437c8 5797 break;
f81125e2
JP
5798 case ANYOF_MAX:
5799 /* this is to handle \p and \P */
5800 break;
73b437c8 5801 default:
b45f050a 5802 vFAIL("Invalid [::] class");
73b437c8 5803 break;
b8c5462f 5804 }
c49a72a9
NC
5805 if (what) {
5806 /* Strings such as "+utf8::isWord\n" */
5807 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5808 }
b8c5462f 5809 if (LOC)
936ed897 5810 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5811 continue;
a0d0e21e 5812 }
ffc61ed2
JH
5813 } /* end of namedclass \blah */
5814
a0d0e21e 5815 if (range) {
eb160463 5816 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5817 const int w = RExC_parse - rangebegin;
5818 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5819 range = 0; /* not a valid range */
73b437c8 5820 }
a0d0e21e
LW
5821 }
5822 else {
3568d838 5823 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5824 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5825 RExC_parse[1] != ']') {
5826 RExC_parse++;
ffc61ed2
JH
5827
5828 /* a bad range like \w-, [:word:]- ? */
5829 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5830 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5831 const int w =
afd78fd5
JH
5832 RExC_parse >= rangebegin ?
5833 RExC_parse - rangebegin : 0;
830247a4 5834 vWARN4(RExC_parse,
b45f050a 5835 "False [] range \"%*.*s\"",
097eb12c 5836 w, w, rangebegin);
afd78fd5 5837 }
73b437c8 5838 if (!SIZE_ONLY)
936ed897 5839 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5840 } else
ffc61ed2
JH
5841 range = 1; /* yeah, it's a range! */
5842 continue; /* but do it the next time */
a0d0e21e 5843 }
a687059c 5844 }
ffc61ed2 5845
93733859 5846 /* now is the next time */
07be1b83 5847 /*stored += (value - prevvalue + 1);*/
ae5c130c 5848 if (!SIZE_ONLY) {
3568d838 5849 if (prevvalue < 256) {
1df70142 5850 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5851 IV i;
3568d838 5852#ifdef EBCDIC
1b2d223b
JH
5853 /* In EBCDIC [\x89-\x91] should include
5854 * the \x8e but [i-j] should not. */
5855 if (literal_endpoint == 2 &&
5856 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5857 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5858 {
3568d838
JH
5859 if (isLOWER(prevvalue)) {
5860 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5861 if (isLOWER(i))
5862 ANYOF_BITMAP_SET(ret, i);
5863 } else {
3568d838 5864 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5865 if (isUPPER(i))
5866 ANYOF_BITMAP_SET(ret, i);
5867 }
8ada0baa 5868 }
ffc61ed2 5869 else
8ada0baa 5870#endif
07be1b83
YO
5871 for (i = prevvalue; i <= ceilvalue; i++) {
5872 if (!ANYOF_BITMAP_TEST(ret,i)) {
5873 stored++;
5874 ANYOF_BITMAP_SET(ret, i);
5875 }
5876 }
3568d838 5877 }
a5961de5 5878 if (value > 255 || UTF) {
1df70142
AL
5879 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5880 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5881 stored+=2; /* can't optimize this class */
ffc61ed2 5882 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5883 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5884 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5885 prevnatvalue, natvalue);
5886 }
5887 else if (prevnatvalue == natvalue) {
5888 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5889 if (FOLD) {
89ebb4a3 5890 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5891 STRLEN foldlen;
1df70142 5892 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5893
e294cc5d
JH
5894#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
5895 if (RExC_precomp[0] == ':' &&
5896 RExC_precomp[1] == '[' &&
5897 (f == 0xDF || f == 0x92)) {
5898 f = NATIVE_TO_UNI(f);
5899 }
5900#endif
c840d2a2
JH
5901 /* If folding and foldable and a single
5902 * character, insert also the folded version
5903 * to the charclass. */
9e55ce06 5904 if (f != value) {
e294cc5d
JH
5905#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
5906 if ((RExC_precomp[0] == ':' &&
5907 RExC_precomp[1] == '[' &&
5908 (f == 0xA2 &&
5909 (value == 0xFB05 || value == 0xFB06))) ?
5910 foldlen == ((STRLEN)UNISKIP(f) - 1) :
5911 foldlen == (STRLEN)UNISKIP(f) )
5912#else
eb160463 5913 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 5914#endif
9e55ce06
JH
5915 Perl_sv_catpvf(aTHX_ listsv,
5916 "%04"UVxf"\n", f);
5917 else {
5918 /* Any multicharacter foldings
5919 * require the following transform:
5920 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5921 * where E folds into "pq" and F folds
5922 * into "rst", all other characters
5923 * fold to single characters. We save
5924 * away these multicharacter foldings,
5925 * to be later saved as part of the
5926 * additional "s" data. */
5927 SV *sv;
5928
5929 if (!unicode_alternate)
5930 unicode_alternate = newAV();
5931 sv = newSVpvn((char*)foldbuf, foldlen);
5932 SvUTF8_on(sv);
5933 av_push(unicode_alternate, sv);
5934 }
5935 }
254ba52a 5936
60a8b682
JH
5937 /* If folding and the value is one of the Greek
5938 * sigmas insert a few more sigmas to make the
5939 * folding rules of the sigmas to work right.
5940 * Note that not all the possible combinations
5941 * are handled here: some of them are handled
9e55ce06
JH
5942 * by the standard folding rules, and some of
5943 * them (literal or EXACTF cases) are handled
5944 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5945 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5946 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5947 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5948 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5949 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5950 }
5951 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5952 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5953 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5954 }
5955 }
ffc61ed2 5956 }
1b2d223b
JH
5957#ifdef EBCDIC
5958 literal_endpoint = 0;
5959#endif
8ada0baa 5960 }
ffc61ed2
JH
5961
5962 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5963 }
ffc61ed2 5964
936ed897 5965 if (need_class) {
4f66b38d 5966 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5967 if (SIZE_ONLY)
830247a4 5968 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5969 else
830247a4 5970 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5971 }
ffc61ed2 5972
7f6f358c
YO
5973
5974 if (SIZE_ONLY)
5975 return ret;
5976 /****** !SIZE_ONLY AFTER HERE *********/
5977
5978 if( stored == 1 && value < 256
5979 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5980 ) {
5981 /* optimize single char class to an EXACT node
5982 but *only* when its not a UTF/high char */
07be1b83
YO
5983 const char * cur_parse= RExC_parse;
5984 RExC_emit = (regnode *)orig_emit;
5985 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5986 ret = reg_node(pRExC_state,
5987 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5988 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5989 *STRING(ret)= (char)value;
5990 STR_LEN(ret)= 1;
5991 RExC_emit += STR_SZ(1);
5992 return ret;
5993 }
ae5c130c 5994 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5995 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5996 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5997 ) {
a0ed51b3 5998 for (value = 0; value < 256; ++value) {
936ed897 5999 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6000 UV fold = PL_fold[value];
ffc61ed2
JH
6001
6002 if (fold != value)
6003 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6004 }
6005 }
936ed897 6006 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6007 }
ffc61ed2 6008
ae5c130c 6009 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6010 if (optimize_invert &&
ffc61ed2
JH
6011 /* If the only flag is inversion. */
6012 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6013 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6014 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6015 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6016 }
7f6f358c 6017 {
097eb12c 6018 AV * const av = newAV();
ffc61ed2 6019 SV *rv;
9e55ce06 6020 /* The 0th element stores the character class description
6a0407ee 6021 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6022 * to initialize the appropriate swash (which gets stored in
6023 * the 1st element), and also useful for dumping the regnode.
6024 * The 2nd element stores the multicharacter foldings,
6a0407ee 6025 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6026 av_store(av, 0, listsv);
6027 av_store(av, 1, NULL);
9e55ce06 6028 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6029 rv = newRV_noinc((SV*)av);
19860706 6030 n = add_data(pRExC_state, 1, "s");
830247a4 6031 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6032 ARG_SET(ret, n);
a0ed51b3 6033 }
a0ed51b3
LW
6034 return ret;
6035}
6036
76e3520e 6037STATIC char*
830247a4 6038S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6039{
097eb12c 6040 char* const retval = RExC_parse++;
a0d0e21e 6041
4633a7c4 6042 for (;;) {
830247a4
IZ
6043 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6044 RExC_parse[2] == '#') {
e994fd66
AE
6045 while (*RExC_parse != ')') {
6046 if (RExC_parse == RExC_end)
6047 FAIL("Sequence (?#... not terminated");
830247a4 6048 RExC_parse++;
e994fd66 6049 }
830247a4 6050 RExC_parse++;
4633a7c4
LW
6051 continue;
6052 }
e2509266 6053 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6054 if (isSPACE(*RExC_parse)) {
6055 RExC_parse++;
748a9306
LW
6056 continue;
6057 }
830247a4 6058 else if (*RExC_parse == '#') {
e994fd66
AE
6059 while (RExC_parse < RExC_end)
6060 if (*RExC_parse++ == '\n') break;
748a9306
LW
6061 continue;
6062 }
748a9306 6063 }
4633a7c4 6064 return retval;
a0d0e21e 6065 }
a687059c
LW
6066}
6067
6068/*
c277df42 6069- reg_node - emit a node
a0d0e21e 6070*/
76e3520e 6071STATIC regnode * /* Location. */
830247a4 6072S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6073{
97aff369 6074 dVAR;
c277df42 6075 register regnode *ptr;
504618e9 6076 regnode * const ret = RExC_emit;
07be1b83 6077 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6078
c277df42 6079 if (SIZE_ONLY) {
830247a4
IZ
6080 SIZE_ALIGN(RExC_size);
6081 RExC_size += 1;
a0d0e21e
LW
6082 return(ret);
6083 }
c277df42 6084 NODE_ALIGN_FILL(ret);
a0d0e21e 6085 ptr = ret;
c277df42 6086 FILL_ADVANCE_NODE(ptr, op);
fac92740 6087 if (RExC_offsets) { /* MJD */
07be1b83 6088 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6089 "reg_node", __LINE__,
6090 reg_name[op],
07be1b83
YO
6091 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6092 ? "Overwriting end of array!\n" : "OK",
6093 (UV)(RExC_emit - RExC_emit_start),
6094 (UV)(RExC_parse - RExC_start),
6095 (UV)RExC_offsets[0]));
ccb2c380 6096 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6097 }
07be1b83 6098
830247a4 6099 RExC_emit = ptr;
a687059c 6100
a0d0e21e 6101 return(ret);
a687059c
LW
6102}
6103
6104/*
a0d0e21e
LW
6105- reganode - emit a node with an argument
6106*/
76e3520e 6107STATIC regnode * /* Location. */
830247a4 6108S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6109{
97aff369 6110 dVAR;
c277df42 6111 register regnode *ptr;
504618e9 6112 regnode * const ret = RExC_emit;
07be1b83 6113 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6114
c277df42 6115 if (SIZE_ONLY) {
830247a4
IZ
6116 SIZE_ALIGN(RExC_size);
6117 RExC_size += 2;
a0d0e21e
LW
6118 return(ret);
6119 }
fe14fcc3 6120
c277df42 6121 NODE_ALIGN_FILL(ret);
a0d0e21e 6122 ptr = ret;
c277df42 6123 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6124 if (RExC_offsets) { /* MJD */
07be1b83 6125 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6126 "reganode",
ccb2c380
MP
6127 __LINE__,
6128 reg_name[op],
07be1b83 6129 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6130 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6131 (UV)(RExC_emit - RExC_emit_start),
6132 (UV)(RExC_parse - RExC_start),
6133 (UV)RExC_offsets[0]));
ccb2c380 6134 Set_Cur_Node_Offset;
fac92740
MJD
6135 }
6136
830247a4 6137 RExC_emit = ptr;
fe14fcc3 6138
a0d0e21e 6139 return(ret);
fe14fcc3
LW
6140}
6141
6142/*
cd439c50 6143- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6144*/
71207a34
AL
6145STATIC STRLEN
6146S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6147{
97aff369 6148 dVAR;
71207a34 6149 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6150}
6151
6152/*
a0d0e21e
LW
6153- reginsert - insert an operator in front of already-emitted operand
6154*
6155* Means relocating the operand.
6156*/
76e3520e 6157STATIC void
830247a4 6158S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6159{
97aff369 6160 dVAR;
c277df42
IZ
6161 register regnode *src;
6162 register regnode *dst;
6163 register regnode *place;
504618e9 6164 const int offset = regarglen[(U8)op];
07be1b83 6165 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6166/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6167
6168 if (SIZE_ONLY) {
830247a4 6169 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6170 return;
6171 }
a687059c 6172
830247a4
IZ
6173 src = RExC_emit;
6174 RExC_emit += NODE_STEP_REGNODE + offset;
6175 dst = RExC_emit;
fac92740 6176 while (src > opnd) {
c277df42 6177 StructCopy(--src, --dst, regnode);
fac92740 6178 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6179 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6180 "reg_insert",
ccb2c380
MP
6181 __LINE__,
6182 reg_name[op],
07be1b83
YO
6183 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6184 ? "Overwriting end of array!\n" : "OK",
6185 (UV)(src - RExC_emit_start),
6186 (UV)(dst - RExC_emit_start),
6187 (UV)RExC_offsets[0]));
ccb2c380
MP
6188 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6189 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6190 }
6191 }
6192
a0d0e21e
LW
6193
6194 place = opnd; /* Op node, where operand used to be. */
fac92740 6195 if (RExC_offsets) { /* MJD */
07be1b83 6196 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6197 "reginsert",
ccb2c380
MP
6198 __LINE__,
6199 reg_name[op],
07be1b83 6200 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6201 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6202 (UV)(place - RExC_emit_start),
6203 (UV)(RExC_parse - RExC_start),
fac92740 6204 RExC_offsets[0]));
ccb2c380 6205 Set_Node_Offset(place, RExC_parse);
45948336 6206 Set_Node_Length(place, 1);
fac92740 6207 }
c277df42
IZ
6208 src = NEXTOPER(place);
6209 FILL_ADVANCE_NODE(place, op);
6210 Zero(src, offset, regnode);
a687059c
LW
6211}
6212
6213/*
c277df42 6214- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6215- SEE ALSO: regtail_study
a0d0e21e 6216*/
097eb12c 6217/* TODO: All three parms should be const */
76e3520e 6218STATIC void
3dab1dad 6219S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6220{
97aff369 6221 dVAR;
c277df42 6222 register regnode *scan;
72f13be8 6223 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6224#ifndef DEBUGGING
6225 PERL_UNUSED_ARG(depth);
6226#endif
a0d0e21e 6227
c277df42 6228 if (SIZE_ONLY)
a0d0e21e
LW
6229 return;
6230
6231 /* Find last node. */
6232 scan = p;
6233 for (;;) {
504618e9 6234 regnode * const temp = regnext(scan);
3dab1dad
YO
6235 DEBUG_PARSE_r({
6236 SV * const mysv=sv_newmortal();
6237 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6238 regprop(RExC_rx, mysv, scan);
6239 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6240 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6241 });
6242 if (temp == NULL)
6243 break;
6244 scan = temp;
6245 }
6246
6247 if (reg_off_by_arg[OP(scan)]) {
6248 ARG_SET(scan, val - scan);
6249 }
6250 else {
6251 NEXT_OFF(scan) = val - scan;
6252 }
6253}
6254
07be1b83 6255#ifdef DEBUGGING
3dab1dad
YO
6256/*
6257- regtail_study - set the next-pointer at the end of a node chain of p to val.
6258- Look for optimizable sequences at the same time.
6259- currently only looks for EXACT chains.
07be1b83
YO
6260
6261This is expermental code. The idea is to use this routine to perform
6262in place optimizations on branches and groups as they are constructed,
6263with the long term intention of removing optimization from study_chunk so
6264that it is purely analytical.
6265
6266Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6267to control which is which.
6268
3dab1dad
YO
6269*/
6270/* TODO: All four parms should be const */
07be1b83 6271
3dab1dad
YO
6272STATIC U8
6273S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6274{
6275 dVAR;
6276 register regnode *scan;
07be1b83
YO
6277 U8 exact = PSEUDO;
6278#ifdef EXPERIMENTAL_INPLACESCAN
6279 I32 min = 0;
6280#endif
6281
3dab1dad
YO
6282 GET_RE_DEBUG_FLAGS_DECL;
6283
07be1b83 6284
3dab1dad
YO
6285 if (SIZE_ONLY)
6286 return exact;
6287
6288 /* Find last node. */
6289
6290 scan = p;
6291 for (;;) {
6292 regnode * const temp = regnext(scan);
07be1b83
YO
6293#ifdef EXPERIMENTAL_INPLACESCAN
6294 if (PL_regkind[OP(scan)] == EXACT)
6295 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6296 return EXACT;
6297#endif
3dab1dad
YO
6298 if ( exact ) {
6299 switch (OP(scan)) {
6300 case EXACT:
6301 case EXACTF:
6302 case EXACTFL:
6303 if( exact == PSEUDO )
6304 exact= OP(scan);
07be1b83
YO
6305 else if ( exact != OP(scan) )
6306 exact= 0;
3dab1dad
YO
6307 case NOTHING:
6308 break;
6309 default:
6310 exact= 0;
6311 }
6312 }
6313 DEBUG_PARSE_r({
6314 SV * const mysv=sv_newmortal();
6315 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6316 regprop(RExC_rx, mysv, scan);
6317 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6318 SvPV_nolen_const(mysv),
6319 reg_name[exact],
6320 REG_NODE_NUM(scan));
6321 });
a0d0e21e
LW
6322 if (temp == NULL)
6323 break;
6324 scan = temp;
6325 }
07be1b83
YO
6326 DEBUG_PARSE_r({
6327 SV * const mysv_val=sv_newmortal();
6328 DEBUG_PARSE_MSG("");
6329 regprop(RExC_rx, mysv_val, val);
6330 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6331 SvPV_nolen_const(mysv_val),
6332 REG_NODE_NUM(val),
6333 val - scan
6334 );
6335 });
c277df42
IZ
6336 if (reg_off_by_arg[OP(scan)]) {
6337 ARG_SET(scan, val - scan);
a0ed51b3
LW
6338 }
6339 else {
c277df42
IZ
6340 NEXT_OFF(scan) = val - scan;
6341 }
3dab1dad
YO
6342
6343 return exact;
a687059c 6344}
07be1b83 6345#endif
a687059c
LW
6346
6347/*
a687059c
LW
6348 - regcurly - a little FSA that accepts {\d+,?\d*}
6349 */
79072805 6350STATIC I32
5f66b61c 6351S_regcurly(register const char *s)
a687059c
LW
6352{
6353 if (*s++ != '{')
6354 return FALSE;
f0fcb552 6355 if (!isDIGIT(*s))
a687059c 6356 return FALSE;
f0fcb552 6357 while (isDIGIT(*s))
a687059c
LW
6358 s++;
6359 if (*s == ',')
6360 s++;
f0fcb552 6361 while (isDIGIT(*s))
a687059c
LW
6362 s++;
6363 if (*s != '}')
6364 return FALSE;
6365 return TRUE;
6366}
6367
a687059c
LW
6368
6369/*
fd181c75 6370 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6371 */
6372void
097eb12c 6373Perl_regdump(pTHX_ const regexp *r)
a687059c 6374{
35ff7856 6375#ifdef DEBUGGING
97aff369 6376 dVAR;
c445ea15 6377 SV * const sv = sv_newmortal();
ab3bbdeb 6378 SV *dsv= sv_newmortal();
a687059c 6379
4f639d21 6380 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6381
6382 /* Header fields of interest. */
ab3bbdeb
YO
6383 if (r->anchored_substr) {
6384 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6385 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 6386 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6387 "anchored %s%s at %"IVdf" ",
6388 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 6389 (IV)r->anchored_offset);
ab3bbdeb
YO
6390 } else if (r->anchored_utf8) {
6391 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6392 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 6393 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6394 "anchored utf8 %s%s at %"IVdf" ",
6395 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 6396 (IV)r->anchored_offset);
ab3bbdeb
YO
6397 }
6398 if (r->float_substr) {
6399 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6400 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 6401 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6402 "floating %s%s at %"IVdf"..%"UVuf" ",
6403 s, RE_SV_TAIL(r->float_substr),
7b0972df 6404 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
6405 } else if (r->float_utf8) {
6406 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6407 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 6408 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6409 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6410 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 6411 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 6412 }
33b8afdf 6413 if (r->check_substr || r->check_utf8)
b81d288d 6414 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
6415 (const char *)
6416 (r->check_substr == r->float_substr
6417 && r->check_utf8 == r->float_utf8
6418 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
6419 if (r->reganch & ROPT_NOSCAN)
6420 PerlIO_printf(Perl_debug_log, " noscan");
6421 if (r->reganch & ROPT_CHECK_ALL)
6422 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6423 if (r->check_substr || r->check_utf8)
c277df42
IZ
6424 PerlIO_printf(Perl_debug_log, ") ");
6425
46fc3d4c 6426 if (r->regstclass) {
32fc9b6a 6427 regprop(r, sv, r->regstclass);
3f7c398e 6428 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6429 }
774d564b 6430 if (r->reganch & ROPT_ANCH) {
6431 PerlIO_printf(Perl_debug_log, "anchored");
6432 if (r->reganch & ROPT_ANCH_BOL)
6433 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6434 if (r->reganch & ROPT_ANCH_MBOL)
6435 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6436 if (r->reganch & ROPT_ANCH_SBOL)
6437 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6438 if (r->reganch & ROPT_ANCH_GPOS)
6439 PerlIO_printf(Perl_debug_log, "(GPOS)");
6440 PerlIO_putc(Perl_debug_log, ' ');
6441 }
c277df42
IZ
6442 if (r->reganch & ROPT_GPOS_SEEN)
6443 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6444 if (r->reganch & ROPT_SKIP)
760ac839 6445 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6446 if (r->reganch & ROPT_IMPLICIT)
760ac839 6447 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6448 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6449 if (r->reganch & ROPT_EVAL_SEEN)
6450 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6451 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6452#else
96a5add6 6453 PERL_UNUSED_CONTEXT;
65e66c80 6454 PERL_UNUSED_ARG(r);
17c3b450 6455#endif /* DEBUGGING */
a687059c
LW
6456}
6457
6458/*
a0d0e21e
LW
6459- regprop - printable representation of opcode
6460*/
46fc3d4c 6461void
32fc9b6a 6462Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6463{
35ff7856 6464#ifdef DEBUGGING
97aff369 6465 dVAR;
9b155405 6466 register int k;
a0d0e21e 6467
54dc92de 6468 sv_setpvn(sv, "", 0);
9b155405 6469 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6470 /* It would be nice to FAIL() here, but this may be called from
6471 regexec.c, and it would be hard to supply pRExC_state. */
6472 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6473 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6474
3dab1dad 6475 k = PL_regkind[OP(o)];
9b155405 6476
2a782b5b 6477 if (k == EXACT) {
396482e1 6478 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
6479 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6480 * is a crude hack but it may be the best for now since
6481 * we have no flag "this EXACTish node was UTF-8"
6482 * --jhi */
6483 const char * const s =
ddc5bc0f 6484 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
6485 PL_colors[0], PL_colors[1],
6486 PERL_PV_ESCAPE_UNI_DETECT |
6487 PERL_PV_PRETTY_ELIPSES |
6488 PERL_PV_PRETTY_LTGT
6489 );
6490 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 6491 } else if (k == TRIE) {
3dab1dad
YO
6492 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6493 /* print the details of the trie in dumpuntil instead, as
4f639d21 6494 * prog->data isn't available here */
a3621e74 6495 } else if (k == CURLY) {
cb434fcc 6496 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6497 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6498 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6499 }
2c2d71f5
JH
6500 else if (k == WHILEM && o->flags) /* Ordinal/of */
6501 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6502 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6503 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6504 else if (k == LOGICAL)
04ebc1ab 6505 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6506 else if (k == ANYOF) {
6507 int i, rangestart = -1;
2d03de9c 6508 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6509
6510 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6511 static const char * const anyofs[] = {
653099ff
GS
6512 "\\w",
6513 "\\W",
6514 "\\s",
6515 "\\S",
6516 "\\d",
6517 "\\D",
6518 "[:alnum:]",
6519 "[:^alnum:]",
6520 "[:alpha:]",
6521 "[:^alpha:]",
6522 "[:ascii:]",
6523 "[:^ascii:]",
6524 "[:ctrl:]",
6525 "[:^ctrl:]",
6526 "[:graph:]",
6527 "[:^graph:]",
6528 "[:lower:]",
6529 "[:^lower:]",
6530 "[:print:]",
6531 "[:^print:]",
6532 "[:punct:]",
6533 "[:^punct:]",
6534 "[:upper:]",
aaa51d5e 6535 "[:^upper:]",
653099ff 6536 "[:xdigit:]",
aaa51d5e
JF
6537 "[:^xdigit:]",
6538 "[:space:]",
6539 "[:^space:]",
6540 "[:blank:]",
6541 "[:^blank:]"
653099ff
GS
6542 };
6543
19860706 6544 if (flags & ANYOF_LOCALE)
396482e1 6545 sv_catpvs(sv, "{loc}");
19860706 6546 if (flags & ANYOF_FOLD)
396482e1 6547 sv_catpvs(sv, "{i}");
653099ff 6548 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6549 if (flags & ANYOF_INVERT)
396482e1 6550 sv_catpvs(sv, "^");
ffc61ed2
JH
6551 for (i = 0; i <= 256; i++) {
6552 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6553 if (rangestart == -1)
6554 rangestart = i;
6555 } else if (rangestart != -1) {
6556 if (i <= rangestart + 3)
6557 for (; rangestart < i; rangestart++)
653099ff 6558 put_byte(sv, rangestart);
ffc61ed2
JH
6559 else {
6560 put_byte(sv, rangestart);
396482e1 6561 sv_catpvs(sv, "-");
ffc61ed2 6562 put_byte(sv, i - 1);
653099ff 6563 }
ffc61ed2 6564 rangestart = -1;
653099ff 6565 }
847a199f 6566 }
ffc61ed2
JH
6567
6568 if (o->flags & ANYOF_CLASS)
bb7a0f54 6569 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6570 if (ANYOF_CLASS_TEST(o,i))
6571 sv_catpv(sv, anyofs[i]);
6572
6573 if (flags & ANYOF_UNICODE)
396482e1 6574 sv_catpvs(sv, "{unicode}");
1aa99e6b 6575 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6576 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6577
6578 {
6579 SV *lv;
32fc9b6a 6580 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6581
ffc61ed2
JH
6582 if (lv) {
6583 if (sw) {
89ebb4a3 6584 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6585
ffc61ed2 6586 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6587 uvchr_to_utf8(s, i);
ffc61ed2 6588
3568d838 6589 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6590 if (rangestart == -1)
6591 rangestart = i;
6592 } else if (rangestart != -1) {
ffc61ed2
JH
6593 if (i <= rangestart + 3)
6594 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6595 const U8 * const e = uvchr_to_utf8(s,rangestart);
6596 U8 *p;
6597 for(p = s; p < e; p++)
ffc61ed2
JH
6598 put_byte(sv, *p);
6599 }
6600 else {
2d03de9c
AL
6601 const U8 *e = uvchr_to_utf8(s,rangestart);
6602 U8 *p;
6603 for (p = s; p < e; p++)
ffc61ed2 6604 put_byte(sv, *p);
396482e1 6605 sv_catpvs(sv, "-");
2d03de9c
AL
6606 e = uvchr_to_utf8(s, i-1);
6607 for (p = s; p < e; p++)
1df70142 6608 put_byte(sv, *p);
ffc61ed2
JH
6609 }
6610 rangestart = -1;
6611 }
19860706 6612 }
ffc61ed2 6613
396482e1 6614 sv_catpvs(sv, "..."); /* et cetera */
19860706 6615 }
fde631ed 6616
ffc61ed2 6617 {
2e0de35c 6618 char *s = savesvpv(lv);
c445ea15 6619 char * const origs = s;
b81d288d 6620
3dab1dad
YO
6621 while (*s && *s != '\n')
6622 s++;
b81d288d 6623
ffc61ed2 6624 if (*s == '\n') {
2d03de9c 6625 const char * const t = ++s;
ffc61ed2
JH
6626
6627 while (*s) {
6628 if (*s == '\n')
6629 *s = ' ';
6630 s++;
6631 }
6632 if (s[-1] == ' ')
6633 s[-1] = 0;
6634
6635 sv_catpv(sv, t);
fde631ed 6636 }
b81d288d 6637
ffc61ed2 6638 Safefree(origs);
fde631ed
JH
6639 }
6640 }
653099ff 6641 }
ffc61ed2 6642
653099ff
GS
6643 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6644 }
9b155405 6645 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6646 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6647#else
96a5add6 6648 PERL_UNUSED_CONTEXT;
65e66c80
SP
6649 PERL_UNUSED_ARG(sv);
6650 PERL_UNUSED_ARG(o);
f9049ba1 6651 PERL_UNUSED_ARG(prog);
17c3b450 6652#endif /* DEBUGGING */
35ff7856 6653}
a687059c 6654
cad2e5aa
JH
6655SV *
6656Perl_re_intuit_string(pTHX_ regexp *prog)
6657{ /* Assume that RE_INTUIT is set */
97aff369 6658 dVAR;
a3621e74 6659 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6660 PERL_UNUSED_CONTEXT;
6661
a3621e74 6662 DEBUG_COMPILE_r(
cfd0369c 6663 {
2d03de9c 6664 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6665 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6666
6667 if (!PL_colorset) reginitcolors();
6668 PerlIO_printf(Perl_debug_log,
a0288114 6669 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6670 PL_colors[4],
6671 prog->check_substr ? "" : "utf8 ",
6672 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6673 s,
6674 PL_colors[1],
6675 (strlen(s) > 60 ? "..." : ""));
6676 } );
6677
33b8afdf 6678 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6679}
6680
2b69d0c2 6681void
864dbfa3 6682Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6683{
27da23d5 6684 dVAR;
0df25f3d
YO
6685
6686
6687
fc32ee4a 6688 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6689
7821416a
IZ
6690 if (!r || (--r->refcnt > 0))
6691 return;
ab3bbdeb 6692 DEBUG_COMPILE_r({
0df25f3d
YO
6693 if (!PL_colorset)
6694 reginitcolors();
ab3bbdeb
YO
6695 if (RX_DEBUG(r)){
6696 SV *dsv= sv_newmortal();
6697 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6698 dsv, r->precomp, r->prelen, 60);
6699 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6700 PL_colors[4],PL_colors[5],s);
6701 }
9e55ce06 6702 });
cad2e5aa 6703
43c5f42d
NC
6704 /* gcov results gave these as non-null 100% of the time, so there's no
6705 optimisation in checking them before calling Safefree */
6706 Safefree(r->precomp);
6707 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6708 RX_MATCH_COPY_FREE(r);
f8c7b90f 6709#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6710 if (r->saved_copy)
6711 SvREFCNT_dec(r->saved_copy);
6712#endif
a193d654
GS
6713 if (r->substrs) {
6714 if (r->anchored_substr)
6715 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6716 if (r->anchored_utf8)
6717 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6718 if (r->float_substr)
6719 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6720 if (r->float_utf8)
6721 SvREFCNT_dec(r->float_utf8);
2779dcf1 6722 Safefree(r->substrs);
a193d654 6723 }
c277df42
IZ
6724 if (r->data) {
6725 int n = r->data->count;
f3548bdc
DM
6726 PAD* new_comppad = NULL;
6727 PAD* old_comppad;
4026c95a 6728 PADOFFSET refcnt;
dfad63ad 6729
c277df42 6730 while (--n >= 0) {
261faec3 6731 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6732 switch (r->data->what[n]) {
6733 case 's':
6734 SvREFCNT_dec((SV*)r->data->data[n]);
6735 break;
653099ff
GS
6736 case 'f':
6737 Safefree(r->data->data[n]);
6738 break;
dfad63ad
HS
6739 case 'p':
6740 new_comppad = (AV*)r->data->data[n];
6741 break;
c277df42 6742 case 'o':
dfad63ad 6743 if (new_comppad == NULL)
cea2e8a9 6744 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6745 PAD_SAVE_LOCAL(old_comppad,
6746 /* Watch out for global destruction's random ordering. */
c445ea15 6747 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6748 );
b34c0dd4 6749 OP_REFCNT_LOCK;
4026c95a
SH
6750 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6751 OP_REFCNT_UNLOCK;
6752 if (!refcnt)
9b978d73 6753 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6754
f3548bdc 6755 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6756 SvREFCNT_dec((SV*)new_comppad);
6757 new_comppad = NULL;
c277df42
IZ
6758 break;
6759 case 'n':
9e55ce06 6760 break;
07be1b83 6761 case 'T':
be8e71aa
YO
6762 { /* Aho Corasick add-on structure for a trie node.
6763 Used in stclass optimization only */
07be1b83
YO
6764 U32 refcount;
6765 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6766 OP_REFCNT_LOCK;
6767 refcount = --aho->refcount;
6768 OP_REFCNT_UNLOCK;
6769 if ( !refcount ) {
6770 Safefree(aho->states);
6771 Safefree(aho->fail);
6772 aho->trie=NULL; /* not necessary to free this as it is
6773 handled by the 't' case */
6774 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6775 Safefree(r->regstclass);
07be1b83
YO
6776 }
6777 }
6778 break;
a3621e74 6779 case 't':
07be1b83 6780 {
be8e71aa 6781 /* trie structure. */
07be1b83
YO
6782 U32 refcount;
6783 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6784 OP_REFCNT_LOCK;
6785 refcount = --trie->refcount;
6786 OP_REFCNT_UNLOCK;
6787 if ( !refcount ) {
6788 Safefree(trie->charmap);
6789 if (trie->widecharmap)
6790 SvREFCNT_dec((SV*)trie->widecharmap);
6791 Safefree(trie->states);
6792 Safefree(trie->trans);
6793 if (trie->bitmap)
6794 Safefree(trie->bitmap);
6795 if (trie->wordlen)
6796 Safefree(trie->wordlen);
a3621e74 6797#ifdef DEBUGGING
be8e71aa
YO
6798 if (RX_DEBUG(r)) {
6799 if (trie->words)
6800 SvREFCNT_dec((SV*)trie->words);
6801 if (trie->revcharmap)
6802 SvREFCNT_dec((SV*)trie->revcharmap);
6803 }
a3621e74 6804#endif
07be1b83 6805 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6806 }
07be1b83
YO
6807 }
6808 break;
c277df42 6809 default:
830247a4 6810 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6811 }
6812 }
6813 Safefree(r->data->what);
6814 Safefree(r->data);
a0d0e21e
LW
6815 }
6816 Safefree(r->startp);
6817 Safefree(r->endp);
6818 Safefree(r);
a687059c 6819}
c277df42 6820
76234dfb 6821#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6822/*
6823 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6824 */
6825regnode *
864dbfa3 6826Perl_regnext(pTHX_ register regnode *p)
c277df42 6827{
97aff369 6828 dVAR;
c277df42
IZ
6829 register I32 offset;
6830
3280af22 6831 if (p == &PL_regdummy)
c277df42
IZ
6832 return(NULL);
6833
6834 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6835 if (offset == 0)
6836 return(NULL);
6837
c277df42 6838 return(p+offset);
c277df42 6839}
76234dfb 6840#endif
c277df42 6841
01f988be 6842STATIC void
cea2e8a9 6843S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6844{
6845 va_list args;
6846 STRLEN l1 = strlen(pat1);
6847 STRLEN l2 = strlen(pat2);
6848 char buf[512];
06bf62c7 6849 SV *msv;
73d840c0 6850 const char *message;
c277df42
IZ
6851
6852 if (l1 > 510)
6853 l1 = 510;
6854 if (l1 + l2 > 510)
6855 l2 = 510 - l1;
6856 Copy(pat1, buf, l1 , char);
6857 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6858 buf[l1 + l2] = '\n';
6859 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6860#ifdef I_STDARG
6861 /* ANSI variant takes additional second argument */
c277df42 6862 va_start(args, pat2);
8736538c
AS
6863#else
6864 va_start(args);
6865#endif
5a844595 6866 msv = vmess(buf, &args);
c277df42 6867 va_end(args);
cfd0369c 6868 message = SvPV_const(msv,l1);
c277df42
IZ
6869 if (l1 > 512)
6870 l1 = 512;
6871 Copy(message, buf, l1 , char);
197cf9b9 6872 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6873 Perl_croak(aTHX_ "%s", buf);
c277df42 6874}
a0ed51b3
LW
6875
6876/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6877
76234dfb 6878#ifndef PERL_IN_XSUB_RE
a0ed51b3 6879void
864dbfa3 6880Perl_save_re_context(pTHX)
b81d288d 6881{
97aff369 6882 dVAR;
1ade1aa1
NC
6883
6884 struct re_save_state *state;
6885
6886 SAVEVPTR(PL_curcop);
6887 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6888
6889 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6890 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6891 SSPUSHINT(SAVEt_RE_STATE);
6892
46ab3289 6893 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6894
a0ed51b3 6895 PL_reg_start_tmp = 0;
a0ed51b3 6896 PL_reg_start_tmpl = 0;
c445ea15 6897 PL_reg_oldsaved = NULL;
a5db57d6 6898 PL_reg_oldsavedlen = 0;
a5db57d6 6899 PL_reg_maxiter = 0;
a5db57d6 6900 PL_reg_leftiter = 0;
c445ea15 6901 PL_reg_poscache = NULL;
a5db57d6 6902 PL_reg_poscache_size = 0;
1ade1aa1
NC
6903#ifdef PERL_OLD_COPY_ON_WRITE
6904 PL_nrs = NULL;
6905#endif
ada6e8a9 6906
c445ea15
AL
6907 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6908 if (PL_curpm) {
6909 const REGEXP * const rx = PM_GETRE(PL_curpm);
6910 if (rx) {
1df70142 6911 U32 i;
ada6e8a9 6912 for (i = 1; i <= rx->nparens; i++) {
1df70142 6913 char digits[TYPE_CHARS(long)];
d9fad198 6914 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6915 GV *const *const gvp
6916 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6917
b37c2d43
AL
6918 if (gvp) {
6919 GV * const gv = *gvp;
6920 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6921 save_scalar(gv);
49f27e4b 6922 }
ada6e8a9
AMS
6923 }
6924 }
6925 }
a0ed51b3 6926}
76234dfb 6927#endif
51371543 6928
51371543 6929static void
acfe0abc 6930clear_re(pTHX_ void *r)
51371543 6931{
97aff369 6932 dVAR;
51371543
GS
6933 ReREFCNT_dec((regexp *)r);
6934}
ffbc6a93 6935
a28509cc
AL
6936#ifdef DEBUGGING
6937
6938STATIC void
6939S_put_byte(pTHX_ SV *sv, int c)
6940{
6941 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6942 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6943 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6944 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6945 else
6946 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6947}
6948
3dab1dad
YO
6949#define CLEAR_OPTSTART \
6950 if (optstart) STMT_START { \
07be1b83 6951 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6952 optstart=NULL; \
6953 } STMT_END
6954
6955#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6956
b5a2f8d8
NC
6957STATIC const regnode *
6958S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6959 const regnode *last, SV* sv, I32 l)
a28509cc 6960{
97aff369 6961 dVAR;
a28509cc 6962 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6963 register const regnode *next;
3dab1dad
YO
6964 const regnode *optstart= NULL;
6965 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6966
6967 while (op != END && (!last || node < last)) {
6968 /* While that wasn't END last time... */
6969
6970 NODE_ALIGN(node);
6971 op = OP(node);
6972 if (op == CLOSE)
6973 l--;
b5a2f8d8 6974 next = regnext((regnode *)node);
07be1b83 6975
a28509cc 6976 /* Where, what. */
8e11feef 6977 if (OP(node) == OPTIMIZED) {
e68ec53f 6978 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6979 optstart = node;
3dab1dad 6980 else
8e11feef 6981 goto after_print;
3dab1dad
YO
6982 } else
6983 CLEAR_OPTSTART;
07be1b83 6984
32fc9b6a 6985 regprop(r, sv, node);
a28509cc
AL
6986 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6987 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6988
6989 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6990 if (next == NULL) /* Next ptr. */
6991 PerlIO_printf(Perl_debug_log, "(0)");
6992 else
6993 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6994 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6995 }
6996
a28509cc
AL
6997 after_print:
6998 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6999 assert(next);
7000 {
7001 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
7002 ? regnext((regnode *)next)
7003 : next);
be8e71aa
YO
7004 if (last && nnode > last)
7005 nnode = last;
7006 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
7007 }
a28509cc
AL
7008 }
7009 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 7010 assert(next);
3dab1dad 7011 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
7012 }
7013 else if ( PL_regkind[(U8)op] == TRIE ) {
7014 const I32 n = ARG(node);
4f639d21 7015 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
7016 const I32 arry_len = av_len(trie->words)+1;
7017 I32 word_idx;
7018 PerlIO_printf(Perl_debug_log,
07be1b83 7019 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
7020 (int)(2*(l+3)),
7021 "",
7022 trie->startstate,
7023 TRIE_WORDCOUNT(trie),
7024 (int)TRIE_CHARCOUNT(trie),
7025 trie->uniquecharcount,
7026 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
7027 (int)trie->minlen,
7028 (int)trie->maxlen
8e11feef 7029 );
3dab1dad
YO
7030 if (trie->bitmap) {
7031 int i;
7032 int rangestart= -1;
7033 sv_setpvn(sv, "", 0);
7034 for (i = 0; i <= 256; i++) {
7035 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
7036 if (rangestart == -1)
7037 rangestart = i;
7038 } else if (rangestart != -1) {
7039 if (i <= rangestart + 3)
7040 for (; rangestart < i; rangestart++)
7041 put_byte(sv, rangestart);
7042 else {
7043 put_byte(sv, rangestart);
7044 sv_catpvs(sv, "-");
7045 put_byte(sv, i - 1);
7046 }
7047 rangestart = -1;
7048 }
7049 }
07be1b83 7050 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 7051 } else
07be1b83 7052 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
7053
7054 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 7055 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
ab3bbdeb
YO
7056 if (elem_ptr)
7057 PerlIO_printf(Perl_debug_log, "%*s%s\n",
a28509cc 7058 (int)(2*(l+4)), "",
ddc5bc0f 7059 pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7060 PL_colors[0], PL_colors[1],
7061 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7062 PERL_PV_PRETTY_ELIPSES |
7063 PERL_PV_PRETTY_LTGT
7064 )
a28509cc 7065 );
a28509cc 7066 }
a28509cc
AL
7067
7068 node = NEXTOPER(node);
7069 node += regarglen[(U8)op];
7070
7071 }
7072 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 7073 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7074 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7075 }
7076 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7077 assert(next);
3dab1dad 7078 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7079 next, sv, l + 1);
7080 }
7081 else if ( op == PLUS || op == STAR) {
3dab1dad 7082 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7083 }
7084 else if (op == ANYOF) {
7085 /* arglen 1 + class block */
7086 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7087 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7088 node = NEXTOPER(node);
7089 }
7090 else if (PL_regkind[(U8)op] == EXACT) {
7091 /* Literal string, where present. */
7092 node += NODE_SZ_STR(node) - 1;
7093 node = NEXTOPER(node);
7094 }
7095 else {
7096 node = NEXTOPER(node);
7097 node += regarglen[(U8)op];
7098 }
7099 if (op == CURLYX || op == OPEN)
7100 l++;
7101 else if (op == WHILEM)
7102 l--;
7103 }
3dab1dad 7104 CLEAR_OPTSTART;
a28509cc
AL
7105 return node;
7106}
7107
7108#endif /* DEBUGGING */
7109
241d1a3b
NC
7110/*
7111 * Local variables:
7112 * c-indentation-style: bsd
7113 * c-basic-offset: 4
7114 * indent-tabs-mode: t
7115 * End:
7116 *
37442d52
RGS
7117 * ex: set ts=8 sts=4 sw=4 noet:
7118 */