This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove the C89 checking with gcc
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
ab3bbdeb 757 if (UTF) SvUTF8_on(tmp); \
3dab1dad
YO
758 av_push( TRIE_REVCHARMAP(trie), tmp ); \
759 } STMT_END
a3621e74
YO
760
761#define TRIE_READ_CHAR STMT_START { \
07be1b83 762 wordlen++; \
a3621e74
YO
763 if ( UTF ) { \
764 if ( folder ) { \
765 if ( foldlen > 0 ) { \
766 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
767 foldlen -= len; \
768 scan += len; \
769 len = 0; \
770 } else { \
e1ec3a88 771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
772 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
773 foldlen -= UNISKIP( uvc ); \
774 scan = foldbuf + UNISKIP( uvc ); \
775 } \
776 } else { \
e1ec3a88 777 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
778 } \
779 } else { \
780 uvc = (U32)*uc; \
781 len = 1; \
782 } \
783} STMT_END
784
785
786#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
787#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
788#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
789#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790
791#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
792 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
793 TRIE_LIST_LEN( state ) *= 2; \
794 Renew( trie->states[ state ].trans.list, \
795 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 } \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
798 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
799 TRIE_LIST_CUR( state )++; \
800} STMT_END
801
802#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 803 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
804 4, reg_trie_trans_le ); \
805 TRIE_LIST_CUR( state ) = 1; \
806 TRIE_LIST_LEN( state ) = 4; \
807} STMT_END
808
07be1b83
YO
809#define TRIE_HANDLE_WORD(state) STMT_START { \
810 if ( !trie->states[ state ].wordnum ) { \
811 /* we haven't inserted this word into the structure yet. */ \
812 if (trie->wordlen) \
813 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
814 trie->states[ state ].wordnum = ++curword; \
815 DEBUG_r({ \
816 /* store the word for dumping */ \
817 SV* tmp; \
07be1b83
YO
818 if (OP(noper) != NOTHING) \
819 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 820 else \
07be1b83 821 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
822 if ( UTF ) SvUTF8_on( tmp ); \
823 av_push( trie->words, tmp ); \
824 }); \
825 } else { \
826 NOOP; /* It's a dupe. So ignore it. */ \
827 } \
828} STMT_END
829
830#ifdef DEBUGGING
07be1b83 831/*
3dab1dad
YO
832 dump_trie(trie)
833 dump_trie_interim_list(trie,next_alloc)
834 dump_trie_interim_table(trie,next_alloc)
835
836 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
837 The _interim_ variants are used for debugging the interim
838 tables that are used to generate the final compressed
839 representation which is what dump_trie expects.
840
3dab1dad
YO
841 Part of the reason for their existance is to provide a form
842 of documentation as to how the different representations function.
07be1b83
YO
843
844*/
3dab1dad
YO
845
846/*
847 dump_trie(trie)
848 Dumps the final compressed table form of the trie to Perl_debug_log.
849 Used for debugging make_trie().
850*/
851
852STATIC void
853S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
854{
855 U32 state;
ab3bbdeb
YO
856 SV *sv=sv_newmortal();
857 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
858 GET_RE_DEBUG_FLAGS_DECL;
859
ab3bbdeb 860
3dab1dad
YO
861 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
862 (int)depth * 2 + 2,"",
863 "Match","Base","Ofs" );
864
865 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 866 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 867 if ( tmp ) {
ab3bbdeb
YO
868 PerlIO_printf( Perl_debug_log, "%*s",
869 colwidth,
ddc5bc0f 870 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
871 PL_colors[0], PL_colors[1],
872 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
873 PERL_PV_ESCAPE_FIRSTCHAR
874 )
875 );
3dab1dad
YO
876 }
877 }
878 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
879 (int)depth * 2 + 2,"");
880
881 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 882 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
883 PerlIO_printf( Perl_debug_log, "\n");
884
885 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 886 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
887
888 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
889
890 if ( trie->states[ state ].wordnum ) {
891 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
892 } else {
893 PerlIO_printf( Perl_debug_log, "%6s", "" );
894 }
895
896 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
897
898 if ( base ) {
899 U32 ofs = 0;
900
901 while( ( base + ofs < trie->uniquecharcount ) ||
902 ( base + ofs - trie->uniquecharcount < trie->lasttrans
903 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
904 ofs++;
905
906 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
907
908 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
909 if ( ( base + ofs >= trie->uniquecharcount ) &&
910 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
911 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
912 {
ab3bbdeb
YO
913 PerlIO_printf( Perl_debug_log, "%*"UVXf,
914 colwidth,
3dab1dad
YO
915 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
916 } else {
ab3bbdeb 917 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
918 }
919 }
920
921 PerlIO_printf( Perl_debug_log, "]");
922
923 }
924 PerlIO_printf( Perl_debug_log, "\n" );
925 }
926}
927/*
928 dump_trie_interim_list(trie,next_alloc)
929 Dumps a fully constructed but uncompressed trie in list form.
930 List tries normally only are used for construction when the number of
931 possible chars (trie->uniquecharcount) is very high.
932 Used for debugging make_trie().
933*/
934STATIC void
935S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
936{
937 U32 state;
ab3bbdeb
YO
938 SV *sv=sv_newmortal();
939 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
940 GET_RE_DEBUG_FLAGS_DECL;
941 /* print out the table precompression. */
ab3bbdeb
YO
942 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
943 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
944 "------:-----+-----------------\n" );
3dab1dad
YO
945
946 for( state=1 ; state < next_alloc ; state ++ ) {
947 U16 charid;
948
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
950 (int)depth * 2 + 2,"", (UV)state );
951 if ( ! trie->states[ state ].wordnum ) {
952 PerlIO_printf( Perl_debug_log, "%5s| ","");
953 } else {
954 PerlIO_printf( Perl_debug_log, "W%4x| ",
955 trie->states[ state ].wordnum
956 );
957 }
958 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 959 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
960 if ( tmp ) {
961 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
962 colwidth,
ddc5bc0f 963 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
964 PL_colors[0], PL_colors[1],
965 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
966 PERL_PV_ESCAPE_FIRSTCHAR
967 ) ,
3dab1dad
YO
968 TRIE_LIST_ITEM(state,charid).forid,
969 (UV)TRIE_LIST_ITEM(state,charid).newstate
970 );
971 }
ab3bbdeb
YO
972 }
973 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
974 }
975}
976
977/*
978 dump_trie_interim_table(trie,next_alloc)
979 Dumps a fully constructed but uncompressed trie in table form.
980 This is the normal DFA style state transition table, with a few
981 twists to facilitate compression later.
982 Used for debugging make_trie().
983*/
984STATIC void
985S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
986{
987 U32 state;
988 U16 charid;
ab3bbdeb
YO
989 SV *sv=sv_newmortal();
990 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
991 GET_RE_DEBUG_FLAGS_DECL;
992
993 /*
994 print out the table precompression so that we can do a visual check
995 that they are identical.
996 */
997
998 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 1001 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 1002 if ( tmp ) {
ab3bbdeb
YO
1003 PerlIO_printf( Perl_debug_log, "%*s",
1004 colwidth,
ddc5bc0f 1005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1006 PL_colors[0], PL_colors[1],
1007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1008 PERL_PV_ESCAPE_FIRSTCHAR
1009 )
1010 );
3dab1dad
YO
1011 }
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1015
1016 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1017 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1018 }
1019
1020 PerlIO_printf( Perl_debug_log, "\n" );
1021
1022 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1023
1024 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1025 (int)depth * 2 + 2,"",
1026 (UV)TRIE_NODENUM( state ) );
1027
1028 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1029 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1030 if (v)
1031 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1032 else
1033 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1034 }
1035 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1036 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1037 } else {
1038 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1039 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1040 }
1041 }
07be1b83 1042}
3dab1dad
YO
1043
1044#endif
1045
07be1b83
YO
1046#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1047 ( ( base + charid >= ucharcount \
1048 && base + charid < ubound \
1049 && state == trie->trans[ base - ucharcount + charid ].check \
1050 && trie->trans[ base - ucharcount + charid ].next ) \
1051 ? trie->trans[ base - ucharcount + charid ].next \
1052 : ( state==1 ? special : 0 ) \
1053 )
3dab1dad 1054
07be1b83
YO
1055STATIC void
1056S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1057{
1058/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1059
1060 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1061 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1062 ISBN 0-201-10088-6
1063
1064 We find the fail state for each state in the trie, this state is the longest proper
1065 suffix of the current states 'word' that is also a proper prefix of another word in our
1066 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1067 the DFA not to have to restart after its tried and failed a word at a given point, it
1068 simply continues as though it had been matching the other word in the first place.
1069 Consider
1070 'abcdgu'=~/abcdefg|cdgu/
1071 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1072 fail, which would bring use to the state representing 'd' in the second word where we would
1073 try 'g' and succeed, prodceding to match 'cdgu'.
1074 */
1075 /* add a fail transition */
1076 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1077 U32 *q;
be8e71aa
YO
1078 const U32 ucharcount = trie->uniquecharcount;
1079 const U32 numstates = trie->laststate;
1080 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1081 U32 q_read = 0;
1082 U32 q_write = 0;
1083 U32 charid;
1084 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1085 U32 *fail;
1086 reg_ac_data *aho;
1087 const U32 data_slot = add_data( pRExC_state, 1, "T" );
72f13be8 1088 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
1089#ifndef DEBUGGING
1090 PERL_UNUSED_ARG(depth);
1091#endif
72f13be8 1092
07be1b83
YO
1093
1094 ARG_SET( stclass, data_slot );
1095 Newxz( aho, 1, reg_ac_data );
1096 RExC_rx->data->data[ data_slot ] = (void*)aho;
1097 aho->trie=trie;
1098 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1099 (trie->laststate+1)*sizeof(reg_trie_state));
1100 Newxz( q, numstates, U32);
1101 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1102 aho->refcount = 1;
1103 fail = aho->fail;
07be1b83
YO
1104 fail[ 0 ] = fail[ 1 ] = 1;
1105
1106 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1107 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1108 if ( newstate ) {
07be1b83
YO
1109 q[ q_write ] = newstate;
1110 /* set to point at the root */
1111 fail[ q[ q_write++ ] ]=1;
1112 }
1113 }
1114 while ( q_read < q_write) {
be8e71aa 1115 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1116 base = trie->states[ cur ].trans.base;
1117
1118 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1119 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1120 if (ch_state) {
07be1b83
YO
1121 U32 fail_state = cur;
1122 U32 fail_base;
1123 do {
1124 fail_state = fail[ fail_state ];
1125 fail_base = aho->states[ fail_state ].trans.base;
1126 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1127
1128 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1129 fail[ ch_state ] = fail_state;
1130 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1131 {
1132 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1133 }
1134 q[ q_write++ % numstates] = ch_state;
1135 }
1136 }
1137 }
1138
1139 DEBUG_TRIE_COMPILE_MORE_r({
1140 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1141 for( q_read=2; q_read<numstates; q_read++ ) {
1142 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1143 }
1144 PerlIO_printf(Perl_debug_log, "\n");
1145 });
1146 Safefree(q);
1147 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1148}
3dab1dad
YO
1149
1150
1151
a3621e74 1152STATIC I32
3dab1dad 1153S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1154{
27da23d5 1155 dVAR;
a3621e74
YO
1156 /* first pass, loop through and scan words */
1157 reg_trie_data *trie;
1158 regnode *cur;
9f7f3913 1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1160 STRLEN len = 0;
1161 UV uvc = 0;
1162 U16 curword = 0;
1163 U32 next_alloc = 0;
1164 /* we just use folder as a flag in utf8 */
e1ec3a88 1165 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1166 ? PL_fold
1167 : ( flags == EXACTFL
1168 ? PL_fold_locale
1169 : NULL
1170 )
1171 );
1172
e1ec3a88 1173 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1174 SV *re_trie_maxbuff;
3dab1dad
YO
1175#ifndef DEBUGGING
1176 /* these are only used during construction but are useful during
8e11feef
RGS
1177 * debugging so we store them in the struct when debugging.
1178 * Wordcount is actually superfluous in debugging as we have
1179 * (AV*)trie->words to use for it, but that's not available when
1180 * not debugging... We could make the macro use the AV during
1181 * debugging though...
1182 */
3dab1dad
YO
1183 U16 trie_wordcount=0;
1184 STRLEN trie_charcount=0;
07be1b83 1185 /*U32 trie_laststate=0;*/
3dab1dad
YO
1186 AV *trie_revcharmap;
1187#endif
a3621e74 1188 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1189#ifndef DEBUGGING
1190 PERL_UNUSED_ARG(depth);
1191#endif
a3621e74 1192
a02a5408 1193 Newxz( trie, 1, reg_trie_data );
a3621e74 1194 trie->refcount = 1;
3dab1dad 1195 trie->startstate = 1;
a3621e74 1196 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1197 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1198 if (!(UTF && folder))
1199 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1200 DEBUG_r({
1201 trie->words = newAV();
a3621e74 1202 });
3dab1dad 1203 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1204
0111c4fd 1205 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1206 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1207 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1208 }
3dab1dad
YO
1209 DEBUG_OPTIMISE_r({
1210 PerlIO_printf( Perl_debug_log,
1211 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1212 (int)depth * 2 + 2, "",
1213 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1214 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1215 });
a3621e74
YO
1216 /* -- First loop and Setup --
1217
1218 We first traverse the branches and scan each word to determine if it
1219 contains widechars, and how many unique chars there are, this is
1220 important as we have to build a table with at least as many columns as we
1221 have unique chars.
1222
1223 We use an array of integers to represent the character codes 0..255
1224 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1225 native representation of the character value as the key and IV's for the
1226 coded index.
1227
1228 *TODO* If we keep track of how many times each character is used we can
1229 remap the columns so that the table compression later on is more
1230 efficient in terms of memory by ensuring most common value is in the
1231 middle and the least common are on the outside. IMO this would be better
1232 than a most to least common mapping as theres a decent chance the most
1233 common letter will share a node with the least common, meaning the node
1234 will not be compressable. With a middle is most common approach the worst
1235 case is when we have the least common nodes twice.
1236
1237 */
1238
a3621e74 1239 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1240 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1241 const U8 *uc = (U8*)STRING( noper );
a28509cc 1242 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1243 STRLEN foldlen = 0;
1244 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1245 const U8 *scan = (U8*)NULL;
07be1b83 1246 U32 wordlen = 0; /* required init */
3dab1dad 1247 STRLEN chars=0;
a3621e74 1248
3dab1dad
YO
1249 TRIE_WORDCOUNT(trie)++;
1250 if (OP(noper) == NOTHING) {
1251 trie->minlen= 0;
1252 continue;
1253 }
1254 if (trie->bitmap) {
1255 TRIE_BITMAP_SET(trie,*uc);
1256 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1257 }
a3621e74 1258 for ( ; uc < e ; uc += len ) {
3dab1dad 1259 TRIE_CHARCOUNT(trie)++;
a3621e74 1260 TRIE_READ_CHAR;
3dab1dad 1261 chars++;
a3621e74
YO
1262 if ( uvc < 256 ) {
1263 if ( !trie->charmap[ uvc ] ) {
1264 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1265 if ( folder )
1266 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1267 TRIE_STORE_REVCHAR;
a3621e74
YO
1268 }
1269 } else {
1270 SV** svpp;
1271 if ( !trie->widecharmap )
1272 trie->widecharmap = newHV();
1273
1274 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1275
1276 if ( !svpp )
e4584336 1277 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1278
1279 if ( !SvTRUE( *svpp ) ) {
1280 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1281 TRIE_STORE_REVCHAR;
a3621e74
YO
1282 }
1283 }
1284 }
3dab1dad
YO
1285 if( cur == first ) {
1286 trie->minlen=chars;
1287 trie->maxlen=chars;
1288 } else if (chars < trie->minlen) {
1289 trie->minlen=chars;
1290 } else if (chars > trie->maxlen) {
1291 trie->maxlen=chars;
1292 }
1293
a3621e74
YO
1294 } /* end first pass */
1295 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1296 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1297 (int)depth * 2 + 2,"",
1298 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1299 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1300 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1301 );
07be1b83 1302 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1303
1304 /*
1305 We now know what we are dealing with in terms of unique chars and
1306 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1307 representation using a flat table will take. If it's over a reasonable
1308 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1309 conservative but potentially much slower representation using an array
1310 of lists.
1311
1312 At the end we convert both representations into the same compressed
1313 form that will be used in regexec.c for matching with. The latter
1314 is a form that cannot be used to construct with but has memory
1315 properties similar to the list form and access properties similar
1316 to the table form making it both suitable for fast searches and
1317 small enough that its feasable to store for the duration of a program.
1318
1319 See the comment in the code where the compressed table is produced
1320 inplace from the flat tabe representation for an explanation of how
1321 the compression works.
1322
1323 */
1324
1325
3dab1dad 1326 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1327 /*
1328 Second Pass -- Array Of Lists Representation
1329
1330 Each state will be represented by a list of charid:state records
1331 (reg_trie_trans_le) the first such element holds the CUR and LEN
1332 points of the allocated array. (See defines above).
1333
1334 We build the initial structure using the lists, and then convert
1335 it into the compressed table form which allows faster lookups
1336 (but cant be modified once converted).
a3621e74
YO
1337 */
1338
a3621e74
YO
1339 STRLEN transcount = 1;
1340
3dab1dad 1341 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1342 TRIE_LIST_NEW(1);
1343 next_alloc = 2;
1344
1345 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1346
c445ea15
AL
1347 regnode * const noper = NEXTOPER( cur );
1348 U8 *uc = (U8*)STRING( noper );
1349 const U8 * const e = uc + STR_LEN( noper );
1350 U32 state = 1; /* required init */
1351 U16 charid = 0; /* sanity init */
1352 U8 *scan = (U8*)NULL; /* sanity init */
1353 STRLEN foldlen = 0; /* required init */
07be1b83 1354 U32 wordlen = 0; /* required init */
c445ea15
AL
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1356
3dab1dad 1357 if (OP(noper) != NOTHING) {
c445ea15
AL
1358 for ( ; uc < e ; uc += len ) {
1359
1360 TRIE_READ_CHAR;
1361
1362 if ( uvc < 256 ) {
1363 charid = trie->charmap[ uvc ];
1364 } else {
1365 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1366 if ( !svpp ) {
1367 charid = 0;
1368 } else {
1369 charid=(U16)SvIV( *svpp );
1370 }
1371 }
1372 if ( charid ) {
a3621e74 1373
c445ea15
AL
1374 U16 check;
1375 U32 newstate = 0;
a3621e74 1376
c445ea15
AL
1377 charid--;
1378 if ( !trie->states[ state ].trans.list ) {
1379 TRIE_LIST_NEW( state );
1380 }
1381 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1382 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1383 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1384 break;
1385 }
1386 }
1387 if ( ! newstate ) {
1388 newstate = next_alloc++;
1389 TRIE_LIST_PUSH( state, charid, newstate );
1390 transcount++;
1391 }
1392 state = newstate;
1393 } else {
1394 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1395 }
c445ea15
AL
1396 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1397 }
c445ea15 1398 }
3dab1dad 1399 TRIE_HANDLE_WORD(state);
a3621e74
YO
1400
1401 } /* end second pass */
1402
3dab1dad 1403 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1404 Renew( trie->states, next_alloc, reg_trie_state );
1405
3dab1dad
YO
1406 /* and now dump it out before we compress it */
1407 DEBUG_TRIE_COMPILE_MORE_r(
1408 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1409 );
a3621e74 1410
a02a5408 1411 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1412 {
1413 U32 state;
a3621e74
YO
1414 U32 tp = 0;
1415 U32 zp = 0;
1416
1417
1418 for( state=1 ; state < next_alloc ; state ++ ) {
1419 U32 base=0;
1420
1421 /*
1422 DEBUG_TRIE_COMPILE_MORE_r(
1423 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1424 );
1425 */
1426
1427 if (trie->states[state].trans.list) {
1428 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1429 U16 maxid=minid;
a28509cc 1430 U16 idx;
a3621e74
YO
1431
1432 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1433 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1434 if ( forid < minid ) {
1435 minid=forid;
1436 } else if ( forid > maxid ) {
1437 maxid=forid;
1438 }
a3621e74
YO
1439 }
1440 if ( transcount < tp + maxid - minid + 1) {
1441 transcount *= 2;
1442 Renew( trie->trans, transcount, reg_trie_trans );
1443 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1444 }
1445 base = trie->uniquecharcount + tp - minid;
1446 if ( maxid == minid ) {
1447 U32 set = 0;
1448 for ( ; zp < tp ; zp++ ) {
1449 if ( ! trie->trans[ zp ].next ) {
1450 base = trie->uniquecharcount + zp - minid;
1451 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1452 trie->trans[ zp ].check = state;
1453 set = 1;
1454 break;
1455 }
1456 }
1457 if ( !set ) {
1458 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1459 trie->trans[ tp ].check = state;
1460 tp++;
1461 zp = tp;
1462 }
1463 } else {
1464 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1465 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1466 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1467 trie->trans[ tid ].check = state;
1468 }
1469 tp += ( maxid - minid + 1 );
1470 }
1471 Safefree(trie->states[ state ].trans.list);
1472 }
1473 /*
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1476 );
1477 */
1478 trie->states[ state ].trans.base=base;
1479 }
cc601c31 1480 trie->lasttrans = tp + 1;
a3621e74
YO
1481 }
1482 } else {
1483 /*
1484 Second Pass -- Flat Table Representation.
1485
1486 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1487 We know that we will need Charcount+1 trans at most to store the data
1488 (one row per char at worst case) So we preallocate both structures
1489 assuming worst case.
1490
1491 We then construct the trie using only the .next slots of the entry
1492 structs.
1493
1494 We use the .check field of the first entry of the node temporarily to
1495 make compression both faster and easier by keeping track of how many non
1496 zero fields are in the node.
1497
1498 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1499 transition.
1500
1501 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1502 number representing the first entry of the node, and state as a
1503 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1504 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1505 are 2 entrys per node. eg:
1506
1507 A B A B
1508 1. 2 4 1. 3 7
1509 2. 0 3 3. 0 5
1510 3. 0 0 5. 0 0
1511 4. 0 0 7. 0 0
1512
1513 The table is internally in the right hand, idx form. However as we also
1514 have to deal with the states array which is indexed by nodenum we have to
1515 use TRIE_NODENUM() to convert.
1516
1517 */
1518
3dab1dad
YO
1519
1520 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1521 reg_trie_trans );
3dab1dad 1522 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1523 next_alloc = trie->uniquecharcount + 1;
1524
3dab1dad 1525
a3621e74
YO
1526 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1527
c445ea15 1528 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1529 const U8 *uc = (U8*)STRING( noper );
1530 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1531
1532 U32 state = 1; /* required init */
1533
1534 U16 charid = 0; /* sanity init */
1535 U32 accept_state = 0; /* sanity init */
1536 U8 *scan = (U8*)NULL; /* sanity init */
1537
1538 STRLEN foldlen = 0; /* required init */
07be1b83 1539 U32 wordlen = 0; /* required init */
a3621e74
YO
1540 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1541
3dab1dad 1542 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1543 for ( ; uc < e ; uc += len ) {
1544
1545 TRIE_READ_CHAR;
1546
1547 if ( uvc < 256 ) {
1548 charid = trie->charmap[ uvc ];
1549 } else {
c445ea15
AL
1550 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1551 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1552 }
1553 if ( charid ) {
1554 charid--;
1555 if ( !trie->trans[ state + charid ].next ) {
1556 trie->trans[ state + charid ].next = next_alloc;
1557 trie->trans[ state ].check++;
1558 next_alloc += trie->uniquecharcount;
1559 }
1560 state = trie->trans[ state + charid ].next;
1561 } else {
e4584336 1562 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1563 }
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1565 }
a3621e74 1566 }
3dab1dad
YO
1567 accept_state = TRIE_NODENUM( state );
1568 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1569
1570 } /* end second pass */
1571
3dab1dad
YO
1572 /* and now dump it out before we compress it */
1573 DEBUG_TRIE_COMPILE_MORE_r(
1574 dump_trie_interim_table(trie,next_alloc,depth+1)
1575 );
a3621e74 1576
a3621e74
YO
1577 {
1578 /*
1579 * Inplace compress the table.*
1580
1581 For sparse data sets the table constructed by the trie algorithm will
1582 be mostly 0/FAIL transitions or to put it another way mostly empty.
1583 (Note that leaf nodes will not contain any transitions.)
1584
1585 This algorithm compresses the tables by eliminating most such
1586 transitions, at the cost of a modest bit of extra work during lookup:
1587
1588 - Each states[] entry contains a .base field which indicates the
1589 index in the state[] array wheres its transition data is stored.
1590
1591 - If .base is 0 there are no valid transitions from that node.
1592
1593 - If .base is nonzero then charid is added to it to find an entry in
1594 the trans array.
1595
1596 -If trans[states[state].base+charid].check!=state then the
1597 transition is taken to be a 0/Fail transition. Thus if there are fail
1598 transitions at the front of the node then the .base offset will point
1599 somewhere inside the previous nodes data (or maybe even into a node
1600 even earlier), but the .check field determines if the transition is
1601 valid.
1602
1603 The following process inplace converts the table to the compressed
1604 table: We first do not compress the root node 1,and mark its all its
1605 .check pointers as 1 and set its .base pointer as 1 as well. This
1606 allows to do a DFA construction from the compressed table later, and
1607 ensures that any .base pointers we calculate later are greater than
1608 0.
1609
1610 - We set 'pos' to indicate the first entry of the second node.
1611
1612 - We then iterate over the columns of the node, finding the first and
1613 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1614 and set the .check pointers accordingly, and advance pos
1615 appropriately and repreat for the next node. Note that when we copy
1616 the next pointers we have to convert them from the original
1617 NODEIDX form to NODENUM form as the former is not valid post
1618 compression.
1619
1620 - If a node has no transitions used we mark its base as 0 and do not
1621 advance the pos pointer.
1622
1623 - If a node only has one transition we use a second pointer into the
1624 structure to fill in allocated fail transitions from other states.
1625 This pointer is independent of the main pointer and scans forward
1626 looking for null transitions that are allocated to a state. When it
1627 finds one it writes the single transition into the "hole". If the
1628 pointer doesnt find one the single transition is appeneded as normal.
1629
1630 - Once compressed we can Renew/realloc the structures to release the
1631 excess space.
1632
1633 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1634 specifically Fig 3.47 and the associated pseudocode.
1635
1636 demq
1637 */
a3b680e6 1638 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1639 U32 state, charid;
a3621e74 1640 U32 pos = 0, zp=0;
3dab1dad 1641 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1642
1643 for ( state = 1 ; state < laststate ; state++ ) {
1644 U8 flag = 0;
a28509cc
AL
1645 const U32 stateidx = TRIE_NODEIDX( state );
1646 const U32 o_used = trie->trans[ stateidx ].check;
1647 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1648 trie->trans[ stateidx ].check = 0;
1649
1650 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1651 if ( flag || trie->trans[ stateidx + charid ].next ) {
1652 if ( trie->trans[ stateidx + charid ].next ) {
1653 if (o_used == 1) {
1654 for ( ; zp < pos ; zp++ ) {
1655 if ( ! trie->trans[ zp ].next ) {
1656 break;
1657 }
1658 }
1659 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1660 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1661 trie->trans[ zp ].check = state;
1662 if ( ++zp > pos ) pos = zp;
1663 break;
1664 }
1665 used--;
1666 }
1667 if ( !flag ) {
1668 flag = 1;
1669 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1670 }
1671 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1672 trie->trans[ pos ].check = state;
1673 pos++;
1674 }
1675 }
1676 }
cc601c31 1677 trie->lasttrans = pos + 1;
a3621e74
YO
1678 Renew( trie->states, laststate + 1, reg_trie_state);
1679 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1680 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1682 (int)depth * 2 + 2,"",
1683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1684 (IV)next_alloc,
1685 (IV)pos,
a3621e74
YO
1686 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1687 );
1688
1689 } /* end table compress */
1690 }
cc601c31
YO
1691 /* resize the trans array to remove unused space */
1692 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1693
3dab1dad
YO
1694 /* and now dump out the compressed format */
1695 DEBUG_TRIE_COMPILE_r(
1696 dump_trie(trie,depth+1)
1697 );
07be1b83 1698
3dab1dad
YO
1699 { /* Modify the program and insert the new TRIE node*/
1700 regnode *convert;
1701 U8 nodetype =(U8)(flags & 0xFF);
1702 char *str=NULL;
07be1b83
YO
1703#ifdef DEBUGGING
1704 U32 mjd_offset;
1705 U32 mjd_nodelen;
1706#endif
a3621e74 1707 /*
3dab1dad
YO
1708 This means we convert either the first branch or the first Exact,
1709 depending on whether the thing following (in 'last') is a branch
1710 or not and whther first is the startbranch (ie is it a sub part of
1711 the alternation or is it the whole thing.)
1712 Assuming its a sub part we conver the EXACT otherwise we convert
1713 the whole branch sequence, including the first.
a3621e74 1714 */
3dab1dad
YO
1715 /* Find the node we are going to overwrite */
1716 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1717 /* whole branch chain */
3dab1dad 1718 convert = first;
07be1b83
YO
1719 DEBUG_r({
1720 const regnode *nop = NEXTOPER( convert );
1721 mjd_offset= Node_Offset((nop));
1722 mjd_nodelen= Node_Length((nop));
1723 });
1724 } else {
1725 /* branch sub-chain */
3dab1dad
YO
1726 convert = NEXTOPER( first );
1727 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1728 DEBUG_r({
1729 mjd_offset= Node_Offset((convert));
1730 mjd_nodelen= Node_Length((convert));
1731 });
1732 }
1733 DEBUG_OPTIMISE_r(
1734 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1735 (int)depth * 2 + 2, "",
1736 mjd_offset,mjd_nodelen)
1737 );
a3621e74 1738
3dab1dad
YO
1739 /* But first we check to see if there is a common prefix we can
1740 split out as an EXACT and put in front of the TRIE node. */
1741 trie->startstate= 1;
1742 if ( trie->bitmap && !trie->widecharmap ) {
1743 U32 state;
1744 DEBUG_OPTIMISE_r(
8e11feef
RGS
1745 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1746 (int)depth * 2 + 2, "",
1747 TRIE_LASTSTATE(trie))
1748 );
1749 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1750 U32 ofs = 0;
8e11feef
RGS
1751 I32 idx = -1;
1752 U32 count = 0;
1753 const U32 base = trie->states[ state ].trans.base;
a3621e74 1754
3dab1dad 1755 if ( trie->states[state].wordnum )
8e11feef 1756 count = 1;
a3621e74 1757
8e11feef 1758 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1759 if ( ( base + ofs >= trie->uniquecharcount ) &&
1760 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1761 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1762 {
3dab1dad 1763 if ( ++count > 1 ) {
8e11feef 1764 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1765 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1766 if ( state == 1 ) break;
3dab1dad
YO
1767 if ( count == 2 ) {
1768 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1769 DEBUG_OPTIMISE_r(
8e11feef
RGS
1770 PerlIO_printf(Perl_debug_log,
1771 "%*sNew Start State=%"UVuf" Class: [",
1772 (int)depth * 2 + 2, "",
3dab1dad 1773 state));
be8e71aa
YO
1774 if (idx >= 0) {
1775 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1776 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1777
3dab1dad 1778 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1779 if ( folder )
1780 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1781 DEBUG_OPTIMISE_r(
07be1b83 1782 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1783 );
8e11feef
RGS
1784 }
1785 }
1786 TRIE_BITMAP_SET(trie,*ch);
1787 if ( folder )
1788 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1789 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1790 }
1791 idx = ofs;
1792 }
3dab1dad
YO
1793 }
1794 if ( count == 1 ) {
1795 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1796 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1797 DEBUG_OPTIMISE_r(
8e11feef
RGS
1798 PerlIO_printf( Perl_debug_log,
1799 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1800 (int)depth * 2 + 2, "",
1801 state, idx, ch)
3dab1dad
YO
1802 );
1803 if ( state==1 ) {
1804 OP( convert ) = nodetype;
1805 str=STRING(convert);
1806 STR_LEN(convert)=0;
1807 }
1808 *str++=*ch;
1809 STR_LEN(convert)++;
a3621e74 1810
8e11feef 1811 } else {
f9049ba1 1812#ifdef DEBUGGING
8e11feef
RGS
1813 if (state>1)
1814 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1815#endif
8e11feef
RGS
1816 break;
1817 }
1818 }
3dab1dad 1819 if (str) {
8e11feef 1820 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1821 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1822 trie->startstate = state;
07be1b83
YO
1823 trie->minlen -= (state - 1);
1824 trie->maxlen -= (state - 1);
1825 DEBUG_r({
1826 regnode *fix = convert;
1827 mjd_nodelen++;
1828 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1829 while( ++fix < n ) {
1830 Set_Node_Offset_Length(fix, 0, 0);
1831 }
1832 });
8e11feef
RGS
1833 if (trie->maxlen) {
1834 convert = n;
1835 } else {
3dab1dad
YO
1836 NEXT_OFF(convert) = (U16)(tail - convert);
1837 }
1838 }
1839 }
1840 if ( trie->maxlen ) {
1841 OP( convert ) = TRIE;
8e11feef
RGS
1842 NEXT_OFF( convert ) = (U16)(tail - convert);
1843 ARG_SET( convert, data_slot );
a3621e74 1844
3dab1dad
YO
1845 /* store the type in the flags */
1846 convert->flags = nodetype;
1847 /* XXX We really should free up the resource in trie now, as we wont use them */
1848 }
a3621e74
YO
1849 /* needed for dumping*/
1850 DEBUG_r({
1851 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1852 regnode *opt = convert;
1853 while (++opt<optimize) {
1854 Set_Node_Offset_Length(opt,0,0);
1855 }
a3621e74
YO
1856 /* We now need to mark all of the space originally used by the
1857 branches as optimized away. This keeps the dumpuntil from
1858 throwing a wobbly as it doesnt use regnext() to traverse the
1859 opcodes.
07be1b83 1860 We also "fix" the offsets
a3621e74
YO
1861 */
1862 while( optimize < last ) {
07be1b83 1863 mjd_nodelen += Node_Length((optimize));
a3621e74 1864 OP( optimize ) = OPTIMIZED;
07be1b83 1865 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1866 optimize++;
1867 }
07be1b83 1868 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1869 });
1870 } /* end node insert */
07be1b83 1871#ifndef DEBUGGING
6e8b4190 1872 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1873#endif
a3621e74
YO
1874 return 1;
1875}
1876
a3621e74 1877/*
5d1c421c
JH
1878 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1879 * These need to be revisited when a newer toolchain becomes available.
1880 */
1881#if defined(__sparc64__) && defined(__GNUC__)
1882# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1883# undef SPARC64_GCC_WORKAROUND
1884# define SPARC64_GCC_WORKAROUND 1
1885# endif
1886#endif
1887
07be1b83
YO
1888#define DEBUG_PEEP(str,scan,depth) \
1889 DEBUG_OPTIMISE_r({ \
1890 SV * const mysv=sv_newmortal(); \
1891 regnode *Next = regnext(scan); \
1892 regprop(RExC_rx, mysv, scan); \
1893 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1894 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1895 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1896 });
1897
1898#define JOIN_EXACT(scan,min,flags) \
1899 if (PL_regkind[OP(scan)] == EXACT) \
1900 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1901
be8e71aa 1902STATIC U32
07be1b83
YO
1903S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1904 /* Merge several consecutive EXACTish nodes into one. */
1905 regnode *n = regnext(scan);
1906 U32 stringok = 1;
1907 regnode *next = scan + NODE_SZ_STR(scan);
1908 U32 merged = 0;
1909 U32 stopnow = 0;
1910#ifdef DEBUGGING
1911 regnode *stop = scan;
72f13be8 1912 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 1913#else
d47053eb
RGS
1914 PERL_UNUSED_ARG(depth);
1915#endif
1916#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
1917 PERL_UNUSED_ARG(flags);
1918 PERL_UNUSED_ARG(val);
07be1b83 1919#endif
07be1b83
YO
1920 DEBUG_PEEP("join",scan,depth);
1921
1922 /* Skip NOTHING, merge EXACT*. */
1923 while (n &&
1924 ( PL_regkind[OP(n)] == NOTHING ||
1925 (stringok && (OP(n) == OP(scan))))
1926 && NEXT_OFF(n)
1927 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1928
1929 if (OP(n) == TAIL || n > next)
1930 stringok = 0;
1931 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
1932 DEBUG_PEEP("skip:",n,depth);
1933 NEXT_OFF(scan) += NEXT_OFF(n);
1934 next = n + NODE_STEP_REGNODE;
1935#ifdef DEBUGGING
1936 if (stringok)
1937 stop = n;
1938#endif
1939 n = regnext(n);
1940 }
1941 else if (stringok) {
1942 const int oldl = STR_LEN(scan);
1943 regnode * const nnext = regnext(n);
1944
1945 DEBUG_PEEP("merg",n,depth);
1946
1947 merged++;
1948 if (oldl + STR_LEN(n) > U8_MAX)
1949 break;
1950 NEXT_OFF(scan) += NEXT_OFF(n);
1951 STR_LEN(scan) += STR_LEN(n);
1952 next = n + NODE_SZ_STR(n);
1953 /* Now we can overwrite *n : */
1954 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1955#ifdef DEBUGGING
1956 stop = next - 1;
1957#endif
1958 n = nnext;
1959 if (stopnow) break;
1960 }
1961
d47053eb
RGS
1962#ifdef EXPERIMENTAL_INPLACESCAN
1963 if (flags && !NEXT_OFF(n)) {
1964 DEBUG_PEEP("atch", val, depth);
1965 if (reg_off_by_arg[OP(n)]) {
1966 ARG_SET(n, val - n);
1967 }
1968 else {
1969 NEXT_OFF(n) = val - n;
1970 }
1971 stopnow = 1;
1972 }
07be1b83
YO
1973#endif
1974 }
1975
1976 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1977 /*
1978 Two problematic code points in Unicode casefolding of EXACT nodes:
1979
1980 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1981 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1982
1983 which casefold to
1984
1985 Unicode UTF-8
1986
1987 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1988 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1989
1990 This means that in case-insensitive matching (or "loose matching",
1991 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1992 length of the above casefolded versions) can match a target string
1993 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1994 This would rather mess up the minimum length computation.
1995
1996 What we'll do is to look for the tail four bytes, and then peek
1997 at the preceding two bytes to see whether we need to decrease
1998 the minimum length by four (six minus two).
1999
2000 Thanks to the design of UTF-8, there cannot be false matches:
2001 A sequence of valid UTF-8 bytes cannot be a subsequence of
2002 another valid sequence of UTF-8 bytes.
2003
2004 */
2005 char * const s0 = STRING(scan), *s, *t;
2006 char * const s1 = s0 + STR_LEN(scan) - 1;
2007 char * const s2 = s1 - 4;
2008 const char t0[] = "\xcc\x88\xcc\x81";
2009 const char * const t1 = t0 + 3;
2010
2011 for (s = s0 + 2;
2012 s < s2 && (t = ninstr(s, s1, t0, t1));
2013 s = t + 4) {
2014 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2015 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2016 *min -= 4;
2017 }
2018 }
2019
2020#ifdef DEBUGGING
2021 /* Allow dumping */
2022 n = scan + NODE_SZ_STR(scan);
2023 while (n <= stop) {
2024 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2025 OP(n) = OPTIMIZED;
2026 NEXT_OFF(n) = 0;
2027 }
2028 n++;
2029 }
2030#endif
2031 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2032 return stopnow;
2033}
2034
653099ff
GS
2035/* REx optimizer. Converts nodes into quickier variants "in place".
2036 Finds fixed substrings. */
2037
a0288114 2038/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2039 to the position after last scanned or to NULL. */
2040
07be1b83
YO
2041
2042
76e3520e 2043STATIC I32
9a957fbc
AL
2044S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2045 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2046 /* scanp: Start here (read-write). */
2047 /* deltap: Write maxlen-minlen here. */
2048 /* last: Stop before this one. */
2049{
97aff369 2050 dVAR;
c277df42
IZ
2051 I32 min = 0, pars = 0, code;
2052 regnode *scan = *scanp, *next;
2053 I32 delta = 0;
2054 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2055 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2056 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2057 scan_data_t data_fake;
653099ff 2058 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2059 SV *re_trie_maxbuff = NULL;
2060
2061 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2062#ifdef DEBUGGING
2063 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2064#endif
b81d288d 2065
c277df42
IZ
2066 while (scan && OP(scan) != END && scan < last) {
2067 /* Peephole optimizer: */
07be1b83 2068 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2069
07be1b83 2070 JOIN_EXACT(scan,&min,0);
a3621e74 2071
653099ff
GS
2072 /* Follow the next-chain of the current node and optimize
2073 away all the NOTHINGs from it. */
c277df42 2074 if (OP(scan) != CURLYX) {
a3b680e6 2075 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2076 ? I32_MAX
2077 /* I32 may be smaller than U16 on CRAYs! */
2078 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2079 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2080 int noff;
2081 regnode *n = scan;
b81d288d 2082
c277df42
IZ
2083 /* Skip NOTHING and LONGJMP. */
2084 while ((n = regnext(n))
3dab1dad 2085 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2086 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2087 && off + noff < max)
2088 off += noff;
2089 if (reg_off_by_arg[OP(scan)])
2090 ARG(scan) = off;
b81d288d 2091 else
c277df42
IZ
2092 NEXT_OFF(scan) = off;
2093 }
a3621e74 2094
07be1b83 2095
3dab1dad 2096
653099ff
GS
2097 /* The principal pseudo-switch. Cannot be a switch, since we
2098 look into several different things. */
b81d288d 2099 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2100 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2101 next = regnext(scan);
2102 code = OP(scan);
a3621e74 2103 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2104
2105 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2106 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2107 struct regnode_charclass_class accum;
d4c19fe8 2108 regnode * const startbranch=scan;
c277df42 2109
653099ff 2110 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2111 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2112 if (flags & SCF_DO_STCLASS)
830247a4 2113 cl_init_zero(pRExC_state, &accum);
a3621e74 2114
c277df42 2115 while (OP(scan) == code) {
830247a4 2116 I32 deltanext, minnext, f = 0, fake;
653099ff 2117 struct regnode_charclass_class this_class;
c277df42
IZ
2118
2119 num++;
2120 data_fake.flags = 0;
b81d288d 2121 if (data) {
2c2d71f5 2122 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2123 data_fake.last_closep = data->last_closep;
2124 }
2125 else
2126 data_fake.last_closep = &fake;
c277df42
IZ
2127 next = regnext(scan);
2128 scan = NEXTOPER(scan);
2129 if (code != BRANCH)
2130 scan = NEXTOPER(scan);
653099ff 2131 if (flags & SCF_DO_STCLASS) {
830247a4 2132 cl_init(pRExC_state, &this_class);
653099ff
GS
2133 data_fake.start_class = &this_class;
2134 f = SCF_DO_STCLASS_AND;
b81d288d 2135 }
e1901655
IZ
2136 if (flags & SCF_WHILEM_VISITED_POS)
2137 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2138
653099ff 2139 /* we suppose the run is continuous, last=next...*/
830247a4 2140 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2141 next, &data_fake, f,depth+1);
b81d288d 2142 if (min1 > minnext)
c277df42
IZ
2143 min1 = minnext;
2144 if (max1 < minnext + deltanext)
2145 max1 = minnext + deltanext;
2146 if (deltanext == I32_MAX)
aca2d497 2147 is_inf = is_inf_internal = 1;
c277df42
IZ
2148 scan = next;
2149 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2150 pars++;
3dab1dad
YO
2151 if (data) {
2152 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2153 data->flags |= SF_HAS_EVAL;
2c2d71f5 2154 data->whilem_c = data_fake.whilem_c;
3dab1dad 2155 }
653099ff 2156 if (flags & SCF_DO_STCLASS)
830247a4 2157 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2158 if (code == SUSPEND)
c277df42
IZ
2159 break;
2160 }
2161 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2162 min1 = 0;
2163 if (flags & SCF_DO_SUBSTR) {
2164 data->pos_min += min1;
2165 data->pos_delta += max1 - min1;
2166 if (max1 != min1 || is_inf)
2167 data->longest = &(data->longest_float);
2168 }
2169 min += min1;
2170 delta += max1 - min1;
653099ff 2171 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2172 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2173 if (min1) {
2174 cl_and(data->start_class, &and_with);
2175 flags &= ~SCF_DO_STCLASS;
2176 }
2177 }
2178 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2179 if (min1) {
2180 cl_and(data->start_class, &accum);
653099ff 2181 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2182 }
2183 else {
b81d288d 2184 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2185 * data->start_class */
2186 StructCopy(data->start_class, &and_with,
2187 struct regnode_charclass_class);
2188 flags &= ~SCF_DO_STCLASS_AND;
2189 StructCopy(&accum, data->start_class,
2190 struct regnode_charclass_class);
2191 flags |= SCF_DO_STCLASS_OR;
2192 data->start_class->flags |= ANYOF_EOS;
2193 }
653099ff 2194 }
a3621e74
YO
2195
2196 /* demq.
2197
2198 Assuming this was/is a branch we are dealing with: 'scan' now
2199 points at the item that follows the branch sequence, whatever
2200 it is. We now start at the beginning of the sequence and look
2201 for subsequences of
2202
2203 BRANCH->EXACT=>X
2204 BRANCH->EXACT=>X
2205
2206 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2207
2208 If we can find such a subseqence we need to turn the first
2209 element into a trie and then add the subsequent branch exact
2210 strings to the trie.
2211
2212 We have two cases
2213
2214 1. patterns where the whole set of branch can be converted to a trie,
2215
2216 2. patterns where only a subset of the alternations can be
2217 converted to a trie.
2218
2219 In case 1 we can replace the whole set with a single regop
2220 for the trie. In case 2 we need to keep the start and end
2221 branchs so
2222
2223 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2224 becomes BRANCH TRIE; BRANCH X;
2225
2226 Hypthetically when we know the regex isnt anchored we can
2227 turn a case 1 into a DFA and let it rip... Every time it finds a match
2228 it would just call its tail, no WHILEM/CURLY needed.
2229
2230 */
07be1b83 2231 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2232 int made=0;
0111c4fd
RGS
2233 if (!re_trie_maxbuff) {
2234 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2235 if (!SvIOK(re_trie_maxbuff))
2236 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2237 }
a3621e74
YO
2238 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2239 regnode *cur;
2240 regnode *first = (regnode *)NULL;
2241 regnode *last = (regnode *)NULL;
2242 regnode *tail = scan;
2243 U8 optype = 0;
2244 U32 count=0;
2245
2246#ifdef DEBUGGING
c445ea15 2247 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2248#endif
2249 /* var tail is used because there may be a TAIL
2250 regop in the way. Ie, the exacts will point to the
2251 thing following the TAIL, but the last branch will
2252 point at the TAIL. So we advance tail. If we
2253 have nested (?:) we may have to move through several
2254 tails.
2255 */
2256
2257 while ( OP( tail ) == TAIL ) {
2258 /* this is the TAIL generated by (?:) */
2259 tail = regnext( tail );
2260 }
2261
3dab1dad 2262
a3621e74 2263 DEBUG_OPTIMISE_r({
32fc9b6a 2264 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2265 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2266 (int)depth * 2 + 2, "",
2267 "Looking for TRIE'able sequences. Tail node is: ",
2268 SvPV_nolen_const( mysv )
a3621e74
YO
2269 );
2270 });
3dab1dad 2271
a3621e74
YO
2272 /*
2273
2274 step through the branches, cur represents each
2275 branch, noper is the first thing to be matched
2276 as part of that branch and noper_next is the
2277 regnext() of that node. if noper is an EXACT
2278 and noper_next is the same as scan (our current
2279 position in the regex) then the EXACT branch is
2280 a possible optimization target. Once we have
2281 two or more consequetive such branches we can
2282 create a trie of the EXACT's contents and stich
2283 it in place. If the sequence represents all of
2284 the branches we eliminate the whole thing and
2285 replace it with a single TRIE. If it is a
2286 subsequence then we need to stitch it in. This
2287 means the first branch has to remain, and needs
2288 to be repointed at the item on the branch chain
2289 following the last branch optimized. This could
2290 be either a BRANCH, in which case the
2291 subsequence is internal, or it could be the
2292 item following the branch sequence in which
2293 case the subsequence is at the end.
2294
2295 */
2296
2297 /* dont use tail as the end marker for this traverse */
2298 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2299 regnode * const noper = NEXTOPER( cur );
2300 regnode * const noper_next = regnext( noper );
a3621e74 2301
a3621e74 2302 DEBUG_OPTIMISE_r({
32fc9b6a 2303 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2304 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2305 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2306
32fc9b6a 2307 regprop(RExC_rx, mysv, noper);
a3621e74 2308 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2309 SvPV_nolen_const(mysv));
a3621e74
YO
2310
2311 if ( noper_next ) {
32fc9b6a 2312 regprop(RExC_rx, mysv, noper_next );
a3621e74 2313 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2314 SvPV_nolen_const(mysv));
a3621e74 2315 }
3dab1dad
YO
2316 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2317 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2318 });
3dab1dad
YO
2319 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2320 : PL_regkind[ OP( noper ) ] == EXACT )
2321 || OP(noper) == NOTHING )
a3621e74
YO
2322 && noper_next == tail && count<U16_MAX)
2323 {
2324 count++;
3dab1dad
YO
2325 if ( !first || optype == NOTHING ) {
2326 if (!first) first = cur;
a3621e74
YO
2327 optype = OP( noper );
2328 } else {
a3621e74 2329 last = cur;
a3621e74
YO
2330 }
2331 } else {
2332 if ( last ) {
3dab1dad 2333 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2334 }
3dab1dad 2335 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2336 && noper_next == tail )
2337 {
2338 count = 1;
2339 first = cur;
2340 optype = OP( noper );
2341 } else {
2342 count = 0;
2343 first = NULL;
2344 optype = 0;
2345 }
2346 last = NULL;
2347 }
2348 }
2349 DEBUG_OPTIMISE_r({
32fc9b6a 2350 regprop(RExC_rx, mysv, cur);
a3621e74 2351 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2352 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2353 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2354
2355 });
2356 if ( last ) {
3dab1dad
YO
2357 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2358#ifdef TRIE_STUDY_OPT
07be1b83
YO
2359 if ( made && startbranch == first ) {
2360 if ( OP(first)!=TRIE )
2361 flags |= SCF_EXACT_TRIE;
2362 else {
2363 regnode *chk=*scanp;
2364 while ( OP( chk ) == OPEN )
2365 chk = regnext( chk );
2366 if (chk==first)
2367 flags |= SCF_EXACT_TRIE;
2368 }
2369 }
3dab1dad 2370#endif
07be1b83 2371 }
a3621e74 2372 }
3dab1dad
YO
2373
2374 } /* do trie */
a0ed51b3 2375 }
a3621e74 2376 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2377 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2378 } else /* single branch is optimized. */
c277df42
IZ
2379 scan = NEXTOPER(scan);
2380 continue;
a0ed51b3
LW
2381 }
2382 else if (OP(scan) == EXACT) {
cd439c50 2383 I32 l = STR_LEN(scan);
c445ea15 2384 UV uc;
a0ed51b3 2385 if (UTF) {
a3b680e6 2386 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2387 l = utf8_length(s, s + l);
9041c2e3 2388 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2389 } else {
2390 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2391 }
2392 min += l;
c277df42 2393 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2394 /* The code below prefers earlier match for fixed
2395 offset, later match for variable offset. */
2396 if (data->last_end == -1) { /* Update the start info. */
2397 data->last_start_min = data->pos_min;
2398 data->last_start_max = is_inf
b81d288d 2399 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2400 }
cd439c50 2401 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2402 if (UTF)
2403 SvUTF8_on(data->last_found);
0eda9292 2404 {
9a957fbc 2405 SV * const sv = data->last_found;
a28509cc 2406 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2407 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2408 if (mg && mg->mg_len >= 0)
5e43f467
JH
2409 mg->mg_len += utf8_length((U8*)STRING(scan),
2410 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2411 }
c277df42
IZ
2412 data->last_end = data->pos_min + l;
2413 data->pos_min += l; /* As in the first entry. */
2414 data->flags &= ~SF_BEFORE_EOL;
2415 }
653099ff
GS
2416 if (flags & SCF_DO_STCLASS_AND) {
2417 /* Check whether it is compatible with what we know already! */
2418 int compat = 1;
2419
1aa99e6b 2420 if (uc >= 0x100 ||
516a5887 2421 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2422 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2423 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2424 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2425 )
653099ff
GS
2426 compat = 0;
2427 ANYOF_CLASS_ZERO(data->start_class);
2428 ANYOF_BITMAP_ZERO(data->start_class);
2429 if (compat)
1aa99e6b 2430 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2431 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2432 if (uc < 0x100)
2433 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2434 }
2435 else if (flags & SCF_DO_STCLASS_OR) {
2436 /* false positive possible if the class is case-folded */
1aa99e6b 2437 if (uc < 0x100)
9b877dbb
IH
2438 ANYOF_BITMAP_SET(data->start_class, uc);
2439 else
2440 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2441 data->start_class->flags &= ~ANYOF_EOS;
2442 cl_and(data->start_class, &and_with);
2443 }
2444 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2445 }
3dab1dad 2446 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2447 I32 l = STR_LEN(scan);
1aa99e6b 2448 UV uc = *((U8*)STRING(scan));
653099ff
GS
2449
2450 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2451 if (flags & SCF_DO_SUBSTR) {
2452 assert(data);
830247a4 2453 scan_commit(pRExC_state, data);
ecaa9b9c 2454 }
a0ed51b3 2455 if (UTF) {
6136c704 2456 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2457 l = utf8_length(s, s + l);
9041c2e3 2458 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2459 }
2460 min += l;
ecaa9b9c 2461 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2462 data->pos_min += l;
653099ff
GS
2463 if (flags & SCF_DO_STCLASS_AND) {
2464 /* Check whether it is compatible with what we know already! */
2465 int compat = 1;
2466
1aa99e6b 2467 if (uc >= 0x100 ||
516a5887 2468 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2469 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2470 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2471 compat = 0;
2472 ANYOF_CLASS_ZERO(data->start_class);
2473 ANYOF_BITMAP_ZERO(data->start_class);
2474 if (compat) {
1aa99e6b 2475 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2476 data->start_class->flags &= ~ANYOF_EOS;
2477 data->start_class->flags |= ANYOF_FOLD;
2478 if (OP(scan) == EXACTFL)
2479 data->start_class->flags |= ANYOF_LOCALE;
2480 }
2481 }
2482 else if (flags & SCF_DO_STCLASS_OR) {
2483 if (data->start_class->flags & ANYOF_FOLD) {
2484 /* false positive possible if the class is case-folded.
2485 Assume that the locale settings are the same... */
1aa99e6b
IH
2486 if (uc < 0x100)
2487 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2488 data->start_class->flags &= ~ANYOF_EOS;
2489 }
2490 cl_and(data->start_class, &and_with);
2491 }
2492 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2493 }
3dab1dad
YO
2494#ifdef TRIE_STUDY_OPT
2495 else if (OP(scan) == TRIE) {
2496 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2497 min += trie->minlen;
07be1b83 2498 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2499 flags &= ~SCF_DO_STCLASS; /* xxx */
2500 if (flags & SCF_DO_SUBSTR) {
2501 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2502 data->pos_min += trie->minlen;
07be1b83
YO
2503 data->pos_delta += (trie->maxlen - trie->minlen);
2504 if (trie->maxlen != trie->minlen)
2505 data->longest = &(data->longest_float);
3dab1dad
YO
2506 }
2507 }
2508#endif
bfed75c6 2509 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2510 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2511 I32 f = flags, pos_before = 0;
d4c19fe8 2512 regnode * const oscan = scan;
653099ff
GS
2513 struct regnode_charclass_class this_class;
2514 struct regnode_charclass_class *oclass = NULL;
727f22e3 2515 I32 next_is_eval = 0;
653099ff 2516
3dab1dad 2517 switch (PL_regkind[OP(scan)]) {
653099ff 2518 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2519 scan = NEXTOPER(scan);
2520 goto finish;
2521 case PLUS:
653099ff 2522 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2523 next = NEXTOPER(scan);
653099ff 2524 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2525 mincount = 1;
2526 maxcount = REG_INFTY;
c277df42
IZ
2527 next = regnext(scan);
2528 scan = NEXTOPER(scan);
2529 goto do_curly;
2530 }
2531 }
2532 if (flags & SCF_DO_SUBSTR)
2533 data->pos_min++;
2534 min++;
2535 /* Fall through. */
2536 case STAR:
653099ff
GS
2537 if (flags & SCF_DO_STCLASS) {
2538 mincount = 0;
b81d288d 2539 maxcount = REG_INFTY;
653099ff
GS
2540 next = regnext(scan);
2541 scan = NEXTOPER(scan);
2542 goto do_curly;
2543 }
b81d288d 2544 is_inf = is_inf_internal = 1;
c277df42
IZ
2545 scan = regnext(scan);
2546 if (flags & SCF_DO_SUBSTR) {
830247a4 2547 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2548 data->longest = &(data->longest_float);
2549 }
2550 goto optimize_curly_tail;
2551 case CURLY:
b81d288d 2552 mincount = ARG1(scan);
c277df42
IZ
2553 maxcount = ARG2(scan);
2554 next = regnext(scan);
cb434fcc
IZ
2555 if (OP(scan) == CURLYX) {
2556 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2557 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2558 }
c277df42 2559 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2560 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2561 do_curly:
2562 if (flags & SCF_DO_SUBSTR) {
830247a4 2563 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2564 pos_before = data->pos_min;
2565 }
2566 if (data) {
2567 fl = data->flags;
2568 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2569 if (is_inf)
2570 data->flags |= SF_IS_INF;
2571 }
653099ff 2572 if (flags & SCF_DO_STCLASS) {
830247a4 2573 cl_init(pRExC_state, &this_class);
653099ff
GS
2574 oclass = data->start_class;
2575 data->start_class = &this_class;
2576 f |= SCF_DO_STCLASS_AND;
2577 f &= ~SCF_DO_STCLASS_OR;
2578 }
e1901655
IZ
2579 /* These are the cases when once a subexpression
2580 fails at a particular position, it cannot succeed
2581 even after backtracking at the enclosing scope.
b81d288d 2582
e1901655
IZ
2583 XXXX what if minimal match and we are at the
2584 initial run of {n,m}? */
2585 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2586 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2587
c277df42 2588 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2589 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2590 (mincount == 0
2591 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2592
2593 if (flags & SCF_DO_STCLASS)
2594 data->start_class = oclass;
2595 if (mincount == 0 || minnext == 0) {
2596 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2597 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2598 }
2599 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2600 /* Switch to OR mode: cache the old value of
653099ff
GS
2601 * data->start_class */
2602 StructCopy(data->start_class, &and_with,
2603 struct regnode_charclass_class);
2604 flags &= ~SCF_DO_STCLASS_AND;
2605 StructCopy(&this_class, data->start_class,
2606 struct regnode_charclass_class);
2607 flags |= SCF_DO_STCLASS_OR;
2608 data->start_class->flags |= ANYOF_EOS;
2609 }
2610 } else { /* Non-zero len */
2611 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2612 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2613 cl_and(data->start_class, &and_with);
2614 }
2615 else if (flags & SCF_DO_STCLASS_AND)
2616 cl_and(data->start_class, &this_class);
2617 flags &= ~SCF_DO_STCLASS;
2618 }
c277df42
IZ
2619 if (!scan) /* It was not CURLYX, but CURLY. */
2620 scan = next;
041457d9
DM
2621 if ( /* ? quantifier ok, except for (?{ ... }) */
2622 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2623 && (minnext == 0) && (deltanext == 0)
99799961 2624 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2625 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2626 && ckWARN(WARN_REGEXP))
b45f050a 2627 {
830247a4 2628 vWARN(RExC_parse,
b45f050a
JF
2629 "Quantifier unexpected on zero-length expression");
2630 }
2631
c277df42 2632 min += minnext * mincount;
b81d288d 2633 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2634 && (minnext + deltanext) > 0)
2635 || deltanext == I32_MAX);
aca2d497 2636 is_inf |= is_inf_internal;
c277df42
IZ
2637 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2638
2639 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2640 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2641 && data->flags & SF_IN_PAR
2642 && !(data->flags & SF_HAS_EVAL)
2643 && !deltanext && minnext == 1 ) {
2644 /* Try to optimize to CURLYN. */
2645 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2646 regnode * const nxt1 = nxt;
497b47a8
JH
2647#ifdef DEBUGGING
2648 regnode *nxt2;
2649#endif
c277df42
IZ
2650
2651 /* Skip open. */
2652 nxt = regnext(nxt);
bfed75c6 2653 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2654 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2655 && STR_LEN(nxt) == 1))
c277df42 2656 goto nogo;
497b47a8 2657#ifdef DEBUGGING
c277df42 2658 nxt2 = nxt;
497b47a8 2659#endif
c277df42 2660 nxt = regnext(nxt);
b81d288d 2661 if (OP(nxt) != CLOSE)
c277df42
IZ
2662 goto nogo;
2663 /* Now we know that nxt2 is the only contents: */
eb160463 2664 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2665 OP(oscan) = CURLYN;
2666 OP(nxt1) = NOTHING; /* was OPEN. */
2667#ifdef DEBUGGING
2668 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2669 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2670 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2671 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2672 OP(nxt + 1) = OPTIMIZED; /* was count. */
2673 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2674#endif
c277df42 2675 }
c277df42
IZ
2676 nogo:
2677
2678 /* Try optimization CURLYX => CURLYM. */
b81d288d 2679 if ( OP(oscan) == CURLYX && data
c277df42 2680 && !(data->flags & SF_HAS_PAR)
c277df42 2681 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2682 && !deltanext /* atom is fixed width */
2683 && minnext != 0 /* CURLYM can't handle zero width */
2684 ) {
c277df42
IZ
2685 /* XXXX How to optimize if data == 0? */
2686 /* Optimize to a simpler form. */
2687 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2688 regnode *nxt2;
2689
2690 OP(oscan) = CURLYM;
2691 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2692 && (OP(nxt2) != WHILEM))
c277df42
IZ
2693 nxt = nxt2;
2694 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2695 /* Need to optimize away parenths. */
2696 if (data->flags & SF_IN_PAR) {
2697 /* Set the parenth number. */
2698 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2699
b81d288d 2700 if (OP(nxt) != CLOSE)
b45f050a 2701 FAIL("Panic opt close");
eb160463 2702 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2703 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2704 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2705#ifdef DEBUGGING
2706 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2707 OP(nxt + 1) = OPTIMIZED; /* was count. */
2708 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2709 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2710#endif
c277df42
IZ
2711#if 0
2712 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2713 regnode *nnxt = regnext(nxt1);
b81d288d 2714
c277df42
IZ
2715 if (nnxt == nxt) {
2716 if (reg_off_by_arg[OP(nxt1)])
2717 ARG_SET(nxt1, nxt2 - nxt1);
2718 else if (nxt2 - nxt1 < U16_MAX)
2719 NEXT_OFF(nxt1) = nxt2 - nxt1;
2720 else
2721 OP(nxt) = NOTHING; /* Cannot beautify */
2722 }
2723 nxt1 = nnxt;
2724 }
2725#endif
2726 /* Optimize again: */
b81d288d 2727 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2728 NULL, 0,depth+1);
a0ed51b3
LW
2729 }
2730 else
c277df42 2731 oscan->flags = 0;
c277df42 2732 }
e1901655
IZ
2733 else if ((OP(oscan) == CURLYX)
2734 && (flags & SCF_WHILEM_VISITED_POS)
2735 /* See the comment on a similar expression above.
2736 However, this time it not a subexpression
2737 we care about, but the expression itself. */
2738 && (maxcount == REG_INFTY)
2739 && data && ++data->whilem_c < 16) {
2740 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2741 /* Find WHILEM (as in regexec.c) */
2742 regnode *nxt = oscan + NEXT_OFF(oscan);
2743
2744 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2745 nxt += ARG(nxt);
eb160463
GS
2746 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2747 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2748 }
b81d288d 2749 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2750 pars++;
2751 if (flags & SCF_DO_SUBSTR) {
c445ea15 2752 SV *last_str = NULL;
c277df42
IZ
2753 int counted = mincount != 0;
2754
2755 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2756#if defined(SPARC64_GCC_WORKAROUND)
2757 I32 b = 0;
2758 STRLEN l = 0;
cfd0369c 2759 const char *s = NULL;
5d1c421c
JH
2760 I32 old = 0;
2761
2762 if (pos_before >= data->last_start_min)
2763 b = pos_before;
2764 else
2765 b = data->last_start_min;
2766
2767 l = 0;
cfd0369c 2768 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2769 old = b - data->last_start_min;
2770
2771#else
b81d288d 2772 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2773 ? pos_before : data->last_start_min;
2774 STRLEN l;
d4c19fe8 2775 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2776 I32 old = b - data->last_start_min;
5d1c421c 2777#endif
a0ed51b3
LW
2778
2779 if (UTF)
2780 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2781
a0ed51b3 2782 l -= old;
c277df42 2783 /* Get the added string: */
79cb57f6 2784 last_str = newSVpvn(s + old, l);
0e933229
IH
2785 if (UTF)
2786 SvUTF8_on(last_str);
c277df42
IZ
2787 if (deltanext == 0 && pos_before == b) {
2788 /* What was added is a constant string */
2789 if (mincount > 1) {
2790 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2791 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2792 SvPVX_const(last_str), l, mincount - 1);
b162af07 2793 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2794 /* Add additional parts. */
b81d288d 2795 SvCUR_set(data->last_found,
c277df42
IZ
2796 SvCUR(data->last_found) - l);
2797 sv_catsv(data->last_found, last_str);
0eda9292
JH
2798 {
2799 SV * sv = data->last_found;
2800 MAGIC *mg =
2801 SvUTF8(sv) && SvMAGICAL(sv) ?
2802 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2803 if (mg && mg->mg_len >= 0)
2804 mg->mg_len += CHR_SVLEN(last_str);
2805 }
c277df42
IZ
2806 data->last_end += l * (mincount - 1);
2807 }
2a8d9689
HS
2808 } else {
2809 /* start offset must point into the last copy */
2810 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2811 data->last_start_max += is_inf ? I32_MAX
2812 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2813 }
2814 }
2815 /* It is counted once already... */
2816 data->pos_min += minnext * (mincount - counted);
2817 data->pos_delta += - counted * deltanext +
2818 (minnext + deltanext) * maxcount - minnext * mincount;
2819 if (mincount != maxcount) {
653099ff
GS
2820 /* Cannot extend fixed substrings found inside
2821 the group. */
830247a4 2822 scan_commit(pRExC_state,data);
c277df42 2823 if (mincount && last_str) {
d4c19fe8
AL
2824 SV * const sv = data->last_found;
2825 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2826 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2827
2828 if (mg)
2829 mg->mg_len = -1;
2830 sv_setsv(sv, last_str);
c277df42 2831 data->last_end = data->pos_min;
b81d288d 2832 data->last_start_min =
a0ed51b3 2833 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2834 data->last_start_max = is_inf
2835 ? I32_MAX
c277df42 2836 : data->pos_min + data->pos_delta
a0ed51b3 2837 - CHR_SVLEN(last_str);
c277df42
IZ
2838 }
2839 data->longest = &(data->longest_float);
2840 }
aca2d497 2841 SvREFCNT_dec(last_str);
c277df42 2842 }
405ff068 2843 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2844 data->flags |= SF_HAS_EVAL;
2845 optimize_curly_tail:
c277df42 2846 if (OP(oscan) != CURLYX) {
3dab1dad 2847 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2848 && NEXT_OFF(next))
2849 NEXT_OFF(oscan) += NEXT_OFF(next);
2850 }
c277df42 2851 continue;
653099ff 2852 default: /* REF and CLUMP only? */
c277df42 2853 if (flags & SCF_DO_SUBSTR) {
830247a4 2854 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2855 data->longest = &(data->longest_float);
2856 }
aca2d497 2857 is_inf = is_inf_internal = 1;
653099ff 2858 if (flags & SCF_DO_STCLASS_OR)
830247a4 2859 cl_anything(pRExC_state, data->start_class);
653099ff 2860 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2861 break;
2862 }
a0ed51b3 2863 }
bfed75c6 2864 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2865 int value = 0;
653099ff 2866
c277df42 2867 if (flags & SCF_DO_SUBSTR) {
830247a4 2868 scan_commit(pRExC_state,data);
c277df42
IZ
2869 data->pos_min++;
2870 }
2871 min++;
653099ff
GS
2872 if (flags & SCF_DO_STCLASS) {
2873 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2874
2875 /* Some of the logic below assumes that switching
2876 locale on will only add false positives. */
3dab1dad 2877 switch (PL_regkind[OP(scan)]) {
653099ff 2878 case SANY:
653099ff
GS
2879 default:
2880 do_default:
2881 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2882 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2883 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2884 break;
2885 case REG_ANY:
2886 if (OP(scan) == SANY)
2887 goto do_default;
2888 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2889 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2890 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2891 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2892 }
2893 if (flags & SCF_DO_STCLASS_AND || !value)
2894 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2895 break;
2896 case ANYOF:
2897 if (flags & SCF_DO_STCLASS_AND)
2898 cl_and(data->start_class,
2899 (struct regnode_charclass_class*)scan);
2900 else
830247a4 2901 cl_or(pRExC_state, data->start_class,
653099ff
GS
2902 (struct regnode_charclass_class*)scan);
2903 break;
2904 case ALNUM:
2905 if (flags & SCF_DO_STCLASS_AND) {
2906 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2907 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2908 for (value = 0; value < 256; value++)
2909 if (!isALNUM(value))
2910 ANYOF_BITMAP_CLEAR(data->start_class, value);
2911 }
2912 }
2913 else {
2914 if (data->start_class->flags & ANYOF_LOCALE)
2915 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2916 else {
2917 for (value = 0; value < 256; value++)
2918 if (isALNUM(value))
b81d288d 2919 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2920 }
2921 }
2922 break;
2923 case ALNUML:
2924 if (flags & SCF_DO_STCLASS_AND) {
2925 if (data->start_class->flags & ANYOF_LOCALE)
2926 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2927 }
2928 else {
2929 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2930 data->start_class->flags |= ANYOF_LOCALE;
2931 }
2932 break;
2933 case NALNUM:
2934 if (flags & SCF_DO_STCLASS_AND) {
2935 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2936 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2937 for (value = 0; value < 256; value++)
2938 if (isALNUM(value))
2939 ANYOF_BITMAP_CLEAR(data->start_class, value);
2940 }
2941 }
2942 else {
2943 if (data->start_class->flags & ANYOF_LOCALE)
2944 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2945 else {
2946 for (value = 0; value < 256; value++)
2947 if (!isALNUM(value))
b81d288d 2948 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2949 }
2950 }
2951 break;
2952 case NALNUML:
2953 if (flags & SCF_DO_STCLASS_AND) {
2954 if (data->start_class->flags & ANYOF_LOCALE)
2955 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2956 }
2957 else {
2958 data->start_class->flags |= ANYOF_LOCALE;
2959 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2960 }
2961 break;
2962 case SPACE:
2963 if (flags & SCF_DO_STCLASS_AND) {
2964 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2965 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2966 for (value = 0; value < 256; value++)
2967 if (!isSPACE(value))
2968 ANYOF_BITMAP_CLEAR(data->start_class, value);
2969 }
2970 }
2971 else {
2972 if (data->start_class->flags & ANYOF_LOCALE)
2973 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2974 else {
2975 for (value = 0; value < 256; value++)
2976 if (isSPACE(value))
b81d288d 2977 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2978 }
2979 }
2980 break;
2981 case SPACEL:
2982 if (flags & SCF_DO_STCLASS_AND) {
2983 if (data->start_class->flags & ANYOF_LOCALE)
2984 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2985 }
2986 else {
2987 data->start_class->flags |= ANYOF_LOCALE;
2988 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2989 }
2990 break;
2991 case NSPACE:
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2994 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2995 for (value = 0; value < 256; value++)
2996 if (isSPACE(value))
2997 ANYOF_BITMAP_CLEAR(data->start_class, value);
2998 }
2999 }
3000 else {
3001 if (data->start_class->flags & ANYOF_LOCALE)
3002 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3003 else {
3004 for (value = 0; value < 256; value++)
3005 if (!isSPACE(value))
b81d288d 3006 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3007 }
3008 }
3009 break;
3010 case NSPACEL:
3011 if (flags & SCF_DO_STCLASS_AND) {
3012 if (data->start_class->flags & ANYOF_LOCALE) {
3013 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3014 for (value = 0; value < 256; value++)
3015 if (!isSPACE(value))
3016 ANYOF_BITMAP_CLEAR(data->start_class, value);
3017 }
3018 }
3019 else {
3020 data->start_class->flags |= ANYOF_LOCALE;
3021 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3022 }
3023 break;
3024 case DIGIT:
3025 if (flags & SCF_DO_STCLASS_AND) {
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3027 for (value = 0; value < 256; value++)
3028 if (!isDIGIT(value))
3029 ANYOF_BITMAP_CLEAR(data->start_class, value);
3030 }
3031 else {
3032 if (data->start_class->flags & ANYOF_LOCALE)
3033 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3034 else {
3035 for (value = 0; value < 256; value++)
3036 if (isDIGIT(value))
b81d288d 3037 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3038 }
3039 }
3040 break;
3041 case NDIGIT:
3042 if (flags & SCF_DO_STCLASS_AND) {
3043 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3044 for (value = 0; value < 256; value++)
3045 if (isDIGIT(value))
3046 ANYOF_BITMAP_CLEAR(data->start_class, value);
3047 }
3048 else {
3049 if (data->start_class->flags & ANYOF_LOCALE)
3050 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3051 else {
3052 for (value = 0; value < 256; value++)
3053 if (!isDIGIT(value))
b81d288d 3054 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3055 }
3056 }
3057 break;
3058 }
3059 if (flags & SCF_DO_STCLASS_OR)
3060 cl_and(data->start_class, &and_with);
3061 flags &= ~SCF_DO_STCLASS;
3062 }
a0ed51b3 3063 }
3dab1dad 3064 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3065 data->flags |= (OP(scan) == MEOL
3066 ? SF_BEFORE_MEOL
3067 : SF_BEFORE_SEOL);
a0ed51b3 3068 }
3dab1dad 3069 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3070 /* Lookbehind, or need to calculate parens/evals/stclass: */
3071 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3072 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3073 /* Lookahead/lookbehind */
cb434fcc 3074 I32 deltanext, minnext, fake = 0;
c277df42 3075 regnode *nscan;
653099ff
GS
3076 struct regnode_charclass_class intrnl;
3077 int f = 0;
c277df42
IZ
3078
3079 data_fake.flags = 0;
b81d288d 3080 if (data) {
2c2d71f5 3081 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3082 data_fake.last_closep = data->last_closep;
3083 }
3084 else
3085 data_fake.last_closep = &fake;
653099ff
GS
3086 if ( flags & SCF_DO_STCLASS && !scan->flags
3087 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3088 cl_init(pRExC_state, &intrnl);
653099ff 3089 data_fake.start_class = &intrnl;
e1901655 3090 f |= SCF_DO_STCLASS_AND;
653099ff 3091 }
e1901655
IZ
3092 if (flags & SCF_WHILEM_VISITED_POS)
3093 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3094 next = regnext(scan);
3095 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3096 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3097 if (scan->flags) {
3098 if (deltanext) {
9baa0206 3099 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3100 }
3101 else if (minnext > U8_MAX) {
9baa0206 3102 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3103 }
eb160463 3104 scan->flags = (U8)minnext;
c277df42 3105 }
be8e71aa
YO
3106 if (data) {
3107 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3108 pars++;
3109 if (data_fake.flags & SF_HAS_EVAL)
3110 data->flags |= SF_HAS_EVAL;
2c2d71f5 3111 data->whilem_c = data_fake.whilem_c;
be8e71aa 3112 }
e1901655 3113 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3114 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3115
3116 cl_and(data->start_class, &intrnl);
3117 if (was)
3118 data->start_class->flags |= ANYOF_EOS;
3119 }
a0ed51b3
LW
3120 }
3121 else if (OP(scan) == OPEN) {
c277df42 3122 pars++;
a0ed51b3 3123 }
cb434fcc 3124 else if (OP(scan) == CLOSE) {
eb160463 3125 if ((I32)ARG(scan) == is_par) {
cb434fcc 3126 next = regnext(scan);
c277df42 3127
cb434fcc
IZ
3128 if ( next && (OP(next) != WHILEM) && next < last)
3129 is_par = 0; /* Disable optimization */
3130 }
3131 if (data)
3132 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3133 }
3134 else if (OP(scan) == EVAL) {
c277df42
IZ
3135 if (data)
3136 data->flags |= SF_HAS_EVAL;
3137 }
96776eda 3138 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3139 if (flags & SCF_DO_SUBSTR) {
830247a4 3140 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3141 data->longest = &(data->longest_float);
3142 }
3143 is_inf = is_inf_internal = 1;
653099ff 3144 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3145 cl_anything(pRExC_state, data->start_class);
96776eda 3146 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3147 }
c277df42
IZ
3148 /* Else: zero-length, ignore. */
3149 scan = regnext(scan);
3150 }
3151
3152 finish:
3153 *scanp = scan;
aca2d497 3154 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3155 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3156 data->pos_delta = I32_MAX - data->pos_min;
3157 if (is_par > U8_MAX)
3158 is_par = 0;
3159 if (is_par && pars==1 && data) {
3160 data->flags |= SF_IN_PAR;
3161 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3162 }
3163 else if (pars && data) {
c277df42
IZ
3164 data->flags |= SF_HAS_PAR;
3165 data->flags &= ~SF_IN_PAR;
3166 }
653099ff
GS
3167 if (flags & SCF_DO_STCLASS_OR)
3168 cl_and(data->start_class, &and_with);
07be1b83
YO
3169 if (flags & SCF_EXACT_TRIE)
3170 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3171 return min;
3172}
3173
76e3520e 3174STATIC I32
5f66b61c 3175S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3176{
830247a4 3177 if (RExC_rx->data) {
b81d288d
AB
3178 Renewc(RExC_rx->data,
3179 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3180 char, struct reg_data);
830247a4
IZ
3181 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3182 RExC_rx->data->count += n;
a0ed51b3
LW
3183 }
3184 else {
a02a5408 3185 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3186 char, struct reg_data);
a02a5408 3187 Newx(RExC_rx->data->what, n, U8);
830247a4 3188 RExC_rx->data->count = n;
c277df42 3189 }
830247a4
IZ
3190 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3191 return RExC_rx->data->count - n;
c277df42
IZ
3192}
3193
76234dfb 3194#ifndef PERL_IN_XSUB_RE
d88dccdf 3195void
864dbfa3 3196Perl_reginitcolors(pTHX)
d88dccdf 3197{
97aff369 3198 dVAR;
1df70142 3199 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3200 if (s) {
1df70142
AL
3201 char *t = savepv(s);
3202 int i = 0;
3203 PL_colors[0] = t;
d88dccdf 3204 while (++i < 6) {
1df70142
AL
3205 t = strchr(t, '\t');
3206 if (t) {
3207 *t = '\0';
3208 PL_colors[i] = ++t;
d88dccdf
IZ
3209 }
3210 else
1df70142 3211 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3212 }
3213 } else {
1df70142 3214 int i = 0;
b81d288d 3215 while (i < 6)
06b5626a 3216 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3217 }
3218 PL_colorset = 1;
3219}
76234dfb 3220#endif
8615cb43 3221
07be1b83 3222
a687059c 3223/*
e50aee73 3224 - pregcomp - compile a regular expression into internal code
a687059c
LW
3225 *
3226 * We can't allocate space until we know how big the compiled form will be,
3227 * but we can't compile it (and thus know how big it is) until we've got a
3228 * place to put the code. So we cheat: we compile it twice, once with code
3229 * generation turned off and size counting turned on, and once "for real".
3230 * This also means that we don't allocate space until we are sure that the
3231 * thing really will compile successfully, and we never have to move the
3232 * code and thus invalidate pointers into it. (Note that it has to be in
3233 * one piece because free() must be able to free it all.) [NB: not true in perl]
3234 *
3235 * Beware that the optimization-preparation code in here knows about some
3236 * of the structure of the compiled regexp. [I'll say.]
3237 */
3238regexp *
864dbfa3 3239Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3240{
97aff369 3241 dVAR;
a0d0e21e 3242 register regexp *r;
c277df42 3243 regnode *scan;
c277df42 3244 regnode *first;
a0d0e21e 3245 I32 flags;
a0d0e21e
LW
3246 I32 minlen = 0;
3247 I32 sawplus = 0;
3248 I32 sawopen = 0;
2c2d71f5 3249 scan_data_t data;
830247a4 3250 RExC_state_t RExC_state;
be8e71aa 3251 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3252#ifdef TRIE_STUDY_OPT
3253 int restudied= 0;
3254 RExC_state_t copyRExC_state;
3255#endif
a0d0e21e 3256
a3621e74
YO
3257 GET_RE_DEBUG_FLAGS_DECL;
3258
a0d0e21e 3259 if (exp == NULL)
c277df42 3260 FAIL("NULL regexp argument");
a0d0e21e 3261
a5961de5 3262 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3263
5cfc7842 3264 RExC_precomp = exp;
a3621e74
YO
3265 DEBUG_r(if (!PL_colorset) reginitcolors());
3266 DEBUG_COMPILE_r({
ab3bbdeb
YO
3267 SV *dsv= sv_newmortal();
3268 RE_PV_QUOTED_DECL(s, RExC_utf8,
3269 dsv, RExC_precomp, (xend - exp), 60);
3270 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3271 PL_colors[4],PL_colors[5],s);
a5961de5 3272 });
e2509266 3273 RExC_flags = pm->op_pmflags;
830247a4 3274 RExC_sawback = 0;
bbce6d69 3275
830247a4
IZ
3276 RExC_seen = 0;
3277 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3278 RExC_seen_evals = 0;
3279 RExC_extralen = 0;
c277df42 3280
bbce6d69 3281 /* First pass: determine size, legality. */
830247a4 3282 RExC_parse = exp;
fac92740 3283 RExC_start = exp;
830247a4
IZ
3284 RExC_end = xend;
3285 RExC_naughty = 0;
3286 RExC_npar = 1;
3287 RExC_size = 0L;
3288 RExC_emit = &PL_regdummy;
3289 RExC_whilem_seen = 0;
85ddcde9
JH
3290#if 0 /* REGC() is (currently) a NOP at the first pass.
3291 * Clever compilers notice this and complain. --jhi */
830247a4 3292 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3293#endif
3dab1dad
YO
3294 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3295 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3296 RExC_precomp = NULL;
a0d0e21e
LW
3297 return(NULL);
3298 }
3dab1dad
YO
3299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3300 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3301 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3302 DEBUG_PARSE_r({
3303 RExC_lastnum=0;
3304 RExC_lastparse=NULL;
3305 });
c277df42 3306
07be1b83 3307
c277df42
IZ
3308 /* Small enough for pointer-storage convention?
3309 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3310 if (RExC_size >= 0x10000L && RExC_extralen)
3311 RExC_size += RExC_extralen;
c277df42 3312 else
830247a4
IZ
3313 RExC_extralen = 0;
3314 if (RExC_whilem_seen > 15)
3315 RExC_whilem_seen = 15;
a0d0e21e 3316
bbce6d69 3317 /* Allocate space and initialize. */
a02a5408 3318 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3319 char, regexp);
a0d0e21e 3320 if (r == NULL)
b45f050a
JF
3321 FAIL("Regexp out of space");
3322
0f79a09d
GS
3323#ifdef DEBUGGING
3324 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3325 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3326#endif
c277df42 3327 r->refcnt = 1;
bbce6d69 3328 r->prelen = xend - exp;
5cfc7842 3329 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3330 r->subbeg = NULL;
f8c7b90f 3331#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3332 r->saved_copy = NULL;
ed252734 3333#endif
cf93c79d 3334 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3335 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3336 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3337
3338 r->substrs = 0; /* Useful during FAIL. */
3339 r->startp = 0; /* Useful during FAIL. */
3340 r->endp = 0; /* Useful during FAIL. */
3341
a02a5408 3342 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3343 if (r->offsets) {
2af232bd 3344 r->offsets[0] = RExC_size;
fac92740 3345 }
a3621e74 3346 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3347 "%s %"UVuf" bytes for offset annotations.\n",
3348 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3349 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3350
830247a4 3351 RExC_rx = r;
bbce6d69 3352
3353 /* Second pass: emit code. */
e2509266 3354 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3355 RExC_parse = exp;
3356 RExC_end = xend;
3357 RExC_naughty = 0;
3358 RExC_npar = 1;
fac92740 3359 RExC_emit_start = r->program;
830247a4 3360 RExC_emit = r->program;
2cd61cdb 3361 /* Store the count of eval-groups for security checks: */
eb160463 3362 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3363 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3364 r->data = 0;
3dab1dad 3365 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3366 return(NULL);
07be1b83
YO
3367 /* XXXX To minimize changes to RE engine we always allocate
3368 3-units-long substrs field. */
3369 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3370
07be1b83
YO
3371reStudy:
3372 Zero(r->substrs, 1, struct reg_substr_data);
3373 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3374
07be1b83
YO
3375#ifdef TRIE_STUDY_OPT
3376 if ( restudied ) {
3377 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3378 RExC_state=copyRExC_state;
3379 if (data.longest_fixed)
3380 SvREFCNT_dec(data.longest_fixed);
3381 if (data.longest_float)
3382 SvREFCNT_dec(data.longest_float);
3383 if (data.last_found)
3384 SvREFCNT_dec(data.last_found);
3385 } else {
3386 copyRExC_state=RExC_state;
3387 }
3388#endif
a0d0e21e 3389 /* Dig out information for optimizations. */
cf93c79d 3390 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3391 pm->op_pmflags = RExC_flags;
a0ed51b3 3392 if (UTF)
5ff6fc6d 3393 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3394 r->regstclass = NULL;
830247a4 3395 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3396 r->reganch |= ROPT_NAUGHTY;
c277df42 3397 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3398
653099ff 3399 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3400 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3401 I32 fake;
c5254dd6 3402 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3403 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3404 int stclass_flag;
07be1b83 3405 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3406
3407 first = scan;
c277df42 3408 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3409 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3410 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3411 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3412 /* for now we can't handle lookbehind IFMATCH*/
3413 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3414 (OP(first) == PLUS) ||
3415 (OP(first) == MINMOD) ||
653099ff 3416 /* An {n,m} with n>0 */
07be1b83
YO
3417 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3418 {
3419 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3420 if (OP(first) == PLUS)
3421 sawplus = 1;
3422 else
3dab1dad 3423 first += regarglen[OP(first)];
07be1b83
YO
3424 if (OP(first) == IFMATCH) {
3425 first = NEXTOPER(first);
3426 first += EXTRA_STEP_2ARGS;
7c167cea 3427 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3428 first = NEXTOPER(first);
a687059c
LW
3429 }
3430
a0d0e21e
LW
3431 /* Starting-point info. */
3432 again:
07be1b83 3433 /* Ignore EXACT as we deal with it later. */
3dab1dad 3434 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3435 if (OP(first) == EXACT)
6f207bd3 3436 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3437 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3438 r->regstclass = first;
3439 }
07be1b83
YO
3440#ifdef TRIE_STCLASS
3441 else if (OP(first) == TRIE &&
3442 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3443 {
3444 /* this can happen only on restudy */
3445 struct regnode_1 *trie_op;
3446 Newxz(trie_op,1,struct regnode_1);
3447 StructCopy(first,trie_op,struct regnode_1);
3448 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3449 r->regstclass = (regnode *)trie_op;
3450 }
3451#endif
bfed75c6 3452 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3453 r->regstclass = first;
3dab1dad
YO
3454 else if (PL_regkind[OP(first)] == BOUND ||
3455 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3456 r->regstclass = first;
3dab1dad 3457 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3458 r->reganch |= (OP(first) == MBOL
3459 ? ROPT_ANCH_MBOL
3460 : (OP(first) == SBOL
3461 ? ROPT_ANCH_SBOL
3462 : ROPT_ANCH_BOL));
a0d0e21e 3463 first = NEXTOPER(first);
774d564b 3464 goto again;
3465 }
3466 else if (OP(first) == GPOS) {
3467 r->reganch |= ROPT_ANCH_GPOS;
3468 first = NEXTOPER(first);
3469 goto again;
a0d0e21e 3470 }
e09294f4 3471 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3472 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3473 !(r->reganch & ROPT_ANCH) )
3474 {
3475 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3476 const int type =
3477 (OP(NEXTOPER(first)) == REG_ANY)
3478 ? ROPT_ANCH_MBOL
3479 : ROPT_ANCH_SBOL;
cad2e5aa 3480 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3481 first = NEXTOPER(first);
774d564b 3482 goto again;
a0d0e21e 3483 }
b81d288d 3484 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3485 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3486 /* x+ must match at the 1st pos of run of x's */
3487 r->reganch |= ROPT_SKIP;
a0d0e21e 3488
c277df42 3489 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3490#ifdef TRIE_STUDY_OPT
3491 DEBUG_COMPILE_r(
3492 if (!restudied)
3493 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3494 (IV)(first - scan + 1))
3495 );
3496#else
3497 DEBUG_COMPILE_r(
3498 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3499 (IV)(first - scan + 1))
3500 );
3501#endif
3502
3503
a0d0e21e
LW
3504 /*
3505 * If there's something expensive in the r.e., find the
3506 * longest literal string that must appear and make it the
3507 * regmust. Resolve ties in favor of later strings, since
3508 * the regstart check works with the beginning of the r.e.
3509 * and avoiding duplication strengthens checking. Not a
3510 * strong reason, but sufficient in the absence of others.
3511 * [Now we resolve ties in favor of the earlier string if
c277df42 3512 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3513 * earlier string may buy us something the later one won't.]
3514 */
a0d0e21e 3515 minlen = 0;
a687059c 3516
396482e1
GA
3517 data.longest_fixed = newSVpvs("");
3518 data.longest_float = newSVpvs("");
3519 data.last_found = newSVpvs("");
c277df42
IZ
3520 data.longest = &(data.longest_fixed);
3521 first = scan;
653099ff 3522 if (!r->regstclass) {
830247a4 3523 cl_init(pRExC_state, &ch_class);
653099ff
GS
3524 data.start_class = &ch_class;
3525 stclass_flag = SCF_DO_STCLASS_AND;
3526 } else /* XXXX Check for BOUND? */
3527 stclass_flag = 0;
cb434fcc 3528 data.last_closep = &last_close;
653099ff 3529
830247a4 3530 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3531 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3532
3533#ifdef TRIE_STUDY_OPT
3534 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3535 goto reStudy;
3536 }
3537#endif
3538
830247a4 3539 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3540 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3541 && !RExC_seen_zerolen
3542 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3543 r->reganch |= ROPT_CHECK_ALL;
830247a4 3544 scan_commit(pRExC_state, &data);
c277df42
IZ
3545 SvREFCNT_dec(data.last_found);
3546
a0ed51b3 3547 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3548 if (longest_float_length
c277df42
IZ
3549 || (data.flags & SF_FL_BEFORE_EOL
3550 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3551 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3552 int t;
3553
a0ed51b3 3554 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3555 && data.offset_fixed == data.offset_float_min
3556 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3557 goto remove_float; /* As in (a)+. */
3558
33b8afdf
JH
3559 if (SvUTF8(data.longest_float)) {
3560 r->float_utf8 = data.longest_float;
c445ea15 3561 r->float_substr = NULL;
33b8afdf
JH
3562 } else {
3563 r->float_substr = data.longest_float;
c445ea15 3564 r->float_utf8 = NULL;
33b8afdf 3565 }
c277df42
IZ
3566 r->float_min_offset = data.offset_float_min;
3567 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3568 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3569 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3570 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3571 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3572 }
3573 else {
aca2d497 3574 remove_float:
c445ea15 3575 r->float_substr = r->float_utf8 = NULL;
c277df42 3576 SvREFCNT_dec(data.longest_float);
c5254dd6 3577 longest_float_length = 0;
a0d0e21e 3578 }
c277df42 3579
a0ed51b3 3580 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3581 if (longest_fixed_length
c277df42
IZ
3582 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3583 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3584 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3585 int t;
3586
33b8afdf
JH
3587 if (SvUTF8(data.longest_fixed)) {
3588 r->anchored_utf8 = data.longest_fixed;
c445ea15 3589 r->anchored_substr = NULL;
33b8afdf
JH
3590 } else {
3591 r->anchored_substr = data.longest_fixed;
c445ea15 3592 r->anchored_utf8 = NULL;
33b8afdf 3593 }
c277df42 3594 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3595 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3596 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3597 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3598 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3599 }
3600 else {
c445ea15 3601 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3602 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3603 longest_fixed_length = 0;
a0d0e21e 3604 }
b81d288d 3605 if (r->regstclass
ffc61ed2 3606 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3607 r->regstclass = NULL;
33b8afdf
JH
3608 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3609 && stclass_flag
653099ff 3610 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3611 && !cl_is_anything(data.start_class))
3612 {
1df70142 3613 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3614
a02a5408 3615 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3616 struct regnode_charclass_class);
3617 StructCopy(data.start_class,
830247a4 3618 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3619 struct regnode_charclass_class);
830247a4 3620 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3621 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3622 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3623 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3624 PerlIO_printf(Perl_debug_log,
a0288114 3625 "synthetic stclass \"%s\".\n",
3f7c398e 3626 SvPVX_const(sv));});
653099ff 3627 }
c277df42
IZ
3628
3629 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3630 if (longest_fixed_length > longest_float_length) {
c277df42 3631 r->check_substr = r->anchored_substr;
33b8afdf 3632 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3633 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3634 if (r->reganch & ROPT_ANCH_SINGLE)
3635 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3636 }
3637 else {
c277df42 3638 r->check_substr = r->float_substr;
33b8afdf 3639 r->check_utf8 = r->float_utf8;
c277df42
IZ
3640 r->check_offset_min = data.offset_float_min;
3641 r->check_offset_max = data.offset_float_max;
a0d0e21e 3642 }
30382c73
IZ
3643 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3644 This should be changed ASAP! */
33b8afdf 3645 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3646 r->reganch |= RE_USE_INTUIT;
33b8afdf 3647 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3648 r->reganch |= RE_INTUIT_TAIL;
3649 }
a0ed51b3
LW
3650 }
3651 else {
c277df42
IZ
3652 /* Several toplevels. Best we can is to set minlen. */
3653 I32 fake;
653099ff 3654 struct regnode_charclass_class ch_class;
cb434fcc 3655 I32 last_close = 0;
c277df42 3656
a3621e74 3657 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3658
c277df42 3659 scan = r->program + 1;
830247a4 3660 cl_init(pRExC_state, &ch_class);
653099ff 3661 data.start_class = &ch_class;
cb434fcc 3662 data.last_closep = &last_close;
07be1b83
YO
3663
3664 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3665 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3666
3667#ifdef TRIE_STUDY_OPT
3668 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3669 goto reStudy;
3670 }
3671#endif
3672
33b8afdf 3673 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3674 = r->float_substr = r->float_utf8 = NULL;
653099ff 3675 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3676 && !cl_is_anything(data.start_class))
3677 {
1df70142 3678 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3679
a02a5408 3680 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3681 struct regnode_charclass_class);
3682 StructCopy(data.start_class,
830247a4 3683 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3684 struct regnode_charclass_class);
830247a4 3685 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3686 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3687 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3688 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3689 PerlIO_printf(Perl_debug_log,
a0288114 3690 "synthetic stclass \"%s\".\n",
3f7c398e 3691 SvPVX_const(sv));});
653099ff 3692 }
a0d0e21e
LW
3693 }
3694
a0d0e21e 3695 r->minlen = minlen;
b81d288d 3696 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3697 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3698 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3699 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3700 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3701 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3702 if (RExC_seen & REG_SEEN_CANY)
3703 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3704 Newxz(r->startp, RExC_npar, I32);
3705 Newxz(r->endp, RExC_npar, I32);
07be1b83 3706
f2278c82 3707 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3708 DEBUG_DUMP_r({
3709 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3710 regdump(r);
3711 });
8e9a8a48
YO
3712 DEBUG_OFFSETS_r(if (r->offsets) {
3713 const U32 len = r->offsets[0];
3714 U32 i;
3715 GET_RE_DEBUG_FLAGS_DECL;
3716 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3717 for (i = 1; i <= len; i++) {
3718 if (r->offsets[i*2-1] || r->offsets[i*2])
3719 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3720 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3721 }
3722 PerlIO_printf(Perl_debug_log, "\n");
3723 });
a0d0e21e 3724 return(r);
a687059c
LW
3725}
3726
3dab1dad
YO
3727
3728#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3729 int rem=(int)(RExC_end - RExC_parse); \
3730 int cut; \
3731 int num; \
3732 int iscut=0; \
3733 if (rem>10) { \
3734 rem=10; \
3735 iscut=1; \
3736 } \
3737 cut=10-rem; \
3738 if (RExC_lastparse!=RExC_parse) \
3739 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3740 rem, RExC_parse, \
3741 cut + 4, \
3742 iscut ? "..." : "<" \
3743 ); \
3744 else \
3745 PerlIO_printf(Perl_debug_log,"%16s",""); \
3746 \
3747 if (SIZE_ONLY) \
3748 num=RExC_size; \
3749 else \
3750 num=REG_NODE_NUM(RExC_emit); \
3751 if (RExC_lastnum!=num) \
be8e71aa 3752 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3753 else \
be8e71aa
YO
3754 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3755 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3756 (int)((depth*2)), "", \
3dab1dad
YO
3757 (funcname) \
3758 ); \
3759 RExC_lastnum=num; \
3760 RExC_lastparse=RExC_parse; \
3761})
3762
07be1b83
YO
3763
3764
3dab1dad
YO
3765#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3766 DEBUG_PARSE_MSG((funcname)); \
3767 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3768})
a687059c
LW
3769/*
3770 - reg - regular expression, i.e. main body or parenthesized thing
3771 *
3772 * Caller must absorb opening parenthesis.
3773 *
3774 * Combining parenthesis handling with the base level of regular expression
3775 * is a trifle forced, but the need to tie the tails of the branches to what
3776 * follows makes it hard to avoid.
3777 */
07be1b83
YO
3778#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3779#ifdef DEBUGGING
3780#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3781#else
3782#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3783#endif
3dab1dad 3784
76e3520e 3785STATIC regnode *
3dab1dad 3786S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3787 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3788{
27da23d5 3789 dVAR;
c277df42
IZ
3790 register regnode *ret; /* Will be the head of the group. */
3791 register regnode *br;
3792 register regnode *lastbr;
cbbf8932 3793 register regnode *ender = NULL;
a0d0e21e 3794 register I32 parno = 0;
cbbf8932
AL
3795 I32 flags;
3796 const I32 oregflags = RExC_flags;
6136c704
AL
3797 bool have_branch = 0;
3798 bool is_open = 0;
9d1d55b5
JP
3799
3800 /* for (?g), (?gc), and (?o) warnings; warning
3801 about (?c) will warn about (?g) -- japhy */
3802
6136c704
AL
3803#define WASTED_O 0x01
3804#define WASTED_G 0x02
3805#define WASTED_C 0x04
3806#define WASTED_GC (0x02|0x04)
cbbf8932 3807 I32 wastedflags = 0x00;
9d1d55b5 3808
fac92740 3809 char * parse_start = RExC_parse; /* MJD */
a28509cc 3810 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3811
3dab1dad
YO
3812 GET_RE_DEBUG_FLAGS_DECL;
3813 DEBUG_PARSE("reg ");
3814
3815
821b33a5 3816 *flagp = 0; /* Tentatively. */
a0d0e21e 3817
9d1d55b5 3818
a0d0e21e
LW
3819 /* Make an OPEN node, if parenthesized. */
3820 if (paren) {
fac92740 3821 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3822 U32 posflags = 0, negflags = 0;
3823 U32 *flagsp = &posflags;
6136c704 3824 bool is_logical = 0;
a28509cc 3825 const char * const seqstart = RExC_parse;
ca9dfc88 3826
830247a4
IZ
3827 RExC_parse++;
3828 paren = *RExC_parse++;
c277df42 3829 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3830 switch (paren) {
fac92740 3831 case '<': /* (?<...) */
830247a4 3832 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3833 if (*RExC_parse == '!')
c277df42 3834 paren = ',';
b81d288d 3835 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3836 goto unknown;
830247a4 3837 RExC_parse++;
fac92740
MJD
3838 case '=': /* (?=...) */
3839 case '!': /* (?!...) */
830247a4 3840 RExC_seen_zerolen++;
fac92740
MJD
3841 case ':': /* (?:...) */
3842 case '>': /* (?>...) */
a0d0e21e 3843 break;
fac92740
MJD
3844 case '$': /* (?$...) */
3845 case '@': /* (?@...) */
8615cb43 3846 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3847 break;
fac92740 3848 case '#': /* (?#...) */
830247a4
IZ
3849 while (*RExC_parse && *RExC_parse != ')')
3850 RExC_parse++;
3851 if (*RExC_parse != ')')
c277df42 3852 FAIL("Sequence (?#... not terminated");
830247a4 3853 nextchar(pRExC_state);
a0d0e21e
LW
3854 *flagp = TRYAGAIN;
3855 return NULL;
fac92740 3856 case 'p': /* (?p...) */
9014280d 3857 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3858 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3859 /* FALL THROUGH*/
fac92740 3860 case '?': /* (??...) */
6136c704 3861 is_logical = 1;
438a3801
YST
3862 if (*RExC_parse != '{')
3863 goto unknown;
830247a4 3864 paren = *RExC_parse++;
0f5d15d6 3865 /* FALL THROUGH */
fac92740 3866 case '{': /* (?{...}) */
c277df42 3867 {
c277df42
IZ
3868 I32 count = 1, n = 0;
3869 char c;
830247a4 3870 char *s = RExC_parse;
c277df42 3871
830247a4
IZ
3872 RExC_seen_zerolen++;
3873 RExC_seen |= REG_SEEN_EVAL;
3874 while (count && (c = *RExC_parse)) {
6136c704
AL
3875 if (c == '\\') {
3876 if (RExC_parse[1])
3877 RExC_parse++;
3878 }
b81d288d 3879 else if (c == '{')
c277df42 3880 count++;
b81d288d 3881 else if (c == '}')
c277df42 3882 count--;
830247a4 3883 RExC_parse++;
c277df42 3884 }
6136c704 3885 if (*RExC_parse != ')') {
b81d288d 3886 RExC_parse = s;
b45f050a
JF
3887 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3888 }
c277df42 3889 if (!SIZE_ONLY) {
f3548bdc 3890 PAD *pad;
6136c704
AL
3891 OP_4tree *sop, *rop;
3892 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3893
569233ed
SB
3894 ENTER;
3895 Perl_save_re_context(aTHX);
f3548bdc 3896 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3897 sop->op_private |= OPpREFCOUNTED;
3898 /* re_dup will OpREFCNT_inc */
3899 OpREFCNT_set(sop, 1);
569233ed 3900 LEAVE;
c277df42 3901
830247a4
IZ
3902 n = add_data(pRExC_state, 3, "nop");
3903 RExC_rx->data->data[n] = (void*)rop;
3904 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3905 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3906 SvREFCNT_dec(sv);
a0ed51b3 3907 }
e24b16f9 3908 else { /* First pass */
830247a4 3909 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3910 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3911 /* No compiled RE interpolated, has runtime
3912 components ===> unsafe. */
3913 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3914 if (PL_tainting && PL_tainted)
cc6b7395 3915 FAIL("Eval-group in insecure regular expression");
54df2634 3916#if PERL_VERSION > 8
923e4eb5 3917 if (IN_PERL_COMPILETIME)
b5c19bd7 3918 PL_cv_has_eval = 1;
54df2634 3919#endif
c277df42 3920 }
b5c19bd7 3921
830247a4 3922 nextchar(pRExC_state);
6136c704 3923 if (is_logical) {
830247a4 3924 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3925 if (!SIZE_ONLY)
3926 ret->flags = 2;
3dab1dad 3927 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3928 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3929 return ret;
3930 }
ccb2c380
MP
3931 ret = reganode(pRExC_state, EVAL, n);
3932 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3933 Set_Node_Offset(ret, parse_start);
3934 return ret;
c277df42 3935 }
fac92740 3936 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3937 {
fac92740 3938 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3939 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3940 || RExC_parse[1] == '<'
830247a4 3941 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3942 I32 flag;
3943
830247a4 3944 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3945 if (!SIZE_ONLY)
3946 ret->flags = 1;
3dab1dad 3947 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3948 goto insert_if;
b81d288d 3949 }
a0ed51b3 3950 }
830247a4 3951 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3952 /* (?(1)...) */
6136c704 3953 char c;
830247a4 3954 parno = atoi(RExC_parse++);
c277df42 3955
830247a4
IZ
3956 while (isDIGIT(*RExC_parse))
3957 RExC_parse++;
fac92740 3958 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3959
830247a4 3960 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3961 vFAIL("Switch condition not recognized");
c277df42 3962 insert_if:
3dab1dad
YO
3963 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3964 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3965 if (br == NULL)
830247a4 3966 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3967 else
3dab1dad 3968 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3969 c = *nextchar(pRExC_state);
d1b80229
IZ
3970 if (flags&HASWIDTH)
3971 *flagp |= HASWIDTH;
c277df42 3972 if (c == '|') {
830247a4 3973 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3974 regbranch(pRExC_state, &flags, 1,depth+1);
3975 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3976 if (flags&HASWIDTH)
3977 *flagp |= HASWIDTH;
830247a4 3978 c = *nextchar(pRExC_state);
a0ed51b3
LW
3979 }
3980 else
c277df42
IZ
3981 lastbr = NULL;
3982 if (c != ')')
8615cb43 3983 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3984 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3985 REGTAIL(pRExC_state, br, ender);
c277df42 3986 if (lastbr) {
3dab1dad
YO
3987 REGTAIL(pRExC_state, lastbr, ender);
3988 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3989 }
3990 else
3dab1dad 3991 REGTAIL(pRExC_state, ret, ender);
c277df42 3992 return ret;
a0ed51b3
LW
3993 }
3994 else {
830247a4 3995 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3996 }
3997 }
1b1626e4 3998 case 0:
830247a4 3999 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4000 vFAIL("Sequence (? incomplete");
1b1626e4 4001 break;
a0d0e21e 4002 default:
830247a4 4003 --RExC_parse;
fac92740 4004 parse_flags: /* (?i) */
830247a4 4005 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4006 /* (?g), (?gc) and (?o) are useless here
4007 and must be globally applied -- japhy */
4008
4009 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4010 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4011 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4012 if (! (wastedflags & wflagbit) ) {
4013 wastedflags |= wflagbit;
4014 vWARN5(
4015 RExC_parse + 1,
4016 "Useless (%s%c) - %suse /%c modifier",
4017 flagsp == &negflags ? "?-" : "?",
4018 *RExC_parse,
4019 flagsp == &negflags ? "don't " : "",
4020 *RExC_parse
4021 );
4022 }
4023 }
4024 }
4025 else if (*RExC_parse == 'c') {
4026 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4027 if (! (wastedflags & WASTED_C) ) {
4028 wastedflags |= WASTED_GC;
9d1d55b5
JP
4029 vWARN3(
4030 RExC_parse + 1,
4031 "Useless (%sc) - %suse /gc modifier",
4032 flagsp == &negflags ? "?-" : "?",
4033 flagsp == &negflags ? "don't " : ""
4034 );
4035 }
4036 }
4037 }
4038 else { pmflag(flagsp, *RExC_parse); }
4039
830247a4 4040 ++RExC_parse;
ca9dfc88 4041 }
830247a4 4042 if (*RExC_parse == '-') {
ca9dfc88 4043 flagsp = &negflags;
9d1d55b5 4044 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4045 ++RExC_parse;
ca9dfc88 4046 goto parse_flags;
48c036b1 4047 }
e2509266
JH
4048 RExC_flags |= posflags;
4049 RExC_flags &= ~negflags;
830247a4
IZ
4050 if (*RExC_parse == ':') {
4051 RExC_parse++;
ca9dfc88
IZ
4052 paren = ':';
4053 break;
4054 }
c277df42 4055 unknown:
830247a4
IZ
4056 if (*RExC_parse != ')') {
4057 RExC_parse++;
4058 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4059 }
830247a4 4060 nextchar(pRExC_state);
a0d0e21e
LW
4061 *flagp = TRYAGAIN;
4062 return NULL;
4063 }
4064 }
fac92740 4065 else { /* (...) */
830247a4
IZ
4066 parno = RExC_npar;
4067 RExC_npar++;
4068 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4069 Set_Node_Length(ret, 1); /* MJD */
4070 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4071 is_open = 1;
a0d0e21e 4072 }
a0ed51b3 4073 }
fac92740 4074 else /* ! paren */
a0d0e21e
LW
4075 ret = NULL;
4076
4077 /* Pick up the branches, linking them together. */
fac92740 4078 parse_start = RExC_parse; /* MJD */
3dab1dad 4079 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4080 /* branch_len = (paren != 0); */
2af232bd 4081
a0d0e21e
LW
4082 if (br == NULL)
4083 return(NULL);
830247a4
IZ
4084 if (*RExC_parse == '|') {
4085 if (!SIZE_ONLY && RExC_extralen) {
4086 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4087 }
fac92740 4088 else { /* MJD */
830247a4 4089 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4090 Set_Node_Length(br, paren != 0);
4091 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4092 }
c277df42
IZ
4093 have_branch = 1;
4094 if (SIZE_ONLY)
830247a4 4095 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4096 }
4097 else if (paren == ':') {
c277df42
IZ
4098 *flagp |= flags&SIMPLE;
4099 }
6136c704 4100 if (is_open) { /* Starts with OPEN. */
3dab1dad 4101 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4102 }
4103 else if (paren != '?') /* Not Conditional */
a0d0e21e 4104 ret = br;
32a0ca98 4105 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4106 lastbr = br;
830247a4
IZ
4107 while (*RExC_parse == '|') {
4108 if (!SIZE_ONLY && RExC_extralen) {
4109 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4110 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4111 }
4112 if (SIZE_ONLY)
830247a4
IZ
4113 RExC_extralen += 2; /* Account for LONGJMP. */
4114 nextchar(pRExC_state);
3dab1dad 4115 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4116
a687059c 4117 if (br == NULL)
a0d0e21e 4118 return(NULL);
3dab1dad 4119 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4120 lastbr = br;
821b33a5
IZ
4121 if (flags&HASWIDTH)
4122 *flagp |= HASWIDTH;
a687059c 4123 *flagp |= flags&SPSTART;
a0d0e21e
LW
4124 }
4125
c277df42
IZ
4126 if (have_branch || paren != ':') {
4127 /* Make a closing node, and hook it on the end. */
4128 switch (paren) {
4129 case ':':
830247a4 4130 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4131 break;
4132 case 1:
830247a4 4133 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4134 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4135 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4136 break;
4137 case '<':
c277df42
IZ
4138 case ',':
4139 case '=':
4140 case '!':
c277df42 4141 *flagp &= ~HASWIDTH;
821b33a5
IZ
4142 /* FALL THROUGH */
4143 case '>':
830247a4 4144 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4145 break;
4146 case 0:
830247a4 4147 ender = reg_node(pRExC_state, END);
c277df42
IZ
4148 break;
4149 }
07be1b83 4150 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4151
9674d46a 4152 if (have_branch && !SIZE_ONLY) {
c277df42 4153 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4154 for (br = ret; br; br = regnext(br)) {
4155 const U8 op = PL_regkind[OP(br)];
4156 if (op == BRANCH) {
07be1b83 4157 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4158 }
4159 else if (op == BRANCHJ) {
07be1b83 4160 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4161 }
c277df42
IZ
4162 }
4163 }
a0d0e21e 4164 }
c277df42
IZ
4165
4166 {
e1ec3a88
AL
4167 const char *p;
4168 static const char parens[] = "=!<,>";
c277df42
IZ
4169
4170 if (paren && (p = strchr(parens, paren))) {
eb160463 4171 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4172 int flag = (p - parens) > 1;
4173
4174 if (paren == '>')
4175 node = SUSPEND, flag = 0;
830247a4 4176 reginsert(pRExC_state, node,ret);
45948336
EP
4177 Set_Node_Cur_Length(ret);
4178 Set_Node_Offset(ret, parse_start + 1);
c277df42 4179 ret->flags = flag;
07be1b83 4180 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4181 }
a0d0e21e
LW
4182 }
4183
4184 /* Check for proper termination. */
ce3e6498 4185 if (paren) {
e2509266 4186 RExC_flags = oregflags;
830247a4
IZ
4187 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4188 RExC_parse = oregcomp_parse;
380a0633 4189 vFAIL("Unmatched (");
ce3e6498 4190 }
a0ed51b3 4191 }
830247a4
IZ
4192 else if (!paren && RExC_parse < RExC_end) {
4193 if (*RExC_parse == ')') {
4194 RExC_parse++;
380a0633 4195 vFAIL("Unmatched )");
a0ed51b3
LW
4196 }
4197 else
b45f050a 4198 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4199 /* NOTREACHED */
4200 }
a687059c 4201
a0d0e21e 4202 return(ret);
a687059c
LW
4203}
4204
4205/*
4206 - regbranch - one alternative of an | operator
4207 *
4208 * Implements the concatenation operator.
4209 */
76e3520e 4210STATIC regnode *
3dab1dad 4211S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4212{
97aff369 4213 dVAR;
c277df42
IZ
4214 register regnode *ret;
4215 register regnode *chain = NULL;
4216 register regnode *latest;
4217 I32 flags = 0, c = 0;
3dab1dad
YO
4218 GET_RE_DEBUG_FLAGS_DECL;
4219 DEBUG_PARSE("brnc");
b81d288d 4220 if (first)
c277df42
IZ
4221 ret = NULL;
4222 else {
b81d288d 4223 if (!SIZE_ONLY && RExC_extralen)
830247a4 4224 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4225 else {
830247a4 4226 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4227 Set_Node_Length(ret, 1);
4228 }
c277df42
IZ
4229 }
4230
b81d288d 4231 if (!first && SIZE_ONLY)
830247a4 4232 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4233
c277df42 4234 *flagp = WORST; /* Tentatively. */
a0d0e21e 4235
830247a4
IZ
4236 RExC_parse--;
4237 nextchar(pRExC_state);
4238 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4239 flags &= ~TRYAGAIN;
3dab1dad 4240 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4241 if (latest == NULL) {
4242 if (flags & TRYAGAIN)
4243 continue;
4244 return(NULL);
a0ed51b3
LW
4245 }
4246 else if (ret == NULL)
c277df42 4247 ret = latest;
a0d0e21e 4248 *flagp |= flags&HASWIDTH;
c277df42 4249 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4250 *flagp |= flags&SPSTART;
4251 else {
830247a4 4252 RExC_naughty++;
3dab1dad 4253 REGTAIL(pRExC_state, chain, latest);
a687059c 4254 }
a0d0e21e 4255 chain = latest;
c277df42
IZ
4256 c++;
4257 }
4258 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4259 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4260 if (ret == NULL)
4261 ret = chain;
4262 }
4263 if (c == 1) {
4264 *flagp |= flags&SIMPLE;
a0d0e21e 4265 }
a687059c 4266
d4c19fe8 4267 return ret;
a687059c
LW
4268}
4269
4270/*
4271 - regpiece - something followed by possible [*+?]
4272 *
4273 * Note that the branching code sequences used for ? and the general cases
4274 * of * and + are somewhat optimized: they use the same NOTHING node as
4275 * both the endmarker for their branch list and the body of the last branch.
4276 * It might seem that this node could be dispensed with entirely, but the
4277 * endmarker role is not redundant.
4278 */
76e3520e 4279STATIC regnode *
3dab1dad 4280S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4281{
97aff369 4282 dVAR;
c277df42 4283 register regnode *ret;
a0d0e21e
LW
4284 register char op;
4285 register char *next;
4286 I32 flags;
1df70142 4287 const char * const origparse = RExC_parse;
a0d0e21e 4288 I32 min;
c277df42 4289 I32 max = REG_INFTY;
fac92740 4290 char *parse_start;
3dab1dad
YO
4291 GET_RE_DEBUG_FLAGS_DECL;
4292 DEBUG_PARSE("piec");
a0d0e21e 4293
3dab1dad 4294 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4295 if (ret == NULL) {
4296 if (flags & TRYAGAIN)
4297 *flagp |= TRYAGAIN;
4298 return(NULL);
4299 }
4300
830247a4 4301 op = *RExC_parse;
a0d0e21e 4302
830247a4 4303 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 4304 const char *maxpos = NULL;
fac92740 4305 parse_start = RExC_parse; /* MJD */
830247a4 4306 next = RExC_parse + 1;
a0d0e21e
LW
4307 while (isDIGIT(*next) || *next == ',') {
4308 if (*next == ',') {
4309 if (maxpos)
4310 break;
4311 else
4312 maxpos = next;
a687059c 4313 }
a0d0e21e
LW
4314 next++;
4315 }
4316 if (*next == '}') { /* got one */
4317 if (!maxpos)
4318 maxpos = next;
830247a4
IZ
4319 RExC_parse++;
4320 min = atoi(RExC_parse);
a0d0e21e
LW
4321 if (*maxpos == ',')
4322 maxpos++;
4323 else
830247a4 4324 maxpos = RExC_parse;
a0d0e21e
LW
4325 max = atoi(maxpos);
4326 if (!max && *maxpos != '0')
c277df42
IZ
4327 max = REG_INFTY; /* meaning "infinity" */
4328 else if (max >= REG_INFTY)
8615cb43 4329 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4330 RExC_parse = next;
4331 nextchar(pRExC_state);
a0d0e21e
LW
4332
4333 do_curly:
4334 if ((flags&SIMPLE)) {
830247a4
IZ
4335 RExC_naughty += 2 + RExC_naughty / 2;
4336 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4337 Set_Node_Offset(ret, parse_start+1); /* MJD */
4338 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4339 }
4340 else {
3dab1dad 4341 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4342
4343 w->flags = 0;
3dab1dad 4344 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4345 if (!SIZE_ONLY && RExC_extralen) {
4346 reginsert(pRExC_state, LONGJMP,ret);
4347 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4348 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4349 }
830247a4 4350 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4351 /* MJD hk */
4352 Set_Node_Offset(ret, parse_start+1);
2af232bd 4353 Set_Node_Length(ret,
fac92740 4354 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4355
830247a4 4356 if (!SIZE_ONLY && RExC_extralen)
c277df42 4357 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4358 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4359 if (SIZE_ONLY)
830247a4
IZ
4360 RExC_whilem_seen++, RExC_extralen += 3;
4361 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4362 }
c277df42 4363 ret->flags = 0;
a0d0e21e
LW
4364
4365 if (min > 0)
821b33a5
IZ
4366 *flagp = WORST;
4367 if (max > 0)
4368 *flagp |= HASWIDTH;
a0d0e21e 4369 if (max && max < min)
8615cb43 4370 vFAIL("Can't do {n,m} with n > m");
c277df42 4371 if (!SIZE_ONLY) {
eb160463
GS
4372 ARG1_SET(ret, (U16)min);
4373 ARG2_SET(ret, (U16)max);
a687059c 4374 }
a687059c 4375
a0d0e21e 4376 goto nest_check;
a687059c 4377 }
a0d0e21e 4378 }
a687059c 4379
a0d0e21e
LW
4380 if (!ISMULT1(op)) {
4381 *flagp = flags;
a687059c 4382 return(ret);
a0d0e21e 4383 }
bb20fd44 4384
c277df42 4385#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4386
4387 /* if this is reinstated, don't forget to put this back into perldiag:
4388
4389 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4390
4391 (F) The part of the regexp subject to either the * or + quantifier
4392 could match an empty string. The {#} shows in the regular
4393 expression about where the problem was discovered.
4394
4395 */
4396
bb20fd44 4397 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4398 vFAIL("Regexp *+ operand could be empty");
b81d288d 4399#endif
bb20fd44 4400
fac92740 4401 parse_start = RExC_parse;
830247a4 4402 nextchar(pRExC_state);
a0d0e21e 4403
821b33a5 4404 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4405
4406 if (op == '*' && (flags&SIMPLE)) {
830247a4 4407 reginsert(pRExC_state, STAR, ret);
c277df42 4408 ret->flags = 0;
830247a4 4409 RExC_naughty += 4;
a0d0e21e
LW
4410 }
4411 else if (op == '*') {
4412 min = 0;
4413 goto do_curly;
a0ed51b3
LW
4414 }
4415 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4416 reginsert(pRExC_state, PLUS, ret);
c277df42 4417 ret->flags = 0;
830247a4 4418 RExC_naughty += 3;
a0d0e21e
LW
4419 }
4420 else if (op == '+') {
4421 min = 1;
4422 goto do_curly;
a0ed51b3
LW
4423 }
4424 else if (op == '?') {
a0d0e21e
LW
4425 min = 0; max = 1;
4426 goto do_curly;
4427 }
4428 nest_check:
041457d9 4429 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4430 vWARN3(RExC_parse,
b45f050a 4431 "%.*s matches null string many times",
afd78fd5 4432 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4433 origparse);
a0d0e21e
LW
4434 }
4435
830247a4
IZ
4436 if (*RExC_parse == '?') {
4437 nextchar(pRExC_state);
4438 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4439 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4440 }
830247a4
IZ
4441 if (ISMULT2(RExC_parse)) {
4442 RExC_parse++;
b45f050a
JF
4443 vFAIL("Nested quantifiers");
4444 }
a0d0e21e
LW
4445
4446 return(ret);
a687059c
LW
4447}
4448
4449/*
4450 - regatom - the lowest level
4451 *
4452 * Optimization: gobbles an entire sequence of ordinary characters so that
4453 * it can turn them into a single node, which is smaller to store and
4454 * faster to run. Backslashed characters are exceptions, each becoming a
4455 * separate node; the code is simpler that way and it's not worth fixing.
4456 *
7f6f358c
YO
4457 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4458 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4459 */
76e3520e 4460STATIC regnode *
3dab1dad 4461S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4462{
97aff369 4463 dVAR;
cbbf8932 4464 register regnode *ret = NULL;
a0d0e21e 4465 I32 flags;
45948336 4466 char *parse_start = RExC_parse;
3dab1dad
YO
4467 GET_RE_DEBUG_FLAGS_DECL;
4468 DEBUG_PARSE("atom");
a0d0e21e
LW
4469 *flagp = WORST; /* Tentatively. */
4470
4471tryagain:
830247a4 4472 switch (*RExC_parse) {
a0d0e21e 4473 case '^':
830247a4
IZ
4474 RExC_seen_zerolen++;
4475 nextchar(pRExC_state);
e2509266 4476 if (RExC_flags & PMf_MULTILINE)
830247a4 4477 ret = reg_node(pRExC_state, MBOL);
e2509266 4478 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4479 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4480 else
830247a4 4481 ret = reg_node(pRExC_state, BOL);
fac92740 4482 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4483 break;
4484 case '$':
830247a4 4485 nextchar(pRExC_state);
b81d288d 4486 if (*RExC_parse)
830247a4 4487 RExC_seen_zerolen++;
e2509266 4488 if (RExC_flags & PMf_MULTILINE)
830247a4 4489 ret = reg_node(pRExC_state, MEOL);
e2509266 4490 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4491 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4492 else
830247a4 4493 ret = reg_node(pRExC_state, EOL);
fac92740 4494 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4495 break;
4496 case '.':
830247a4 4497 nextchar(pRExC_state);
e2509266 4498 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4499 ret = reg_node(pRExC_state, SANY);
4500 else
4501 ret = reg_node(pRExC_state, REG_ANY);
4502 *flagp |= HASWIDTH|SIMPLE;
830247a4 4503 RExC_naughty++;
fac92740 4504 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4505 break;
4506 case '[':
b45f050a 4507 {
3dab1dad
YO
4508 char * const oregcomp_parse = ++RExC_parse;
4509 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4510 if (*RExC_parse != ']') {
4511 RExC_parse = oregcomp_parse;
b45f050a
JF
4512 vFAIL("Unmatched [");
4513 }
830247a4 4514 nextchar(pRExC_state);
a0d0e21e 4515 *flagp |= HASWIDTH|SIMPLE;
fac92740 4516 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4517 break;
b45f050a 4518 }
a0d0e21e 4519 case '(':
830247a4 4520 nextchar(pRExC_state);
3dab1dad 4521 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4522 if (ret == NULL) {
bf93d4cc 4523 if (flags & TRYAGAIN) {
830247a4 4524 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4525 /* Make parent create an empty node if needed. */
4526 *flagp |= TRYAGAIN;
4527 return(NULL);
4528 }
a0d0e21e 4529 goto tryagain;
bf93d4cc 4530 }
a0d0e21e
LW
4531 return(NULL);
4532 }
c277df42 4533 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4534 break;
4535 case '|':
4536 case ')':
4537 if (flags & TRYAGAIN) {
4538 *flagp |= TRYAGAIN;
4539 return NULL;
4540 }
b45f050a 4541 vFAIL("Internal urp");
a0d0e21e
LW
4542 /* Supposed to be caught earlier. */
4543 break;
85afd4ae 4544 case '{':
830247a4
IZ
4545 if (!regcurly(RExC_parse)) {
4546 RExC_parse++;
85afd4ae
CS
4547 goto defchar;
4548 }
4549 /* FALL THROUGH */
a0d0e21e
LW
4550 case '?':
4551 case '+':
4552 case '*':
830247a4 4553 RExC_parse++;
b45f050a 4554 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4555 break;
4556 case '\\':
830247a4 4557 switch (*++RExC_parse) {
a0d0e21e 4558 case 'A':
830247a4
IZ
4559 RExC_seen_zerolen++;
4560 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4561 *flagp |= SIMPLE;
830247a4 4562 nextchar(pRExC_state);
fac92740 4563 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4564 break;
4565 case 'G':
830247a4
IZ
4566 ret = reg_node(pRExC_state, GPOS);
4567 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4568 *flagp |= SIMPLE;
830247a4 4569 nextchar(pRExC_state);
fac92740 4570 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4571 break;
4572 case 'Z':
830247a4 4573 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4574 *flagp |= SIMPLE;
a1917ab9 4575 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4576 nextchar(pRExC_state);
a0d0e21e 4577 break;
b85d18e9 4578 case 'z':
830247a4 4579 ret = reg_node(pRExC_state, EOS);
b85d18e9 4580 *flagp |= SIMPLE;
830247a4
IZ
4581 RExC_seen_zerolen++; /* Do not optimize RE away */
4582 nextchar(pRExC_state);
fac92740 4583 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4584 break;
4a2d328f 4585 case 'C':
f33976b4
DB
4586 ret = reg_node(pRExC_state, CANY);
4587 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4588 *flagp |= HASWIDTH|SIMPLE;
830247a4 4589 nextchar(pRExC_state);
fac92740 4590 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4591 break;
4592 case 'X':
830247a4 4593 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4594 *flagp |= HASWIDTH;
830247a4 4595 nextchar(pRExC_state);
fac92740 4596 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4597 break;
a0d0e21e 4598 case 'w':
eb160463 4599 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4600 *flagp |= HASWIDTH|SIMPLE;
830247a4 4601 nextchar(pRExC_state);
fac92740 4602 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4603 break;
4604 case 'W':
eb160463 4605 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4606 *flagp |= HASWIDTH|SIMPLE;
830247a4 4607 nextchar(pRExC_state);
fac92740 4608 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4609 break;
4610 case 'b':
830247a4
IZ
4611 RExC_seen_zerolen++;
4612 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4613 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4614 *flagp |= SIMPLE;
830247a4 4615 nextchar(pRExC_state);
fac92740 4616 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4617 break;
4618 case 'B':
830247a4
IZ
4619 RExC_seen_zerolen++;
4620 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4621 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4622 *flagp |= SIMPLE;
830247a4 4623 nextchar(pRExC_state);
fac92740 4624 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4625 break;
4626 case 's':
eb160463 4627 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4628 *flagp |= HASWIDTH|SIMPLE;
830247a4 4629 nextchar(pRExC_state);
fac92740 4630 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4631 break;
4632 case 'S':
eb160463 4633 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4634 *flagp |= HASWIDTH|SIMPLE;
830247a4 4635 nextchar(pRExC_state);
fac92740 4636 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4637 break;
4638 case 'd':
ffc61ed2 4639 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4640 *flagp |= HASWIDTH|SIMPLE;
830247a4 4641 nextchar(pRExC_state);
fac92740 4642 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4643 break;
4644 case 'D':
ffc61ed2 4645 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4646 *flagp |= HASWIDTH|SIMPLE;
830247a4 4647 nextchar(pRExC_state);
fac92740 4648 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4649 break;
a14b48bc
LW
4650 case 'p':
4651 case 'P':
3568d838 4652 {
3dab1dad 4653 char* const oldregxend = RExC_end;
ccb2c380 4654 char* parse_start = RExC_parse - 2;
a14b48bc 4655
830247a4 4656 if (RExC_parse[1] == '{') {
3568d838 4657 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4658 RExC_end = strchr(RExC_parse, '}');
4659 if (!RExC_end) {
3dab1dad 4660 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4661 RExC_parse += 2;
4662 RExC_end = oldregxend;
0da60cf5 4663 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4664 }
830247a4 4665 RExC_end++;
a14b48bc 4666 }
af6f566e 4667 else {
830247a4 4668 RExC_end = RExC_parse + 2;
af6f566e
HS
4669 if (RExC_end > oldregxend)
4670 RExC_end = oldregxend;
4671 }
830247a4 4672 RExC_parse--;
a14b48bc 4673
3dab1dad 4674 ret = regclass(pRExC_state,depth+1);
a14b48bc 4675
830247a4
IZ
4676 RExC_end = oldregxend;
4677 RExC_parse--;
ccb2c380
MP
4678
4679 Set_Node_Offset(ret, parse_start + 2);
4680 Set_Node_Cur_Length(ret);
830247a4 4681 nextchar(pRExC_state);
a14b48bc
LW
4682 *flagp |= HASWIDTH|SIMPLE;
4683 }
4684 break;
a0d0e21e
LW
4685 case 'n':
4686 case 'r':
4687 case 't':
4688 case 'f':
4689 case 'e':
4690 case 'a':
4691 case 'x':
4692 case 'c':
4693 case '0':
4694 goto defchar;
4695 case '1': case '2': case '3': case '4':
4696 case '5': case '6': case '7': case '8': case '9':
4697 {
1df70142 4698 const I32 num = atoi(RExC_parse);
a0d0e21e 4699
830247a4 4700 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4701 goto defchar;
4702 else {
3dab1dad 4703 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4704 while (isDIGIT(*RExC_parse))
4705 RExC_parse++;
b45f050a 4706
eb160463 4707 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4708 vFAIL("Reference to nonexistent group");
830247a4 4709 RExC_sawback = 1;
eb160463
GS
4710 ret = reganode(pRExC_state,
4711 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4712 num);
a0d0e21e 4713 *flagp |= HASWIDTH;
2af232bd 4714
fac92740 4715 /* override incorrect value set in reganode MJD */
2af232bd 4716 Set_Node_Offset(ret, parse_start+1);
fac92740 4717 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4718 RExC_parse--;
4719 nextchar(pRExC_state);
a0d0e21e
LW
4720 }
4721 }
4722 break;
4723 case '\0':
830247a4 4724 if (RExC_parse >= RExC_end)
b45f050a 4725 FAIL("Trailing \\");
a0d0e21e
LW
4726 /* FALL THROUGH */
4727 default:
a0288114 4728 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4729 back into the quick-grab loop below */
45948336 4730 parse_start--;
a0d0e21e
LW
4731 goto defchar;
4732 }
4733 break;
4633a7c4
LW
4734
4735 case '#':
e2509266 4736 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4737 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4738 RExC_parse++;
830247a4 4739 if (RExC_parse < RExC_end)
4633a7c4
LW
4740 goto tryagain;
4741 }
4742 /* FALL THROUGH */
4743
a0d0e21e 4744 default: {
ba210ebe 4745 register STRLEN len;
58ae7d3f 4746 register UV ender;
a0d0e21e 4747 register char *p;
3dab1dad 4748 char *s;
80aecb99 4749 STRLEN foldlen;
89ebb4a3 4750 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4751
4752 parse_start = RExC_parse - 1;
a0d0e21e 4753
830247a4 4754 RExC_parse++;
a0d0e21e
LW
4755
4756 defchar:
58ae7d3f 4757 ender = 0;
eb160463
GS
4758 ret = reg_node(pRExC_state,
4759 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4760 s = STRING(ret);
830247a4
IZ
4761 for (len = 0, p = RExC_parse - 1;
4762 len < 127 && p < RExC_end;
a0d0e21e
LW
4763 len++)
4764 {
3dab1dad 4765 char * const oldp = p;
5b5a24f7 4766
e2509266 4767 if (RExC_flags & PMf_EXTENDED)
830247a4 4768 p = regwhite(p, RExC_end);
a0d0e21e
LW
4769 switch (*p) {
4770 case '^':
4771 case '$':
4772 case '.':
4773 case '[':
4774 case '(':
4775 case ')':
4776 case '|':
4777 goto loopdone;
4778 case '\\':
4779 switch (*++p) {
4780 case 'A':
1ed8eac0
JF
4781 case 'C':
4782 case 'X':
a0d0e21e
LW
4783 case 'G':
4784 case 'Z':
b85d18e9 4785 case 'z':
a0d0e21e
LW
4786 case 'w':
4787 case 'W':
4788 case 'b':
4789 case 'B':
4790 case 's':
4791 case 'S':
4792 case 'd':
4793 case 'D':
a14b48bc
LW
4794 case 'p':
4795 case 'P':
a0d0e21e
LW
4796 --p;
4797 goto loopdone;
4798 case 'n':
4799 ender = '\n';
4800 p++;
a687059c 4801 break;
a0d0e21e
LW
4802 case 'r':
4803 ender = '\r';
4804 p++;
a687059c 4805 break;
a0d0e21e
LW
4806 case 't':
4807 ender = '\t';
4808 p++;
a687059c 4809 break;
a0d0e21e
LW
4810 case 'f':
4811 ender = '\f';
4812 p++;
a687059c 4813 break;
a0d0e21e 4814 case 'e':
c7f1f016 4815 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4816 p++;
a687059c 4817 break;
a0d0e21e 4818 case 'a':
c7f1f016 4819 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4820 p++;
a687059c 4821 break;
a0d0e21e 4822 case 'x':
a0ed51b3 4823 if (*++p == '{') {
1df70142 4824 char* const e = strchr(p, '}');
b81d288d 4825
b45f050a 4826 if (!e) {
830247a4 4827 RExC_parse = p + 1;
b45f050a
JF
4828 vFAIL("Missing right brace on \\x{}");
4829 }
de5f0749 4830 else {
a4c04bdc
NC
4831 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4832 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4833 STRLEN numlen = e - p - 1;
53305cf1 4834 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4835 if (ender > 0xff)
4836 RExC_utf8 = 1;
a0ed51b3
LW
4837 p = e + 1;
4838 }
a0ed51b3
LW
4839 }
4840 else {
a4c04bdc 4841 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4842 STRLEN numlen = 2;
53305cf1 4843 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4844 p += numlen;
4845 }
a687059c 4846 break;
a0d0e21e
LW
4847 case 'c':
4848 p++;
bbce6d69 4849 ender = UCHARAT(p++);
4850 ender = toCTRL(ender);
a687059c 4851 break;
a0d0e21e
LW
4852 case '0': case '1': case '2': case '3':case '4':
4853 case '5': case '6': case '7': case '8':case '9':
4854 if (*p == '0' ||
830247a4 4855 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4856 I32 flags = 0;
1df70142 4857 STRLEN numlen = 3;
53305cf1 4858 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4859 p += numlen;
4860 }
4861 else {
4862 --p;
4863 goto loopdone;
a687059c
LW
4864 }
4865 break;
a0d0e21e 4866 case '\0':
830247a4 4867 if (p >= RExC_end)
b45f050a 4868 FAIL("Trailing \\");
a687059c 4869 /* FALL THROUGH */
a0d0e21e 4870 default:
041457d9 4871 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4872 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4873 goto normal_default;
a0d0e21e
LW
4874 }
4875 break;
a687059c 4876 default:
a0ed51b3 4877 normal_default:
fd400ab9 4878 if (UTF8_IS_START(*p) && UTF) {
1df70142 4879 STRLEN numlen;
5e12f4fb 4880 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4881 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4882 p += numlen;
4883 }
4884 else
4885 ender = *p++;
a0d0e21e 4886 break;
a687059c 4887 }
e2509266 4888 if (RExC_flags & PMf_EXTENDED)
830247a4 4889 p = regwhite(p, RExC_end);
60a8b682
JH
4890 if (UTF && FOLD) {
4891 /* Prime the casefolded buffer. */
ac7e0132 4892 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4893 }
a0d0e21e
LW
4894 if (ISMULT2(p)) { /* Back off on ?+*. */
4895 if (len)
4896 p = oldp;
16ea2a2e 4897 else if (UTF) {
80aecb99 4898 if (FOLD) {
60a8b682 4899 /* Emit all the Unicode characters. */
1df70142 4900 STRLEN numlen;
80aecb99
JH
4901 for (foldbuf = tmpbuf;
4902 foldlen;
4903 foldlen -= numlen) {
4904 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4905 if (numlen > 0) {
71207a34 4906 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4907 s += unilen;
4908 len += unilen;
4909 /* In EBCDIC the numlen
4910 * and unilen can differ. */
9dc45d57 4911 foldbuf += numlen;
47654450
JH
4912 if (numlen >= foldlen)
4913 break;
9dc45d57
JH
4914 }
4915 else
4916 break; /* "Can't happen." */
80aecb99
JH
4917 }
4918 }
4919 else {
71207a34 4920 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4921 if (unilen > 0) {
0ebc6274
JH
4922 s += unilen;
4923 len += unilen;
9dc45d57 4924 }
80aecb99 4925 }
a0ed51b3 4926 }
a0d0e21e
LW
4927 else {
4928 len++;
eb160463 4929 REGC((char)ender, s++);
a0d0e21e
LW
4930 }
4931 break;
a687059c 4932 }
16ea2a2e 4933 if (UTF) {
80aecb99 4934 if (FOLD) {
60a8b682 4935 /* Emit all the Unicode characters. */
1df70142 4936 STRLEN numlen;
80aecb99
JH
4937 for (foldbuf = tmpbuf;
4938 foldlen;
4939 foldlen -= numlen) {
4940 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4941 if (numlen > 0) {
71207a34 4942 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4943 len += unilen;
4944 s += unilen;
4945 /* In EBCDIC the numlen
4946 * and unilen can differ. */
9dc45d57 4947 foldbuf += numlen;
47654450
JH
4948 if (numlen >= foldlen)
4949 break;
9dc45d57
JH
4950 }
4951 else
4952 break;
80aecb99
JH
4953 }
4954 }
4955 else {
71207a34 4956 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4957 if (unilen > 0) {
0ebc6274
JH
4958 s += unilen;
4959 len += unilen;
9dc45d57 4960 }
80aecb99
JH
4961 }
4962 len--;
a0ed51b3
LW
4963 }
4964 else
eb160463 4965 REGC((char)ender, s++);
a0d0e21e
LW
4966 }
4967 loopdone:
830247a4 4968 RExC_parse = p - 1;
fac92740 4969 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4970 nextchar(pRExC_state);
793db0cb
JH
4971 {
4972 /* len is STRLEN which is unsigned, need to copy to signed */
4973 IV iv = len;
4974 if (iv < 0)
4975 vFAIL("Internal disaster");
4976 }
a0d0e21e
LW
4977 if (len > 0)
4978 *flagp |= HASWIDTH;
090f7165 4979 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4980 *flagp |= SIMPLE;
3dab1dad 4981
cd439c50 4982 if (SIZE_ONLY)
830247a4 4983 RExC_size += STR_SZ(len);
3dab1dad
YO
4984 else {
4985 STR_LEN(ret) = len;
830247a4 4986 RExC_emit += STR_SZ(len);
07be1b83 4987 }
3dab1dad 4988 }
a0d0e21e
LW
4989 break;
4990 }
a687059c 4991
60a8b682
JH
4992 /* If the encoding pragma is in effect recode the text of
4993 * any EXACT-kind nodes. */
3dab1dad
YO
4994 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4995 const STRLEN oldlen = STR_LEN(ret);
4996 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4997
4998 if (RExC_utf8)
4999 SvUTF8_on(sv);
5000 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5001 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5002 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5003
5004 if (SvUTF8(sv))
5005 RExC_utf8 = 1;
5006 if (!SIZE_ONLY) {
a3621e74
YO
5007 GET_RE_DEBUG_FLAGS_DECL;
5008 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5009 (int)oldlen, STRING(ret),
5010 (int)newlen, s));
5011 Copy(s, STRING(ret), newlen, char);
5012 STR_LEN(ret) += newlen - oldlen;
5013 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5014 } else
5015 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5016 }
a72c7584
JH
5017 }
5018
a0d0e21e 5019 return(ret);
a687059c
LW
5020}
5021
873ef191 5022STATIC char *
5f66b61c 5023S_regwhite(char *p, const char *e)
5b5a24f7
CS
5024{
5025 while (p < e) {
5026 if (isSPACE(*p))
5027 ++p;
5028 else if (*p == '#') {
5029 do {
5030 p++;
5031 } while (p < e && *p != '\n');
5032 }
5033 else
5034 break;
5035 }
5036 return p;
5037}
5038
b8c5462f
JH
5039/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5040 Character classes ([:foo:]) can also be negated ([:^foo:]).
5041 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5042 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5043 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5044
5045#define POSIXCC_DONE(c) ((c) == ':')
5046#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5047#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5048
b8c5462f 5049STATIC I32
830247a4 5050S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5051{
97aff369 5052 dVAR;
936ed897 5053 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5054
830247a4 5055 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5056 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5057 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5058 const char c = UCHARAT(RExC_parse);
097eb12c 5059 char* const s = RExC_parse++;
b81d288d 5060
9a86a77b 5061 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5062 RExC_parse++;
5063 if (RExC_parse == RExC_end)
620e46c5 5064 /* Grandfather lone [:, [=, [. */
830247a4 5065 RExC_parse = s;
620e46c5 5066 else {
3dab1dad 5067 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5068 assert(*t == c);
5069
9a86a77b 5070 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5071 const char *posixcc = s + 1;
830247a4 5072 RExC_parse++; /* skip over the ending ] */
3dab1dad 5073
b8c5462f 5074 if (*s == ':') {
1df70142
AL
5075 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5076 const I32 skip = t - posixcc;
80916619
NC
5077
5078 /* Initially switch on the length of the name. */
5079 switch (skip) {
5080 case 4:
3dab1dad
YO
5081 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5082 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5083 break;
80916619
NC
5084 case 5:
5085 /* Names all of length 5. */
5086 /* alnum alpha ascii blank cntrl digit graph lower
5087 print punct space upper */
5088 /* Offset 4 gives the best switch position. */
5089 switch (posixcc[4]) {
5090 case 'a':
3dab1dad
YO
5091 if (memEQ(posixcc, "alph", 4)) /* alpha */
5092 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5093 break;
5094 case 'e':
3dab1dad
YO
5095 if (memEQ(posixcc, "spac", 4)) /* space */
5096 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5097 break;
5098 case 'h':
3dab1dad
YO
5099 if (memEQ(posixcc, "grap", 4)) /* graph */
5100 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5101 break;
5102 case 'i':
3dab1dad
YO
5103 if (memEQ(posixcc, "asci", 4)) /* ascii */
5104 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5105 break;
5106 case 'k':
3dab1dad
YO
5107 if (memEQ(posixcc, "blan", 4)) /* blank */
5108 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5109 break;
5110 case 'l':
3dab1dad
YO
5111 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5112 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5113 break;
5114 case 'm':
3dab1dad
YO
5115 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5116 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5117 break;
5118 case 'r':
3dab1dad
YO
5119 if (memEQ(posixcc, "lowe", 4)) /* lower */
5120 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5121 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5122 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5123 break;
5124 case 't':
3dab1dad
YO
5125 if (memEQ(posixcc, "digi", 4)) /* digit */
5126 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5127 else if (memEQ(posixcc, "prin", 4)) /* print */
5128 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5129 else if (memEQ(posixcc, "punc", 4)) /* punct */
5130 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5131 break;
b8c5462f
JH
5132 }
5133 break;
80916619 5134 case 6:
3dab1dad
YO
5135 if (memEQ(posixcc, "xdigit", 6))
5136 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5137 break;
5138 }
80916619
NC
5139
5140 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5141 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5142 t - s - 1, s + 1);
80916619
NC
5143 assert (posixcc[skip] == ':');
5144 assert (posixcc[skip+1] == ']');
b45f050a 5145 } else if (!SIZE_ONLY) {
b8c5462f 5146 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5147
830247a4 5148 /* adjust RExC_parse so the warning shows after
b45f050a 5149 the class closes */
9a86a77b 5150 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5151 RExC_parse++;
b45f050a
JF
5152 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5153 }
b8c5462f
JH
5154 } else {
5155 /* Maternal grandfather:
5156 * "[:" ending in ":" but not in ":]" */
830247a4 5157 RExC_parse = s;
767d463e 5158 }
620e46c5
JH
5159 }
5160 }
5161
b8c5462f
JH
5162 return namedclass;
5163}
5164
5165STATIC void
830247a4 5166S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5167{
97aff369 5168 dVAR;
3dab1dad 5169 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5170 const char *s = RExC_parse;
5171 const char c = *s++;
b8c5462f 5172
3dab1dad 5173 while (isALNUM(*s))
b8c5462f
JH
5174 s++;
5175 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5176 if (ckWARN(WARN_REGEXP))
5177 vWARN3(s+2,
5178 "POSIX syntax [%c %c] belongs inside character classes",
5179 c, c);
b45f050a
JF
5180
5181 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5182 if (POSIXCC_NOTYET(c)) {
830247a4 5183 /* adjust RExC_parse so the error shows after
b45f050a 5184 the class closes */
9a86a77b 5185 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5186 NOOP;
b45f050a
JF
5187 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5188 }
b8c5462f
JH
5189 }
5190 }
620e46c5
JH
5191}
5192
7f6f358c
YO
5193
5194/*
5195 parse a class specification and produce either an ANYOF node that
5196 matches the pattern. If the pattern matches a single char only and
5197 that char is < 256 then we produce an EXACT node instead.
5198*/
76e3520e 5199STATIC regnode *
3dab1dad 5200S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5201{
97aff369 5202 dVAR;
07be1b83 5203 register UV value;
9a86a77b 5204 register UV nextvalue;
3568d838 5205 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5206 register IV range = 0;
c277df42 5207 register regnode *ret;
ba210ebe 5208 STRLEN numlen;
ffc61ed2 5209 IV namedclass;
cbbf8932 5210 char *rangebegin = NULL;
936ed897 5211 bool need_class = 0;
c445ea15 5212 SV *listsv = NULL;
ffc61ed2 5213 UV n;
9e55ce06 5214 bool optimize_invert = TRUE;
cbbf8932 5215 AV* unicode_alternate = NULL;
1b2d223b
JH
5216#ifdef EBCDIC
5217 UV literal_endpoint = 0;
5218#endif
7f6f358c 5219 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5220
3dab1dad 5221 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5222 case we need to change the emitted regop to an EXACT. */
07be1b83 5223 const char * orig_parse = RExC_parse;
72f13be8 5224 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5225#ifndef DEBUGGING
5226 PERL_UNUSED_ARG(depth);
5227#endif
72f13be8 5228
3dab1dad 5229 DEBUG_PARSE("clas");
7f6f358c
YO
5230
5231 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5232 ret = reganode(pRExC_state, ANYOF, 0);
5233
5234 if (!SIZE_ONLY)
5235 ANYOF_FLAGS(ret) = 0;
5236
9a86a77b 5237 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5238 RExC_naughty++;
5239 RExC_parse++;
5240 if (!SIZE_ONLY)
5241 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5242 }
a0d0e21e 5243
73060fc4 5244 if (SIZE_ONLY) {
830247a4 5245 RExC_size += ANYOF_SKIP;
73060fc4
JH
5246 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5247 }
936ed897 5248 else {
830247a4 5249 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5250 if (FOLD)
5251 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5252 if (LOC)
5253 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5254 ANYOF_BITMAP_ZERO(ret);
396482e1 5255 listsv = newSVpvs("# comment\n");
a0d0e21e 5256 }
b8c5462f 5257
9a86a77b
JH
5258 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5259
b938889d 5260 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5261 checkposixcc(pRExC_state);
b8c5462f 5262
f064b6ad
HS
5263 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5264 if (UCHARAT(RExC_parse) == ']')
5265 goto charclassloop;
ffc61ed2 5266
9a86a77b 5267 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5268
5269 charclassloop:
5270
5271 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5272
73b437c8 5273 if (!range)
830247a4 5274 rangebegin = RExC_parse;
ffc61ed2 5275 if (UTF) {
5e12f4fb 5276 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5277 RExC_end - RExC_parse,
9f7f3913 5278 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5279 RExC_parse += numlen;
5280 }
5281 else
5282 value = UCHARAT(RExC_parse++);
7f6f358c 5283
9a86a77b
JH
5284 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5285 if (value == '[' && POSIXCC(nextvalue))
830247a4 5286 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5287 else if (value == '\\') {
ffc61ed2 5288 if (UTF) {
5e12f4fb 5289 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5290 RExC_end - RExC_parse,
9f7f3913 5291 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5292 RExC_parse += numlen;
5293 }
5294 else
5295 value = UCHARAT(RExC_parse++);
470c3474 5296 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5297 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5298 * be a problem later if we want switch on Unicode.
5299 * A similar issue a little bit later when switching on
5300 * namedclass. --jhi */
ffc61ed2 5301 switch ((I32)value) {
b8c5462f
JH
5302 case 'w': namedclass = ANYOF_ALNUM; break;
5303 case 'W': namedclass = ANYOF_NALNUM; break;
5304 case 's': namedclass = ANYOF_SPACE; break;
5305 case 'S': namedclass = ANYOF_NSPACE; break;
5306 case 'd': namedclass = ANYOF_DIGIT; break;
5307 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5308 case 'p':
5309 case 'P':
3dab1dad
YO
5310 {
5311 char *e;
af6f566e 5312 if (RExC_parse >= RExC_end)
2a4859cd 5313 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5314 if (*RExC_parse == '{') {
1df70142 5315 const U8 c = (U8)value;
ffc61ed2
JH
5316 e = strchr(RExC_parse++, '}');
5317 if (!e)
0da60cf5 5318 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5319 while (isSPACE(UCHARAT(RExC_parse)))
5320 RExC_parse++;
5321 if (e == RExC_parse)
0da60cf5 5322 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5323 n = e - RExC_parse;
ab13f0c7
JH
5324 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5325 n--;
ffc61ed2
JH
5326 }
5327 else {
5328 e = RExC_parse;
5329 n = 1;
5330 }
5331 if (!SIZE_ONLY) {
ab13f0c7
JH
5332 if (UCHARAT(RExC_parse) == '^') {
5333 RExC_parse++;
5334 n--;
5335 value = value == 'p' ? 'P' : 'p'; /* toggle */
5336 while (isSPACE(UCHARAT(RExC_parse))) {
5337 RExC_parse++;
5338 n--;
5339 }
5340 }
097eb12c
AL
5341 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5342 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5343 }
5344 RExC_parse = e + 1;
5345 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5346 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5347 }
f81125e2 5348 break;
b8c5462f
JH
5349 case 'n': value = '\n'; break;
5350 case 'r': value = '\r'; break;
5351 case 't': value = '\t'; break;
5352 case 'f': value = '\f'; break;
5353 case 'b': value = '\b'; break;
c7f1f016
NIS
5354 case 'e': value = ASCII_TO_NATIVE('\033');break;
5355 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5356 case 'x':
ffc61ed2 5357 if (*RExC_parse == '{') {
a4c04bdc
NC
5358 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5359 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5360 char * const e = strchr(RExC_parse++, '}');
b81d288d 5361 if (!e)
ffc61ed2 5362 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5363
5364 numlen = e - RExC_parse;
5365 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5366 RExC_parse = e + 1;
5367 }
5368 else {
a4c04bdc 5369 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5370 numlen = 2;
5371 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5372 RExC_parse += numlen;
5373 }
b8c5462f
JH
5374 break;
5375 case 'c':
830247a4 5376 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5377 value = toCTRL(value);
5378 break;
5379 case '0': case '1': case '2': case '3': case '4':
5380 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5381 {
5382 I32 flags = 0;
5383 numlen = 3;
5384 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5385 RExC_parse += numlen;
b8c5462f 5386 break;
53305cf1 5387 }
1028017a 5388 default:
041457d9 5389 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5390 vWARN2(RExC_parse,
5391 "Unrecognized escape \\%c in character class passed through",
5392 (int)value);
1028017a 5393 break;
b8c5462f 5394 }
ffc61ed2 5395 } /* end of \blah */
1b2d223b
JH
5396#ifdef EBCDIC
5397 else
5398 literal_endpoint++;
5399#endif
ffc61ed2
JH
5400
5401 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5402
5403 if (!SIZE_ONLY && !need_class)
936ed897 5404 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5405
936ed897 5406 need_class = 1;
ffc61ed2
JH
5407
5408 /* a bad range like a-\d, a-[:digit:] ? */
5409 if (range) {
73b437c8 5410 if (!SIZE_ONLY) {
afd78fd5 5411 if (ckWARN(WARN_REGEXP)) {
097eb12c 5412 const int w =
afd78fd5
JH
5413 RExC_parse >= rangebegin ?
5414 RExC_parse - rangebegin : 0;
830247a4 5415 vWARN4(RExC_parse,
b45f050a 5416 "False [] range \"%*.*s\"",
097eb12c 5417 w, w, rangebegin);
afd78fd5 5418 }
3568d838
JH
5419 if (prevvalue < 256) {
5420 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5421 ANYOF_BITMAP_SET(ret, '-');
5422 }
5423 else {
5424 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5425 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5426 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5427 }
b8c5462f 5428 }
ffc61ed2
JH
5429
5430 range = 0; /* this was not a true range */
73b437c8 5431 }
ffc61ed2 5432
73b437c8 5433 if (!SIZE_ONLY) {
c49a72a9
NC
5434 const char *what = NULL;
5435 char yesno = 0;
5436
3568d838
JH
5437 if (namedclass > OOB_NAMEDCLASS)
5438 optimize_invert = FALSE;
e2962f66
JH
5439 /* Possible truncation here but in some 64-bit environments
5440 * the compiler gets heartburn about switch on 64-bit values.
5441 * A similar issue a little earlier when switching on value.
98f323fa 5442 * --jhi */
e2962f66 5443 switch ((I32)namedclass) {
73b437c8
JH
5444 case ANYOF_ALNUM:
5445 if (LOC)
936ed897 5446 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5447 else {
5448 for (value = 0; value < 256; value++)
5449 if (isALNUM(value))
936ed897 5450 ANYOF_BITMAP_SET(ret, value);
73b437c8 5451 }
c49a72a9
NC
5452 yesno = '+';
5453 what = "Word";
73b437c8
JH
5454 break;
5455 case ANYOF_NALNUM:
5456 if (LOC)
936ed897 5457 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5458 else {
5459 for (value = 0; value < 256; value++)
5460 if (!isALNUM(value))
936ed897 5461 ANYOF_BITMAP_SET(ret, value);
73b437c8 5462 }
c49a72a9
NC
5463 yesno = '!';
5464 what = "Word";
73b437c8 5465 break;
ffc61ed2 5466 case ANYOF_ALNUMC:
73b437c8 5467 if (LOC)
ffc61ed2 5468 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5469 else {
5470 for (value = 0; value < 256; value++)
ffc61ed2 5471 if (isALNUMC(value))
936ed897 5472 ANYOF_BITMAP_SET(ret, value);
73b437c8 5473 }
c49a72a9
NC
5474 yesno = '+';
5475 what = "Alnum";
73b437c8
JH
5476 break;
5477 case ANYOF_NALNUMC:
5478 if (LOC)
936ed897 5479 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5480 else {
5481 for (value = 0; value < 256; value++)
5482 if (!isALNUMC(value))
936ed897 5483 ANYOF_BITMAP_SET(ret, value);
73b437c8 5484 }
c49a72a9
NC
5485 yesno = '!';
5486 what = "Alnum";
73b437c8
JH
5487 break;
5488 case ANYOF_ALPHA:
5489 if (LOC)
936ed897 5490 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5491 else {
5492 for (value = 0; value < 256; value++)
5493 if (isALPHA(value))
936ed897 5494 ANYOF_BITMAP_SET(ret, value);
73b437c8 5495 }
c49a72a9
NC
5496 yesno = '+';
5497 what = "Alpha";
73b437c8
JH
5498 break;
5499 case ANYOF_NALPHA:
5500 if (LOC)
936ed897 5501 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5502 else {
5503 for (value = 0; value < 256; value++)
5504 if (!isALPHA(value))
936ed897 5505 ANYOF_BITMAP_SET(ret, value);
73b437c8 5506 }
c49a72a9
NC
5507 yesno = '!';
5508 what = "Alpha";
73b437c8
JH
5509 break;
5510 case ANYOF_ASCII:
5511 if (LOC)
936ed897 5512 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5513 else {
c7f1f016 5514#ifndef EBCDIC
1ba5c669
JH
5515 for (value = 0; value < 128; value++)
5516 ANYOF_BITMAP_SET(ret, value);
5517#else /* EBCDIC */
ffbc6a93 5518 for (value = 0; value < 256; value++) {
3a3c4447
JH
5519 if (isASCII(value))
5520 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5521 }
1ba5c669 5522#endif /* EBCDIC */
73b437c8 5523 }
c49a72a9
NC
5524 yesno = '+';
5525 what = "ASCII";
73b437c8
JH
5526 break;
5527 case ANYOF_NASCII:
5528 if (LOC)
936ed897 5529 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5530 else {
c7f1f016 5531#ifndef EBCDIC
1ba5c669
JH
5532 for (value = 128; value < 256; value++)
5533 ANYOF_BITMAP_SET(ret, value);
5534#else /* EBCDIC */
ffbc6a93 5535 for (value = 0; value < 256; value++) {
3a3c4447
JH
5536 if (!isASCII(value))
5537 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5538 }
1ba5c669 5539#endif /* EBCDIC */
73b437c8 5540 }
c49a72a9
NC
5541 yesno = '!';
5542 what = "ASCII";
73b437c8 5543 break;
aaa51d5e
JF
5544 case ANYOF_BLANK:
5545 if (LOC)
5546 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5547 else {
5548 for (value = 0; value < 256; value++)
5549 if (isBLANK(value))
5550 ANYOF_BITMAP_SET(ret, value);
5551 }
c49a72a9
NC
5552 yesno = '+';
5553 what = "Blank";
aaa51d5e
JF
5554 break;
5555 case ANYOF_NBLANK:
5556 if (LOC)
5557 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5558 else {
5559 for (value = 0; value < 256; value++)
5560 if (!isBLANK(value))
5561 ANYOF_BITMAP_SET(ret, value);
5562 }
c49a72a9
NC
5563 yesno = '!';
5564 what = "Blank";
aaa51d5e 5565 break;
73b437c8
JH
5566 case ANYOF_CNTRL:
5567 if (LOC)
936ed897 5568 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5569 else {
5570 for (value = 0; value < 256; value++)
5571 if (isCNTRL(value))
936ed897 5572 ANYOF_BITMAP_SET(ret, value);
73b437c8 5573 }
c49a72a9
NC
5574 yesno = '+';
5575 what = "Cntrl";
73b437c8
JH
5576 break;
5577 case ANYOF_NCNTRL:
5578 if (LOC)
936ed897 5579 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5580 else {
5581 for (value = 0; value < 256; value++)
5582 if (!isCNTRL(value))
936ed897 5583 ANYOF_BITMAP_SET(ret, value);
73b437c8 5584 }
c49a72a9
NC
5585 yesno = '!';
5586 what = "Cntrl";
ffc61ed2
JH
5587 break;
5588 case ANYOF_DIGIT:
5589 if (LOC)
5590 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5591 else {
5592 /* consecutive digits assumed */
5593 for (value = '0'; value <= '9'; value++)
5594 ANYOF_BITMAP_SET(ret, value);
5595 }
c49a72a9
NC
5596 yesno = '+';
5597 what = "Digit";
ffc61ed2
JH
5598 break;
5599 case ANYOF_NDIGIT:
5600 if (LOC)
5601 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5602 else {
5603 /* consecutive digits assumed */
5604 for (value = 0; value < '0'; value++)
5605 ANYOF_BITMAP_SET(ret, value);
5606 for (value = '9' + 1; value < 256; value++)
5607 ANYOF_BITMAP_SET(ret, value);
5608 }
c49a72a9
NC
5609 yesno = '!';
5610 what = "Digit";
73b437c8
JH
5611 break;
5612 case ANYOF_GRAPH:
5613 if (LOC)
936ed897 5614 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5615 else {
5616 for (value = 0; value < 256; value++)
5617 if (isGRAPH(value))
936ed897 5618 ANYOF_BITMAP_SET(ret, value);
73b437c8 5619 }
c49a72a9
NC
5620 yesno = '+';
5621 what = "Graph";
73b437c8
JH
5622 break;
5623 case ANYOF_NGRAPH:
5624 if (LOC)
936ed897 5625 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5626 else {
5627 for (value = 0; value < 256; value++)
5628 if (!isGRAPH(value))
936ed897 5629 ANYOF_BITMAP_SET(ret, value);
73b437c8 5630 }
c49a72a9
NC
5631 yesno = '!';
5632 what = "Graph";
73b437c8
JH
5633 break;
5634 case ANYOF_LOWER:
5635 if (LOC)
936ed897 5636 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5637 else {
5638 for (value = 0; value < 256; value++)
5639 if (isLOWER(value))
936ed897 5640 ANYOF_BITMAP_SET(ret, value);
73b437c8 5641 }
c49a72a9
NC
5642 yesno = '+';
5643 what = "Lower";
73b437c8
JH
5644 break;
5645 case ANYOF_NLOWER:
5646 if (LOC)
936ed897 5647 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5648 else {
5649 for (value = 0; value < 256; value++)
5650 if (!isLOWER(value))
936ed897 5651 ANYOF_BITMAP_SET(ret, value);
73b437c8 5652 }
c49a72a9
NC
5653 yesno = '!';
5654 what = "Lower";
73b437c8
JH
5655 break;
5656 case ANYOF_PRINT:
5657 if (LOC)
936ed897 5658 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5659 else {
5660 for (value = 0; value < 256; value++)
5661 if (isPRINT(value))
936ed897 5662 ANYOF_BITMAP_SET(ret, value);
73b437c8 5663 }
c49a72a9
NC
5664 yesno = '+';
5665 what = "Print";
73b437c8
JH
5666 break;
5667 case ANYOF_NPRINT:
5668 if (LOC)
936ed897 5669 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5670 else {
5671 for (value = 0; value < 256; value++)
5672 if (!isPRINT(value))
936ed897 5673 ANYOF_BITMAP_SET(ret, value);
73b437c8 5674 }
c49a72a9
NC
5675 yesno = '!';
5676 what = "Print";
73b437c8 5677 break;
aaa51d5e
JF
5678 case ANYOF_PSXSPC:
5679 if (LOC)
5680 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5681 else {
5682 for (value = 0; value < 256; value++)
5683 if (isPSXSPC(value))
5684 ANYOF_BITMAP_SET(ret, value);
5685 }
c49a72a9
NC
5686 yesno = '+';
5687 what = "Space";
aaa51d5e
JF
5688 break;
5689 case ANYOF_NPSXSPC:
5690 if (LOC)
5691 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5692 else {
5693 for (value = 0; value < 256; value++)
5694 if (!isPSXSPC(value))
5695 ANYOF_BITMAP_SET(ret, value);
5696 }
c49a72a9
NC
5697 yesno = '!';
5698 what = "Space";
aaa51d5e 5699 break;
73b437c8
JH
5700 case ANYOF_PUNCT:
5701 if (LOC)
936ed897 5702 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5703 else {
5704 for (value = 0; value < 256; value++)
5705 if (isPUNCT(value))
936ed897 5706 ANYOF_BITMAP_SET(ret, value);
73b437c8 5707 }
c49a72a9
NC
5708 yesno = '+';
5709 what = "Punct";
73b437c8
JH
5710 break;
5711 case ANYOF_NPUNCT:
5712 if (LOC)
936ed897 5713 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5714 else {
5715 for (value = 0; value < 256; value++)
5716 if (!isPUNCT(value))
936ed897 5717 ANYOF_BITMAP_SET(ret, value);
73b437c8 5718 }
c49a72a9
NC
5719 yesno = '!';
5720 what = "Punct";
ffc61ed2
JH
5721 break;
5722 case ANYOF_SPACE:
5723 if (LOC)
5724 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5725 else {
5726 for (value = 0; value < 256; value++)
5727 if (isSPACE(value))
5728 ANYOF_BITMAP_SET(ret, value);
5729 }
c49a72a9
NC
5730 yesno = '+';
5731 what = "SpacePerl";
ffc61ed2
JH
5732 break;
5733 case ANYOF_NSPACE:
5734 if (LOC)
5735 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5736 else {
5737 for (value = 0; value < 256; value++)
5738 if (!isSPACE(value))
5739 ANYOF_BITMAP_SET(ret, value);
5740 }
c49a72a9
NC
5741 yesno = '!';
5742 what = "SpacePerl";
73b437c8
JH
5743 break;
5744 case ANYOF_UPPER:
5745 if (LOC)
936ed897 5746 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5747 else {
5748 for (value = 0; value < 256; value++)
5749 if (isUPPER(value))
936ed897 5750 ANYOF_BITMAP_SET(ret, value);
73b437c8 5751 }
c49a72a9
NC
5752 yesno = '+';
5753 what = "Upper";
73b437c8
JH
5754 break;
5755 case ANYOF_NUPPER:
5756 if (LOC)
936ed897 5757 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5758 else {
5759 for (value = 0; value < 256; value++)
5760 if (!isUPPER(value))
936ed897 5761 ANYOF_BITMAP_SET(ret, value);
73b437c8 5762 }
c49a72a9
NC
5763 yesno = '!';
5764 what = "Upper";
73b437c8
JH
5765 break;
5766 case ANYOF_XDIGIT:
5767 if (LOC)
936ed897 5768 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5769 else {
5770 for (value = 0; value < 256; value++)
5771 if (isXDIGIT(value))
936ed897 5772 ANYOF_BITMAP_SET(ret, value);
73b437c8 5773 }
c49a72a9
NC
5774 yesno = '+';
5775 what = "XDigit";
73b437c8
JH
5776 break;
5777 case ANYOF_NXDIGIT:
5778 if (LOC)
936ed897 5779 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5780 else {
5781 for (value = 0; value < 256; value++)
5782 if (!isXDIGIT(value))
936ed897 5783 ANYOF_BITMAP_SET(ret, value);
73b437c8 5784 }
c49a72a9
NC
5785 yesno = '!';
5786 what = "XDigit";
73b437c8 5787 break;
f81125e2
JP
5788 case ANYOF_MAX:
5789 /* this is to handle \p and \P */
5790 break;
73b437c8 5791 default:
b45f050a 5792 vFAIL("Invalid [::] class");
73b437c8 5793 break;
b8c5462f 5794 }
c49a72a9
NC
5795 if (what) {
5796 /* Strings such as "+utf8::isWord\n" */
5797 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5798 }
b8c5462f 5799 if (LOC)
936ed897 5800 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5801 continue;
a0d0e21e 5802 }
ffc61ed2
JH
5803 } /* end of namedclass \blah */
5804
a0d0e21e 5805 if (range) {
eb160463 5806 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5807 const int w = RExC_parse - rangebegin;
5808 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5809 range = 0; /* not a valid range */
73b437c8 5810 }
a0d0e21e
LW
5811 }
5812 else {
3568d838 5813 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5814 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5815 RExC_parse[1] != ']') {
5816 RExC_parse++;
ffc61ed2
JH
5817
5818 /* a bad range like \w-, [:word:]- ? */
5819 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5820 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5821 const int w =
afd78fd5
JH
5822 RExC_parse >= rangebegin ?
5823 RExC_parse - rangebegin : 0;
830247a4 5824 vWARN4(RExC_parse,
b45f050a 5825 "False [] range \"%*.*s\"",
097eb12c 5826 w, w, rangebegin);
afd78fd5 5827 }
73b437c8 5828 if (!SIZE_ONLY)
936ed897 5829 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5830 } else
ffc61ed2
JH
5831 range = 1; /* yeah, it's a range! */
5832 continue; /* but do it the next time */
a0d0e21e 5833 }
a687059c 5834 }
ffc61ed2 5835
93733859 5836 /* now is the next time */
07be1b83 5837 /*stored += (value - prevvalue + 1);*/
ae5c130c 5838 if (!SIZE_ONLY) {
3568d838 5839 if (prevvalue < 256) {
1df70142 5840 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5841 IV i;
3568d838 5842#ifdef EBCDIC
1b2d223b
JH
5843 /* In EBCDIC [\x89-\x91] should include
5844 * the \x8e but [i-j] should not. */
5845 if (literal_endpoint == 2 &&
5846 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5847 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5848 {
3568d838
JH
5849 if (isLOWER(prevvalue)) {
5850 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5851 if (isLOWER(i))
5852 ANYOF_BITMAP_SET(ret, i);
5853 } else {
3568d838 5854 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5855 if (isUPPER(i))
5856 ANYOF_BITMAP_SET(ret, i);
5857 }
8ada0baa 5858 }
ffc61ed2 5859 else
8ada0baa 5860#endif
07be1b83
YO
5861 for (i = prevvalue; i <= ceilvalue; i++) {
5862 if (!ANYOF_BITMAP_TEST(ret,i)) {
5863 stored++;
5864 ANYOF_BITMAP_SET(ret, i);
5865 }
5866 }
3568d838 5867 }
a5961de5 5868 if (value > 255 || UTF) {
1df70142
AL
5869 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5870 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5871 stored+=2; /* can't optimize this class */
ffc61ed2 5872 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5873 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5874 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5875 prevnatvalue, natvalue);
5876 }
5877 else if (prevnatvalue == natvalue) {
5878 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5879 if (FOLD) {
89ebb4a3 5880 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5881 STRLEN foldlen;
1df70142 5882 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5883
c840d2a2
JH
5884 /* If folding and foldable and a single
5885 * character, insert also the folded version
5886 * to the charclass. */
9e55ce06 5887 if (f != value) {
eb160463 5888 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5889 Perl_sv_catpvf(aTHX_ listsv,
5890 "%04"UVxf"\n", f);
5891 else {
5892 /* Any multicharacter foldings
5893 * require the following transform:
5894 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5895 * where E folds into "pq" and F folds
5896 * into "rst", all other characters
5897 * fold to single characters. We save
5898 * away these multicharacter foldings,
5899 * to be later saved as part of the
5900 * additional "s" data. */
5901 SV *sv;
5902
5903 if (!unicode_alternate)
5904 unicode_alternate = newAV();
5905 sv = newSVpvn((char*)foldbuf, foldlen);
5906 SvUTF8_on(sv);
5907 av_push(unicode_alternate, sv);
5908 }
5909 }
254ba52a 5910
60a8b682
JH
5911 /* If folding and the value is one of the Greek
5912 * sigmas insert a few more sigmas to make the
5913 * folding rules of the sigmas to work right.
5914 * Note that not all the possible combinations
5915 * are handled here: some of them are handled
9e55ce06
JH
5916 * by the standard folding rules, and some of
5917 * them (literal or EXACTF cases) are handled
5918 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5919 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5920 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5921 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5922 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5923 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5924 }
5925 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5926 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5927 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5928 }
5929 }
ffc61ed2 5930 }
1b2d223b
JH
5931#ifdef EBCDIC
5932 literal_endpoint = 0;
5933#endif
8ada0baa 5934 }
ffc61ed2
JH
5935
5936 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5937 }
ffc61ed2 5938
936ed897 5939 if (need_class) {
4f66b38d 5940 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5941 if (SIZE_ONLY)
830247a4 5942 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5943 else
830247a4 5944 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5945 }
ffc61ed2 5946
7f6f358c
YO
5947
5948 if (SIZE_ONLY)
5949 return ret;
5950 /****** !SIZE_ONLY AFTER HERE *********/
5951
5952 if( stored == 1 && value < 256
5953 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5954 ) {
5955 /* optimize single char class to an EXACT node
5956 but *only* when its not a UTF/high char */
07be1b83
YO
5957 const char * cur_parse= RExC_parse;
5958 RExC_emit = (regnode *)orig_emit;
5959 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5960 ret = reg_node(pRExC_state,
5961 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5962 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5963 *STRING(ret)= (char)value;
5964 STR_LEN(ret)= 1;
5965 RExC_emit += STR_SZ(1);
5966 return ret;
5967 }
ae5c130c 5968 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5969 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5970 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5971 ) {
a0ed51b3 5972 for (value = 0; value < 256; ++value) {
936ed897 5973 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5974 UV fold = PL_fold[value];
ffc61ed2
JH
5975
5976 if (fold != value)
5977 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5978 }
5979 }
936ed897 5980 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5981 }
ffc61ed2 5982
ae5c130c 5983 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5984 if (optimize_invert &&
ffc61ed2
JH
5985 /* If the only flag is inversion. */
5986 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5987 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5988 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5989 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5990 }
7f6f358c 5991 {
097eb12c 5992 AV * const av = newAV();
ffc61ed2 5993 SV *rv;
9e55ce06 5994 /* The 0th element stores the character class description
6a0407ee 5995 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5996 * to initialize the appropriate swash (which gets stored in
5997 * the 1st element), and also useful for dumping the regnode.
5998 * The 2nd element stores the multicharacter foldings,
6a0407ee 5999 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6000 av_store(av, 0, listsv);
6001 av_store(av, 1, NULL);
9e55ce06 6002 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6003 rv = newRV_noinc((SV*)av);
19860706 6004 n = add_data(pRExC_state, 1, "s");
830247a4 6005 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6006 ARG_SET(ret, n);
a0ed51b3 6007 }
a0ed51b3
LW
6008 return ret;
6009}
6010
76e3520e 6011STATIC char*
830247a4 6012S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6013{
097eb12c 6014 char* const retval = RExC_parse++;
a0d0e21e 6015
4633a7c4 6016 for (;;) {
830247a4
IZ
6017 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6018 RExC_parse[2] == '#') {
e994fd66
AE
6019 while (*RExC_parse != ')') {
6020 if (RExC_parse == RExC_end)
6021 FAIL("Sequence (?#... not terminated");
830247a4 6022 RExC_parse++;
e994fd66 6023 }
830247a4 6024 RExC_parse++;
4633a7c4
LW
6025 continue;
6026 }
e2509266 6027 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6028 if (isSPACE(*RExC_parse)) {
6029 RExC_parse++;
748a9306
LW
6030 continue;
6031 }
830247a4 6032 else if (*RExC_parse == '#') {
e994fd66
AE
6033 while (RExC_parse < RExC_end)
6034 if (*RExC_parse++ == '\n') break;
748a9306
LW
6035 continue;
6036 }
748a9306 6037 }
4633a7c4 6038 return retval;
a0d0e21e 6039 }
a687059c
LW
6040}
6041
6042/*
c277df42 6043- reg_node - emit a node
a0d0e21e 6044*/
76e3520e 6045STATIC regnode * /* Location. */
830247a4 6046S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6047{
97aff369 6048 dVAR;
c277df42 6049 register regnode *ptr;
504618e9 6050 regnode * const ret = RExC_emit;
07be1b83 6051 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6052
c277df42 6053 if (SIZE_ONLY) {
830247a4
IZ
6054 SIZE_ALIGN(RExC_size);
6055 RExC_size += 1;
a0d0e21e
LW
6056 return(ret);
6057 }
c277df42 6058 NODE_ALIGN_FILL(ret);
a0d0e21e 6059 ptr = ret;
c277df42 6060 FILL_ADVANCE_NODE(ptr, op);
fac92740 6061 if (RExC_offsets) { /* MJD */
07be1b83 6062 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6063 "reg_node", __LINE__,
6064 reg_name[op],
07be1b83
YO
6065 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6066 ? "Overwriting end of array!\n" : "OK",
6067 (UV)(RExC_emit - RExC_emit_start),
6068 (UV)(RExC_parse - RExC_start),
6069 (UV)RExC_offsets[0]));
ccb2c380 6070 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6071 }
07be1b83 6072
830247a4 6073 RExC_emit = ptr;
a687059c 6074
a0d0e21e 6075 return(ret);
a687059c
LW
6076}
6077
6078/*
a0d0e21e
LW
6079- reganode - emit a node with an argument
6080*/
76e3520e 6081STATIC regnode * /* Location. */
830247a4 6082S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6083{
97aff369 6084 dVAR;
c277df42 6085 register regnode *ptr;
504618e9 6086 regnode * const ret = RExC_emit;
07be1b83 6087 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6088
c277df42 6089 if (SIZE_ONLY) {
830247a4
IZ
6090 SIZE_ALIGN(RExC_size);
6091 RExC_size += 2;
a0d0e21e
LW
6092 return(ret);
6093 }
fe14fcc3 6094
c277df42 6095 NODE_ALIGN_FILL(ret);
a0d0e21e 6096 ptr = ret;
c277df42 6097 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6098 if (RExC_offsets) { /* MJD */
07be1b83 6099 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6100 "reganode",
ccb2c380
MP
6101 __LINE__,
6102 reg_name[op],
07be1b83 6103 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6104 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6105 (UV)(RExC_emit - RExC_emit_start),
6106 (UV)(RExC_parse - RExC_start),
6107 (UV)RExC_offsets[0]));
ccb2c380 6108 Set_Cur_Node_Offset;
fac92740
MJD
6109 }
6110
830247a4 6111 RExC_emit = ptr;
fe14fcc3 6112
a0d0e21e 6113 return(ret);
fe14fcc3
LW
6114}
6115
6116/*
cd439c50 6117- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6118*/
71207a34
AL
6119STATIC STRLEN
6120S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6121{
97aff369 6122 dVAR;
71207a34 6123 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6124}
6125
6126/*
a0d0e21e
LW
6127- reginsert - insert an operator in front of already-emitted operand
6128*
6129* Means relocating the operand.
6130*/
76e3520e 6131STATIC void
830247a4 6132S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6133{
97aff369 6134 dVAR;
c277df42
IZ
6135 register regnode *src;
6136 register regnode *dst;
6137 register regnode *place;
504618e9 6138 const int offset = regarglen[(U8)op];
07be1b83 6139 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6140/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6141
6142 if (SIZE_ONLY) {
830247a4 6143 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6144 return;
6145 }
a687059c 6146
830247a4
IZ
6147 src = RExC_emit;
6148 RExC_emit += NODE_STEP_REGNODE + offset;
6149 dst = RExC_emit;
fac92740 6150 while (src > opnd) {
c277df42 6151 StructCopy(--src, --dst, regnode);
fac92740 6152 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6153 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6154 "reg_insert",
ccb2c380
MP
6155 __LINE__,
6156 reg_name[op],
07be1b83
YO
6157 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6158 ? "Overwriting end of array!\n" : "OK",
6159 (UV)(src - RExC_emit_start),
6160 (UV)(dst - RExC_emit_start),
6161 (UV)RExC_offsets[0]));
ccb2c380
MP
6162 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6163 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6164 }
6165 }
6166
a0d0e21e
LW
6167
6168 place = opnd; /* Op node, where operand used to be. */
fac92740 6169 if (RExC_offsets) { /* MJD */
07be1b83 6170 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6171 "reginsert",
ccb2c380
MP
6172 __LINE__,
6173 reg_name[op],
07be1b83 6174 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6175 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6176 (UV)(place - RExC_emit_start),
6177 (UV)(RExC_parse - RExC_start),
fac92740 6178 RExC_offsets[0]));
ccb2c380 6179 Set_Node_Offset(place, RExC_parse);
45948336 6180 Set_Node_Length(place, 1);
fac92740 6181 }
c277df42
IZ
6182 src = NEXTOPER(place);
6183 FILL_ADVANCE_NODE(place, op);
6184 Zero(src, offset, regnode);
a687059c
LW
6185}
6186
6187/*
c277df42 6188- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6189- SEE ALSO: regtail_study
a0d0e21e 6190*/
097eb12c 6191/* TODO: All three parms should be const */
76e3520e 6192STATIC void
3dab1dad 6193S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6194{
97aff369 6195 dVAR;
c277df42 6196 register regnode *scan;
72f13be8 6197 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6198#ifndef DEBUGGING
6199 PERL_UNUSED_ARG(depth);
6200#endif
a0d0e21e 6201
c277df42 6202 if (SIZE_ONLY)
a0d0e21e
LW
6203 return;
6204
6205 /* Find last node. */
6206 scan = p;
6207 for (;;) {
504618e9 6208 regnode * const temp = regnext(scan);
3dab1dad
YO
6209 DEBUG_PARSE_r({
6210 SV * const mysv=sv_newmortal();
6211 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6212 regprop(RExC_rx, mysv, scan);
6213 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6214 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6215 });
6216 if (temp == NULL)
6217 break;
6218 scan = temp;
6219 }
6220
6221 if (reg_off_by_arg[OP(scan)]) {
6222 ARG_SET(scan, val - scan);
6223 }
6224 else {
6225 NEXT_OFF(scan) = val - scan;
6226 }
6227}
6228
07be1b83 6229#ifdef DEBUGGING
3dab1dad
YO
6230/*
6231- regtail_study - set the next-pointer at the end of a node chain of p to val.
6232- Look for optimizable sequences at the same time.
6233- currently only looks for EXACT chains.
07be1b83
YO
6234
6235This is expermental code. The idea is to use this routine to perform
6236in place optimizations on branches and groups as they are constructed,
6237with the long term intention of removing optimization from study_chunk so
6238that it is purely analytical.
6239
6240Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6241to control which is which.
6242
3dab1dad
YO
6243*/
6244/* TODO: All four parms should be const */
07be1b83 6245
3dab1dad
YO
6246STATIC U8
6247S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6248{
6249 dVAR;
6250 register regnode *scan;
07be1b83
YO
6251 U8 exact = PSEUDO;
6252#ifdef EXPERIMENTAL_INPLACESCAN
6253 I32 min = 0;
6254#endif
6255
3dab1dad
YO
6256 GET_RE_DEBUG_FLAGS_DECL;
6257
07be1b83 6258
3dab1dad
YO
6259 if (SIZE_ONLY)
6260 return exact;
6261
6262 /* Find last node. */
6263
6264 scan = p;
6265 for (;;) {
6266 regnode * const temp = regnext(scan);
07be1b83
YO
6267#ifdef EXPERIMENTAL_INPLACESCAN
6268 if (PL_regkind[OP(scan)] == EXACT)
6269 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6270 return EXACT;
6271#endif
3dab1dad
YO
6272 if ( exact ) {
6273 switch (OP(scan)) {
6274 case EXACT:
6275 case EXACTF:
6276 case EXACTFL:
6277 if( exact == PSEUDO )
6278 exact= OP(scan);
07be1b83
YO
6279 else if ( exact != OP(scan) )
6280 exact= 0;
3dab1dad
YO
6281 case NOTHING:
6282 break;
6283 default:
6284 exact= 0;
6285 }
6286 }
6287 DEBUG_PARSE_r({
6288 SV * const mysv=sv_newmortal();
6289 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6290 regprop(RExC_rx, mysv, scan);
6291 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6292 SvPV_nolen_const(mysv),
6293 reg_name[exact],
6294 REG_NODE_NUM(scan));
6295 });
a0d0e21e
LW
6296 if (temp == NULL)
6297 break;
6298 scan = temp;
6299 }
07be1b83
YO
6300 DEBUG_PARSE_r({
6301 SV * const mysv_val=sv_newmortal();
6302 DEBUG_PARSE_MSG("");
6303 regprop(RExC_rx, mysv_val, val);
6304 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6305 SvPV_nolen_const(mysv_val),
6306 REG_NODE_NUM(val),
6307 val - scan
6308 );
6309 });
c277df42
IZ
6310 if (reg_off_by_arg[OP(scan)]) {
6311 ARG_SET(scan, val - scan);
a0ed51b3
LW
6312 }
6313 else {
c277df42
IZ
6314 NEXT_OFF(scan) = val - scan;
6315 }
3dab1dad
YO
6316
6317 return exact;
a687059c 6318}
07be1b83 6319#endif
a687059c
LW
6320
6321/*
a687059c
LW
6322 - regcurly - a little FSA that accepts {\d+,?\d*}
6323 */
79072805 6324STATIC I32
5f66b61c 6325S_regcurly(register const char *s)
a687059c
LW
6326{
6327 if (*s++ != '{')
6328 return FALSE;
f0fcb552 6329 if (!isDIGIT(*s))
a687059c 6330 return FALSE;
f0fcb552 6331 while (isDIGIT(*s))
a687059c
LW
6332 s++;
6333 if (*s == ',')
6334 s++;
f0fcb552 6335 while (isDIGIT(*s))
a687059c
LW
6336 s++;
6337 if (*s != '}')
6338 return FALSE;
6339 return TRUE;
6340}
6341
a687059c
LW
6342
6343/*
fd181c75 6344 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6345 */
6346void
097eb12c 6347Perl_regdump(pTHX_ const regexp *r)
a687059c 6348{
35ff7856 6349#ifdef DEBUGGING
97aff369 6350 dVAR;
c445ea15 6351 SV * const sv = sv_newmortal();
ab3bbdeb 6352 SV *dsv= sv_newmortal();
a687059c 6353
4f639d21 6354 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6355
6356 /* Header fields of interest. */
ab3bbdeb
YO
6357 if (r->anchored_substr) {
6358 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6359 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 6360 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6361 "anchored %s%s at %"IVdf" ",
6362 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 6363 (IV)r->anchored_offset);
ab3bbdeb
YO
6364 } else if (r->anchored_utf8) {
6365 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6366 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 6367 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6368 "anchored utf8 %s%s at %"IVdf" ",
6369 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 6370 (IV)r->anchored_offset);
ab3bbdeb
YO
6371 }
6372 if (r->float_substr) {
6373 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6374 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 6375 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6376 "floating %s%s at %"IVdf"..%"UVuf" ",
6377 s, RE_SV_TAIL(r->float_substr),
7b0972df 6378 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
6379 } else if (r->float_utf8) {
6380 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6381 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 6382 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6383 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6384 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 6385 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 6386 }
33b8afdf 6387 if (r->check_substr || r->check_utf8)
b81d288d
AB
6388 PerlIO_printf(Perl_debug_log,
6389 r->check_substr == r->float_substr
33b8afdf 6390 && r->check_utf8 == r->float_utf8
c277df42
IZ
6391 ? "(checking floating" : "(checking anchored");
6392 if (r->reganch & ROPT_NOSCAN)
6393 PerlIO_printf(Perl_debug_log, " noscan");
6394 if (r->reganch & ROPT_CHECK_ALL)
6395 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6396 if (r->check_substr || r->check_utf8)
c277df42
IZ
6397 PerlIO_printf(Perl_debug_log, ") ");
6398
46fc3d4c 6399 if (r->regstclass) {
32fc9b6a 6400 regprop(r, sv, r->regstclass);
3f7c398e 6401 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6402 }
774d564b 6403 if (r->reganch & ROPT_ANCH) {
6404 PerlIO_printf(Perl_debug_log, "anchored");
6405 if (r->reganch & ROPT_ANCH_BOL)
6406 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6407 if (r->reganch & ROPT_ANCH_MBOL)
6408 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6409 if (r->reganch & ROPT_ANCH_SBOL)
6410 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6411 if (r->reganch & ROPT_ANCH_GPOS)
6412 PerlIO_printf(Perl_debug_log, "(GPOS)");
6413 PerlIO_putc(Perl_debug_log, ' ');
6414 }
c277df42
IZ
6415 if (r->reganch & ROPT_GPOS_SEEN)
6416 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6417 if (r->reganch & ROPT_SKIP)
760ac839 6418 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6419 if (r->reganch & ROPT_IMPLICIT)
760ac839 6420 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6421 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6422 if (r->reganch & ROPT_EVAL_SEEN)
6423 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6424 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6425#else
96a5add6 6426 PERL_UNUSED_CONTEXT;
65e66c80 6427 PERL_UNUSED_ARG(r);
17c3b450 6428#endif /* DEBUGGING */
a687059c
LW
6429}
6430
6431/*
a0d0e21e
LW
6432- regprop - printable representation of opcode
6433*/
46fc3d4c 6434void
32fc9b6a 6435Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6436{
35ff7856 6437#ifdef DEBUGGING
97aff369 6438 dVAR;
9b155405 6439 register int k;
a0d0e21e 6440
54dc92de 6441 sv_setpvn(sv, "", 0);
9b155405 6442 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6443 /* It would be nice to FAIL() here, but this may be called from
6444 regexec.c, and it would be hard to supply pRExC_state. */
6445 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6446 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6447
3dab1dad 6448 k = PL_regkind[OP(o)];
9b155405 6449
2a782b5b 6450 if (k == EXACT) {
396482e1 6451 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
6452 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6453 * is a crude hack but it may be the best for now since
6454 * we have no flag "this EXACTish node was UTF-8"
6455 * --jhi */
6456 const char * const s =
ddc5bc0f 6457 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
6458 PL_colors[0], PL_colors[1],
6459 PERL_PV_ESCAPE_UNI_DETECT |
6460 PERL_PV_PRETTY_ELIPSES |
6461 PERL_PV_PRETTY_LTGT
6462 );
6463 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 6464 } else if (k == TRIE) {
3dab1dad
YO
6465 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6466 /* print the details of the trie in dumpuntil instead, as
4f639d21 6467 * prog->data isn't available here */
a3621e74 6468 } else if (k == CURLY) {
cb434fcc 6469 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6470 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6471 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6472 }
2c2d71f5
JH
6473 else if (k == WHILEM && o->flags) /* Ordinal/of */
6474 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6475 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6476 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6477 else if (k == LOGICAL)
04ebc1ab 6478 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6479 else if (k == ANYOF) {
6480 int i, rangestart = -1;
2d03de9c 6481 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6482
6483 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6484 static const char * const anyofs[] = {
653099ff
GS
6485 "\\w",
6486 "\\W",
6487 "\\s",
6488 "\\S",
6489 "\\d",
6490 "\\D",
6491 "[:alnum:]",
6492 "[:^alnum:]",
6493 "[:alpha:]",
6494 "[:^alpha:]",
6495 "[:ascii:]",
6496 "[:^ascii:]",
6497 "[:ctrl:]",
6498 "[:^ctrl:]",
6499 "[:graph:]",
6500 "[:^graph:]",
6501 "[:lower:]",
6502 "[:^lower:]",
6503 "[:print:]",
6504 "[:^print:]",
6505 "[:punct:]",
6506 "[:^punct:]",
6507 "[:upper:]",
aaa51d5e 6508 "[:^upper:]",
653099ff 6509 "[:xdigit:]",
aaa51d5e
JF
6510 "[:^xdigit:]",
6511 "[:space:]",
6512 "[:^space:]",
6513 "[:blank:]",
6514 "[:^blank:]"
653099ff
GS
6515 };
6516
19860706 6517 if (flags & ANYOF_LOCALE)
396482e1 6518 sv_catpvs(sv, "{loc}");
19860706 6519 if (flags & ANYOF_FOLD)
396482e1 6520 sv_catpvs(sv, "{i}");
653099ff 6521 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6522 if (flags & ANYOF_INVERT)
396482e1 6523 sv_catpvs(sv, "^");
ffc61ed2
JH
6524 for (i = 0; i <= 256; i++) {
6525 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6526 if (rangestart == -1)
6527 rangestart = i;
6528 } else if (rangestart != -1) {
6529 if (i <= rangestart + 3)
6530 for (; rangestart < i; rangestart++)
653099ff 6531 put_byte(sv, rangestart);
ffc61ed2
JH
6532 else {
6533 put_byte(sv, rangestart);
396482e1 6534 sv_catpvs(sv, "-");
ffc61ed2 6535 put_byte(sv, i - 1);
653099ff 6536 }
ffc61ed2 6537 rangestart = -1;
653099ff 6538 }
847a199f 6539 }
ffc61ed2
JH
6540
6541 if (o->flags & ANYOF_CLASS)
bb7a0f54 6542 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6543 if (ANYOF_CLASS_TEST(o,i))
6544 sv_catpv(sv, anyofs[i]);
6545
6546 if (flags & ANYOF_UNICODE)
396482e1 6547 sv_catpvs(sv, "{unicode}");
1aa99e6b 6548 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6549 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6550
6551 {
6552 SV *lv;
32fc9b6a 6553 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6554
ffc61ed2
JH
6555 if (lv) {
6556 if (sw) {
89ebb4a3 6557 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6558
ffc61ed2 6559 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6560 uvchr_to_utf8(s, i);
ffc61ed2 6561
3568d838 6562 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6563 if (rangestart == -1)
6564 rangestart = i;
6565 } else if (rangestart != -1) {
ffc61ed2
JH
6566 if (i <= rangestart + 3)
6567 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6568 const U8 * const e = uvchr_to_utf8(s,rangestart);
6569 U8 *p;
6570 for(p = s; p < e; p++)
ffc61ed2
JH
6571 put_byte(sv, *p);
6572 }
6573 else {
2d03de9c
AL
6574 const U8 *e = uvchr_to_utf8(s,rangestart);
6575 U8 *p;
6576 for (p = s; p < e; p++)
ffc61ed2 6577 put_byte(sv, *p);
396482e1 6578 sv_catpvs(sv, "-");
2d03de9c
AL
6579 e = uvchr_to_utf8(s, i-1);
6580 for (p = s; p < e; p++)
1df70142 6581 put_byte(sv, *p);
ffc61ed2
JH
6582 }
6583 rangestart = -1;
6584 }
19860706 6585 }
ffc61ed2 6586
396482e1 6587 sv_catpvs(sv, "..."); /* et cetera */
19860706 6588 }
fde631ed 6589
ffc61ed2 6590 {
2e0de35c 6591 char *s = savesvpv(lv);
c445ea15 6592 char * const origs = s;
b81d288d 6593
3dab1dad
YO
6594 while (*s && *s != '\n')
6595 s++;
b81d288d 6596
ffc61ed2 6597 if (*s == '\n') {
2d03de9c 6598 const char * const t = ++s;
ffc61ed2
JH
6599
6600 while (*s) {
6601 if (*s == '\n')
6602 *s = ' ';
6603 s++;
6604 }
6605 if (s[-1] == ' ')
6606 s[-1] = 0;
6607
6608 sv_catpv(sv, t);
fde631ed 6609 }
b81d288d 6610
ffc61ed2 6611 Safefree(origs);
fde631ed
JH
6612 }
6613 }
653099ff 6614 }
ffc61ed2 6615
653099ff
GS
6616 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6617 }
9b155405 6618 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6619 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6620#else
96a5add6 6621 PERL_UNUSED_CONTEXT;
65e66c80
SP
6622 PERL_UNUSED_ARG(sv);
6623 PERL_UNUSED_ARG(o);
f9049ba1 6624 PERL_UNUSED_ARG(prog);
17c3b450 6625#endif /* DEBUGGING */
35ff7856 6626}
a687059c 6627
cad2e5aa
JH
6628SV *
6629Perl_re_intuit_string(pTHX_ regexp *prog)
6630{ /* Assume that RE_INTUIT is set */
97aff369 6631 dVAR;
a3621e74 6632 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6633 PERL_UNUSED_CONTEXT;
6634
a3621e74 6635 DEBUG_COMPILE_r(
cfd0369c 6636 {
2d03de9c 6637 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6638 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6639
6640 if (!PL_colorset) reginitcolors();
6641 PerlIO_printf(Perl_debug_log,
a0288114 6642 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6643 PL_colors[4],
6644 prog->check_substr ? "" : "utf8 ",
6645 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6646 s,
6647 PL_colors[1],
6648 (strlen(s) > 60 ? "..." : ""));
6649 } );
6650
33b8afdf 6651 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6652}
6653
2b69d0c2 6654void
864dbfa3 6655Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6656{
27da23d5 6657 dVAR;
0df25f3d
YO
6658
6659
6660
fc32ee4a 6661 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6662
7821416a
IZ
6663 if (!r || (--r->refcnt > 0))
6664 return;
ab3bbdeb 6665 DEBUG_COMPILE_r({
0df25f3d
YO
6666 if (!PL_colorset)
6667 reginitcolors();
ab3bbdeb
YO
6668 if (RX_DEBUG(r)){
6669 SV *dsv= sv_newmortal();
6670 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6671 dsv, r->precomp, r->prelen, 60);
6672 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6673 PL_colors[4],PL_colors[5],s);
6674 }
9e55ce06 6675 });
cad2e5aa 6676
43c5f42d
NC
6677 /* gcov results gave these as non-null 100% of the time, so there's no
6678 optimisation in checking them before calling Safefree */
6679 Safefree(r->precomp);
6680 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6681 RX_MATCH_COPY_FREE(r);
f8c7b90f 6682#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6683 if (r->saved_copy)
6684 SvREFCNT_dec(r->saved_copy);
6685#endif
a193d654
GS
6686 if (r->substrs) {
6687 if (r->anchored_substr)
6688 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6689 if (r->anchored_utf8)
6690 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6691 if (r->float_substr)
6692 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6693 if (r->float_utf8)
6694 SvREFCNT_dec(r->float_utf8);
2779dcf1 6695 Safefree(r->substrs);
a193d654 6696 }
c277df42
IZ
6697 if (r->data) {
6698 int n = r->data->count;
f3548bdc
DM
6699 PAD* new_comppad = NULL;
6700 PAD* old_comppad;
4026c95a 6701 PADOFFSET refcnt;
dfad63ad 6702
c277df42 6703 while (--n >= 0) {
261faec3 6704 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6705 switch (r->data->what[n]) {
6706 case 's':
6707 SvREFCNT_dec((SV*)r->data->data[n]);
6708 break;
653099ff
GS
6709 case 'f':
6710 Safefree(r->data->data[n]);
6711 break;
dfad63ad
HS
6712 case 'p':
6713 new_comppad = (AV*)r->data->data[n];
6714 break;
c277df42 6715 case 'o':
dfad63ad 6716 if (new_comppad == NULL)
cea2e8a9 6717 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6718 PAD_SAVE_LOCAL(old_comppad,
6719 /* Watch out for global destruction's random ordering. */
c445ea15 6720 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6721 );
b34c0dd4 6722 OP_REFCNT_LOCK;
4026c95a
SH
6723 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6724 OP_REFCNT_UNLOCK;
6725 if (!refcnt)
9b978d73 6726 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6727
f3548bdc 6728 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6729 SvREFCNT_dec((SV*)new_comppad);
6730 new_comppad = NULL;
c277df42
IZ
6731 break;
6732 case 'n':
9e55ce06 6733 break;
07be1b83 6734 case 'T':
be8e71aa
YO
6735 { /* Aho Corasick add-on structure for a trie node.
6736 Used in stclass optimization only */
07be1b83
YO
6737 U32 refcount;
6738 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6739 OP_REFCNT_LOCK;
6740 refcount = --aho->refcount;
6741 OP_REFCNT_UNLOCK;
6742 if ( !refcount ) {
6743 Safefree(aho->states);
6744 Safefree(aho->fail);
6745 aho->trie=NULL; /* not necessary to free this as it is
6746 handled by the 't' case */
6747 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6748 Safefree(r->regstclass);
07be1b83
YO
6749 }
6750 }
6751 break;
a3621e74 6752 case 't':
07be1b83 6753 {
be8e71aa 6754 /* trie structure. */
07be1b83
YO
6755 U32 refcount;
6756 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6757 OP_REFCNT_LOCK;
6758 refcount = --trie->refcount;
6759 OP_REFCNT_UNLOCK;
6760 if ( !refcount ) {
6761 Safefree(trie->charmap);
6762 if (trie->widecharmap)
6763 SvREFCNT_dec((SV*)trie->widecharmap);
6764 Safefree(trie->states);
6765 Safefree(trie->trans);
6766 if (trie->bitmap)
6767 Safefree(trie->bitmap);
6768 if (trie->wordlen)
6769 Safefree(trie->wordlen);
a3621e74 6770#ifdef DEBUGGING
be8e71aa
YO
6771 if (RX_DEBUG(r)) {
6772 if (trie->words)
6773 SvREFCNT_dec((SV*)trie->words);
6774 if (trie->revcharmap)
6775 SvREFCNT_dec((SV*)trie->revcharmap);
6776 }
a3621e74 6777#endif
07be1b83 6778 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6779 }
07be1b83
YO
6780 }
6781 break;
c277df42 6782 default:
830247a4 6783 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6784 }
6785 }
6786 Safefree(r->data->what);
6787 Safefree(r->data);
a0d0e21e
LW
6788 }
6789 Safefree(r->startp);
6790 Safefree(r->endp);
6791 Safefree(r);
a687059c 6792}
c277df42 6793
76234dfb 6794#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6795/*
6796 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6797 */
6798regnode *
864dbfa3 6799Perl_regnext(pTHX_ register regnode *p)
c277df42 6800{
97aff369 6801 dVAR;
c277df42
IZ
6802 register I32 offset;
6803
3280af22 6804 if (p == &PL_regdummy)
c277df42
IZ
6805 return(NULL);
6806
6807 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6808 if (offset == 0)
6809 return(NULL);
6810
c277df42 6811 return(p+offset);
c277df42 6812}
76234dfb 6813#endif
c277df42 6814
01f988be 6815STATIC void
cea2e8a9 6816S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6817{
6818 va_list args;
6819 STRLEN l1 = strlen(pat1);
6820 STRLEN l2 = strlen(pat2);
6821 char buf[512];
06bf62c7 6822 SV *msv;
73d840c0 6823 const char *message;
c277df42
IZ
6824
6825 if (l1 > 510)
6826 l1 = 510;
6827 if (l1 + l2 > 510)
6828 l2 = 510 - l1;
6829 Copy(pat1, buf, l1 , char);
6830 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6831 buf[l1 + l2] = '\n';
6832 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6833#ifdef I_STDARG
6834 /* ANSI variant takes additional second argument */
c277df42 6835 va_start(args, pat2);
8736538c
AS
6836#else
6837 va_start(args);
6838#endif
5a844595 6839 msv = vmess(buf, &args);
c277df42 6840 va_end(args);
cfd0369c 6841 message = SvPV_const(msv,l1);
c277df42
IZ
6842 if (l1 > 512)
6843 l1 = 512;
6844 Copy(message, buf, l1 , char);
197cf9b9 6845 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6846 Perl_croak(aTHX_ "%s", buf);
c277df42 6847}
a0ed51b3
LW
6848
6849/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6850
76234dfb 6851#ifndef PERL_IN_XSUB_RE
a0ed51b3 6852void
864dbfa3 6853Perl_save_re_context(pTHX)
b81d288d 6854{
97aff369 6855 dVAR;
1ade1aa1
NC
6856
6857 struct re_save_state *state;
6858
6859 SAVEVPTR(PL_curcop);
6860 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6861
6862 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6863 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6864 SSPUSHINT(SAVEt_RE_STATE);
6865
46ab3289 6866 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6867
a0ed51b3 6868 PL_reg_start_tmp = 0;
a0ed51b3 6869 PL_reg_start_tmpl = 0;
c445ea15 6870 PL_reg_oldsaved = NULL;
a5db57d6 6871 PL_reg_oldsavedlen = 0;
a5db57d6 6872 PL_reg_maxiter = 0;
a5db57d6 6873 PL_reg_leftiter = 0;
c445ea15 6874 PL_reg_poscache = NULL;
a5db57d6 6875 PL_reg_poscache_size = 0;
1ade1aa1
NC
6876#ifdef PERL_OLD_COPY_ON_WRITE
6877 PL_nrs = NULL;
6878#endif
ada6e8a9 6879
c445ea15
AL
6880 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6881 if (PL_curpm) {
6882 const REGEXP * const rx = PM_GETRE(PL_curpm);
6883 if (rx) {
1df70142 6884 U32 i;
ada6e8a9 6885 for (i = 1; i <= rx->nparens; i++) {
1df70142 6886 char digits[TYPE_CHARS(long)];
d9fad198 6887 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6888 GV *const *const gvp
6889 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6890
b37c2d43
AL
6891 if (gvp) {
6892 GV * const gv = *gvp;
6893 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6894 save_scalar(gv);
49f27e4b 6895 }
ada6e8a9
AMS
6896 }
6897 }
6898 }
a0ed51b3 6899}
76234dfb 6900#endif
51371543 6901
51371543 6902static void
acfe0abc 6903clear_re(pTHX_ void *r)
51371543 6904{
97aff369 6905 dVAR;
51371543
GS
6906 ReREFCNT_dec((regexp *)r);
6907}
ffbc6a93 6908
a28509cc
AL
6909#ifdef DEBUGGING
6910
6911STATIC void
6912S_put_byte(pTHX_ SV *sv, int c)
6913{
6914 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6915 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6916 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6917 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6918 else
6919 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6920}
6921
3dab1dad
YO
6922#define CLEAR_OPTSTART \
6923 if (optstart) STMT_START { \
07be1b83 6924 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6925 optstart=NULL; \
6926 } STMT_END
6927
6928#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6929
b5a2f8d8
NC
6930STATIC const regnode *
6931S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6932 const regnode *last, SV* sv, I32 l)
a28509cc 6933{
97aff369 6934 dVAR;
a28509cc 6935 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6936 register const regnode *next;
3dab1dad
YO
6937 const regnode *optstart= NULL;
6938 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6939
6940 while (op != END && (!last || node < last)) {
6941 /* While that wasn't END last time... */
6942
6943 NODE_ALIGN(node);
6944 op = OP(node);
6945 if (op == CLOSE)
6946 l--;
b5a2f8d8 6947 next = regnext((regnode *)node);
07be1b83 6948
a28509cc 6949 /* Where, what. */
8e11feef 6950 if (OP(node) == OPTIMIZED) {
e68ec53f 6951 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6952 optstart = node;
3dab1dad 6953 else
8e11feef 6954 goto after_print;
3dab1dad
YO
6955 } else
6956 CLEAR_OPTSTART;
07be1b83 6957
32fc9b6a 6958 regprop(r, sv, node);
a28509cc
AL
6959 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6960 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6961
6962 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6963 if (next == NULL) /* Next ptr. */
6964 PerlIO_printf(Perl_debug_log, "(0)");
6965 else
6966 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6967 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6968 }
6969
a28509cc
AL
6970 after_print:
6971 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6972 assert(next);
6973 {
6974 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
6975 ? regnext((regnode *)next)
6976 : next);
be8e71aa
YO
6977 if (last && nnode > last)
6978 nnode = last;
6979 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6980 }
a28509cc
AL
6981 }
6982 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 6983 assert(next);
3dab1dad 6984 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6985 }
6986 else if ( PL_regkind[(U8)op] == TRIE ) {
6987 const I32 n = ARG(node);
4f639d21 6988 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6989 const I32 arry_len = av_len(trie->words)+1;
6990 I32 word_idx;
6991 PerlIO_printf(Perl_debug_log,
07be1b83 6992 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
6993 (int)(2*(l+3)),
6994 "",
6995 trie->startstate,
6996 TRIE_WORDCOUNT(trie),
6997 (int)TRIE_CHARCOUNT(trie),
6998 trie->uniquecharcount,
6999 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
7000 (int)trie->minlen,
7001 (int)trie->maxlen
8e11feef 7002 );
3dab1dad
YO
7003 if (trie->bitmap) {
7004 int i;
7005 int rangestart= -1;
7006 sv_setpvn(sv, "", 0);
7007 for (i = 0; i <= 256; i++) {
7008 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
7009 if (rangestart == -1)
7010 rangestart = i;
7011 } else if (rangestart != -1) {
7012 if (i <= rangestart + 3)
7013 for (; rangestart < i; rangestart++)
7014 put_byte(sv, rangestart);
7015 else {
7016 put_byte(sv, rangestart);
7017 sv_catpvs(sv, "-");
7018 put_byte(sv, i - 1);
7019 }
7020 rangestart = -1;
7021 }
7022 }
07be1b83 7023 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 7024 } else
07be1b83 7025 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
7026
7027 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 7028 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
ab3bbdeb
YO
7029 if (elem_ptr)
7030 PerlIO_printf(Perl_debug_log, "%*s%s\n",
a28509cc 7031 (int)(2*(l+4)), "",
ddc5bc0f 7032 pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7033 PL_colors[0], PL_colors[1],
7034 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7035 PERL_PV_PRETTY_ELIPSES |
7036 PERL_PV_PRETTY_LTGT
7037 )
a28509cc 7038 );
a28509cc 7039 }
a28509cc
AL
7040
7041 node = NEXTOPER(node);
7042 node += regarglen[(U8)op];
7043
7044 }
7045 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 7046 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7047 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7048 }
7049 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7050 assert(next);
3dab1dad 7051 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7052 next, sv, l + 1);
7053 }
7054 else if ( op == PLUS || op == STAR) {
3dab1dad 7055 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7056 }
7057 else if (op == ANYOF) {
7058 /* arglen 1 + class block */
7059 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7060 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7061 node = NEXTOPER(node);
7062 }
7063 else if (PL_regkind[(U8)op] == EXACT) {
7064 /* Literal string, where present. */
7065 node += NODE_SZ_STR(node) - 1;
7066 node = NEXTOPER(node);
7067 }
7068 else {
7069 node = NEXTOPER(node);
7070 node += regarglen[(U8)op];
7071 }
7072 if (op == CURLYX || op == OPEN)
7073 l++;
7074 else if (op == WHILEM)
7075 l--;
7076 }
3dab1dad 7077 CLEAR_OPTSTART;
a28509cc
AL
7078 return node;
7079}
7080
7081#endif /* DEBUGGING */
7082
241d1a3b
NC
7083/*
7084 * Local variables:
7085 * c-indentation-style: bsd
7086 * c-basic-offset: 4
7087 * indent-tabs-mode: t
7088 * End:
7089 *
37442d52
RGS
7090 * ex: set ts=8 sts=4 sw=4 noet:
7091 */