This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
I thought I had fixed the spelling of DOES() in universal.t, but
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
758 } STMT_END
a3621e74
YO
759
760#define TRIE_READ_CHAR STMT_START { \
07be1b83 761 wordlen++; \
a3621e74
YO
762 if ( UTF ) { \
763 if ( folder ) { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
766 foldlen -= len; \
767 scan += len; \
768 len = 0; \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
774 } \
775 } else { \
e1ec3a88 776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
777 } \
778 } else { \
779 uvc = (U32)*uc; \
780 len = 1; \
781 } \
782} STMT_END
783
784
785#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
789
790#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
795 } \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
799} STMT_END
800
801#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 802 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
806} STMT_END
807
07be1b83
YO
808#define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
811 if (trie->wordlen) \
812 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
813 trie->states[ state ].wordnum = ++curword; \
814 DEBUG_r({ \
815 /* store the word for dumping */ \
816 SV* tmp; \
07be1b83
YO
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 819 else \
07be1b83 820 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
823 }); \
824 } else { \
825 NOOP; /* It's a dupe. So ignore it. */ \
826 } \
827} STMT_END
828
829#ifdef DEBUGGING
07be1b83 830/*
3dab1dad
YO
831 dump_trie(trie)
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
834
835 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
839
3dab1dad
YO
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
07be1b83
YO
842
843*/
3dab1dad
YO
844
845/*
846 dump_trie(trie)
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
849*/
850
851STATIC void
852S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
853{
854 U32 state;
855 GET_RE_DEBUG_FLAGS_DECL;
856
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
860
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad
YO
863 if ( tmp ) {
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
873
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
904 } else {
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
906 }
907 }
908
909 PerlIO_printf( Perl_debug_log, "]");
910
911 }
912 PerlIO_printf( Perl_debug_log, "\n" );
913 }
914}
915/*
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
924{
925 U32 state;
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
931
932 for( state=1 ; state < next_alloc ; state ++ ) {
933 U16 charid;
934
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
939 } else {
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
942 );
943 }
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
3dab1dad
YO
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
950 );
951 }
952
953 }
954}
955
956/*
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
962*/
963STATIC void
964S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
965{
966 U32 state;
967 U16 charid;
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad
YO
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1001 }
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1004 } else {
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 }
1008 }
07be1b83 1009}
3dab1dad
YO
1010
1011#endif
1012
07be1b83
YO
1013#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1020 )
3dab1dad 1021
07be1b83
YO
1022STATIC void
1023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1024{
1025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1026
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1029 ISBN 0-201-10088-6
1030
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1036 Consider
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1041 */
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1044 U32 *q;
be8e71aa
YO
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1048 U32 q_read = 0;
1049 U32 q_write = 0;
1050 U32 charid;
1051 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1052 U32 *fail;
1053 reg_ac_data *aho;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1056
1057 ARG_SET( stclass, data_slot );
1058 Newxz( aho, 1, reg_ac_data );
1059 RExC_rx->data->data[ data_slot ] = (void*)aho;
1060 aho->trie=trie;
1061 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1062 (trie->laststate+1)*sizeof(reg_trie_state));
1063 Newxz( q, numstates, U32);
1064 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1065 aho->refcount = 1;
1066 fail = aho->fail;
07be1b83
YO
1067 fail[ 0 ] = fail[ 1 ] = 1;
1068
1069 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1070 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1071 if ( newstate ) {
07be1b83
YO
1072 q[ q_write ] = newstate;
1073 /* set to point at the root */
1074 fail[ q[ q_write++ ] ]=1;
1075 }
1076 }
1077 while ( q_read < q_write) {
be8e71aa 1078 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1079 base = trie->states[ cur ].trans.base;
1080
1081 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1082 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1083 if (ch_state) {
07be1b83
YO
1084 U32 fail_state = cur;
1085 U32 fail_base;
1086 do {
1087 fail_state = fail[ fail_state ];
1088 fail_base = aho->states[ fail_state ].trans.base;
1089 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1090
1091 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1092 fail[ ch_state ] = fail_state;
1093 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1094 {
1095 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1096 }
1097 q[ q_write++ % numstates] = ch_state;
1098 }
1099 }
1100 }
1101
1102 DEBUG_TRIE_COMPILE_MORE_r({
1103 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1104 for( q_read=2; q_read<numstates; q_read++ ) {
1105 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1106 }
1107 PerlIO_printf(Perl_debug_log, "\n");
1108 });
1109 Safefree(q);
1110 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1111}
3dab1dad
YO
1112
1113
1114
a3621e74 1115STATIC I32
3dab1dad 1116S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1117{
27da23d5 1118 dVAR;
a3621e74
YO
1119 /* first pass, loop through and scan words */
1120 reg_trie_data *trie;
1121 regnode *cur;
9f7f3913 1122 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1123 STRLEN len = 0;
1124 UV uvc = 0;
1125 U16 curword = 0;
1126 U32 next_alloc = 0;
1127 /* we just use folder as a flag in utf8 */
e1ec3a88 1128 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1129 ? PL_fold
1130 : ( flags == EXACTFL
1131 ? PL_fold_locale
1132 : NULL
1133 )
1134 );
1135
e1ec3a88 1136 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1137 SV *re_trie_maxbuff;
3dab1dad
YO
1138#ifndef DEBUGGING
1139 /* these are only used during construction but are useful during
8e11feef
RGS
1140 * debugging so we store them in the struct when debugging.
1141 * Wordcount is actually superfluous in debugging as we have
1142 * (AV*)trie->words to use for it, but that's not available when
1143 * not debugging... We could make the macro use the AV during
1144 * debugging though...
1145 */
3dab1dad
YO
1146 U16 trie_wordcount=0;
1147 STRLEN trie_charcount=0;
07be1b83 1148 /*U32 trie_laststate=0;*/
3dab1dad
YO
1149 AV *trie_revcharmap;
1150#endif
a3621e74
YO
1151 GET_RE_DEBUG_FLAGS_DECL;
1152
a02a5408 1153 Newxz( trie, 1, reg_trie_data );
a3621e74 1154 trie->refcount = 1;
3dab1dad 1155 trie->startstate = 1;
a3621e74 1156 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1157 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1158 if (!(UTF && folder))
1159 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1160 DEBUG_r({
1161 trie->words = newAV();
a3621e74 1162 });
3dab1dad 1163 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1164
0111c4fd 1165 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1166 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1167 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1168 }
3dab1dad
YO
1169 DEBUG_OPTIMISE_r({
1170 PerlIO_printf( Perl_debug_log,
1171 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1172 (int)depth * 2 + 2, "",
1173 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1174 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1175 });
a3621e74
YO
1176 /* -- First loop and Setup --
1177
1178 We first traverse the branches and scan each word to determine if it
1179 contains widechars, and how many unique chars there are, this is
1180 important as we have to build a table with at least as many columns as we
1181 have unique chars.
1182
1183 We use an array of integers to represent the character codes 0..255
1184 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1185 native representation of the character value as the key and IV's for the
1186 coded index.
1187
1188 *TODO* If we keep track of how many times each character is used we can
1189 remap the columns so that the table compression later on is more
1190 efficient in terms of memory by ensuring most common value is in the
1191 middle and the least common are on the outside. IMO this would be better
1192 than a most to least common mapping as theres a decent chance the most
1193 common letter will share a node with the least common, meaning the node
1194 will not be compressable. With a middle is most common approach the worst
1195 case is when we have the least common nodes twice.
1196
1197 */
1198
a3621e74 1199 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1200 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1201 const U8 *uc = (U8*)STRING( noper );
a28509cc 1202 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1203 STRLEN foldlen = 0;
1204 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1205 const U8 *scan = (U8*)NULL;
07be1b83 1206 U32 wordlen = 0; /* required init */
3dab1dad 1207 STRLEN chars=0;
a3621e74 1208
3dab1dad
YO
1209 TRIE_WORDCOUNT(trie)++;
1210 if (OP(noper) == NOTHING) {
1211 trie->minlen= 0;
1212 continue;
1213 }
1214 if (trie->bitmap) {
1215 TRIE_BITMAP_SET(trie,*uc);
1216 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1217 }
a3621e74 1218 for ( ; uc < e ; uc += len ) {
3dab1dad 1219 TRIE_CHARCOUNT(trie)++;
a3621e74 1220 TRIE_READ_CHAR;
3dab1dad 1221 chars++;
a3621e74
YO
1222 if ( uvc < 256 ) {
1223 if ( !trie->charmap[ uvc ] ) {
1224 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1225 if ( folder )
1226 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1227 TRIE_STORE_REVCHAR;
a3621e74
YO
1228 }
1229 } else {
1230 SV** svpp;
1231 if ( !trie->widecharmap )
1232 trie->widecharmap = newHV();
1233
1234 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1235
1236 if ( !svpp )
e4584336 1237 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1238
1239 if ( !SvTRUE( *svpp ) ) {
1240 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1241 TRIE_STORE_REVCHAR;
a3621e74
YO
1242 }
1243 }
1244 }
3dab1dad
YO
1245 if( cur == first ) {
1246 trie->minlen=chars;
1247 trie->maxlen=chars;
1248 } else if (chars < trie->minlen) {
1249 trie->minlen=chars;
1250 } else if (chars > trie->maxlen) {
1251 trie->maxlen=chars;
1252 }
1253
a3621e74
YO
1254 } /* end first pass */
1255 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1256 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1257 (int)depth * 2 + 2,"",
1258 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1259 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1260 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1261 );
07be1b83 1262 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1263
1264 /*
1265 We now know what we are dealing with in terms of unique chars and
1266 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1267 representation using a flat table will take. If it's over a reasonable
1268 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1269 conservative but potentially much slower representation using an array
1270 of lists.
1271
1272 At the end we convert both representations into the same compressed
1273 form that will be used in regexec.c for matching with. The latter
1274 is a form that cannot be used to construct with but has memory
1275 properties similar to the list form and access properties similar
1276 to the table form making it both suitable for fast searches and
1277 small enough that its feasable to store for the duration of a program.
1278
1279 See the comment in the code where the compressed table is produced
1280 inplace from the flat tabe representation for an explanation of how
1281 the compression works.
1282
1283 */
1284
1285
3dab1dad 1286 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1287 /*
1288 Second Pass -- Array Of Lists Representation
1289
1290 Each state will be represented by a list of charid:state records
1291 (reg_trie_trans_le) the first such element holds the CUR and LEN
1292 points of the allocated array. (See defines above).
1293
1294 We build the initial structure using the lists, and then convert
1295 it into the compressed table form which allows faster lookups
1296 (but cant be modified once converted).
a3621e74
YO
1297 */
1298
a3621e74
YO
1299 STRLEN transcount = 1;
1300
3dab1dad 1301 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1302 TRIE_LIST_NEW(1);
1303 next_alloc = 2;
1304
1305 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1306
c445ea15
AL
1307 regnode * const noper = NEXTOPER( cur );
1308 U8 *uc = (U8*)STRING( noper );
1309 const U8 * const e = uc + STR_LEN( noper );
1310 U32 state = 1; /* required init */
1311 U16 charid = 0; /* sanity init */
1312 U8 *scan = (U8*)NULL; /* sanity init */
1313 STRLEN foldlen = 0; /* required init */
07be1b83 1314 U32 wordlen = 0; /* required init */
c445ea15
AL
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1316
3dab1dad 1317 if (OP(noper) != NOTHING) {
c445ea15
AL
1318 for ( ; uc < e ; uc += len ) {
1319
1320 TRIE_READ_CHAR;
1321
1322 if ( uvc < 256 ) {
1323 charid = trie->charmap[ uvc ];
1324 } else {
1325 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1326 if ( !svpp ) {
1327 charid = 0;
1328 } else {
1329 charid=(U16)SvIV( *svpp );
1330 }
1331 }
1332 if ( charid ) {
a3621e74 1333
c445ea15
AL
1334 U16 check;
1335 U32 newstate = 0;
a3621e74 1336
c445ea15
AL
1337 charid--;
1338 if ( !trie->states[ state ].trans.list ) {
1339 TRIE_LIST_NEW( state );
1340 }
1341 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1342 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1343 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1344 break;
1345 }
1346 }
1347 if ( ! newstate ) {
1348 newstate = next_alloc++;
1349 TRIE_LIST_PUSH( state, charid, newstate );
1350 transcount++;
1351 }
1352 state = newstate;
1353 } else {
1354 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1355 }
c445ea15
AL
1356 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1357 }
c445ea15 1358 }
3dab1dad 1359 TRIE_HANDLE_WORD(state);
a3621e74
YO
1360
1361 } /* end second pass */
1362
3dab1dad 1363 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1364 Renew( trie->states, next_alloc, reg_trie_state );
1365
3dab1dad
YO
1366 /* and now dump it out before we compress it */
1367 DEBUG_TRIE_COMPILE_MORE_r(
1368 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1369 );
a3621e74 1370
a02a5408 1371 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1372 {
1373 U32 state;
a3621e74
YO
1374 U32 tp = 0;
1375 U32 zp = 0;
1376
1377
1378 for( state=1 ; state < next_alloc ; state ++ ) {
1379 U32 base=0;
1380
1381 /*
1382 DEBUG_TRIE_COMPILE_MORE_r(
1383 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1384 );
1385 */
1386
1387 if (trie->states[state].trans.list) {
1388 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1389 U16 maxid=minid;
a28509cc 1390 U16 idx;
a3621e74
YO
1391
1392 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1393 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1394 if ( forid < minid ) {
1395 minid=forid;
1396 } else if ( forid > maxid ) {
1397 maxid=forid;
1398 }
a3621e74
YO
1399 }
1400 if ( transcount < tp + maxid - minid + 1) {
1401 transcount *= 2;
1402 Renew( trie->trans, transcount, reg_trie_trans );
1403 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1404 }
1405 base = trie->uniquecharcount + tp - minid;
1406 if ( maxid == minid ) {
1407 U32 set = 0;
1408 for ( ; zp < tp ; zp++ ) {
1409 if ( ! trie->trans[ zp ].next ) {
1410 base = trie->uniquecharcount + zp - minid;
1411 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1412 trie->trans[ zp ].check = state;
1413 set = 1;
1414 break;
1415 }
1416 }
1417 if ( !set ) {
1418 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ tp ].check = state;
1420 tp++;
1421 zp = tp;
1422 }
1423 } else {
1424 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1425 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1426 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1427 trie->trans[ tid ].check = state;
1428 }
1429 tp += ( maxid - minid + 1 );
1430 }
1431 Safefree(trie->states[ state ].trans.list);
1432 }
1433 /*
1434 DEBUG_TRIE_COMPILE_MORE_r(
1435 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1436 );
1437 */
1438 trie->states[ state ].trans.base=base;
1439 }
cc601c31 1440 trie->lasttrans = tp + 1;
a3621e74
YO
1441 }
1442 } else {
1443 /*
1444 Second Pass -- Flat Table Representation.
1445
1446 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1447 We know that we will need Charcount+1 trans at most to store the data
1448 (one row per char at worst case) So we preallocate both structures
1449 assuming worst case.
1450
1451 We then construct the trie using only the .next slots of the entry
1452 structs.
1453
1454 We use the .check field of the first entry of the node temporarily to
1455 make compression both faster and easier by keeping track of how many non
1456 zero fields are in the node.
1457
1458 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1459 transition.
1460
1461 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1462 number representing the first entry of the node, and state as a
1463 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1464 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1465 are 2 entrys per node. eg:
1466
1467 A B A B
1468 1. 2 4 1. 3 7
1469 2. 0 3 3. 0 5
1470 3. 0 0 5. 0 0
1471 4. 0 0 7. 0 0
1472
1473 The table is internally in the right hand, idx form. However as we also
1474 have to deal with the states array which is indexed by nodenum we have to
1475 use TRIE_NODENUM() to convert.
1476
1477 */
1478
3dab1dad
YO
1479
1480 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1481 reg_trie_trans );
3dab1dad 1482 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1483 next_alloc = trie->uniquecharcount + 1;
1484
3dab1dad 1485
a3621e74
YO
1486 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1487
c445ea15 1488 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1489 const U8 *uc = (U8*)STRING( noper );
1490 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1491
1492 U32 state = 1; /* required init */
1493
1494 U16 charid = 0; /* sanity init */
1495 U32 accept_state = 0; /* sanity init */
1496 U8 *scan = (U8*)NULL; /* sanity init */
1497
1498 STRLEN foldlen = 0; /* required init */
07be1b83 1499 U32 wordlen = 0; /* required init */
a3621e74
YO
1500 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1501
3dab1dad 1502 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1503 for ( ; uc < e ; uc += len ) {
1504
1505 TRIE_READ_CHAR;
1506
1507 if ( uvc < 256 ) {
1508 charid = trie->charmap[ uvc ];
1509 } else {
c445ea15
AL
1510 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1511 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1512 }
1513 if ( charid ) {
1514 charid--;
1515 if ( !trie->trans[ state + charid ].next ) {
1516 trie->trans[ state + charid ].next = next_alloc;
1517 trie->trans[ state ].check++;
1518 next_alloc += trie->uniquecharcount;
1519 }
1520 state = trie->trans[ state + charid ].next;
1521 } else {
e4584336 1522 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1523 }
1524 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1525 }
a3621e74 1526 }
3dab1dad
YO
1527 accept_state = TRIE_NODENUM( state );
1528 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1529
1530 } /* end second pass */
1531
3dab1dad
YO
1532 /* and now dump it out before we compress it */
1533 DEBUG_TRIE_COMPILE_MORE_r(
1534 dump_trie_interim_table(trie,next_alloc,depth+1)
1535 );
a3621e74 1536
a3621e74
YO
1537 {
1538 /*
1539 * Inplace compress the table.*
1540
1541 For sparse data sets the table constructed by the trie algorithm will
1542 be mostly 0/FAIL transitions or to put it another way mostly empty.
1543 (Note that leaf nodes will not contain any transitions.)
1544
1545 This algorithm compresses the tables by eliminating most such
1546 transitions, at the cost of a modest bit of extra work during lookup:
1547
1548 - Each states[] entry contains a .base field which indicates the
1549 index in the state[] array wheres its transition data is stored.
1550
1551 - If .base is 0 there are no valid transitions from that node.
1552
1553 - If .base is nonzero then charid is added to it to find an entry in
1554 the trans array.
1555
1556 -If trans[states[state].base+charid].check!=state then the
1557 transition is taken to be a 0/Fail transition. Thus if there are fail
1558 transitions at the front of the node then the .base offset will point
1559 somewhere inside the previous nodes data (or maybe even into a node
1560 even earlier), but the .check field determines if the transition is
1561 valid.
1562
1563 The following process inplace converts the table to the compressed
1564 table: We first do not compress the root node 1,and mark its all its
1565 .check pointers as 1 and set its .base pointer as 1 as well. This
1566 allows to do a DFA construction from the compressed table later, and
1567 ensures that any .base pointers we calculate later are greater than
1568 0.
1569
1570 - We set 'pos' to indicate the first entry of the second node.
1571
1572 - We then iterate over the columns of the node, finding the first and
1573 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1574 and set the .check pointers accordingly, and advance pos
1575 appropriately and repreat for the next node. Note that when we copy
1576 the next pointers we have to convert them from the original
1577 NODEIDX form to NODENUM form as the former is not valid post
1578 compression.
1579
1580 - If a node has no transitions used we mark its base as 0 and do not
1581 advance the pos pointer.
1582
1583 - If a node only has one transition we use a second pointer into the
1584 structure to fill in allocated fail transitions from other states.
1585 This pointer is independent of the main pointer and scans forward
1586 looking for null transitions that are allocated to a state. When it
1587 finds one it writes the single transition into the "hole". If the
1588 pointer doesnt find one the single transition is appeneded as normal.
1589
1590 - Once compressed we can Renew/realloc the structures to release the
1591 excess space.
1592
1593 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1594 specifically Fig 3.47 and the associated pseudocode.
1595
1596 demq
1597 */
a3b680e6 1598 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1599 U32 state, charid;
a3621e74 1600 U32 pos = 0, zp=0;
3dab1dad 1601 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1602
1603 for ( state = 1 ; state < laststate ; state++ ) {
1604 U8 flag = 0;
a28509cc
AL
1605 const U32 stateidx = TRIE_NODEIDX( state );
1606 const U32 o_used = trie->trans[ stateidx ].check;
1607 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1608 trie->trans[ stateidx ].check = 0;
1609
1610 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1611 if ( flag || trie->trans[ stateidx + charid ].next ) {
1612 if ( trie->trans[ stateidx + charid ].next ) {
1613 if (o_used == 1) {
1614 for ( ; zp < pos ; zp++ ) {
1615 if ( ! trie->trans[ zp ].next ) {
1616 break;
1617 }
1618 }
1619 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1620 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1621 trie->trans[ zp ].check = state;
1622 if ( ++zp > pos ) pos = zp;
1623 break;
1624 }
1625 used--;
1626 }
1627 if ( !flag ) {
1628 flag = 1;
1629 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1630 }
1631 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1632 trie->trans[ pos ].check = state;
1633 pos++;
1634 }
1635 }
1636 }
cc601c31 1637 trie->lasttrans = pos + 1;
a3621e74
YO
1638 Renew( trie->states, laststate + 1, reg_trie_state);
1639 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1640 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1641 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1642 (int)depth * 2 + 2,"",
1643 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1644 (IV)next_alloc,
1645 (IV)pos,
a3621e74
YO
1646 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1647 );
1648
1649 } /* end table compress */
1650 }
cc601c31
YO
1651 /* resize the trans array to remove unused space */
1652 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1653
3dab1dad
YO
1654 /* and now dump out the compressed format */
1655 DEBUG_TRIE_COMPILE_r(
1656 dump_trie(trie,depth+1)
1657 );
07be1b83 1658
3dab1dad
YO
1659 { /* Modify the program and insert the new TRIE node*/
1660 regnode *convert;
1661 U8 nodetype =(U8)(flags & 0xFF);
1662 char *str=NULL;
07be1b83
YO
1663#ifdef DEBUGGING
1664 U32 mjd_offset;
1665 U32 mjd_nodelen;
1666#endif
a3621e74 1667 /*
3dab1dad
YO
1668 This means we convert either the first branch or the first Exact,
1669 depending on whether the thing following (in 'last') is a branch
1670 or not and whther first is the startbranch (ie is it a sub part of
1671 the alternation or is it the whole thing.)
1672 Assuming its a sub part we conver the EXACT otherwise we convert
1673 the whole branch sequence, including the first.
a3621e74 1674 */
3dab1dad
YO
1675 /* Find the node we are going to overwrite */
1676 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1677 /* whole branch chain */
3dab1dad 1678 convert = first;
07be1b83
YO
1679 DEBUG_r({
1680 const regnode *nop = NEXTOPER( convert );
1681 mjd_offset= Node_Offset((nop));
1682 mjd_nodelen= Node_Length((nop));
1683 });
1684 } else {
1685 /* branch sub-chain */
3dab1dad
YO
1686 convert = NEXTOPER( first );
1687 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1688 DEBUG_r({
1689 mjd_offset= Node_Offset((convert));
1690 mjd_nodelen= Node_Length((convert));
1691 });
1692 }
1693 DEBUG_OPTIMISE_r(
1694 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1695 (int)depth * 2 + 2, "",
1696 mjd_offset,mjd_nodelen)
1697 );
a3621e74 1698
3dab1dad
YO
1699 /* But first we check to see if there is a common prefix we can
1700 split out as an EXACT and put in front of the TRIE node. */
1701 trie->startstate= 1;
1702 if ( trie->bitmap && !trie->widecharmap ) {
1703 U32 state;
1704 DEBUG_OPTIMISE_r(
8e11feef
RGS
1705 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1706 (int)depth * 2 + 2, "",
1707 TRIE_LASTSTATE(trie))
1708 );
1709 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1710 U32 ofs = 0;
8e11feef
RGS
1711 I32 idx = -1;
1712 U32 count = 0;
1713 const U32 base = trie->states[ state ].trans.base;
a3621e74 1714
3dab1dad 1715 if ( trie->states[state].wordnum )
8e11feef 1716 count = 1;
a3621e74 1717
8e11feef 1718 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1719 if ( ( base + ofs >= trie->uniquecharcount ) &&
1720 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1721 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1722 {
3dab1dad 1723 if ( ++count > 1 ) {
8e11feef 1724 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1725 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1726 if ( state == 1 ) break;
3dab1dad
YO
1727 if ( count == 2 ) {
1728 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1729 DEBUG_OPTIMISE_r(
8e11feef
RGS
1730 PerlIO_printf(Perl_debug_log,
1731 "%*sNew Start State=%"UVuf" Class: [",
1732 (int)depth * 2 + 2, "",
3dab1dad 1733 state));
be8e71aa
YO
1734 if (idx >= 0) {
1735 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1736 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1737
3dab1dad 1738 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1739 if ( folder )
1740 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1741 DEBUG_OPTIMISE_r(
07be1b83 1742 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1743 );
8e11feef
RGS
1744 }
1745 }
1746 TRIE_BITMAP_SET(trie,*ch);
1747 if ( folder )
1748 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1750 }
1751 idx = ofs;
1752 }
3dab1dad
YO
1753 }
1754 if ( count == 1 ) {
1755 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1756 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1757 DEBUG_OPTIMISE_r(
8e11feef
RGS
1758 PerlIO_printf( Perl_debug_log,
1759 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1760 (int)depth * 2 + 2, "",
1761 state, idx, ch)
3dab1dad
YO
1762 );
1763 if ( state==1 ) {
1764 OP( convert ) = nodetype;
1765 str=STRING(convert);
1766 STR_LEN(convert)=0;
1767 }
1768 *str++=*ch;
1769 STR_LEN(convert)++;
a3621e74 1770
8e11feef
RGS
1771 } else {
1772 if (state>1)
1773 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1774 break;
1775 }
1776 }
3dab1dad 1777 if (str) {
8e11feef 1778 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1779 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1780 trie->startstate = state;
07be1b83
YO
1781 trie->minlen -= (state - 1);
1782 trie->maxlen -= (state - 1);
1783 DEBUG_r({
1784 regnode *fix = convert;
1785 mjd_nodelen++;
1786 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1787 while( ++fix < n ) {
1788 Set_Node_Offset_Length(fix, 0, 0);
1789 }
1790 });
8e11feef
RGS
1791 if (trie->maxlen) {
1792 convert = n;
1793 } else {
3dab1dad
YO
1794 NEXT_OFF(convert) = (U16)(tail - convert);
1795 }
1796 }
1797 }
1798 if ( trie->maxlen ) {
1799 OP( convert ) = TRIE;
8e11feef
RGS
1800 NEXT_OFF( convert ) = (U16)(tail - convert);
1801 ARG_SET( convert, data_slot );
a3621e74 1802
3dab1dad
YO
1803 /* store the type in the flags */
1804 convert->flags = nodetype;
1805 /* XXX We really should free up the resource in trie now, as we wont use them */
1806 }
a3621e74
YO
1807 /* needed for dumping*/
1808 DEBUG_r({
1809 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1810 regnode *opt = convert;
1811 while (++opt<optimize) {
1812 Set_Node_Offset_Length(opt,0,0);
1813 }
a3621e74
YO
1814 /* We now need to mark all of the space originally used by the
1815 branches as optimized away. This keeps the dumpuntil from
1816 throwing a wobbly as it doesnt use regnext() to traverse the
1817 opcodes.
07be1b83 1818 We also "fix" the offsets
a3621e74
YO
1819 */
1820 while( optimize < last ) {
07be1b83 1821 mjd_nodelen += Node_Length((optimize));
a3621e74 1822 OP( optimize ) = OPTIMIZED;
07be1b83 1823 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1824 optimize++;
1825 }
07be1b83 1826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1827 });
1828 } /* end node insert */
07be1b83 1829#ifndef DEBUGGING
6e8b4190 1830 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1831#endif
a3621e74
YO
1832 return 1;
1833}
1834
a3621e74 1835/*
5d1c421c
JH
1836 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1837 * These need to be revisited when a newer toolchain becomes available.
1838 */
1839#if defined(__sparc64__) && defined(__GNUC__)
1840# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1841# undef SPARC64_GCC_WORKAROUND
1842# define SPARC64_GCC_WORKAROUND 1
1843# endif
1844#endif
1845
07be1b83
YO
1846#define DEBUG_PEEP(str,scan,depth) \
1847 DEBUG_OPTIMISE_r({ \
1848 SV * const mysv=sv_newmortal(); \
1849 regnode *Next = regnext(scan); \
1850 regprop(RExC_rx, mysv, scan); \
1851 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1852 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1853 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1854 });
1855
1856#define JOIN_EXACT(scan,min,flags) \
1857 if (PL_regkind[OP(scan)] == EXACT) \
1858 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1859
be8e71aa 1860STATIC U32
07be1b83
YO
1861S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1862 /* Merge several consecutive EXACTish nodes into one. */
1863 regnode *n = regnext(scan);
1864 U32 stringok = 1;
1865 regnode *next = scan + NODE_SZ_STR(scan);
1866 U32 merged = 0;
1867 U32 stopnow = 0;
1868#ifdef DEBUGGING
1869 regnode *stop = scan;
1870#endif
1871 GET_RE_DEBUG_FLAGS_DECL;
1872 DEBUG_PEEP("join",scan,depth);
1873
1874 /* Skip NOTHING, merge EXACT*. */
1875 while (n &&
1876 ( PL_regkind[OP(n)] == NOTHING ||
1877 (stringok && (OP(n) == OP(scan))))
1878 && NEXT_OFF(n)
1879 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1880
1881 if (OP(n) == TAIL || n > next)
1882 stringok = 0;
1883 if (PL_regkind[OP(n)] == NOTHING) {
1884
1885 DEBUG_PEEP("skip:",n,depth);
1886 NEXT_OFF(scan) += NEXT_OFF(n);
1887 next = n + NODE_STEP_REGNODE;
1888#ifdef DEBUGGING
1889 if (stringok)
1890 stop = n;
1891#endif
1892 n = regnext(n);
1893 }
1894 else if (stringok) {
1895 const int oldl = STR_LEN(scan);
1896 regnode * const nnext = regnext(n);
1897
1898 DEBUG_PEEP("merg",n,depth);
1899
1900 merged++;
1901 if (oldl + STR_LEN(n) > U8_MAX)
1902 break;
1903 NEXT_OFF(scan) += NEXT_OFF(n);
1904 STR_LEN(scan) += STR_LEN(n);
1905 next = n + NODE_SZ_STR(n);
1906 /* Now we can overwrite *n : */
1907 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1908#ifdef DEBUGGING
1909 stop = next - 1;
1910#endif
1911 n = nnext;
1912 if (stopnow) break;
1913 }
1914
1915#ifdef EXPERIMENTAL_INPLACESCAN
1916 if (flags && !NEXT_OFF(n)) {
1917 DEBUG_PEEP("atch",val,depth);
1918 if (reg_off_by_arg[OP(n)]) {
1919 ARG_SET(n, val - n);
1920 }
1921 else {
1922 NEXT_OFF(n) = val - n;
1923 }
1924 stopnow=1;
1925 }
1926#endif
1927 }
1928
1929 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1930 /*
1931 Two problematic code points in Unicode casefolding of EXACT nodes:
1932
1933 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1934 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1935
1936 which casefold to
1937
1938 Unicode UTF-8
1939
1940 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1941 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1942
1943 This means that in case-insensitive matching (or "loose matching",
1944 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1945 length of the above casefolded versions) can match a target string
1946 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1947 This would rather mess up the minimum length computation.
1948
1949 What we'll do is to look for the tail four bytes, and then peek
1950 at the preceding two bytes to see whether we need to decrease
1951 the minimum length by four (six minus two).
1952
1953 Thanks to the design of UTF-8, there cannot be false matches:
1954 A sequence of valid UTF-8 bytes cannot be a subsequence of
1955 another valid sequence of UTF-8 bytes.
1956
1957 */
1958 char * const s0 = STRING(scan), *s, *t;
1959 char * const s1 = s0 + STR_LEN(scan) - 1;
1960 char * const s2 = s1 - 4;
1961 const char t0[] = "\xcc\x88\xcc\x81";
1962 const char * const t1 = t0 + 3;
1963
1964 for (s = s0 + 2;
1965 s < s2 && (t = ninstr(s, s1, t0, t1));
1966 s = t + 4) {
1967 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1968 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1969 *min -= 4;
1970 }
1971 }
1972
1973#ifdef DEBUGGING
1974 /* Allow dumping */
1975 n = scan + NODE_SZ_STR(scan);
1976 while (n <= stop) {
1977 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1978 OP(n) = OPTIMIZED;
1979 NEXT_OFF(n) = 0;
1980 }
1981 n++;
1982 }
1983#endif
1984 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1985 return stopnow;
1986}
1987
653099ff
GS
1988/* REx optimizer. Converts nodes into quickier variants "in place".
1989 Finds fixed substrings. */
1990
a0288114 1991/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1992 to the position after last scanned or to NULL. */
1993
07be1b83
YO
1994
1995
76e3520e 1996STATIC I32
9a957fbc
AL
1997S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1998 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1999 /* scanp: Start here (read-write). */
2000 /* deltap: Write maxlen-minlen here. */
2001 /* last: Stop before this one. */
2002{
97aff369 2003 dVAR;
c277df42
IZ
2004 I32 min = 0, pars = 0, code;
2005 regnode *scan = *scanp, *next;
2006 I32 delta = 0;
2007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2008 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2010 scan_data_t data_fake;
653099ff 2011 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2012 SV *re_trie_maxbuff = NULL;
2013
2014 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2015#ifdef DEBUGGING
2016 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2017#endif
b81d288d 2018
c277df42
IZ
2019 while (scan && OP(scan) != END && scan < last) {
2020 /* Peephole optimizer: */
07be1b83 2021 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2022
07be1b83 2023 JOIN_EXACT(scan,&min,0);
a3621e74 2024
653099ff
GS
2025 /* Follow the next-chain of the current node and optimize
2026 away all the NOTHINGs from it. */
c277df42 2027 if (OP(scan) != CURLYX) {
a3b680e6 2028 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2029 ? I32_MAX
2030 /* I32 may be smaller than U16 on CRAYs! */
2031 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2032 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2033 int noff;
2034 regnode *n = scan;
b81d288d 2035
c277df42
IZ
2036 /* Skip NOTHING and LONGJMP. */
2037 while ((n = regnext(n))
3dab1dad 2038 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2039 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2040 && off + noff < max)
2041 off += noff;
2042 if (reg_off_by_arg[OP(scan)])
2043 ARG(scan) = off;
b81d288d 2044 else
c277df42
IZ
2045 NEXT_OFF(scan) = off;
2046 }
a3621e74 2047
07be1b83 2048
3dab1dad 2049
653099ff
GS
2050 /* The principal pseudo-switch. Cannot be a switch, since we
2051 look into several different things. */
b81d288d 2052 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2053 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2054 next = regnext(scan);
2055 code = OP(scan);
a3621e74 2056 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2057
2058 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2059 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2060 struct regnode_charclass_class accum;
d4c19fe8 2061 regnode * const startbranch=scan;
c277df42 2062
653099ff 2063 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2064 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2065 if (flags & SCF_DO_STCLASS)
830247a4 2066 cl_init_zero(pRExC_state, &accum);
a3621e74 2067
c277df42 2068 while (OP(scan) == code) {
830247a4 2069 I32 deltanext, minnext, f = 0, fake;
653099ff 2070 struct regnode_charclass_class this_class;
c277df42
IZ
2071
2072 num++;
2073 data_fake.flags = 0;
b81d288d 2074 if (data) {
2c2d71f5 2075 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2076 data_fake.last_closep = data->last_closep;
2077 }
2078 else
2079 data_fake.last_closep = &fake;
c277df42
IZ
2080 next = regnext(scan);
2081 scan = NEXTOPER(scan);
2082 if (code != BRANCH)
2083 scan = NEXTOPER(scan);
653099ff 2084 if (flags & SCF_DO_STCLASS) {
830247a4 2085 cl_init(pRExC_state, &this_class);
653099ff
GS
2086 data_fake.start_class = &this_class;
2087 f = SCF_DO_STCLASS_AND;
b81d288d 2088 }
e1901655
IZ
2089 if (flags & SCF_WHILEM_VISITED_POS)
2090 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2091
653099ff 2092 /* we suppose the run is continuous, last=next...*/
830247a4 2093 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2094 next, &data_fake, f,depth+1);
b81d288d 2095 if (min1 > minnext)
c277df42
IZ
2096 min1 = minnext;
2097 if (max1 < minnext + deltanext)
2098 max1 = minnext + deltanext;
2099 if (deltanext == I32_MAX)
aca2d497 2100 is_inf = is_inf_internal = 1;
c277df42
IZ
2101 scan = next;
2102 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2103 pars++;
3dab1dad
YO
2104 if (data) {
2105 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2106 data->flags |= SF_HAS_EVAL;
2c2d71f5 2107 data->whilem_c = data_fake.whilem_c;
3dab1dad 2108 }
653099ff 2109 if (flags & SCF_DO_STCLASS)
830247a4 2110 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2111 if (code == SUSPEND)
c277df42
IZ
2112 break;
2113 }
2114 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2115 min1 = 0;
2116 if (flags & SCF_DO_SUBSTR) {
2117 data->pos_min += min1;
2118 data->pos_delta += max1 - min1;
2119 if (max1 != min1 || is_inf)
2120 data->longest = &(data->longest_float);
2121 }
2122 min += min1;
2123 delta += max1 - min1;
653099ff 2124 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2125 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2126 if (min1) {
2127 cl_and(data->start_class, &and_with);
2128 flags &= ~SCF_DO_STCLASS;
2129 }
2130 }
2131 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2132 if (min1) {
2133 cl_and(data->start_class, &accum);
653099ff 2134 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2135 }
2136 else {
b81d288d 2137 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2138 * data->start_class */
2139 StructCopy(data->start_class, &and_with,
2140 struct regnode_charclass_class);
2141 flags &= ~SCF_DO_STCLASS_AND;
2142 StructCopy(&accum, data->start_class,
2143 struct regnode_charclass_class);
2144 flags |= SCF_DO_STCLASS_OR;
2145 data->start_class->flags |= ANYOF_EOS;
2146 }
653099ff 2147 }
a3621e74
YO
2148
2149 /* demq.
2150
2151 Assuming this was/is a branch we are dealing with: 'scan' now
2152 points at the item that follows the branch sequence, whatever
2153 it is. We now start at the beginning of the sequence and look
2154 for subsequences of
2155
2156 BRANCH->EXACT=>X
2157 BRANCH->EXACT=>X
2158
2159 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2160
2161 If we can find such a subseqence we need to turn the first
2162 element into a trie and then add the subsequent branch exact
2163 strings to the trie.
2164
2165 We have two cases
2166
2167 1. patterns where the whole set of branch can be converted to a trie,
2168
2169 2. patterns where only a subset of the alternations can be
2170 converted to a trie.
2171
2172 In case 1 we can replace the whole set with a single regop
2173 for the trie. In case 2 we need to keep the start and end
2174 branchs so
2175
2176 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2177 becomes BRANCH TRIE; BRANCH X;
2178
2179 Hypthetically when we know the regex isnt anchored we can
2180 turn a case 1 into a DFA and let it rip... Every time it finds a match
2181 it would just call its tail, no WHILEM/CURLY needed.
2182
2183 */
07be1b83 2184 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2185 int made=0;
0111c4fd
RGS
2186 if (!re_trie_maxbuff) {
2187 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2188 if (!SvIOK(re_trie_maxbuff))
2189 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2190 }
a3621e74
YO
2191 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2192 regnode *cur;
2193 regnode *first = (regnode *)NULL;
2194 regnode *last = (regnode *)NULL;
2195 regnode *tail = scan;
2196 U8 optype = 0;
2197 U32 count=0;
2198
2199#ifdef DEBUGGING
c445ea15 2200 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2201#endif
2202 /* var tail is used because there may be a TAIL
2203 regop in the way. Ie, the exacts will point to the
2204 thing following the TAIL, but the last branch will
2205 point at the TAIL. So we advance tail. If we
2206 have nested (?:) we may have to move through several
2207 tails.
2208 */
2209
2210 while ( OP( tail ) == TAIL ) {
2211 /* this is the TAIL generated by (?:) */
2212 tail = regnext( tail );
2213 }
2214
3dab1dad 2215
a3621e74 2216 DEBUG_OPTIMISE_r({
32fc9b6a 2217 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2218 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2219 (int)depth * 2 + 2, "",
2220 "Looking for TRIE'able sequences. Tail node is: ",
2221 SvPV_nolen_const( mysv )
a3621e74
YO
2222 );
2223 });
3dab1dad 2224
a3621e74
YO
2225 /*
2226
2227 step through the branches, cur represents each
2228 branch, noper is the first thing to be matched
2229 as part of that branch and noper_next is the
2230 regnext() of that node. if noper is an EXACT
2231 and noper_next is the same as scan (our current
2232 position in the regex) then the EXACT branch is
2233 a possible optimization target. Once we have
2234 two or more consequetive such branches we can
2235 create a trie of the EXACT's contents and stich
2236 it in place. If the sequence represents all of
2237 the branches we eliminate the whole thing and
2238 replace it with a single TRIE. If it is a
2239 subsequence then we need to stitch it in. This
2240 means the first branch has to remain, and needs
2241 to be repointed at the item on the branch chain
2242 following the last branch optimized. This could
2243 be either a BRANCH, in which case the
2244 subsequence is internal, or it could be the
2245 item following the branch sequence in which
2246 case the subsequence is at the end.
2247
2248 */
2249
2250 /* dont use tail as the end marker for this traverse */
2251 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2252 regnode * const noper = NEXTOPER( cur );
2253 regnode * const noper_next = regnext( noper );
a3621e74 2254
a3621e74 2255 DEBUG_OPTIMISE_r({
32fc9b6a 2256 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2257 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2258 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2259
32fc9b6a 2260 regprop(RExC_rx, mysv, noper);
a3621e74 2261 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2262 SvPV_nolen_const(mysv));
a3621e74
YO
2263
2264 if ( noper_next ) {
32fc9b6a 2265 regprop(RExC_rx, mysv, noper_next );
a3621e74 2266 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2267 SvPV_nolen_const(mysv));
a3621e74 2268 }
3dab1dad
YO
2269 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2270 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2271 });
3dab1dad
YO
2272 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2273 : PL_regkind[ OP( noper ) ] == EXACT )
2274 || OP(noper) == NOTHING )
a3621e74
YO
2275 && noper_next == tail && count<U16_MAX)
2276 {
2277 count++;
3dab1dad
YO
2278 if ( !first || optype == NOTHING ) {
2279 if (!first) first = cur;
a3621e74
YO
2280 optype = OP( noper );
2281 } else {
a3621e74 2282 last = cur;
a3621e74
YO
2283 }
2284 } else {
2285 if ( last ) {
3dab1dad 2286 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2287 }
3dab1dad 2288 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2289 && noper_next == tail )
2290 {
2291 count = 1;
2292 first = cur;
2293 optype = OP( noper );
2294 } else {
2295 count = 0;
2296 first = NULL;
2297 optype = 0;
2298 }
2299 last = NULL;
2300 }
2301 }
2302 DEBUG_OPTIMISE_r({
32fc9b6a 2303 regprop(RExC_rx, mysv, cur);
a3621e74 2304 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2305 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2306 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2307
2308 });
2309 if ( last ) {
3dab1dad
YO
2310 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2311#ifdef TRIE_STUDY_OPT
07be1b83
YO
2312 if ( made && startbranch == first ) {
2313 if ( OP(first)!=TRIE )
2314 flags |= SCF_EXACT_TRIE;
2315 else {
2316 regnode *chk=*scanp;
2317 while ( OP( chk ) == OPEN )
2318 chk = regnext( chk );
2319 if (chk==first)
2320 flags |= SCF_EXACT_TRIE;
2321 }
2322 }
3dab1dad 2323#endif
07be1b83 2324 }
a3621e74 2325 }
3dab1dad
YO
2326
2327 } /* do trie */
a0ed51b3 2328 }
a3621e74 2329 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2330 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2331 } else /* single branch is optimized. */
c277df42
IZ
2332 scan = NEXTOPER(scan);
2333 continue;
a0ed51b3
LW
2334 }
2335 else if (OP(scan) == EXACT) {
cd439c50 2336 I32 l = STR_LEN(scan);
c445ea15 2337 UV uc;
a0ed51b3 2338 if (UTF) {
a3b680e6 2339 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2340 l = utf8_length(s, s + l);
9041c2e3 2341 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2342 } else {
2343 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2344 }
2345 min += l;
c277df42 2346 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2347 /* The code below prefers earlier match for fixed
2348 offset, later match for variable offset. */
2349 if (data->last_end == -1) { /* Update the start info. */
2350 data->last_start_min = data->pos_min;
2351 data->last_start_max = is_inf
b81d288d 2352 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2353 }
cd439c50 2354 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2355 if (UTF)
2356 SvUTF8_on(data->last_found);
0eda9292 2357 {
9a957fbc 2358 SV * const sv = data->last_found;
a28509cc 2359 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2360 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2361 if (mg && mg->mg_len >= 0)
5e43f467
JH
2362 mg->mg_len += utf8_length((U8*)STRING(scan),
2363 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2364 }
c277df42
IZ
2365 data->last_end = data->pos_min + l;
2366 data->pos_min += l; /* As in the first entry. */
2367 data->flags &= ~SF_BEFORE_EOL;
2368 }
653099ff
GS
2369 if (flags & SCF_DO_STCLASS_AND) {
2370 /* Check whether it is compatible with what we know already! */
2371 int compat = 1;
2372
1aa99e6b 2373 if (uc >= 0x100 ||
516a5887 2374 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2375 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2376 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2377 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2378 )
653099ff
GS
2379 compat = 0;
2380 ANYOF_CLASS_ZERO(data->start_class);
2381 ANYOF_BITMAP_ZERO(data->start_class);
2382 if (compat)
1aa99e6b 2383 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2384 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2385 if (uc < 0x100)
2386 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2387 }
2388 else if (flags & SCF_DO_STCLASS_OR) {
2389 /* false positive possible if the class is case-folded */
1aa99e6b 2390 if (uc < 0x100)
9b877dbb
IH
2391 ANYOF_BITMAP_SET(data->start_class, uc);
2392 else
2393 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2394 data->start_class->flags &= ~ANYOF_EOS;
2395 cl_and(data->start_class, &and_with);
2396 }
2397 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2398 }
3dab1dad 2399 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2400 I32 l = STR_LEN(scan);
1aa99e6b 2401 UV uc = *((U8*)STRING(scan));
653099ff
GS
2402
2403 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2404 if (flags & SCF_DO_SUBSTR) {
2405 assert(data);
830247a4 2406 scan_commit(pRExC_state, data);
ecaa9b9c 2407 }
a0ed51b3 2408 if (UTF) {
6136c704 2409 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2410 l = utf8_length(s, s + l);
9041c2e3 2411 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2412 }
2413 min += l;
ecaa9b9c 2414 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2415 data->pos_min += l;
653099ff
GS
2416 if (flags & SCF_DO_STCLASS_AND) {
2417 /* Check whether it is compatible with what we know already! */
2418 int compat = 1;
2419
1aa99e6b 2420 if (uc >= 0x100 ||
516a5887 2421 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2422 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2423 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2424 compat = 0;
2425 ANYOF_CLASS_ZERO(data->start_class);
2426 ANYOF_BITMAP_ZERO(data->start_class);
2427 if (compat) {
1aa99e6b 2428 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2429 data->start_class->flags &= ~ANYOF_EOS;
2430 data->start_class->flags |= ANYOF_FOLD;
2431 if (OP(scan) == EXACTFL)
2432 data->start_class->flags |= ANYOF_LOCALE;
2433 }
2434 }
2435 else if (flags & SCF_DO_STCLASS_OR) {
2436 if (data->start_class->flags & ANYOF_FOLD) {
2437 /* false positive possible if the class is case-folded.
2438 Assume that the locale settings are the same... */
1aa99e6b
IH
2439 if (uc < 0x100)
2440 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2441 data->start_class->flags &= ~ANYOF_EOS;
2442 }
2443 cl_and(data->start_class, &and_with);
2444 }
2445 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2446 }
3dab1dad
YO
2447#ifdef TRIE_STUDY_OPT
2448 else if (OP(scan) == TRIE) {
2449 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2450 min += trie->minlen;
07be1b83 2451 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2452 flags &= ~SCF_DO_STCLASS; /* xxx */
2453 if (flags & SCF_DO_SUBSTR) {
2454 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2455 data->pos_min += trie->minlen;
07be1b83
YO
2456 data->pos_delta += (trie->maxlen - trie->minlen);
2457 if (trie->maxlen != trie->minlen)
2458 data->longest = &(data->longest_float);
3dab1dad
YO
2459 }
2460 }
2461#endif
bfed75c6 2462 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2463 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2464 I32 f = flags, pos_before = 0;
d4c19fe8 2465 regnode * const oscan = scan;
653099ff
GS
2466 struct regnode_charclass_class this_class;
2467 struct regnode_charclass_class *oclass = NULL;
727f22e3 2468 I32 next_is_eval = 0;
653099ff 2469
3dab1dad 2470 switch (PL_regkind[OP(scan)]) {
653099ff 2471 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2472 scan = NEXTOPER(scan);
2473 goto finish;
2474 case PLUS:
653099ff 2475 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2476 next = NEXTOPER(scan);
653099ff 2477 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2478 mincount = 1;
2479 maxcount = REG_INFTY;
c277df42
IZ
2480 next = regnext(scan);
2481 scan = NEXTOPER(scan);
2482 goto do_curly;
2483 }
2484 }
2485 if (flags & SCF_DO_SUBSTR)
2486 data->pos_min++;
2487 min++;
2488 /* Fall through. */
2489 case STAR:
653099ff
GS
2490 if (flags & SCF_DO_STCLASS) {
2491 mincount = 0;
b81d288d 2492 maxcount = REG_INFTY;
653099ff
GS
2493 next = regnext(scan);
2494 scan = NEXTOPER(scan);
2495 goto do_curly;
2496 }
b81d288d 2497 is_inf = is_inf_internal = 1;
c277df42
IZ
2498 scan = regnext(scan);
2499 if (flags & SCF_DO_SUBSTR) {
830247a4 2500 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2501 data->longest = &(data->longest_float);
2502 }
2503 goto optimize_curly_tail;
2504 case CURLY:
b81d288d 2505 mincount = ARG1(scan);
c277df42
IZ
2506 maxcount = ARG2(scan);
2507 next = regnext(scan);
cb434fcc
IZ
2508 if (OP(scan) == CURLYX) {
2509 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2510 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2511 }
c277df42 2512 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2513 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2514 do_curly:
2515 if (flags & SCF_DO_SUBSTR) {
830247a4 2516 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2517 pos_before = data->pos_min;
2518 }
2519 if (data) {
2520 fl = data->flags;
2521 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2522 if (is_inf)
2523 data->flags |= SF_IS_INF;
2524 }
653099ff 2525 if (flags & SCF_DO_STCLASS) {
830247a4 2526 cl_init(pRExC_state, &this_class);
653099ff
GS
2527 oclass = data->start_class;
2528 data->start_class = &this_class;
2529 f |= SCF_DO_STCLASS_AND;
2530 f &= ~SCF_DO_STCLASS_OR;
2531 }
e1901655
IZ
2532 /* These are the cases when once a subexpression
2533 fails at a particular position, it cannot succeed
2534 even after backtracking at the enclosing scope.
b81d288d 2535
e1901655
IZ
2536 XXXX what if minimal match and we are at the
2537 initial run of {n,m}? */
2538 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2539 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2540
c277df42 2541 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2542 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2543 (mincount == 0
2544 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2545
2546 if (flags & SCF_DO_STCLASS)
2547 data->start_class = oclass;
2548 if (mincount == 0 || minnext == 0) {
2549 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2550 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2551 }
2552 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2553 /* Switch to OR mode: cache the old value of
653099ff
GS
2554 * data->start_class */
2555 StructCopy(data->start_class, &and_with,
2556 struct regnode_charclass_class);
2557 flags &= ~SCF_DO_STCLASS_AND;
2558 StructCopy(&this_class, data->start_class,
2559 struct regnode_charclass_class);
2560 flags |= SCF_DO_STCLASS_OR;
2561 data->start_class->flags |= ANYOF_EOS;
2562 }
2563 } else { /* Non-zero len */
2564 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2565 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2566 cl_and(data->start_class, &and_with);
2567 }
2568 else if (flags & SCF_DO_STCLASS_AND)
2569 cl_and(data->start_class, &this_class);
2570 flags &= ~SCF_DO_STCLASS;
2571 }
c277df42
IZ
2572 if (!scan) /* It was not CURLYX, but CURLY. */
2573 scan = next;
041457d9
DM
2574 if ( /* ? quantifier ok, except for (?{ ... }) */
2575 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2576 && (minnext == 0) && (deltanext == 0)
99799961 2577 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2578 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2579 && ckWARN(WARN_REGEXP))
b45f050a 2580 {
830247a4 2581 vWARN(RExC_parse,
b45f050a
JF
2582 "Quantifier unexpected on zero-length expression");
2583 }
2584
c277df42 2585 min += minnext * mincount;
b81d288d 2586 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2587 && (minnext + deltanext) > 0)
2588 || deltanext == I32_MAX);
aca2d497 2589 is_inf |= is_inf_internal;
c277df42
IZ
2590 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2591
2592 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2593 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2594 && data->flags & SF_IN_PAR
2595 && !(data->flags & SF_HAS_EVAL)
2596 && !deltanext && minnext == 1 ) {
2597 /* Try to optimize to CURLYN. */
2598 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2599 regnode * const nxt1 = nxt;
497b47a8
JH
2600#ifdef DEBUGGING
2601 regnode *nxt2;
2602#endif
c277df42
IZ
2603
2604 /* Skip open. */
2605 nxt = regnext(nxt);
bfed75c6 2606 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2607 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2608 && STR_LEN(nxt) == 1))
c277df42 2609 goto nogo;
497b47a8 2610#ifdef DEBUGGING
c277df42 2611 nxt2 = nxt;
497b47a8 2612#endif
c277df42 2613 nxt = regnext(nxt);
b81d288d 2614 if (OP(nxt) != CLOSE)
c277df42
IZ
2615 goto nogo;
2616 /* Now we know that nxt2 is the only contents: */
eb160463 2617 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2618 OP(oscan) = CURLYN;
2619 OP(nxt1) = NOTHING; /* was OPEN. */
2620#ifdef DEBUGGING
2621 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2622 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2623 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2624 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2625 OP(nxt + 1) = OPTIMIZED; /* was count. */
2626 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2627#endif
c277df42 2628 }
c277df42
IZ
2629 nogo:
2630
2631 /* Try optimization CURLYX => CURLYM. */
b81d288d 2632 if ( OP(oscan) == CURLYX && data
c277df42 2633 && !(data->flags & SF_HAS_PAR)
c277df42 2634 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2635 && !deltanext /* atom is fixed width */
2636 && minnext != 0 /* CURLYM can't handle zero width */
2637 ) {
c277df42
IZ
2638 /* XXXX How to optimize if data == 0? */
2639 /* Optimize to a simpler form. */
2640 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2641 regnode *nxt2;
2642
2643 OP(oscan) = CURLYM;
2644 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2645 && (OP(nxt2) != WHILEM))
c277df42
IZ
2646 nxt = nxt2;
2647 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2648 /* Need to optimize away parenths. */
2649 if (data->flags & SF_IN_PAR) {
2650 /* Set the parenth number. */
2651 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2652
b81d288d 2653 if (OP(nxt) != CLOSE)
b45f050a 2654 FAIL("Panic opt close");
eb160463 2655 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2656 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2657 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2658#ifdef DEBUGGING
2659 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2660 OP(nxt + 1) = OPTIMIZED; /* was count. */
2661 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2662 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2663#endif
c277df42
IZ
2664#if 0
2665 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2666 regnode *nnxt = regnext(nxt1);
b81d288d 2667
c277df42
IZ
2668 if (nnxt == nxt) {
2669 if (reg_off_by_arg[OP(nxt1)])
2670 ARG_SET(nxt1, nxt2 - nxt1);
2671 else if (nxt2 - nxt1 < U16_MAX)
2672 NEXT_OFF(nxt1) = nxt2 - nxt1;
2673 else
2674 OP(nxt) = NOTHING; /* Cannot beautify */
2675 }
2676 nxt1 = nnxt;
2677 }
2678#endif
2679 /* Optimize again: */
b81d288d 2680 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2681 NULL, 0,depth+1);
a0ed51b3
LW
2682 }
2683 else
c277df42 2684 oscan->flags = 0;
c277df42 2685 }
e1901655
IZ
2686 else if ((OP(oscan) == CURLYX)
2687 && (flags & SCF_WHILEM_VISITED_POS)
2688 /* See the comment on a similar expression above.
2689 However, this time it not a subexpression
2690 we care about, but the expression itself. */
2691 && (maxcount == REG_INFTY)
2692 && data && ++data->whilem_c < 16) {
2693 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2694 /* Find WHILEM (as in regexec.c) */
2695 regnode *nxt = oscan + NEXT_OFF(oscan);
2696
2697 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2698 nxt += ARG(nxt);
eb160463
GS
2699 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2700 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2701 }
b81d288d 2702 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2703 pars++;
2704 if (flags & SCF_DO_SUBSTR) {
c445ea15 2705 SV *last_str = NULL;
c277df42
IZ
2706 int counted = mincount != 0;
2707
2708 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2709#if defined(SPARC64_GCC_WORKAROUND)
2710 I32 b = 0;
2711 STRLEN l = 0;
cfd0369c 2712 const char *s = NULL;
5d1c421c
JH
2713 I32 old = 0;
2714
2715 if (pos_before >= data->last_start_min)
2716 b = pos_before;
2717 else
2718 b = data->last_start_min;
2719
2720 l = 0;
cfd0369c 2721 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2722 old = b - data->last_start_min;
2723
2724#else
b81d288d 2725 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2726 ? pos_before : data->last_start_min;
2727 STRLEN l;
d4c19fe8 2728 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2729 I32 old = b - data->last_start_min;
5d1c421c 2730#endif
a0ed51b3
LW
2731
2732 if (UTF)
2733 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2734
a0ed51b3 2735 l -= old;
c277df42 2736 /* Get the added string: */
79cb57f6 2737 last_str = newSVpvn(s + old, l);
0e933229
IH
2738 if (UTF)
2739 SvUTF8_on(last_str);
c277df42
IZ
2740 if (deltanext == 0 && pos_before == b) {
2741 /* What was added is a constant string */
2742 if (mincount > 1) {
2743 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2744 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2745 SvPVX_const(last_str), l, mincount - 1);
b162af07 2746 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2747 /* Add additional parts. */
b81d288d 2748 SvCUR_set(data->last_found,
c277df42
IZ
2749 SvCUR(data->last_found) - l);
2750 sv_catsv(data->last_found, last_str);
0eda9292
JH
2751 {
2752 SV * sv = data->last_found;
2753 MAGIC *mg =
2754 SvUTF8(sv) && SvMAGICAL(sv) ?
2755 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2756 if (mg && mg->mg_len >= 0)
2757 mg->mg_len += CHR_SVLEN(last_str);
2758 }
c277df42
IZ
2759 data->last_end += l * (mincount - 1);
2760 }
2a8d9689
HS
2761 } else {
2762 /* start offset must point into the last copy */
2763 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2764 data->last_start_max += is_inf ? I32_MAX
2765 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2766 }
2767 }
2768 /* It is counted once already... */
2769 data->pos_min += minnext * (mincount - counted);
2770 data->pos_delta += - counted * deltanext +
2771 (minnext + deltanext) * maxcount - minnext * mincount;
2772 if (mincount != maxcount) {
653099ff
GS
2773 /* Cannot extend fixed substrings found inside
2774 the group. */
830247a4 2775 scan_commit(pRExC_state,data);
c277df42 2776 if (mincount && last_str) {
d4c19fe8
AL
2777 SV * const sv = data->last_found;
2778 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2779 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2780
2781 if (mg)
2782 mg->mg_len = -1;
2783 sv_setsv(sv, last_str);
c277df42 2784 data->last_end = data->pos_min;
b81d288d 2785 data->last_start_min =
a0ed51b3 2786 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2787 data->last_start_max = is_inf
2788 ? I32_MAX
c277df42 2789 : data->pos_min + data->pos_delta
a0ed51b3 2790 - CHR_SVLEN(last_str);
c277df42
IZ
2791 }
2792 data->longest = &(data->longest_float);
2793 }
aca2d497 2794 SvREFCNT_dec(last_str);
c277df42 2795 }
405ff068 2796 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2797 data->flags |= SF_HAS_EVAL;
2798 optimize_curly_tail:
c277df42 2799 if (OP(oscan) != CURLYX) {
3dab1dad 2800 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2801 && NEXT_OFF(next))
2802 NEXT_OFF(oscan) += NEXT_OFF(next);
2803 }
c277df42 2804 continue;
653099ff 2805 default: /* REF and CLUMP only? */
c277df42 2806 if (flags & SCF_DO_SUBSTR) {
830247a4 2807 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2808 data->longest = &(data->longest_float);
2809 }
aca2d497 2810 is_inf = is_inf_internal = 1;
653099ff 2811 if (flags & SCF_DO_STCLASS_OR)
830247a4 2812 cl_anything(pRExC_state, data->start_class);
653099ff 2813 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2814 break;
2815 }
a0ed51b3 2816 }
bfed75c6 2817 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2818 int value = 0;
653099ff 2819
c277df42 2820 if (flags & SCF_DO_SUBSTR) {
830247a4 2821 scan_commit(pRExC_state,data);
c277df42
IZ
2822 data->pos_min++;
2823 }
2824 min++;
653099ff
GS
2825 if (flags & SCF_DO_STCLASS) {
2826 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2827
2828 /* Some of the logic below assumes that switching
2829 locale on will only add false positives. */
3dab1dad 2830 switch (PL_regkind[OP(scan)]) {
653099ff 2831 case SANY:
653099ff
GS
2832 default:
2833 do_default:
2834 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2835 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2836 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2837 break;
2838 case REG_ANY:
2839 if (OP(scan) == SANY)
2840 goto do_default;
2841 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2842 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2843 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2844 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2845 }
2846 if (flags & SCF_DO_STCLASS_AND || !value)
2847 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2848 break;
2849 case ANYOF:
2850 if (flags & SCF_DO_STCLASS_AND)
2851 cl_and(data->start_class,
2852 (struct regnode_charclass_class*)scan);
2853 else
830247a4 2854 cl_or(pRExC_state, data->start_class,
653099ff
GS
2855 (struct regnode_charclass_class*)scan);
2856 break;
2857 case ALNUM:
2858 if (flags & SCF_DO_STCLASS_AND) {
2859 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2860 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2861 for (value = 0; value < 256; value++)
2862 if (!isALNUM(value))
2863 ANYOF_BITMAP_CLEAR(data->start_class, value);
2864 }
2865 }
2866 else {
2867 if (data->start_class->flags & ANYOF_LOCALE)
2868 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2869 else {
2870 for (value = 0; value < 256; value++)
2871 if (isALNUM(value))
b81d288d 2872 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2873 }
2874 }
2875 break;
2876 case ALNUML:
2877 if (flags & SCF_DO_STCLASS_AND) {
2878 if (data->start_class->flags & ANYOF_LOCALE)
2879 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2880 }
2881 else {
2882 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2883 data->start_class->flags |= ANYOF_LOCALE;
2884 }
2885 break;
2886 case NALNUM:
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2889 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2890 for (value = 0; value < 256; value++)
2891 if (isALNUM(value))
2892 ANYOF_BITMAP_CLEAR(data->start_class, value);
2893 }
2894 }
2895 else {
2896 if (data->start_class->flags & ANYOF_LOCALE)
2897 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2898 else {
2899 for (value = 0; value < 256; value++)
2900 if (!isALNUM(value))
b81d288d 2901 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2902 }
2903 }
2904 break;
2905 case NALNUML:
2906 if (flags & SCF_DO_STCLASS_AND) {
2907 if (data->start_class->flags & ANYOF_LOCALE)
2908 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2909 }
2910 else {
2911 data->start_class->flags |= ANYOF_LOCALE;
2912 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2913 }
2914 break;
2915 case SPACE:
2916 if (flags & SCF_DO_STCLASS_AND) {
2917 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2918 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2919 for (value = 0; value < 256; value++)
2920 if (!isSPACE(value))
2921 ANYOF_BITMAP_CLEAR(data->start_class, value);
2922 }
2923 }
2924 else {
2925 if (data->start_class->flags & ANYOF_LOCALE)
2926 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2927 else {
2928 for (value = 0; value < 256; value++)
2929 if (isSPACE(value))
b81d288d 2930 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2931 }
2932 }
2933 break;
2934 case SPACEL:
2935 if (flags & SCF_DO_STCLASS_AND) {
2936 if (data->start_class->flags & ANYOF_LOCALE)
2937 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2938 }
2939 else {
2940 data->start_class->flags |= ANYOF_LOCALE;
2941 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2942 }
2943 break;
2944 case NSPACE:
2945 if (flags & SCF_DO_STCLASS_AND) {
2946 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2947 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2948 for (value = 0; value < 256; value++)
2949 if (isSPACE(value))
2950 ANYOF_BITMAP_CLEAR(data->start_class, value);
2951 }
2952 }
2953 else {
2954 if (data->start_class->flags & ANYOF_LOCALE)
2955 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2956 else {
2957 for (value = 0; value < 256; value++)
2958 if (!isSPACE(value))
b81d288d 2959 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2960 }
2961 }
2962 break;
2963 case NSPACEL:
2964 if (flags & SCF_DO_STCLASS_AND) {
2965 if (data->start_class->flags & ANYOF_LOCALE) {
2966 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2967 for (value = 0; value < 256; value++)
2968 if (!isSPACE(value))
2969 ANYOF_BITMAP_CLEAR(data->start_class, value);
2970 }
2971 }
2972 else {
2973 data->start_class->flags |= ANYOF_LOCALE;
2974 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2975 }
2976 break;
2977 case DIGIT:
2978 if (flags & SCF_DO_STCLASS_AND) {
2979 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2980 for (value = 0; value < 256; value++)
2981 if (!isDIGIT(value))
2982 ANYOF_BITMAP_CLEAR(data->start_class, value);
2983 }
2984 else {
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2987 else {
2988 for (value = 0; value < 256; value++)
2989 if (isDIGIT(value))
b81d288d 2990 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2991 }
2992 }
2993 break;
2994 case NDIGIT:
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2997 for (value = 0; value < 256; value++)
2998 if (isDIGIT(value))
2999 ANYOF_BITMAP_CLEAR(data->start_class, value);
3000 }
3001 else {
3002 if (data->start_class->flags & ANYOF_LOCALE)
3003 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3004 else {
3005 for (value = 0; value < 256; value++)
3006 if (!isDIGIT(value))
b81d288d 3007 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3008 }
3009 }
3010 break;
3011 }
3012 if (flags & SCF_DO_STCLASS_OR)
3013 cl_and(data->start_class, &and_with);
3014 flags &= ~SCF_DO_STCLASS;
3015 }
a0ed51b3 3016 }
3dab1dad 3017 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3018 data->flags |= (OP(scan) == MEOL
3019 ? SF_BEFORE_MEOL
3020 : SF_BEFORE_SEOL);
a0ed51b3 3021 }
3dab1dad 3022 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3023 /* Lookbehind, or need to calculate parens/evals/stclass: */
3024 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3025 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3026 /* Lookahead/lookbehind */
cb434fcc 3027 I32 deltanext, minnext, fake = 0;
c277df42 3028 regnode *nscan;
653099ff
GS
3029 struct regnode_charclass_class intrnl;
3030 int f = 0;
c277df42
IZ
3031
3032 data_fake.flags = 0;
b81d288d 3033 if (data) {
2c2d71f5 3034 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3035 data_fake.last_closep = data->last_closep;
3036 }
3037 else
3038 data_fake.last_closep = &fake;
653099ff
GS
3039 if ( flags & SCF_DO_STCLASS && !scan->flags
3040 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3041 cl_init(pRExC_state, &intrnl);
653099ff 3042 data_fake.start_class = &intrnl;
e1901655 3043 f |= SCF_DO_STCLASS_AND;
653099ff 3044 }
e1901655
IZ
3045 if (flags & SCF_WHILEM_VISITED_POS)
3046 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3047 next = regnext(scan);
3048 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3049 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3050 if (scan->flags) {
3051 if (deltanext) {
9baa0206 3052 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3053 }
3054 else if (minnext > U8_MAX) {
9baa0206 3055 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3056 }
eb160463 3057 scan->flags = (U8)minnext;
c277df42 3058 }
be8e71aa
YO
3059 if (data) {
3060 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3061 pars++;
3062 if (data_fake.flags & SF_HAS_EVAL)
3063 data->flags |= SF_HAS_EVAL;
2c2d71f5 3064 data->whilem_c = data_fake.whilem_c;
be8e71aa 3065 }
e1901655 3066 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3067 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3068
3069 cl_and(data->start_class, &intrnl);
3070 if (was)
3071 data->start_class->flags |= ANYOF_EOS;
3072 }
a0ed51b3
LW
3073 }
3074 else if (OP(scan) == OPEN) {
c277df42 3075 pars++;
a0ed51b3 3076 }
cb434fcc 3077 else if (OP(scan) == CLOSE) {
eb160463 3078 if ((I32)ARG(scan) == is_par) {
cb434fcc 3079 next = regnext(scan);
c277df42 3080
cb434fcc
IZ
3081 if ( next && (OP(next) != WHILEM) && next < last)
3082 is_par = 0; /* Disable optimization */
3083 }
3084 if (data)
3085 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3086 }
3087 else if (OP(scan) == EVAL) {
c277df42
IZ
3088 if (data)
3089 data->flags |= SF_HAS_EVAL;
3090 }
96776eda 3091 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3092 if (flags & SCF_DO_SUBSTR) {
830247a4 3093 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3094 data->longest = &(data->longest_float);
3095 }
3096 is_inf = is_inf_internal = 1;
653099ff 3097 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3098 cl_anything(pRExC_state, data->start_class);
96776eda 3099 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3100 }
c277df42
IZ
3101 /* Else: zero-length, ignore. */
3102 scan = regnext(scan);
3103 }
3104
3105 finish:
3106 *scanp = scan;
aca2d497 3107 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3108 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3109 data->pos_delta = I32_MAX - data->pos_min;
3110 if (is_par > U8_MAX)
3111 is_par = 0;
3112 if (is_par && pars==1 && data) {
3113 data->flags |= SF_IN_PAR;
3114 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3115 }
3116 else if (pars && data) {
c277df42
IZ
3117 data->flags |= SF_HAS_PAR;
3118 data->flags &= ~SF_IN_PAR;
3119 }
653099ff
GS
3120 if (flags & SCF_DO_STCLASS_OR)
3121 cl_and(data->start_class, &and_with);
07be1b83
YO
3122 if (flags & SCF_EXACT_TRIE)
3123 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3124 return min;
3125}
3126
76e3520e 3127STATIC I32
5f66b61c 3128S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3129{
830247a4 3130 if (RExC_rx->data) {
b81d288d
AB
3131 Renewc(RExC_rx->data,
3132 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3133 char, struct reg_data);
830247a4
IZ
3134 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3135 RExC_rx->data->count += n;
a0ed51b3
LW
3136 }
3137 else {
a02a5408 3138 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3139 char, struct reg_data);
a02a5408 3140 Newx(RExC_rx->data->what, n, U8);
830247a4 3141 RExC_rx->data->count = n;
c277df42 3142 }
830247a4
IZ
3143 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3144 return RExC_rx->data->count - n;
c277df42
IZ
3145}
3146
76234dfb 3147#ifndef PERL_IN_XSUB_RE
d88dccdf 3148void
864dbfa3 3149Perl_reginitcolors(pTHX)
d88dccdf 3150{
97aff369 3151 dVAR;
1df70142 3152 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3153 if (s) {
1df70142
AL
3154 char *t = savepv(s);
3155 int i = 0;
3156 PL_colors[0] = t;
d88dccdf 3157 while (++i < 6) {
1df70142
AL
3158 t = strchr(t, '\t');
3159 if (t) {
3160 *t = '\0';
3161 PL_colors[i] = ++t;
d88dccdf
IZ
3162 }
3163 else
1df70142 3164 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3165 }
3166 } else {
1df70142 3167 int i = 0;
b81d288d 3168 while (i < 6)
06b5626a 3169 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3170 }
3171 PL_colorset = 1;
3172}
76234dfb 3173#endif
8615cb43 3174
07be1b83 3175
a687059c 3176/*
e50aee73 3177 - pregcomp - compile a regular expression into internal code
a687059c
LW
3178 *
3179 * We can't allocate space until we know how big the compiled form will be,
3180 * but we can't compile it (and thus know how big it is) until we've got a
3181 * place to put the code. So we cheat: we compile it twice, once with code
3182 * generation turned off and size counting turned on, and once "for real".
3183 * This also means that we don't allocate space until we are sure that the
3184 * thing really will compile successfully, and we never have to move the
3185 * code and thus invalidate pointers into it. (Note that it has to be in
3186 * one piece because free() must be able to free it all.) [NB: not true in perl]
3187 *
3188 * Beware that the optimization-preparation code in here knows about some
3189 * of the structure of the compiled regexp. [I'll say.]
3190 */
3191regexp *
864dbfa3 3192Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3193{
97aff369 3194 dVAR;
a0d0e21e 3195 register regexp *r;
c277df42 3196 regnode *scan;
c277df42 3197 regnode *first;
a0d0e21e 3198 I32 flags;
a0d0e21e
LW
3199 I32 minlen = 0;
3200 I32 sawplus = 0;
3201 I32 sawopen = 0;
2c2d71f5 3202 scan_data_t data;
830247a4 3203 RExC_state_t RExC_state;
be8e71aa 3204 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3205#ifdef TRIE_STUDY_OPT
3206 int restudied= 0;
3207 RExC_state_t copyRExC_state;
3208#endif
a0d0e21e 3209
a3621e74
YO
3210 GET_RE_DEBUG_FLAGS_DECL;
3211
a0d0e21e 3212 if (exp == NULL)
c277df42 3213 FAIL("NULL regexp argument");
a0d0e21e 3214
a5961de5 3215 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3216
5cfc7842 3217 RExC_precomp = exp;
a3621e74
YO
3218 DEBUG_r(if (!PL_colorset) reginitcolors());
3219 DEBUG_COMPILE_r({
3220 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3221 PL_colors[4],PL_colors[5],PL_colors[0],
3222 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3223 });
e2509266 3224 RExC_flags = pm->op_pmflags;
830247a4 3225 RExC_sawback = 0;
bbce6d69 3226
830247a4
IZ
3227 RExC_seen = 0;
3228 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3229 RExC_seen_evals = 0;
3230 RExC_extralen = 0;
c277df42 3231
bbce6d69 3232 /* First pass: determine size, legality. */
830247a4 3233 RExC_parse = exp;
fac92740 3234 RExC_start = exp;
830247a4
IZ
3235 RExC_end = xend;
3236 RExC_naughty = 0;
3237 RExC_npar = 1;
3238 RExC_size = 0L;
3239 RExC_emit = &PL_regdummy;
3240 RExC_whilem_seen = 0;
85ddcde9
JH
3241#if 0 /* REGC() is (currently) a NOP at the first pass.
3242 * Clever compilers notice this and complain. --jhi */
830247a4 3243 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3244#endif
3dab1dad
YO
3245 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3246 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3247 RExC_precomp = NULL;
a0d0e21e
LW
3248 return(NULL);
3249 }
3dab1dad
YO
3250 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3251 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3252 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3253 DEBUG_PARSE_r({
3254 RExC_lastnum=0;
3255 RExC_lastparse=NULL;
3256 });
c277df42 3257
07be1b83 3258
c277df42
IZ
3259 /* Small enough for pointer-storage convention?
3260 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3261 if (RExC_size >= 0x10000L && RExC_extralen)
3262 RExC_size += RExC_extralen;
c277df42 3263 else
830247a4
IZ
3264 RExC_extralen = 0;
3265 if (RExC_whilem_seen > 15)
3266 RExC_whilem_seen = 15;
a0d0e21e 3267
bbce6d69 3268 /* Allocate space and initialize. */
a02a5408 3269 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3270 char, regexp);
a0d0e21e 3271 if (r == NULL)
b45f050a
JF
3272 FAIL("Regexp out of space");
3273
0f79a09d
GS
3274#ifdef DEBUGGING
3275 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3276 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3277#endif
c277df42 3278 r->refcnt = 1;
bbce6d69 3279 r->prelen = xend - exp;
5cfc7842 3280 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3281 r->subbeg = NULL;
f8c7b90f 3282#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3283 r->saved_copy = NULL;
ed252734 3284#endif
cf93c79d 3285 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3286 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3287 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3288
3289 r->substrs = 0; /* Useful during FAIL. */
3290 r->startp = 0; /* Useful during FAIL. */
3291 r->endp = 0; /* Useful during FAIL. */
3292
a02a5408 3293 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3294 if (r->offsets) {
2af232bd 3295 r->offsets[0] = RExC_size;
fac92740 3296 }
a3621e74 3297 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3298 "%s %"UVuf" bytes for offset annotations.\n",
3299 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3300 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3301
830247a4 3302 RExC_rx = r;
bbce6d69 3303
3304 /* Second pass: emit code. */
e2509266 3305 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3306 RExC_parse = exp;
3307 RExC_end = xend;
3308 RExC_naughty = 0;
3309 RExC_npar = 1;
fac92740 3310 RExC_emit_start = r->program;
830247a4 3311 RExC_emit = r->program;
2cd61cdb 3312 /* Store the count of eval-groups for security checks: */
eb160463 3313 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3314 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3315 r->data = 0;
3dab1dad 3316 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3317 return(NULL);
07be1b83
YO
3318 /* XXXX To minimize changes to RE engine we always allocate
3319 3-units-long substrs field. */
3320 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3321
07be1b83
YO
3322reStudy:
3323 Zero(r->substrs, 1, struct reg_substr_data);
3324 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3325
07be1b83
YO
3326#ifdef TRIE_STUDY_OPT
3327 if ( restudied ) {
3328 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3329 RExC_state=copyRExC_state;
3330 if (data.longest_fixed)
3331 SvREFCNT_dec(data.longest_fixed);
3332 if (data.longest_float)
3333 SvREFCNT_dec(data.longest_float);
3334 if (data.last_found)
3335 SvREFCNT_dec(data.last_found);
3336 } else {
3337 copyRExC_state=RExC_state;
3338 }
3339#endif
a0d0e21e 3340 /* Dig out information for optimizations. */
cf93c79d 3341 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3342 pm->op_pmflags = RExC_flags;
a0ed51b3 3343 if (UTF)
5ff6fc6d 3344 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3345 r->regstclass = NULL;
830247a4 3346 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3347 r->reganch |= ROPT_NAUGHTY;
c277df42 3348 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3349
653099ff 3350 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3351 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3352 I32 fake;
c5254dd6 3353 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3354 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3355 int stclass_flag;
07be1b83 3356 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3357
3358 first = scan;
c277df42 3359 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3360 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3361 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3362 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3363 /* for now we can't handle lookbehind IFMATCH*/
3364 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3365 (OP(first) == PLUS) ||
3366 (OP(first) == MINMOD) ||
653099ff 3367 /* An {n,m} with n>0 */
07be1b83
YO
3368 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3369 {
3370 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3371 if (OP(first) == PLUS)
3372 sawplus = 1;
3373 else
3dab1dad 3374 first += regarglen[OP(first)];
07be1b83
YO
3375 if (OP(first) == IFMATCH) {
3376 first = NEXTOPER(first);
3377 first += EXTRA_STEP_2ARGS;
3378 } else /*xxx possible optimisation for /(?=)/*/
3379 first = NEXTOPER(first);
a687059c
LW
3380 }
3381
a0d0e21e
LW
3382 /* Starting-point info. */
3383 again:
07be1b83 3384 /* Ignore EXACT as we deal with it later. */
3dab1dad 3385 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3386 if (OP(first) == EXACT)
6f207bd3 3387 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3388 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3389 r->regstclass = first;
3390 }
07be1b83
YO
3391#ifdef TRIE_STCLASS
3392 else if (OP(first) == TRIE &&
3393 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3394 {
3395 /* this can happen only on restudy */
3396 struct regnode_1 *trie_op;
3397 Newxz(trie_op,1,struct regnode_1);
3398 StructCopy(first,trie_op,struct regnode_1);
3399 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3400 r->regstclass = (regnode *)trie_op;
3401 }
3402#endif
bfed75c6 3403 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3404 r->regstclass = first;
3dab1dad
YO
3405 else if (PL_regkind[OP(first)] == BOUND ||
3406 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3407 r->regstclass = first;
3dab1dad 3408 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3409 r->reganch |= (OP(first) == MBOL
3410 ? ROPT_ANCH_MBOL
3411 : (OP(first) == SBOL
3412 ? ROPT_ANCH_SBOL
3413 : ROPT_ANCH_BOL));
a0d0e21e 3414 first = NEXTOPER(first);
774d564b 3415 goto again;
3416 }
3417 else if (OP(first) == GPOS) {
3418 r->reganch |= ROPT_ANCH_GPOS;
3419 first = NEXTOPER(first);
3420 goto again;
a0d0e21e 3421 }
e09294f4 3422 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3423 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3424 !(r->reganch & ROPT_ANCH) )
3425 {
3426 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3427 const int type =
3428 (OP(NEXTOPER(first)) == REG_ANY)
3429 ? ROPT_ANCH_MBOL
3430 : ROPT_ANCH_SBOL;
cad2e5aa 3431 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3432 first = NEXTOPER(first);
774d564b 3433 goto again;
a0d0e21e 3434 }
b81d288d 3435 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3436 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3437 /* x+ must match at the 1st pos of run of x's */
3438 r->reganch |= ROPT_SKIP;
a0d0e21e 3439
c277df42 3440 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3441#ifdef TRIE_STUDY_OPT
3442 DEBUG_COMPILE_r(
3443 if (!restudied)
3444 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3445 (IV)(first - scan + 1))
3446 );
3447#else
3448 DEBUG_COMPILE_r(
3449 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3450 (IV)(first - scan + 1))
3451 );
3452#endif
3453
3454
a0d0e21e
LW
3455 /*
3456 * If there's something expensive in the r.e., find the
3457 * longest literal string that must appear and make it the
3458 * regmust. Resolve ties in favor of later strings, since
3459 * the regstart check works with the beginning of the r.e.
3460 * and avoiding duplication strengthens checking. Not a
3461 * strong reason, but sufficient in the absence of others.
3462 * [Now we resolve ties in favor of the earlier string if
c277df42 3463 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3464 * earlier string may buy us something the later one won't.]
3465 */
a0d0e21e 3466 minlen = 0;
a687059c 3467
396482e1
GA
3468 data.longest_fixed = newSVpvs("");
3469 data.longest_float = newSVpvs("");
3470 data.last_found = newSVpvs("");
c277df42
IZ
3471 data.longest = &(data.longest_fixed);
3472 first = scan;
653099ff 3473 if (!r->regstclass) {
830247a4 3474 cl_init(pRExC_state, &ch_class);
653099ff
GS
3475 data.start_class = &ch_class;
3476 stclass_flag = SCF_DO_STCLASS_AND;
3477 } else /* XXXX Check for BOUND? */
3478 stclass_flag = 0;
cb434fcc 3479 data.last_closep = &last_close;
653099ff 3480
830247a4 3481 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3482 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3483
3484#ifdef TRIE_STUDY_OPT
3485 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3486 goto reStudy;
3487 }
3488#endif
3489
830247a4 3490 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3491 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3492 && !RExC_seen_zerolen
3493 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3494 r->reganch |= ROPT_CHECK_ALL;
830247a4 3495 scan_commit(pRExC_state, &data);
c277df42
IZ
3496 SvREFCNT_dec(data.last_found);
3497
a0ed51b3 3498 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3499 if (longest_float_length
c277df42
IZ
3500 || (data.flags & SF_FL_BEFORE_EOL
3501 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3502 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3503 int t;
3504
a0ed51b3 3505 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3506 && data.offset_fixed == data.offset_float_min
3507 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3508 goto remove_float; /* As in (a)+. */
3509
33b8afdf
JH
3510 if (SvUTF8(data.longest_float)) {
3511 r->float_utf8 = data.longest_float;
c445ea15 3512 r->float_substr = NULL;
33b8afdf
JH
3513 } else {
3514 r->float_substr = data.longest_float;
c445ea15 3515 r->float_utf8 = NULL;
33b8afdf 3516 }
c277df42
IZ
3517 r->float_min_offset = data.offset_float_min;
3518 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3519 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3520 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3521 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3522 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3523 }
3524 else {
aca2d497 3525 remove_float:
c445ea15 3526 r->float_substr = r->float_utf8 = NULL;
c277df42 3527 SvREFCNT_dec(data.longest_float);
c5254dd6 3528 longest_float_length = 0;
a0d0e21e 3529 }
c277df42 3530
a0ed51b3 3531 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3532 if (longest_fixed_length
c277df42
IZ
3533 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3534 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3535 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3536 int t;
3537
33b8afdf
JH
3538 if (SvUTF8(data.longest_fixed)) {
3539 r->anchored_utf8 = data.longest_fixed;
c445ea15 3540 r->anchored_substr = NULL;
33b8afdf
JH
3541 } else {
3542 r->anchored_substr = data.longest_fixed;
c445ea15 3543 r->anchored_utf8 = NULL;
33b8afdf 3544 }
c277df42 3545 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3546 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3547 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3548 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3549 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3550 }
3551 else {
c445ea15 3552 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3553 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3554 longest_fixed_length = 0;
a0d0e21e 3555 }
b81d288d 3556 if (r->regstclass
ffc61ed2 3557 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3558 r->regstclass = NULL;
33b8afdf
JH
3559 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3560 && stclass_flag
653099ff 3561 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3562 && !cl_is_anything(data.start_class))
3563 {
1df70142 3564 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3565
a02a5408 3566 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3567 struct regnode_charclass_class);
3568 StructCopy(data.start_class,
830247a4 3569 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3570 struct regnode_charclass_class);
830247a4 3571 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3572 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3573 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3574 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3575 PerlIO_printf(Perl_debug_log,
a0288114 3576 "synthetic stclass \"%s\".\n",
3f7c398e 3577 SvPVX_const(sv));});
653099ff 3578 }
c277df42
IZ
3579
3580 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3581 if (longest_fixed_length > longest_float_length) {
c277df42 3582 r->check_substr = r->anchored_substr;
33b8afdf 3583 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3584 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3585 if (r->reganch & ROPT_ANCH_SINGLE)
3586 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3587 }
3588 else {
c277df42 3589 r->check_substr = r->float_substr;
33b8afdf 3590 r->check_utf8 = r->float_utf8;
c277df42
IZ
3591 r->check_offset_min = data.offset_float_min;
3592 r->check_offset_max = data.offset_float_max;
a0d0e21e 3593 }
30382c73
IZ
3594 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3595 This should be changed ASAP! */
33b8afdf 3596 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3597 r->reganch |= RE_USE_INTUIT;
33b8afdf 3598 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3599 r->reganch |= RE_INTUIT_TAIL;
3600 }
a0ed51b3
LW
3601 }
3602 else {
c277df42
IZ
3603 /* Several toplevels. Best we can is to set minlen. */
3604 I32 fake;
653099ff 3605 struct regnode_charclass_class ch_class;
cb434fcc 3606 I32 last_close = 0;
c277df42 3607
a3621e74 3608 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3609
c277df42 3610 scan = r->program + 1;
830247a4 3611 cl_init(pRExC_state, &ch_class);
653099ff 3612 data.start_class = &ch_class;
cb434fcc 3613 data.last_closep = &last_close;
07be1b83
YO
3614
3615 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3616 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3617
3618#ifdef TRIE_STUDY_OPT
3619 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3620 goto reStudy;
3621 }
3622#endif
3623
33b8afdf 3624 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3625 = r->float_substr = r->float_utf8 = NULL;
653099ff 3626 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3627 && !cl_is_anything(data.start_class))
3628 {
1df70142 3629 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3630
a02a5408 3631 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3632 struct regnode_charclass_class);
3633 StructCopy(data.start_class,
830247a4 3634 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3635 struct regnode_charclass_class);
830247a4 3636 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3637 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3638 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3639 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3640 PerlIO_printf(Perl_debug_log,
a0288114 3641 "synthetic stclass \"%s\".\n",
3f7c398e 3642 SvPVX_const(sv));});
653099ff 3643 }
a0d0e21e
LW
3644 }
3645
a0d0e21e 3646 r->minlen = minlen;
b81d288d 3647 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3648 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3649 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3650 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3651 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3652 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3653 if (RExC_seen & REG_SEEN_CANY)
3654 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3655 Newxz(r->startp, RExC_npar, I32);
3656 Newxz(r->endp, RExC_npar, I32);
07be1b83 3657
f2278c82 3658 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3659 DEBUG_DUMP_r({
3660 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3661 regdump(r);
3662 });
a0d0e21e 3663 return(r);
a687059c
LW
3664}
3665
3dab1dad
YO
3666
3667#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3668 int rem=(int)(RExC_end - RExC_parse); \
3669 int cut; \
3670 int num; \
3671 int iscut=0; \
3672 if (rem>10) { \
3673 rem=10; \
3674 iscut=1; \
3675 } \
3676 cut=10-rem; \
3677 if (RExC_lastparse!=RExC_parse) \
3678 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3679 rem, RExC_parse, \
3680 cut + 4, \
3681 iscut ? "..." : "<" \
3682 ); \
3683 else \
3684 PerlIO_printf(Perl_debug_log,"%16s",""); \
3685 \
3686 if (SIZE_ONLY) \
3687 num=RExC_size; \
3688 else \
3689 num=REG_NODE_NUM(RExC_emit); \
3690 if (RExC_lastnum!=num) \
be8e71aa 3691 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3692 else \
be8e71aa
YO
3693 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3694 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3695 (int)((depth*2)), "", \
3dab1dad
YO
3696 (funcname) \
3697 ); \
3698 RExC_lastnum=num; \
3699 RExC_lastparse=RExC_parse; \
3700})
3701
07be1b83
YO
3702
3703
3dab1dad
YO
3704#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3705 DEBUG_PARSE_MSG((funcname)); \
3706 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3707})
a687059c
LW
3708/*
3709 - reg - regular expression, i.e. main body or parenthesized thing
3710 *
3711 * Caller must absorb opening parenthesis.
3712 *
3713 * Combining parenthesis handling with the base level of regular expression
3714 * is a trifle forced, but the need to tie the tails of the branches to what
3715 * follows makes it hard to avoid.
3716 */
07be1b83
YO
3717#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3718#ifdef DEBUGGING
3719#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3720#else
3721#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3722#endif
3dab1dad 3723
76e3520e 3724STATIC regnode *
3dab1dad 3725S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3726 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3727{
27da23d5 3728 dVAR;
c277df42
IZ
3729 register regnode *ret; /* Will be the head of the group. */
3730 register regnode *br;
3731 register regnode *lastbr;
cbbf8932 3732 register regnode *ender = NULL;
a0d0e21e 3733 register I32 parno = 0;
cbbf8932
AL
3734 I32 flags;
3735 const I32 oregflags = RExC_flags;
6136c704
AL
3736 bool have_branch = 0;
3737 bool is_open = 0;
9d1d55b5
JP
3738
3739 /* for (?g), (?gc), and (?o) warnings; warning
3740 about (?c) will warn about (?g) -- japhy */
3741
6136c704
AL
3742#define WASTED_O 0x01
3743#define WASTED_G 0x02
3744#define WASTED_C 0x04
3745#define WASTED_GC (0x02|0x04)
cbbf8932 3746 I32 wastedflags = 0x00;
9d1d55b5 3747
fac92740 3748 char * parse_start = RExC_parse; /* MJD */
a28509cc 3749 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3750
3dab1dad
YO
3751 GET_RE_DEBUG_FLAGS_DECL;
3752 DEBUG_PARSE("reg ");
3753
3754
821b33a5 3755 *flagp = 0; /* Tentatively. */
a0d0e21e 3756
9d1d55b5 3757
a0d0e21e
LW
3758 /* Make an OPEN node, if parenthesized. */
3759 if (paren) {
fac92740 3760 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3761 U32 posflags = 0, negflags = 0;
3762 U32 *flagsp = &posflags;
6136c704 3763 bool is_logical = 0;
a28509cc 3764 const char * const seqstart = RExC_parse;
ca9dfc88 3765
830247a4
IZ
3766 RExC_parse++;
3767 paren = *RExC_parse++;
c277df42 3768 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3769 switch (paren) {
fac92740 3770 case '<': /* (?<...) */
830247a4 3771 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3772 if (*RExC_parse == '!')
c277df42 3773 paren = ',';
b81d288d 3774 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3775 goto unknown;
830247a4 3776 RExC_parse++;
fac92740
MJD
3777 case '=': /* (?=...) */
3778 case '!': /* (?!...) */
830247a4 3779 RExC_seen_zerolen++;
fac92740
MJD
3780 case ':': /* (?:...) */
3781 case '>': /* (?>...) */
a0d0e21e 3782 break;
fac92740
MJD
3783 case '$': /* (?$...) */
3784 case '@': /* (?@...) */
8615cb43 3785 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3786 break;
fac92740 3787 case '#': /* (?#...) */
830247a4
IZ
3788 while (*RExC_parse && *RExC_parse != ')')
3789 RExC_parse++;
3790 if (*RExC_parse != ')')
c277df42 3791 FAIL("Sequence (?#... not terminated");
830247a4 3792 nextchar(pRExC_state);
a0d0e21e
LW
3793 *flagp = TRYAGAIN;
3794 return NULL;
fac92740 3795 case 'p': /* (?p...) */
9014280d 3796 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3797 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3798 /* FALL THROUGH*/
fac92740 3799 case '?': /* (??...) */
6136c704 3800 is_logical = 1;
438a3801
YST
3801 if (*RExC_parse != '{')
3802 goto unknown;
830247a4 3803 paren = *RExC_parse++;
0f5d15d6 3804 /* FALL THROUGH */
fac92740 3805 case '{': /* (?{...}) */
c277df42 3806 {
c277df42
IZ
3807 I32 count = 1, n = 0;
3808 char c;
830247a4 3809 char *s = RExC_parse;
c277df42 3810
830247a4
IZ
3811 RExC_seen_zerolen++;
3812 RExC_seen |= REG_SEEN_EVAL;
3813 while (count && (c = *RExC_parse)) {
6136c704
AL
3814 if (c == '\\') {
3815 if (RExC_parse[1])
3816 RExC_parse++;
3817 }
b81d288d 3818 else if (c == '{')
c277df42 3819 count++;
b81d288d 3820 else if (c == '}')
c277df42 3821 count--;
830247a4 3822 RExC_parse++;
c277df42 3823 }
6136c704 3824 if (*RExC_parse != ')') {
b81d288d 3825 RExC_parse = s;
b45f050a
JF
3826 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3827 }
c277df42 3828 if (!SIZE_ONLY) {
f3548bdc 3829 PAD *pad;
6136c704
AL
3830 OP_4tree *sop, *rop;
3831 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3832
569233ed
SB
3833 ENTER;
3834 Perl_save_re_context(aTHX);
f3548bdc 3835 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3836 sop->op_private |= OPpREFCOUNTED;
3837 /* re_dup will OpREFCNT_inc */
3838 OpREFCNT_set(sop, 1);
569233ed 3839 LEAVE;
c277df42 3840
830247a4
IZ
3841 n = add_data(pRExC_state, 3, "nop");
3842 RExC_rx->data->data[n] = (void*)rop;
3843 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3844 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3845 SvREFCNT_dec(sv);
a0ed51b3 3846 }
e24b16f9 3847 else { /* First pass */
830247a4 3848 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3849 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3850 /* No compiled RE interpolated, has runtime
3851 components ===> unsafe. */
3852 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3853 if (PL_tainting && PL_tainted)
cc6b7395 3854 FAIL("Eval-group in insecure regular expression");
54df2634 3855#if PERL_VERSION > 8
923e4eb5 3856 if (IN_PERL_COMPILETIME)
b5c19bd7 3857 PL_cv_has_eval = 1;
54df2634 3858#endif
c277df42 3859 }
b5c19bd7 3860
830247a4 3861 nextchar(pRExC_state);
6136c704 3862 if (is_logical) {
830247a4 3863 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3864 if (!SIZE_ONLY)
3865 ret->flags = 2;
3dab1dad 3866 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3867 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3868 return ret;
3869 }
ccb2c380
MP
3870 ret = reganode(pRExC_state, EVAL, n);
3871 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3872 Set_Node_Offset(ret, parse_start);
3873 return ret;
c277df42 3874 }
fac92740 3875 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3876 {
fac92740 3877 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3878 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3879 || RExC_parse[1] == '<'
830247a4 3880 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3881 I32 flag;
3882
830247a4 3883 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3884 if (!SIZE_ONLY)
3885 ret->flags = 1;
3dab1dad 3886 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3887 goto insert_if;
b81d288d 3888 }
a0ed51b3 3889 }
830247a4 3890 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3891 /* (?(1)...) */
6136c704 3892 char c;
830247a4 3893 parno = atoi(RExC_parse++);
c277df42 3894
830247a4
IZ
3895 while (isDIGIT(*RExC_parse))
3896 RExC_parse++;
fac92740 3897 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3898
830247a4 3899 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3900 vFAIL("Switch condition not recognized");
c277df42 3901 insert_if:
3dab1dad
YO
3902 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3903 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3904 if (br == NULL)
830247a4 3905 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3906 else
3dab1dad 3907 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3908 c = *nextchar(pRExC_state);
d1b80229
IZ
3909 if (flags&HASWIDTH)
3910 *flagp |= HASWIDTH;
c277df42 3911 if (c == '|') {
830247a4 3912 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3913 regbranch(pRExC_state, &flags, 1,depth+1);
3914 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3915 if (flags&HASWIDTH)
3916 *flagp |= HASWIDTH;
830247a4 3917 c = *nextchar(pRExC_state);
a0ed51b3
LW
3918 }
3919 else
c277df42
IZ
3920 lastbr = NULL;
3921 if (c != ')')
8615cb43 3922 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3923 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3924 REGTAIL(pRExC_state, br, ender);
c277df42 3925 if (lastbr) {
3dab1dad
YO
3926 REGTAIL(pRExC_state, lastbr, ender);
3927 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3928 }
3929 else
3dab1dad 3930 REGTAIL(pRExC_state, ret, ender);
c277df42 3931 return ret;
a0ed51b3
LW
3932 }
3933 else {
830247a4 3934 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3935 }
3936 }
1b1626e4 3937 case 0:
830247a4 3938 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3939 vFAIL("Sequence (? incomplete");
1b1626e4 3940 break;
a0d0e21e 3941 default:
830247a4 3942 --RExC_parse;
fac92740 3943 parse_flags: /* (?i) */
830247a4 3944 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3945 /* (?g), (?gc) and (?o) are useless here
3946 and must be globally applied -- japhy */
3947
3948 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3949 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3950 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3951 if (! (wastedflags & wflagbit) ) {
3952 wastedflags |= wflagbit;
3953 vWARN5(
3954 RExC_parse + 1,
3955 "Useless (%s%c) - %suse /%c modifier",
3956 flagsp == &negflags ? "?-" : "?",
3957 *RExC_parse,
3958 flagsp == &negflags ? "don't " : "",
3959 *RExC_parse
3960 );
3961 }
3962 }
3963 }
3964 else if (*RExC_parse == 'c') {
3965 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3966 if (! (wastedflags & WASTED_C) ) {
3967 wastedflags |= WASTED_GC;
9d1d55b5
JP
3968 vWARN3(
3969 RExC_parse + 1,
3970 "Useless (%sc) - %suse /gc modifier",
3971 flagsp == &negflags ? "?-" : "?",
3972 flagsp == &negflags ? "don't " : ""
3973 );
3974 }
3975 }
3976 }
3977 else { pmflag(flagsp, *RExC_parse); }
3978
830247a4 3979 ++RExC_parse;
ca9dfc88 3980 }
830247a4 3981 if (*RExC_parse == '-') {
ca9dfc88 3982 flagsp = &negflags;
9d1d55b5 3983 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3984 ++RExC_parse;
ca9dfc88 3985 goto parse_flags;
48c036b1 3986 }
e2509266
JH
3987 RExC_flags |= posflags;
3988 RExC_flags &= ~negflags;
830247a4
IZ
3989 if (*RExC_parse == ':') {
3990 RExC_parse++;
ca9dfc88
IZ
3991 paren = ':';
3992 break;
3993 }
c277df42 3994 unknown:
830247a4
IZ
3995 if (*RExC_parse != ')') {
3996 RExC_parse++;
3997 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3998 }
830247a4 3999 nextchar(pRExC_state);
a0d0e21e
LW
4000 *flagp = TRYAGAIN;
4001 return NULL;
4002 }
4003 }
fac92740 4004 else { /* (...) */
830247a4
IZ
4005 parno = RExC_npar;
4006 RExC_npar++;
4007 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4008 Set_Node_Length(ret, 1); /* MJD */
4009 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4010 is_open = 1;
a0d0e21e 4011 }
a0ed51b3 4012 }
fac92740 4013 else /* ! paren */
a0d0e21e
LW
4014 ret = NULL;
4015
4016 /* Pick up the branches, linking them together. */
fac92740 4017 parse_start = RExC_parse; /* MJD */
3dab1dad 4018 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4019 /* branch_len = (paren != 0); */
2af232bd 4020
a0d0e21e
LW
4021 if (br == NULL)
4022 return(NULL);
830247a4
IZ
4023 if (*RExC_parse == '|') {
4024 if (!SIZE_ONLY && RExC_extralen) {
4025 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4026 }
fac92740 4027 else { /* MJD */
830247a4 4028 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4029 Set_Node_Length(br, paren != 0);
4030 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4031 }
c277df42
IZ
4032 have_branch = 1;
4033 if (SIZE_ONLY)
830247a4 4034 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4035 }
4036 else if (paren == ':') {
c277df42
IZ
4037 *flagp |= flags&SIMPLE;
4038 }
6136c704 4039 if (is_open) { /* Starts with OPEN. */
3dab1dad 4040 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4041 }
4042 else if (paren != '?') /* Not Conditional */
a0d0e21e 4043 ret = br;
32a0ca98 4044 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4045 lastbr = br;
830247a4
IZ
4046 while (*RExC_parse == '|') {
4047 if (!SIZE_ONLY && RExC_extralen) {
4048 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4049 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4050 }
4051 if (SIZE_ONLY)
830247a4
IZ
4052 RExC_extralen += 2; /* Account for LONGJMP. */
4053 nextchar(pRExC_state);
3dab1dad 4054 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4055
a687059c 4056 if (br == NULL)
a0d0e21e 4057 return(NULL);
3dab1dad 4058 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4059 lastbr = br;
821b33a5
IZ
4060 if (flags&HASWIDTH)
4061 *flagp |= HASWIDTH;
a687059c 4062 *flagp |= flags&SPSTART;
a0d0e21e
LW
4063 }
4064
c277df42
IZ
4065 if (have_branch || paren != ':') {
4066 /* Make a closing node, and hook it on the end. */
4067 switch (paren) {
4068 case ':':
830247a4 4069 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4070 break;
4071 case 1:
830247a4 4072 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4073 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4074 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4075 break;
4076 case '<':
c277df42
IZ
4077 case ',':
4078 case '=':
4079 case '!':
c277df42 4080 *flagp &= ~HASWIDTH;
821b33a5
IZ
4081 /* FALL THROUGH */
4082 case '>':
830247a4 4083 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4084 break;
4085 case 0:
830247a4 4086 ender = reg_node(pRExC_state, END);
c277df42
IZ
4087 break;
4088 }
07be1b83 4089 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4090
9674d46a 4091 if (have_branch && !SIZE_ONLY) {
c277df42 4092 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4093 for (br = ret; br; br = regnext(br)) {
4094 const U8 op = PL_regkind[OP(br)];
4095 if (op == BRANCH) {
07be1b83 4096 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4097 }
4098 else if (op == BRANCHJ) {
07be1b83 4099 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4100 }
c277df42
IZ
4101 }
4102 }
a0d0e21e 4103 }
c277df42
IZ
4104
4105 {
e1ec3a88
AL
4106 const char *p;
4107 static const char parens[] = "=!<,>";
c277df42
IZ
4108
4109 if (paren && (p = strchr(parens, paren))) {
eb160463 4110 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4111 int flag = (p - parens) > 1;
4112
4113 if (paren == '>')
4114 node = SUSPEND, flag = 0;
830247a4 4115 reginsert(pRExC_state, node,ret);
45948336
EP
4116 Set_Node_Cur_Length(ret);
4117 Set_Node_Offset(ret, parse_start + 1);
c277df42 4118 ret->flags = flag;
07be1b83 4119 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4120 }
a0d0e21e
LW
4121 }
4122
4123 /* Check for proper termination. */
ce3e6498 4124 if (paren) {
e2509266 4125 RExC_flags = oregflags;
830247a4
IZ
4126 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4127 RExC_parse = oregcomp_parse;
380a0633 4128 vFAIL("Unmatched (");
ce3e6498 4129 }
a0ed51b3 4130 }
830247a4
IZ
4131 else if (!paren && RExC_parse < RExC_end) {
4132 if (*RExC_parse == ')') {
4133 RExC_parse++;
380a0633 4134 vFAIL("Unmatched )");
a0ed51b3
LW
4135 }
4136 else
b45f050a 4137 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4138 /* NOTREACHED */
4139 }
a687059c 4140
a0d0e21e 4141 return(ret);
a687059c
LW
4142}
4143
4144/*
4145 - regbranch - one alternative of an | operator
4146 *
4147 * Implements the concatenation operator.
4148 */
76e3520e 4149STATIC regnode *
3dab1dad 4150S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4151{
97aff369 4152 dVAR;
c277df42
IZ
4153 register regnode *ret;
4154 register regnode *chain = NULL;
4155 register regnode *latest;
4156 I32 flags = 0, c = 0;
3dab1dad
YO
4157 GET_RE_DEBUG_FLAGS_DECL;
4158 DEBUG_PARSE("brnc");
b81d288d 4159 if (first)
c277df42
IZ
4160 ret = NULL;
4161 else {
b81d288d 4162 if (!SIZE_ONLY && RExC_extralen)
830247a4 4163 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4164 else {
830247a4 4165 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4166 Set_Node_Length(ret, 1);
4167 }
c277df42
IZ
4168 }
4169
b81d288d 4170 if (!first && SIZE_ONLY)
830247a4 4171 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4172
c277df42 4173 *flagp = WORST; /* Tentatively. */
a0d0e21e 4174
830247a4
IZ
4175 RExC_parse--;
4176 nextchar(pRExC_state);
4177 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4178 flags &= ~TRYAGAIN;
3dab1dad 4179 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4180 if (latest == NULL) {
4181 if (flags & TRYAGAIN)
4182 continue;
4183 return(NULL);
a0ed51b3
LW
4184 }
4185 else if (ret == NULL)
c277df42 4186 ret = latest;
a0d0e21e 4187 *flagp |= flags&HASWIDTH;
c277df42 4188 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4189 *flagp |= flags&SPSTART;
4190 else {
830247a4 4191 RExC_naughty++;
3dab1dad 4192 REGTAIL(pRExC_state, chain, latest);
a687059c 4193 }
a0d0e21e 4194 chain = latest;
c277df42
IZ
4195 c++;
4196 }
4197 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4198 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4199 if (ret == NULL)
4200 ret = chain;
4201 }
4202 if (c == 1) {
4203 *flagp |= flags&SIMPLE;
a0d0e21e 4204 }
a687059c 4205
d4c19fe8 4206 return ret;
a687059c
LW
4207}
4208
4209/*
4210 - regpiece - something followed by possible [*+?]
4211 *
4212 * Note that the branching code sequences used for ? and the general cases
4213 * of * and + are somewhat optimized: they use the same NOTHING node as
4214 * both the endmarker for their branch list and the body of the last branch.
4215 * It might seem that this node could be dispensed with entirely, but the
4216 * endmarker role is not redundant.
4217 */
76e3520e 4218STATIC regnode *
3dab1dad 4219S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4220{
97aff369 4221 dVAR;
c277df42 4222 register regnode *ret;
a0d0e21e
LW
4223 register char op;
4224 register char *next;
4225 I32 flags;
1df70142 4226 const char * const origparse = RExC_parse;
a0d0e21e 4227 I32 min;
c277df42 4228 I32 max = REG_INFTY;
fac92740 4229 char *parse_start;
3dab1dad
YO
4230 GET_RE_DEBUG_FLAGS_DECL;
4231 DEBUG_PARSE("piec");
a0d0e21e 4232
3dab1dad 4233 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4234 if (ret == NULL) {
4235 if (flags & TRYAGAIN)
4236 *flagp |= TRYAGAIN;
4237 return(NULL);
4238 }
4239
830247a4 4240 op = *RExC_parse;
a0d0e21e 4241
830247a4 4242 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 4243 const char *maxpos = NULL;
fac92740 4244 parse_start = RExC_parse; /* MJD */
830247a4 4245 next = RExC_parse + 1;
a0d0e21e
LW
4246 while (isDIGIT(*next) || *next == ',') {
4247 if (*next == ',') {
4248 if (maxpos)
4249 break;
4250 else
4251 maxpos = next;
a687059c 4252 }
a0d0e21e
LW
4253 next++;
4254 }
4255 if (*next == '}') { /* got one */
4256 if (!maxpos)
4257 maxpos = next;
830247a4
IZ
4258 RExC_parse++;
4259 min = atoi(RExC_parse);
a0d0e21e
LW
4260 if (*maxpos == ',')
4261 maxpos++;
4262 else
830247a4 4263 maxpos = RExC_parse;
a0d0e21e
LW
4264 max = atoi(maxpos);
4265 if (!max && *maxpos != '0')
c277df42
IZ
4266 max = REG_INFTY; /* meaning "infinity" */
4267 else if (max >= REG_INFTY)
8615cb43 4268 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4269 RExC_parse = next;
4270 nextchar(pRExC_state);
a0d0e21e
LW
4271
4272 do_curly:
4273 if ((flags&SIMPLE)) {
830247a4
IZ
4274 RExC_naughty += 2 + RExC_naughty / 2;
4275 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4276 Set_Node_Offset(ret, parse_start+1); /* MJD */
4277 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4278 }
4279 else {
3dab1dad 4280 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4281
4282 w->flags = 0;
3dab1dad 4283 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4284 if (!SIZE_ONLY && RExC_extralen) {
4285 reginsert(pRExC_state, LONGJMP,ret);
4286 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4287 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4288 }
830247a4 4289 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4290 /* MJD hk */
4291 Set_Node_Offset(ret, parse_start+1);
2af232bd 4292 Set_Node_Length(ret,
fac92740 4293 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4294
830247a4 4295 if (!SIZE_ONLY && RExC_extralen)
c277df42 4296 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4297 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4298 if (SIZE_ONLY)
830247a4
IZ
4299 RExC_whilem_seen++, RExC_extralen += 3;
4300 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4301 }
c277df42 4302 ret->flags = 0;
a0d0e21e
LW
4303
4304 if (min > 0)
821b33a5
IZ
4305 *flagp = WORST;
4306 if (max > 0)
4307 *flagp |= HASWIDTH;
a0d0e21e 4308 if (max && max < min)
8615cb43 4309 vFAIL("Can't do {n,m} with n > m");
c277df42 4310 if (!SIZE_ONLY) {
eb160463
GS
4311 ARG1_SET(ret, (U16)min);
4312 ARG2_SET(ret, (U16)max);
a687059c 4313 }
a687059c 4314
a0d0e21e 4315 goto nest_check;
a687059c 4316 }
a0d0e21e 4317 }
a687059c 4318
a0d0e21e
LW
4319 if (!ISMULT1(op)) {
4320 *flagp = flags;
a687059c 4321 return(ret);
a0d0e21e 4322 }
bb20fd44 4323
c277df42 4324#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4325
4326 /* if this is reinstated, don't forget to put this back into perldiag:
4327
4328 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4329
4330 (F) The part of the regexp subject to either the * or + quantifier
4331 could match an empty string. The {#} shows in the regular
4332 expression about where the problem was discovered.
4333
4334 */
4335
bb20fd44 4336 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4337 vFAIL("Regexp *+ operand could be empty");
b81d288d 4338#endif
bb20fd44 4339
fac92740 4340 parse_start = RExC_parse;
830247a4 4341 nextchar(pRExC_state);
a0d0e21e 4342
821b33a5 4343 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4344
4345 if (op == '*' && (flags&SIMPLE)) {
830247a4 4346 reginsert(pRExC_state, STAR, ret);
c277df42 4347 ret->flags = 0;
830247a4 4348 RExC_naughty += 4;
a0d0e21e
LW
4349 }
4350 else if (op == '*') {
4351 min = 0;
4352 goto do_curly;
a0ed51b3
LW
4353 }
4354 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4355 reginsert(pRExC_state, PLUS, ret);
c277df42 4356 ret->flags = 0;
830247a4 4357 RExC_naughty += 3;
a0d0e21e
LW
4358 }
4359 else if (op == '+') {
4360 min = 1;
4361 goto do_curly;
a0ed51b3
LW
4362 }
4363 else if (op == '?') {
a0d0e21e
LW
4364 min = 0; max = 1;
4365 goto do_curly;
4366 }
4367 nest_check:
041457d9 4368 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4369 vWARN3(RExC_parse,
b45f050a 4370 "%.*s matches null string many times",
afd78fd5 4371 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4372 origparse);
a0d0e21e
LW
4373 }
4374
830247a4
IZ
4375 if (*RExC_parse == '?') {
4376 nextchar(pRExC_state);
4377 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4378 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4379 }
830247a4
IZ
4380 if (ISMULT2(RExC_parse)) {
4381 RExC_parse++;
b45f050a
JF
4382 vFAIL("Nested quantifiers");
4383 }
a0d0e21e
LW
4384
4385 return(ret);
a687059c
LW
4386}
4387
4388/*
4389 - regatom - the lowest level
4390 *
4391 * Optimization: gobbles an entire sequence of ordinary characters so that
4392 * it can turn them into a single node, which is smaller to store and
4393 * faster to run. Backslashed characters are exceptions, each becoming a
4394 * separate node; the code is simpler that way and it's not worth fixing.
4395 *
7f6f358c
YO
4396 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4397 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4398 */
76e3520e 4399STATIC regnode *
3dab1dad 4400S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4401{
97aff369 4402 dVAR;
cbbf8932 4403 register regnode *ret = NULL;
a0d0e21e 4404 I32 flags;
45948336 4405 char *parse_start = RExC_parse;
3dab1dad
YO
4406 GET_RE_DEBUG_FLAGS_DECL;
4407 DEBUG_PARSE("atom");
a0d0e21e
LW
4408 *flagp = WORST; /* Tentatively. */
4409
4410tryagain:
830247a4 4411 switch (*RExC_parse) {
a0d0e21e 4412 case '^':
830247a4
IZ
4413 RExC_seen_zerolen++;
4414 nextchar(pRExC_state);
e2509266 4415 if (RExC_flags & PMf_MULTILINE)
830247a4 4416 ret = reg_node(pRExC_state, MBOL);
e2509266 4417 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4418 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4419 else
830247a4 4420 ret = reg_node(pRExC_state, BOL);
fac92740 4421 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4422 break;
4423 case '$':
830247a4 4424 nextchar(pRExC_state);
b81d288d 4425 if (*RExC_parse)
830247a4 4426 RExC_seen_zerolen++;
e2509266 4427 if (RExC_flags & PMf_MULTILINE)
830247a4 4428 ret = reg_node(pRExC_state, MEOL);
e2509266 4429 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4430 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4431 else
830247a4 4432 ret = reg_node(pRExC_state, EOL);
fac92740 4433 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4434 break;
4435 case '.':
830247a4 4436 nextchar(pRExC_state);
e2509266 4437 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4438 ret = reg_node(pRExC_state, SANY);
4439 else
4440 ret = reg_node(pRExC_state, REG_ANY);
4441 *flagp |= HASWIDTH|SIMPLE;
830247a4 4442 RExC_naughty++;
fac92740 4443 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4444 break;
4445 case '[':
b45f050a 4446 {
3dab1dad
YO
4447 char * const oregcomp_parse = ++RExC_parse;
4448 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4449 if (*RExC_parse != ']') {
4450 RExC_parse = oregcomp_parse;
b45f050a
JF
4451 vFAIL("Unmatched [");
4452 }
830247a4 4453 nextchar(pRExC_state);
a0d0e21e 4454 *flagp |= HASWIDTH|SIMPLE;
fac92740 4455 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4456 break;
b45f050a 4457 }
a0d0e21e 4458 case '(':
830247a4 4459 nextchar(pRExC_state);
3dab1dad 4460 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4461 if (ret == NULL) {
bf93d4cc 4462 if (flags & TRYAGAIN) {
830247a4 4463 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4464 /* Make parent create an empty node if needed. */
4465 *flagp |= TRYAGAIN;
4466 return(NULL);
4467 }
a0d0e21e 4468 goto tryagain;
bf93d4cc 4469 }
a0d0e21e
LW
4470 return(NULL);
4471 }
c277df42 4472 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4473 break;
4474 case '|':
4475 case ')':
4476 if (flags & TRYAGAIN) {
4477 *flagp |= TRYAGAIN;
4478 return NULL;
4479 }
b45f050a 4480 vFAIL("Internal urp");
a0d0e21e
LW
4481 /* Supposed to be caught earlier. */
4482 break;
85afd4ae 4483 case '{':
830247a4
IZ
4484 if (!regcurly(RExC_parse)) {
4485 RExC_parse++;
85afd4ae
CS
4486 goto defchar;
4487 }
4488 /* FALL THROUGH */
a0d0e21e
LW
4489 case '?':
4490 case '+':
4491 case '*':
830247a4 4492 RExC_parse++;
b45f050a 4493 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4494 break;
4495 case '\\':
830247a4 4496 switch (*++RExC_parse) {
a0d0e21e 4497 case 'A':
830247a4
IZ
4498 RExC_seen_zerolen++;
4499 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4500 *flagp |= SIMPLE;
830247a4 4501 nextchar(pRExC_state);
fac92740 4502 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4503 break;
4504 case 'G':
830247a4
IZ
4505 ret = reg_node(pRExC_state, GPOS);
4506 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4507 *flagp |= SIMPLE;
830247a4 4508 nextchar(pRExC_state);
fac92740 4509 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4510 break;
4511 case 'Z':
830247a4 4512 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4513 *flagp |= SIMPLE;
a1917ab9 4514 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4515 nextchar(pRExC_state);
a0d0e21e 4516 break;
b85d18e9 4517 case 'z':
830247a4 4518 ret = reg_node(pRExC_state, EOS);
b85d18e9 4519 *flagp |= SIMPLE;
830247a4
IZ
4520 RExC_seen_zerolen++; /* Do not optimize RE away */
4521 nextchar(pRExC_state);
fac92740 4522 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4523 break;
4a2d328f 4524 case 'C':
f33976b4
DB
4525 ret = reg_node(pRExC_state, CANY);
4526 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4527 *flagp |= HASWIDTH|SIMPLE;
830247a4 4528 nextchar(pRExC_state);
fac92740 4529 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4530 break;
4531 case 'X':
830247a4 4532 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4533 *flagp |= HASWIDTH;
830247a4 4534 nextchar(pRExC_state);
fac92740 4535 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4536 break;
a0d0e21e 4537 case 'w':
eb160463 4538 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4539 *flagp |= HASWIDTH|SIMPLE;
830247a4 4540 nextchar(pRExC_state);
fac92740 4541 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4542 break;
4543 case 'W':
eb160463 4544 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4545 *flagp |= HASWIDTH|SIMPLE;
830247a4 4546 nextchar(pRExC_state);
fac92740 4547 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4548 break;
4549 case 'b':
830247a4
IZ
4550 RExC_seen_zerolen++;
4551 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4552 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4553 *flagp |= SIMPLE;
830247a4 4554 nextchar(pRExC_state);
fac92740 4555 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4556 break;
4557 case 'B':
830247a4
IZ
4558 RExC_seen_zerolen++;
4559 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4560 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4561 *flagp |= SIMPLE;
830247a4 4562 nextchar(pRExC_state);
fac92740 4563 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4564 break;
4565 case 's':
eb160463 4566 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4567 *flagp |= HASWIDTH|SIMPLE;
830247a4 4568 nextchar(pRExC_state);
fac92740 4569 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4570 break;
4571 case 'S':
eb160463 4572 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4573 *flagp |= HASWIDTH|SIMPLE;
830247a4 4574 nextchar(pRExC_state);
fac92740 4575 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4576 break;
4577 case 'd':
ffc61ed2 4578 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4579 *flagp |= HASWIDTH|SIMPLE;
830247a4 4580 nextchar(pRExC_state);
fac92740 4581 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4582 break;
4583 case 'D':
ffc61ed2 4584 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4585 *flagp |= HASWIDTH|SIMPLE;
830247a4 4586 nextchar(pRExC_state);
fac92740 4587 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4588 break;
a14b48bc
LW
4589 case 'p':
4590 case 'P':
3568d838 4591 {
3dab1dad 4592 char* const oldregxend = RExC_end;
ccb2c380 4593 char* parse_start = RExC_parse - 2;
a14b48bc 4594
830247a4 4595 if (RExC_parse[1] == '{') {
3568d838 4596 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4597 RExC_end = strchr(RExC_parse, '}');
4598 if (!RExC_end) {
3dab1dad 4599 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4600 RExC_parse += 2;
4601 RExC_end = oldregxend;
0da60cf5 4602 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4603 }
830247a4 4604 RExC_end++;
a14b48bc 4605 }
af6f566e 4606 else {
830247a4 4607 RExC_end = RExC_parse + 2;
af6f566e
HS
4608 if (RExC_end > oldregxend)
4609 RExC_end = oldregxend;
4610 }
830247a4 4611 RExC_parse--;
a14b48bc 4612
3dab1dad 4613 ret = regclass(pRExC_state,depth+1);
a14b48bc 4614
830247a4
IZ
4615 RExC_end = oldregxend;
4616 RExC_parse--;
ccb2c380
MP
4617
4618 Set_Node_Offset(ret, parse_start + 2);
4619 Set_Node_Cur_Length(ret);
830247a4 4620 nextchar(pRExC_state);
a14b48bc
LW
4621 *flagp |= HASWIDTH|SIMPLE;
4622 }
4623 break;
a0d0e21e
LW
4624 case 'n':
4625 case 'r':
4626 case 't':
4627 case 'f':
4628 case 'e':
4629 case 'a':
4630 case 'x':
4631 case 'c':
4632 case '0':
4633 goto defchar;
4634 case '1': case '2': case '3': case '4':
4635 case '5': case '6': case '7': case '8': case '9':
4636 {
1df70142 4637 const I32 num = atoi(RExC_parse);
a0d0e21e 4638
830247a4 4639 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4640 goto defchar;
4641 else {
3dab1dad 4642 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4643 while (isDIGIT(*RExC_parse))
4644 RExC_parse++;
b45f050a 4645
eb160463 4646 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4647 vFAIL("Reference to nonexistent group");
830247a4 4648 RExC_sawback = 1;
eb160463
GS
4649 ret = reganode(pRExC_state,
4650 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4651 num);
a0d0e21e 4652 *flagp |= HASWIDTH;
2af232bd 4653
fac92740 4654 /* override incorrect value set in reganode MJD */
2af232bd 4655 Set_Node_Offset(ret, parse_start+1);
fac92740 4656 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4657 RExC_parse--;
4658 nextchar(pRExC_state);
a0d0e21e
LW
4659 }
4660 }
4661 break;
4662 case '\0':
830247a4 4663 if (RExC_parse >= RExC_end)
b45f050a 4664 FAIL("Trailing \\");
a0d0e21e
LW
4665 /* FALL THROUGH */
4666 default:
a0288114 4667 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4668 back into the quick-grab loop below */
45948336 4669 parse_start--;
a0d0e21e
LW
4670 goto defchar;
4671 }
4672 break;
4633a7c4
LW
4673
4674 case '#':
e2509266 4675 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4676 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4677 RExC_parse++;
830247a4 4678 if (RExC_parse < RExC_end)
4633a7c4
LW
4679 goto tryagain;
4680 }
4681 /* FALL THROUGH */
4682
a0d0e21e 4683 default: {
ba210ebe 4684 register STRLEN len;
58ae7d3f 4685 register UV ender;
a0d0e21e 4686 register char *p;
3dab1dad 4687 char *s;
80aecb99 4688 STRLEN foldlen;
89ebb4a3 4689 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4690
4691 parse_start = RExC_parse - 1;
a0d0e21e 4692
830247a4 4693 RExC_parse++;
a0d0e21e
LW
4694
4695 defchar:
58ae7d3f 4696 ender = 0;
eb160463
GS
4697 ret = reg_node(pRExC_state,
4698 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4699 s = STRING(ret);
830247a4
IZ
4700 for (len = 0, p = RExC_parse - 1;
4701 len < 127 && p < RExC_end;
a0d0e21e
LW
4702 len++)
4703 {
3dab1dad 4704 char * const oldp = p;
5b5a24f7 4705
e2509266 4706 if (RExC_flags & PMf_EXTENDED)
830247a4 4707 p = regwhite(p, RExC_end);
a0d0e21e
LW
4708 switch (*p) {
4709 case '^':
4710 case '$':
4711 case '.':
4712 case '[':
4713 case '(':
4714 case ')':
4715 case '|':
4716 goto loopdone;
4717 case '\\':
4718 switch (*++p) {
4719 case 'A':
1ed8eac0
JF
4720 case 'C':
4721 case 'X':
a0d0e21e
LW
4722 case 'G':
4723 case 'Z':
b85d18e9 4724 case 'z':
a0d0e21e
LW
4725 case 'w':
4726 case 'W':
4727 case 'b':
4728 case 'B':
4729 case 's':
4730 case 'S':
4731 case 'd':
4732 case 'D':
a14b48bc
LW
4733 case 'p':
4734 case 'P':
a0d0e21e
LW
4735 --p;
4736 goto loopdone;
4737 case 'n':
4738 ender = '\n';
4739 p++;
a687059c 4740 break;
a0d0e21e
LW
4741 case 'r':
4742 ender = '\r';
4743 p++;
a687059c 4744 break;
a0d0e21e
LW
4745 case 't':
4746 ender = '\t';
4747 p++;
a687059c 4748 break;
a0d0e21e
LW
4749 case 'f':
4750 ender = '\f';
4751 p++;
a687059c 4752 break;
a0d0e21e 4753 case 'e':
c7f1f016 4754 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4755 p++;
a687059c 4756 break;
a0d0e21e 4757 case 'a':
c7f1f016 4758 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4759 p++;
a687059c 4760 break;
a0d0e21e 4761 case 'x':
a0ed51b3 4762 if (*++p == '{') {
1df70142 4763 char* const e = strchr(p, '}');
b81d288d 4764
b45f050a 4765 if (!e) {
830247a4 4766 RExC_parse = p + 1;
b45f050a
JF
4767 vFAIL("Missing right brace on \\x{}");
4768 }
de5f0749 4769 else {
a4c04bdc
NC
4770 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4771 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4772 STRLEN numlen = e - p - 1;
53305cf1 4773 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4774 if (ender > 0xff)
4775 RExC_utf8 = 1;
a0ed51b3
LW
4776 p = e + 1;
4777 }
a0ed51b3
LW
4778 }
4779 else {
a4c04bdc 4780 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4781 STRLEN numlen = 2;
53305cf1 4782 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4783 p += numlen;
4784 }
a687059c 4785 break;
a0d0e21e
LW
4786 case 'c':
4787 p++;
bbce6d69 4788 ender = UCHARAT(p++);
4789 ender = toCTRL(ender);
a687059c 4790 break;
a0d0e21e
LW
4791 case '0': case '1': case '2': case '3':case '4':
4792 case '5': case '6': case '7': case '8':case '9':
4793 if (*p == '0' ||
830247a4 4794 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4795 I32 flags = 0;
1df70142 4796 STRLEN numlen = 3;
53305cf1 4797 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4798 p += numlen;
4799 }
4800 else {
4801 --p;
4802 goto loopdone;
a687059c
LW
4803 }
4804 break;
a0d0e21e 4805 case '\0':
830247a4 4806 if (p >= RExC_end)
b45f050a 4807 FAIL("Trailing \\");
a687059c 4808 /* FALL THROUGH */
a0d0e21e 4809 default:
041457d9 4810 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4811 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4812 goto normal_default;
a0d0e21e
LW
4813 }
4814 break;
a687059c 4815 default:
a0ed51b3 4816 normal_default:
fd400ab9 4817 if (UTF8_IS_START(*p) && UTF) {
1df70142 4818 STRLEN numlen;
5e12f4fb 4819 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4820 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4821 p += numlen;
4822 }
4823 else
4824 ender = *p++;
a0d0e21e 4825 break;
a687059c 4826 }
e2509266 4827 if (RExC_flags & PMf_EXTENDED)
830247a4 4828 p = regwhite(p, RExC_end);
60a8b682
JH
4829 if (UTF && FOLD) {
4830 /* Prime the casefolded buffer. */
ac7e0132 4831 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4832 }
a0d0e21e
LW
4833 if (ISMULT2(p)) { /* Back off on ?+*. */
4834 if (len)
4835 p = oldp;
16ea2a2e 4836 else if (UTF) {
80aecb99 4837 if (FOLD) {
60a8b682 4838 /* Emit all the Unicode characters. */
1df70142 4839 STRLEN numlen;
80aecb99
JH
4840 for (foldbuf = tmpbuf;
4841 foldlen;
4842 foldlen -= numlen) {
4843 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4844 if (numlen > 0) {
71207a34 4845 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4846 s += unilen;
4847 len += unilen;
4848 /* In EBCDIC the numlen
4849 * and unilen can differ. */
9dc45d57 4850 foldbuf += numlen;
47654450
JH
4851 if (numlen >= foldlen)
4852 break;
9dc45d57
JH
4853 }
4854 else
4855 break; /* "Can't happen." */
80aecb99
JH
4856 }
4857 }
4858 else {
71207a34 4859 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4860 if (unilen > 0) {
0ebc6274
JH
4861 s += unilen;
4862 len += unilen;
9dc45d57 4863 }
80aecb99 4864 }
a0ed51b3 4865 }
a0d0e21e
LW
4866 else {
4867 len++;
eb160463 4868 REGC((char)ender, s++);
a0d0e21e
LW
4869 }
4870 break;
a687059c 4871 }
16ea2a2e 4872 if (UTF) {
80aecb99 4873 if (FOLD) {
60a8b682 4874 /* Emit all the Unicode characters. */
1df70142 4875 STRLEN numlen;
80aecb99
JH
4876 for (foldbuf = tmpbuf;
4877 foldlen;
4878 foldlen -= numlen) {
4879 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4880 if (numlen > 0) {
71207a34 4881 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4882 len += unilen;
4883 s += unilen;
4884 /* In EBCDIC the numlen
4885 * and unilen can differ. */
9dc45d57 4886 foldbuf += numlen;
47654450
JH
4887 if (numlen >= foldlen)
4888 break;
9dc45d57
JH
4889 }
4890 else
4891 break;
80aecb99
JH
4892 }
4893 }
4894 else {
71207a34 4895 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4896 if (unilen > 0) {
0ebc6274
JH
4897 s += unilen;
4898 len += unilen;
9dc45d57 4899 }
80aecb99
JH
4900 }
4901 len--;
a0ed51b3
LW
4902 }
4903 else
eb160463 4904 REGC((char)ender, s++);
a0d0e21e
LW
4905 }
4906 loopdone:
830247a4 4907 RExC_parse = p - 1;
fac92740 4908 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4909 nextchar(pRExC_state);
793db0cb
JH
4910 {
4911 /* len is STRLEN which is unsigned, need to copy to signed */
4912 IV iv = len;
4913 if (iv < 0)
4914 vFAIL("Internal disaster");
4915 }
a0d0e21e
LW
4916 if (len > 0)
4917 *flagp |= HASWIDTH;
090f7165 4918 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4919 *flagp |= SIMPLE;
3dab1dad 4920
cd439c50 4921 if (SIZE_ONLY)
830247a4 4922 RExC_size += STR_SZ(len);
3dab1dad
YO
4923 else {
4924 STR_LEN(ret) = len;
830247a4 4925 RExC_emit += STR_SZ(len);
07be1b83 4926 }
3dab1dad 4927 }
a0d0e21e
LW
4928 break;
4929 }
a687059c 4930
60a8b682
JH
4931 /* If the encoding pragma is in effect recode the text of
4932 * any EXACT-kind nodes. */
3dab1dad
YO
4933 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4934 const STRLEN oldlen = STR_LEN(ret);
4935 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4936
4937 if (RExC_utf8)
4938 SvUTF8_on(sv);
4939 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4940 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4941 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4942
4943 if (SvUTF8(sv))
4944 RExC_utf8 = 1;
4945 if (!SIZE_ONLY) {
a3621e74
YO
4946 GET_RE_DEBUG_FLAGS_DECL;
4947 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4948 (int)oldlen, STRING(ret),
4949 (int)newlen, s));
4950 Copy(s, STRING(ret), newlen, char);
4951 STR_LEN(ret) += newlen - oldlen;
4952 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4953 } else
4954 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4955 }
a72c7584
JH
4956 }
4957
a0d0e21e 4958 return(ret);
a687059c
LW
4959}
4960
873ef191 4961STATIC char *
5f66b61c 4962S_regwhite(char *p, const char *e)
5b5a24f7
CS
4963{
4964 while (p < e) {
4965 if (isSPACE(*p))
4966 ++p;
4967 else if (*p == '#') {
4968 do {
4969 p++;
4970 } while (p < e && *p != '\n');
4971 }
4972 else
4973 break;
4974 }
4975 return p;
4976}
4977
b8c5462f
JH
4978/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4979 Character classes ([:foo:]) can also be negated ([:^foo:]).
4980 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4981 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4982 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4983
4984#define POSIXCC_DONE(c) ((c) == ':')
4985#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4986#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4987
b8c5462f 4988STATIC I32
830247a4 4989S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4990{
97aff369 4991 dVAR;
936ed897 4992 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4993
830247a4 4994 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4995 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4996 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4997 const char c = UCHARAT(RExC_parse);
097eb12c 4998 char* const s = RExC_parse++;
b81d288d 4999
9a86a77b 5000 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5001 RExC_parse++;
5002 if (RExC_parse == RExC_end)
620e46c5 5003 /* Grandfather lone [:, [=, [. */
830247a4 5004 RExC_parse = s;
620e46c5 5005 else {
3dab1dad 5006 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5007 assert(*t == c);
5008
9a86a77b 5009 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5010 const char *posixcc = s + 1;
830247a4 5011 RExC_parse++; /* skip over the ending ] */
3dab1dad 5012
b8c5462f 5013 if (*s == ':') {
1df70142
AL
5014 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5015 const I32 skip = t - posixcc;
80916619
NC
5016
5017 /* Initially switch on the length of the name. */
5018 switch (skip) {
5019 case 4:
3dab1dad
YO
5020 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5021 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5022 break;
80916619
NC
5023 case 5:
5024 /* Names all of length 5. */
5025 /* alnum alpha ascii blank cntrl digit graph lower
5026 print punct space upper */
5027 /* Offset 4 gives the best switch position. */
5028 switch (posixcc[4]) {
5029 case 'a':
3dab1dad
YO
5030 if (memEQ(posixcc, "alph", 4)) /* alpha */
5031 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5032 break;
5033 case 'e':
3dab1dad
YO
5034 if (memEQ(posixcc, "spac", 4)) /* space */
5035 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5036 break;
5037 case 'h':
3dab1dad
YO
5038 if (memEQ(posixcc, "grap", 4)) /* graph */
5039 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5040 break;
5041 case 'i':
3dab1dad
YO
5042 if (memEQ(posixcc, "asci", 4)) /* ascii */
5043 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5044 break;
5045 case 'k':
3dab1dad
YO
5046 if (memEQ(posixcc, "blan", 4)) /* blank */
5047 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5048 break;
5049 case 'l':
3dab1dad
YO
5050 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5051 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5052 break;
5053 case 'm':
3dab1dad
YO
5054 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5055 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5056 break;
5057 case 'r':
3dab1dad
YO
5058 if (memEQ(posixcc, "lowe", 4)) /* lower */
5059 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5060 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5061 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5062 break;
5063 case 't':
3dab1dad
YO
5064 if (memEQ(posixcc, "digi", 4)) /* digit */
5065 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5066 else if (memEQ(posixcc, "prin", 4)) /* print */
5067 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5068 else if (memEQ(posixcc, "punc", 4)) /* punct */
5069 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5070 break;
b8c5462f
JH
5071 }
5072 break;
80916619 5073 case 6:
3dab1dad
YO
5074 if (memEQ(posixcc, "xdigit", 6))
5075 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5076 break;
5077 }
80916619
NC
5078
5079 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5080 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5081 t - s - 1, s + 1);
80916619
NC
5082 assert (posixcc[skip] == ':');
5083 assert (posixcc[skip+1] == ']');
b45f050a 5084 } else if (!SIZE_ONLY) {
b8c5462f 5085 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5086
830247a4 5087 /* adjust RExC_parse so the warning shows after
b45f050a 5088 the class closes */
9a86a77b 5089 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5090 RExC_parse++;
b45f050a
JF
5091 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5092 }
b8c5462f
JH
5093 } else {
5094 /* Maternal grandfather:
5095 * "[:" ending in ":" but not in ":]" */
830247a4 5096 RExC_parse = s;
767d463e 5097 }
620e46c5
JH
5098 }
5099 }
5100
b8c5462f
JH
5101 return namedclass;
5102}
5103
5104STATIC void
830247a4 5105S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5106{
97aff369 5107 dVAR;
3dab1dad 5108 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5109 const char *s = RExC_parse;
5110 const char c = *s++;
b8c5462f 5111
3dab1dad 5112 while (isALNUM(*s))
b8c5462f
JH
5113 s++;
5114 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5115 if (ckWARN(WARN_REGEXP))
5116 vWARN3(s+2,
5117 "POSIX syntax [%c %c] belongs inside character classes",
5118 c, c);
b45f050a
JF
5119
5120 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5121 if (POSIXCC_NOTYET(c)) {
830247a4 5122 /* adjust RExC_parse so the error shows after
b45f050a 5123 the class closes */
9a86a77b 5124 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5125 NOOP;
b45f050a
JF
5126 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5127 }
b8c5462f
JH
5128 }
5129 }
620e46c5
JH
5130}
5131
7f6f358c
YO
5132
5133/*
5134 parse a class specification and produce either an ANYOF node that
5135 matches the pattern. If the pattern matches a single char only and
5136 that char is < 256 then we produce an EXACT node instead.
5137*/
76e3520e 5138STATIC regnode *
3dab1dad 5139S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5140{
97aff369 5141 dVAR;
07be1b83 5142 register UV value;
9a86a77b 5143 register UV nextvalue;
3568d838 5144 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5145 register IV range = 0;
c277df42 5146 register regnode *ret;
ba210ebe 5147 STRLEN numlen;
ffc61ed2 5148 IV namedclass;
cbbf8932 5149 char *rangebegin = NULL;
936ed897 5150 bool need_class = 0;
c445ea15 5151 SV *listsv = NULL;
ffc61ed2 5152 UV n;
9e55ce06 5153 bool optimize_invert = TRUE;
cbbf8932 5154 AV* unicode_alternate = NULL;
1b2d223b
JH
5155#ifdef EBCDIC
5156 UV literal_endpoint = 0;
5157#endif
7f6f358c 5158 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5159
3dab1dad 5160 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5161 case we need to change the emitted regop to an EXACT. */
07be1b83 5162 const char * orig_parse = RExC_parse;
3dab1dad
YO
5163 GET_RE_DEBUG_FLAGS_DECL;
5164 DEBUG_PARSE("clas");
7f6f358c
YO
5165
5166 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5167 ret = reganode(pRExC_state, ANYOF, 0);
5168
5169 if (!SIZE_ONLY)
5170 ANYOF_FLAGS(ret) = 0;
5171
9a86a77b 5172 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5173 RExC_naughty++;
5174 RExC_parse++;
5175 if (!SIZE_ONLY)
5176 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5177 }
a0d0e21e 5178
73060fc4 5179 if (SIZE_ONLY) {
830247a4 5180 RExC_size += ANYOF_SKIP;
73060fc4
JH
5181 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5182 }
936ed897 5183 else {
830247a4 5184 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5185 if (FOLD)
5186 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5187 if (LOC)
5188 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5189 ANYOF_BITMAP_ZERO(ret);
396482e1 5190 listsv = newSVpvs("# comment\n");
a0d0e21e 5191 }
b8c5462f 5192
9a86a77b
JH
5193 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5194
b938889d 5195 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5196 checkposixcc(pRExC_state);
b8c5462f 5197
f064b6ad
HS
5198 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5199 if (UCHARAT(RExC_parse) == ']')
5200 goto charclassloop;
ffc61ed2 5201
9a86a77b 5202 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5203
5204 charclassloop:
5205
5206 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5207
73b437c8 5208 if (!range)
830247a4 5209 rangebegin = RExC_parse;
ffc61ed2 5210 if (UTF) {
5e12f4fb 5211 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5212 RExC_end - RExC_parse,
9f7f3913 5213 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5214 RExC_parse += numlen;
5215 }
5216 else
5217 value = UCHARAT(RExC_parse++);
7f6f358c 5218
9a86a77b
JH
5219 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5220 if (value == '[' && POSIXCC(nextvalue))
830247a4 5221 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5222 else if (value == '\\') {
ffc61ed2 5223 if (UTF) {
5e12f4fb 5224 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5225 RExC_end - RExC_parse,
9f7f3913 5226 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5227 RExC_parse += numlen;
5228 }
5229 else
5230 value = UCHARAT(RExC_parse++);
470c3474 5231 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5232 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5233 * be a problem later if we want switch on Unicode.
5234 * A similar issue a little bit later when switching on
5235 * namedclass. --jhi */
ffc61ed2 5236 switch ((I32)value) {
b8c5462f
JH
5237 case 'w': namedclass = ANYOF_ALNUM; break;
5238 case 'W': namedclass = ANYOF_NALNUM; break;
5239 case 's': namedclass = ANYOF_SPACE; break;
5240 case 'S': namedclass = ANYOF_NSPACE; break;
5241 case 'd': namedclass = ANYOF_DIGIT; break;
5242 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5243 case 'p':
5244 case 'P':
3dab1dad
YO
5245 {
5246 char *e;
af6f566e 5247 if (RExC_parse >= RExC_end)
2a4859cd 5248 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5249 if (*RExC_parse == '{') {
1df70142 5250 const U8 c = (U8)value;
ffc61ed2
JH
5251 e = strchr(RExC_parse++, '}');
5252 if (!e)
0da60cf5 5253 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5254 while (isSPACE(UCHARAT(RExC_parse)))
5255 RExC_parse++;
5256 if (e == RExC_parse)
0da60cf5 5257 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5258 n = e - RExC_parse;
ab13f0c7
JH
5259 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5260 n--;
ffc61ed2
JH
5261 }
5262 else {
5263 e = RExC_parse;
5264 n = 1;
5265 }
5266 if (!SIZE_ONLY) {
ab13f0c7
JH
5267 if (UCHARAT(RExC_parse) == '^') {
5268 RExC_parse++;
5269 n--;
5270 value = value == 'p' ? 'P' : 'p'; /* toggle */
5271 while (isSPACE(UCHARAT(RExC_parse))) {
5272 RExC_parse++;
5273 n--;
5274 }
5275 }
097eb12c
AL
5276 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5277 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5278 }
5279 RExC_parse = e + 1;
5280 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5281 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5282 }
f81125e2 5283 break;
b8c5462f
JH
5284 case 'n': value = '\n'; break;
5285 case 'r': value = '\r'; break;
5286 case 't': value = '\t'; break;
5287 case 'f': value = '\f'; break;
5288 case 'b': value = '\b'; break;
c7f1f016
NIS
5289 case 'e': value = ASCII_TO_NATIVE('\033');break;
5290 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5291 case 'x':
ffc61ed2 5292 if (*RExC_parse == '{') {
a4c04bdc
NC
5293 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5294 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5295 char * const e = strchr(RExC_parse++, '}');
b81d288d 5296 if (!e)
ffc61ed2 5297 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5298
5299 numlen = e - RExC_parse;
5300 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5301 RExC_parse = e + 1;
5302 }
5303 else {
a4c04bdc 5304 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5305 numlen = 2;
5306 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5307 RExC_parse += numlen;
5308 }
b8c5462f
JH
5309 break;
5310 case 'c':
830247a4 5311 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5312 value = toCTRL(value);
5313 break;
5314 case '0': case '1': case '2': case '3': case '4':
5315 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5316 {
5317 I32 flags = 0;
5318 numlen = 3;
5319 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5320 RExC_parse += numlen;
b8c5462f 5321 break;
53305cf1 5322 }
1028017a 5323 default:
041457d9 5324 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5325 vWARN2(RExC_parse,
5326 "Unrecognized escape \\%c in character class passed through",
5327 (int)value);
1028017a 5328 break;
b8c5462f 5329 }
ffc61ed2 5330 } /* end of \blah */
1b2d223b
JH
5331#ifdef EBCDIC
5332 else
5333 literal_endpoint++;
5334#endif
ffc61ed2
JH
5335
5336 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5337
5338 if (!SIZE_ONLY && !need_class)
936ed897 5339 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5340
936ed897 5341 need_class = 1;
ffc61ed2
JH
5342
5343 /* a bad range like a-\d, a-[:digit:] ? */
5344 if (range) {
73b437c8 5345 if (!SIZE_ONLY) {
afd78fd5 5346 if (ckWARN(WARN_REGEXP)) {
097eb12c 5347 const int w =
afd78fd5
JH
5348 RExC_parse >= rangebegin ?
5349 RExC_parse - rangebegin : 0;
830247a4 5350 vWARN4(RExC_parse,
b45f050a 5351 "False [] range \"%*.*s\"",
097eb12c 5352 w, w, rangebegin);
afd78fd5 5353 }
3568d838
JH
5354 if (prevvalue < 256) {
5355 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5356 ANYOF_BITMAP_SET(ret, '-');
5357 }
5358 else {
5359 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5360 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5361 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5362 }
b8c5462f 5363 }
ffc61ed2
JH
5364
5365 range = 0; /* this was not a true range */
73b437c8 5366 }
ffc61ed2 5367
73b437c8 5368 if (!SIZE_ONLY) {
c49a72a9
NC
5369 const char *what = NULL;
5370 char yesno = 0;
5371
3568d838
JH
5372 if (namedclass > OOB_NAMEDCLASS)
5373 optimize_invert = FALSE;
e2962f66
JH
5374 /* Possible truncation here but in some 64-bit environments
5375 * the compiler gets heartburn about switch on 64-bit values.
5376 * A similar issue a little earlier when switching on value.
98f323fa 5377 * --jhi */
e2962f66 5378 switch ((I32)namedclass) {
73b437c8
JH
5379 case ANYOF_ALNUM:
5380 if (LOC)
936ed897 5381 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5382 else {
5383 for (value = 0; value < 256; value++)
5384 if (isALNUM(value))
936ed897 5385 ANYOF_BITMAP_SET(ret, value);
73b437c8 5386 }
c49a72a9
NC
5387 yesno = '+';
5388 what = "Word";
73b437c8
JH
5389 break;
5390 case ANYOF_NALNUM:
5391 if (LOC)
936ed897 5392 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5393 else {
5394 for (value = 0; value < 256; value++)
5395 if (!isALNUM(value))
936ed897 5396 ANYOF_BITMAP_SET(ret, value);
73b437c8 5397 }
c49a72a9
NC
5398 yesno = '!';
5399 what = "Word";
73b437c8 5400 break;
ffc61ed2 5401 case ANYOF_ALNUMC:
73b437c8 5402 if (LOC)
ffc61ed2 5403 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5404 else {
5405 for (value = 0; value < 256; value++)
ffc61ed2 5406 if (isALNUMC(value))
936ed897 5407 ANYOF_BITMAP_SET(ret, value);
73b437c8 5408 }
c49a72a9
NC
5409 yesno = '+';
5410 what = "Alnum";
73b437c8
JH
5411 break;
5412 case ANYOF_NALNUMC:
5413 if (LOC)
936ed897 5414 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5415 else {
5416 for (value = 0; value < 256; value++)
5417 if (!isALNUMC(value))
936ed897 5418 ANYOF_BITMAP_SET(ret, value);
73b437c8 5419 }
c49a72a9
NC
5420 yesno = '!';
5421 what = "Alnum";
73b437c8
JH
5422 break;
5423 case ANYOF_ALPHA:
5424 if (LOC)
936ed897 5425 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5426 else {
5427 for (value = 0; value < 256; value++)
5428 if (isALPHA(value))
936ed897 5429 ANYOF_BITMAP_SET(ret, value);
73b437c8 5430 }
c49a72a9
NC
5431 yesno = '+';
5432 what = "Alpha";
73b437c8
JH
5433 break;
5434 case ANYOF_NALPHA:
5435 if (LOC)
936ed897 5436 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5437 else {
5438 for (value = 0; value < 256; value++)
5439 if (!isALPHA(value))
936ed897 5440 ANYOF_BITMAP_SET(ret, value);
73b437c8 5441 }
c49a72a9
NC
5442 yesno = '!';
5443 what = "Alpha";
73b437c8
JH
5444 break;
5445 case ANYOF_ASCII:
5446 if (LOC)
936ed897 5447 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5448 else {
c7f1f016 5449#ifndef EBCDIC
1ba5c669
JH
5450 for (value = 0; value < 128; value++)
5451 ANYOF_BITMAP_SET(ret, value);
5452#else /* EBCDIC */
ffbc6a93 5453 for (value = 0; value < 256; value++) {
3a3c4447
JH
5454 if (isASCII(value))
5455 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5456 }
1ba5c669 5457#endif /* EBCDIC */
73b437c8 5458 }
c49a72a9
NC
5459 yesno = '+';
5460 what = "ASCII";
73b437c8
JH
5461 break;
5462 case ANYOF_NASCII:
5463 if (LOC)
936ed897 5464 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5465 else {
c7f1f016 5466#ifndef EBCDIC
1ba5c669
JH
5467 for (value = 128; value < 256; value++)
5468 ANYOF_BITMAP_SET(ret, value);
5469#else /* EBCDIC */
ffbc6a93 5470 for (value = 0; value < 256; value++) {
3a3c4447
JH
5471 if (!isASCII(value))
5472 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5473 }
1ba5c669 5474#endif /* EBCDIC */
73b437c8 5475 }
c49a72a9
NC
5476 yesno = '!';
5477 what = "ASCII";
73b437c8 5478 break;
aaa51d5e
JF
5479 case ANYOF_BLANK:
5480 if (LOC)
5481 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5482 else {
5483 for (value = 0; value < 256; value++)
5484 if (isBLANK(value))
5485 ANYOF_BITMAP_SET(ret, value);
5486 }
c49a72a9
NC
5487 yesno = '+';
5488 what = "Blank";
aaa51d5e
JF
5489 break;
5490 case ANYOF_NBLANK:
5491 if (LOC)
5492 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5493 else {
5494 for (value = 0; value < 256; value++)
5495 if (!isBLANK(value))
5496 ANYOF_BITMAP_SET(ret, value);
5497 }
c49a72a9
NC
5498 yesno = '!';
5499 what = "Blank";
aaa51d5e 5500 break;
73b437c8
JH
5501 case ANYOF_CNTRL:
5502 if (LOC)
936ed897 5503 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5504 else {
5505 for (value = 0; value < 256; value++)
5506 if (isCNTRL(value))
936ed897 5507 ANYOF_BITMAP_SET(ret, value);
73b437c8 5508 }
c49a72a9
NC
5509 yesno = '+';
5510 what = "Cntrl";
73b437c8
JH
5511 break;
5512 case ANYOF_NCNTRL:
5513 if (LOC)
936ed897 5514 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5515 else {
5516 for (value = 0; value < 256; value++)
5517 if (!isCNTRL(value))
936ed897 5518 ANYOF_BITMAP_SET(ret, value);
73b437c8 5519 }
c49a72a9
NC
5520 yesno = '!';
5521 what = "Cntrl";
ffc61ed2
JH
5522 break;
5523 case ANYOF_DIGIT:
5524 if (LOC)
5525 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5526 else {
5527 /* consecutive digits assumed */
5528 for (value = '0'; value <= '9'; value++)
5529 ANYOF_BITMAP_SET(ret, value);
5530 }
c49a72a9
NC
5531 yesno = '+';
5532 what = "Digit";
ffc61ed2
JH
5533 break;
5534 case ANYOF_NDIGIT:
5535 if (LOC)
5536 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5537 else {
5538 /* consecutive digits assumed */
5539 for (value = 0; value < '0'; value++)
5540 ANYOF_BITMAP_SET(ret, value);
5541 for (value = '9' + 1; value < 256; value++)
5542 ANYOF_BITMAP_SET(ret, value);
5543 }
c49a72a9
NC
5544 yesno = '!';
5545 what = "Digit";
73b437c8
JH
5546 break;
5547 case ANYOF_GRAPH:
5548 if (LOC)
936ed897 5549 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5550 else {
5551 for (value = 0; value < 256; value++)
5552 if (isGRAPH(value))
936ed897 5553 ANYOF_BITMAP_SET(ret, value);
73b437c8 5554 }
c49a72a9
NC
5555 yesno = '+';
5556 what = "Graph";
73b437c8
JH
5557 break;
5558 case ANYOF_NGRAPH:
5559 if (LOC)
936ed897 5560 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5561 else {
5562 for (value = 0; value < 256; value++)
5563 if (!isGRAPH(value))
936ed897 5564 ANYOF_BITMAP_SET(ret, value);
73b437c8 5565 }
c49a72a9
NC
5566 yesno = '!';
5567 what = "Graph";
73b437c8
JH
5568 break;
5569 case ANYOF_LOWER:
5570 if (LOC)
936ed897 5571 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5572 else {
5573 for (value = 0; value < 256; value++)
5574 if (isLOWER(value))
936ed897 5575 ANYOF_BITMAP_SET(ret, value);
73b437c8 5576 }
c49a72a9
NC
5577 yesno = '+';
5578 what = "Lower";
73b437c8
JH
5579 break;
5580 case ANYOF_NLOWER:
5581 if (LOC)
936ed897 5582 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5583 else {
5584 for (value = 0; value < 256; value++)
5585 if (!isLOWER(value))
936ed897 5586 ANYOF_BITMAP_SET(ret, value);
73b437c8 5587 }
c49a72a9
NC
5588 yesno = '!';
5589 what = "Lower";
73b437c8
JH
5590 break;
5591 case ANYOF_PRINT:
5592 if (LOC)
936ed897 5593 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5594 else {
5595 for (value = 0; value < 256; value++)
5596 if (isPRINT(value))
936ed897 5597 ANYOF_BITMAP_SET(ret, value);
73b437c8 5598 }
c49a72a9
NC
5599 yesno = '+';
5600 what = "Print";
73b437c8
JH
5601 break;
5602 case ANYOF_NPRINT:
5603 if (LOC)
936ed897 5604 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5605 else {
5606 for (value = 0; value < 256; value++)
5607 if (!isPRINT(value))
936ed897 5608 ANYOF_BITMAP_SET(ret, value);
73b437c8 5609 }
c49a72a9
NC
5610 yesno = '!';
5611 what = "Print";
73b437c8 5612 break;
aaa51d5e
JF
5613 case ANYOF_PSXSPC:
5614 if (LOC)
5615 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5616 else {
5617 for (value = 0; value < 256; value++)
5618 if (isPSXSPC(value))
5619 ANYOF_BITMAP_SET(ret, value);
5620 }
c49a72a9
NC
5621 yesno = '+';
5622 what = "Space";
aaa51d5e
JF
5623 break;
5624 case ANYOF_NPSXSPC:
5625 if (LOC)
5626 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5627 else {
5628 for (value = 0; value < 256; value++)
5629 if (!isPSXSPC(value))
5630 ANYOF_BITMAP_SET(ret, value);
5631 }
c49a72a9
NC
5632 yesno = '!';
5633 what = "Space";
aaa51d5e 5634 break;
73b437c8
JH
5635 case ANYOF_PUNCT:
5636 if (LOC)
936ed897 5637 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5638 else {
5639 for (value = 0; value < 256; value++)
5640 if (isPUNCT(value))
936ed897 5641 ANYOF_BITMAP_SET(ret, value);
73b437c8 5642 }
c49a72a9
NC
5643 yesno = '+';
5644 what = "Punct";
73b437c8
JH
5645 break;
5646 case ANYOF_NPUNCT:
5647 if (LOC)
936ed897 5648 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5649 else {
5650 for (value = 0; value < 256; value++)
5651 if (!isPUNCT(value))
936ed897 5652 ANYOF_BITMAP_SET(ret, value);
73b437c8 5653 }
c49a72a9
NC
5654 yesno = '!';
5655 what = "Punct";
ffc61ed2
JH
5656 break;
5657 case ANYOF_SPACE:
5658 if (LOC)
5659 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5660 else {
5661 for (value = 0; value < 256; value++)
5662 if (isSPACE(value))
5663 ANYOF_BITMAP_SET(ret, value);
5664 }
c49a72a9
NC
5665 yesno = '+';
5666 what = "SpacePerl";
ffc61ed2
JH
5667 break;
5668 case ANYOF_NSPACE:
5669 if (LOC)
5670 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5671 else {
5672 for (value = 0; value < 256; value++)
5673 if (!isSPACE(value))
5674 ANYOF_BITMAP_SET(ret, value);
5675 }
c49a72a9
NC
5676 yesno = '!';
5677 what = "SpacePerl";
73b437c8
JH
5678 break;
5679 case ANYOF_UPPER:
5680 if (LOC)
936ed897 5681 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5682 else {
5683 for (value = 0; value < 256; value++)
5684 if (isUPPER(value))
936ed897 5685 ANYOF_BITMAP_SET(ret, value);
73b437c8 5686 }
c49a72a9
NC
5687 yesno = '+';
5688 what = "Upper";
73b437c8
JH
5689 break;
5690 case ANYOF_NUPPER:
5691 if (LOC)
936ed897 5692 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5693 else {
5694 for (value = 0; value < 256; value++)
5695 if (!isUPPER(value))
936ed897 5696 ANYOF_BITMAP_SET(ret, value);
73b437c8 5697 }
c49a72a9
NC
5698 yesno = '!';
5699 what = "Upper";
73b437c8
JH
5700 break;
5701 case ANYOF_XDIGIT:
5702 if (LOC)
936ed897 5703 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5704 else {
5705 for (value = 0; value < 256; value++)
5706 if (isXDIGIT(value))
936ed897 5707 ANYOF_BITMAP_SET(ret, value);
73b437c8 5708 }
c49a72a9
NC
5709 yesno = '+';
5710 what = "XDigit";
73b437c8
JH
5711 break;
5712 case ANYOF_NXDIGIT:
5713 if (LOC)
936ed897 5714 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5715 else {
5716 for (value = 0; value < 256; value++)
5717 if (!isXDIGIT(value))
936ed897 5718 ANYOF_BITMAP_SET(ret, value);
73b437c8 5719 }
c49a72a9
NC
5720 yesno = '!';
5721 what = "XDigit";
73b437c8 5722 break;
f81125e2
JP
5723 case ANYOF_MAX:
5724 /* this is to handle \p and \P */
5725 break;
73b437c8 5726 default:
b45f050a 5727 vFAIL("Invalid [::] class");
73b437c8 5728 break;
b8c5462f 5729 }
c49a72a9
NC
5730 if (what) {
5731 /* Strings such as "+utf8::isWord\n" */
5732 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5733 }
b8c5462f 5734 if (LOC)
936ed897 5735 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5736 continue;
a0d0e21e 5737 }
ffc61ed2
JH
5738 } /* end of namedclass \blah */
5739
a0d0e21e 5740 if (range) {
eb160463 5741 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5742 const int w = RExC_parse - rangebegin;
5743 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5744 range = 0; /* not a valid range */
73b437c8 5745 }
a0d0e21e
LW
5746 }
5747 else {
3568d838 5748 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5749 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5750 RExC_parse[1] != ']') {
5751 RExC_parse++;
ffc61ed2
JH
5752
5753 /* a bad range like \w-, [:word:]- ? */
5754 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5755 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5756 const int w =
afd78fd5
JH
5757 RExC_parse >= rangebegin ?
5758 RExC_parse - rangebegin : 0;
830247a4 5759 vWARN4(RExC_parse,
b45f050a 5760 "False [] range \"%*.*s\"",
097eb12c 5761 w, w, rangebegin);
afd78fd5 5762 }
73b437c8 5763 if (!SIZE_ONLY)
936ed897 5764 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5765 } else
ffc61ed2
JH
5766 range = 1; /* yeah, it's a range! */
5767 continue; /* but do it the next time */
a0d0e21e 5768 }
a687059c 5769 }
ffc61ed2 5770
93733859 5771 /* now is the next time */
07be1b83 5772 /*stored += (value - prevvalue + 1);*/
ae5c130c 5773 if (!SIZE_ONLY) {
3568d838 5774 if (prevvalue < 256) {
1df70142 5775 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5776 IV i;
3568d838 5777#ifdef EBCDIC
1b2d223b
JH
5778 /* In EBCDIC [\x89-\x91] should include
5779 * the \x8e but [i-j] should not. */
5780 if (literal_endpoint == 2 &&
5781 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5782 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5783 {
3568d838
JH
5784 if (isLOWER(prevvalue)) {
5785 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5786 if (isLOWER(i))
5787 ANYOF_BITMAP_SET(ret, i);
5788 } else {
3568d838 5789 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5790 if (isUPPER(i))
5791 ANYOF_BITMAP_SET(ret, i);
5792 }
8ada0baa 5793 }
ffc61ed2 5794 else
8ada0baa 5795#endif
07be1b83
YO
5796 for (i = prevvalue; i <= ceilvalue; i++) {
5797 if (!ANYOF_BITMAP_TEST(ret,i)) {
5798 stored++;
5799 ANYOF_BITMAP_SET(ret, i);
5800 }
5801 }
3568d838 5802 }
a5961de5 5803 if (value > 255 || UTF) {
1df70142
AL
5804 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5805 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5806 stored+=2; /* can't optimize this class */
ffc61ed2 5807 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5808 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5809 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5810 prevnatvalue, natvalue);
5811 }
5812 else if (prevnatvalue == natvalue) {
5813 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5814 if (FOLD) {
89ebb4a3 5815 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5816 STRLEN foldlen;
1df70142 5817 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5818
c840d2a2
JH
5819 /* If folding and foldable and a single
5820 * character, insert also the folded version
5821 * to the charclass. */
9e55ce06 5822 if (f != value) {
eb160463 5823 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5824 Perl_sv_catpvf(aTHX_ listsv,
5825 "%04"UVxf"\n", f);
5826 else {
5827 /* Any multicharacter foldings
5828 * require the following transform:
5829 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5830 * where E folds into "pq" and F folds
5831 * into "rst", all other characters
5832 * fold to single characters. We save
5833 * away these multicharacter foldings,
5834 * to be later saved as part of the
5835 * additional "s" data. */
5836 SV *sv;
5837
5838 if (!unicode_alternate)
5839 unicode_alternate = newAV();
5840 sv = newSVpvn((char*)foldbuf, foldlen);
5841 SvUTF8_on(sv);
5842 av_push(unicode_alternate, sv);
5843 }
5844 }
254ba52a 5845
60a8b682
JH
5846 /* If folding and the value is one of the Greek
5847 * sigmas insert a few more sigmas to make the
5848 * folding rules of the sigmas to work right.
5849 * Note that not all the possible combinations
5850 * are handled here: some of them are handled
9e55ce06
JH
5851 * by the standard folding rules, and some of
5852 * them (literal or EXACTF cases) are handled
5853 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5854 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5855 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5856 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5857 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5858 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5859 }
5860 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5861 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5862 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5863 }
5864 }
ffc61ed2 5865 }
1b2d223b
JH
5866#ifdef EBCDIC
5867 literal_endpoint = 0;
5868#endif
8ada0baa 5869 }
ffc61ed2
JH
5870
5871 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5872 }
ffc61ed2 5873
936ed897 5874 if (need_class) {
4f66b38d 5875 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5876 if (SIZE_ONLY)
830247a4 5877 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5878 else
830247a4 5879 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5880 }
ffc61ed2 5881
7f6f358c
YO
5882
5883 if (SIZE_ONLY)
5884 return ret;
5885 /****** !SIZE_ONLY AFTER HERE *********/
5886
5887 if( stored == 1 && value < 256
5888 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5889 ) {
5890 /* optimize single char class to an EXACT node
5891 but *only* when its not a UTF/high char */
07be1b83
YO
5892 const char * cur_parse= RExC_parse;
5893 RExC_emit = (regnode *)orig_emit;
5894 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5895 ret = reg_node(pRExC_state,
5896 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5897 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5898 *STRING(ret)= (char)value;
5899 STR_LEN(ret)= 1;
5900 RExC_emit += STR_SZ(1);
5901 return ret;
5902 }
ae5c130c 5903 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5904 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5905 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5906 ) {
a0ed51b3 5907 for (value = 0; value < 256; ++value) {
936ed897 5908 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5909 UV fold = PL_fold[value];
ffc61ed2
JH
5910
5911 if (fold != value)
5912 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5913 }
5914 }
936ed897 5915 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5916 }
ffc61ed2 5917
ae5c130c 5918 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5919 if (optimize_invert &&
ffc61ed2
JH
5920 /* If the only flag is inversion. */
5921 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5922 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5923 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5924 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5925 }
7f6f358c 5926 {
097eb12c 5927 AV * const av = newAV();
ffc61ed2 5928 SV *rv;
9e55ce06 5929 /* The 0th element stores the character class description
6a0407ee 5930 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5931 * to initialize the appropriate swash (which gets stored in
5932 * the 1st element), and also useful for dumping the regnode.
5933 * The 2nd element stores the multicharacter foldings,
6a0407ee 5934 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5935 av_store(av, 0, listsv);
5936 av_store(av, 1, NULL);
9e55ce06 5937 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5938 rv = newRV_noinc((SV*)av);
19860706 5939 n = add_data(pRExC_state, 1, "s");
830247a4 5940 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5941 ARG_SET(ret, n);
a0ed51b3 5942 }
a0ed51b3
LW
5943 return ret;
5944}
5945
76e3520e 5946STATIC char*
830247a4 5947S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5948{
097eb12c 5949 char* const retval = RExC_parse++;
a0d0e21e 5950
4633a7c4 5951 for (;;) {
830247a4
IZ
5952 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5953 RExC_parse[2] == '#') {
e994fd66
AE
5954 while (*RExC_parse != ')') {
5955 if (RExC_parse == RExC_end)
5956 FAIL("Sequence (?#... not terminated");
830247a4 5957 RExC_parse++;
e994fd66 5958 }
830247a4 5959 RExC_parse++;
4633a7c4
LW
5960 continue;
5961 }
e2509266 5962 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5963 if (isSPACE(*RExC_parse)) {
5964 RExC_parse++;
748a9306
LW
5965 continue;
5966 }
830247a4 5967 else if (*RExC_parse == '#') {
e994fd66
AE
5968 while (RExC_parse < RExC_end)
5969 if (*RExC_parse++ == '\n') break;
748a9306
LW
5970 continue;
5971 }
748a9306 5972 }
4633a7c4 5973 return retval;
a0d0e21e 5974 }
a687059c
LW
5975}
5976
5977/*
c277df42 5978- reg_node - emit a node
a0d0e21e 5979*/
76e3520e 5980STATIC regnode * /* Location. */
830247a4 5981S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5982{
97aff369 5983 dVAR;
c277df42 5984 register regnode *ptr;
504618e9 5985 regnode * const ret = RExC_emit;
07be1b83 5986 GET_RE_DEBUG_FLAGS_DECL;
a687059c 5987
c277df42 5988 if (SIZE_ONLY) {
830247a4
IZ
5989 SIZE_ALIGN(RExC_size);
5990 RExC_size += 1;
a0d0e21e
LW
5991 return(ret);
5992 }
c277df42 5993 NODE_ALIGN_FILL(ret);
a0d0e21e 5994 ptr = ret;
c277df42 5995 FILL_ADVANCE_NODE(ptr, op);
fac92740 5996 if (RExC_offsets) { /* MJD */
07be1b83 5997 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
5998 "reg_node", __LINE__,
5999 reg_name[op],
07be1b83
YO
6000 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6001 ? "Overwriting end of array!\n" : "OK",
6002 (UV)(RExC_emit - RExC_emit_start),
6003 (UV)(RExC_parse - RExC_start),
6004 (UV)RExC_offsets[0]));
ccb2c380 6005 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6006 }
07be1b83 6007
830247a4 6008 RExC_emit = ptr;
a687059c 6009
a0d0e21e 6010 return(ret);
a687059c
LW
6011}
6012
6013/*
a0d0e21e
LW
6014- reganode - emit a node with an argument
6015*/
76e3520e 6016STATIC regnode * /* Location. */
830247a4 6017S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6018{
97aff369 6019 dVAR;
c277df42 6020 register regnode *ptr;
504618e9 6021 regnode * const ret = RExC_emit;
07be1b83 6022 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6023
c277df42 6024 if (SIZE_ONLY) {
830247a4
IZ
6025 SIZE_ALIGN(RExC_size);
6026 RExC_size += 2;
a0d0e21e
LW
6027 return(ret);
6028 }
fe14fcc3 6029
c277df42 6030 NODE_ALIGN_FILL(ret);
a0d0e21e 6031 ptr = ret;
c277df42 6032 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6033 if (RExC_offsets) { /* MJD */
07be1b83 6034 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6035 "reganode",
ccb2c380
MP
6036 __LINE__,
6037 reg_name[op],
07be1b83 6038 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6039 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6040 (UV)(RExC_emit - RExC_emit_start),
6041 (UV)(RExC_parse - RExC_start),
6042 (UV)RExC_offsets[0]));
ccb2c380 6043 Set_Cur_Node_Offset;
fac92740
MJD
6044 }
6045
830247a4 6046 RExC_emit = ptr;
fe14fcc3 6047
a0d0e21e 6048 return(ret);
fe14fcc3
LW
6049}
6050
6051/*
cd439c50 6052- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6053*/
71207a34
AL
6054STATIC STRLEN
6055S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6056{
97aff369 6057 dVAR;
71207a34 6058 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6059}
6060
6061/*
a0d0e21e
LW
6062- reginsert - insert an operator in front of already-emitted operand
6063*
6064* Means relocating the operand.
6065*/
76e3520e 6066STATIC void
830247a4 6067S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6068{
97aff369 6069 dVAR;
c277df42
IZ
6070 register regnode *src;
6071 register regnode *dst;
6072 register regnode *place;
504618e9 6073 const int offset = regarglen[(U8)op];
07be1b83 6074 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6075/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6076
6077 if (SIZE_ONLY) {
830247a4 6078 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6079 return;
6080 }
a687059c 6081
830247a4
IZ
6082 src = RExC_emit;
6083 RExC_emit += NODE_STEP_REGNODE + offset;
6084 dst = RExC_emit;
fac92740 6085 while (src > opnd) {
c277df42 6086 StructCopy(--src, --dst, regnode);
fac92740 6087 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6088 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6089 "reg_insert",
ccb2c380
MP
6090 __LINE__,
6091 reg_name[op],
07be1b83
YO
6092 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6093 ? "Overwriting end of array!\n" : "OK",
6094 (UV)(src - RExC_emit_start),
6095 (UV)(dst - RExC_emit_start),
6096 (UV)RExC_offsets[0]));
ccb2c380
MP
6097 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6098 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6099 }
6100 }
6101
a0d0e21e
LW
6102
6103 place = opnd; /* Op node, where operand used to be. */
fac92740 6104 if (RExC_offsets) { /* MJD */
07be1b83 6105 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6106 "reginsert",
ccb2c380
MP
6107 __LINE__,
6108 reg_name[op],
07be1b83 6109 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6110 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6111 (UV)(place - RExC_emit_start),
6112 (UV)(RExC_parse - RExC_start),
fac92740 6113 RExC_offsets[0]));
ccb2c380 6114 Set_Node_Offset(place, RExC_parse);
45948336 6115 Set_Node_Length(place, 1);
fac92740 6116 }
c277df42
IZ
6117 src = NEXTOPER(place);
6118 FILL_ADVANCE_NODE(place, op);
6119 Zero(src, offset, regnode);
a687059c
LW
6120}
6121
6122/*
c277df42 6123- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6124- SEE ALSO: regtail_study
a0d0e21e 6125*/
097eb12c 6126/* TODO: All three parms should be const */
76e3520e 6127STATIC void
3dab1dad 6128S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6129{
97aff369 6130 dVAR;
c277df42 6131 register regnode *scan;
3dab1dad 6132 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 6133
c277df42 6134 if (SIZE_ONLY)
a0d0e21e
LW
6135 return;
6136
6137 /* Find last node. */
6138 scan = p;
6139 for (;;) {
504618e9 6140 regnode * const temp = regnext(scan);
3dab1dad
YO
6141 DEBUG_PARSE_r({
6142 SV * const mysv=sv_newmortal();
6143 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6144 regprop(RExC_rx, mysv, scan);
6145 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6146 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6147 });
6148 if (temp == NULL)
6149 break;
6150 scan = temp;
6151 }
6152
6153 if (reg_off_by_arg[OP(scan)]) {
6154 ARG_SET(scan, val - scan);
6155 }
6156 else {
6157 NEXT_OFF(scan) = val - scan;
6158 }
6159}
6160
07be1b83 6161#ifdef DEBUGGING
3dab1dad
YO
6162/*
6163- regtail_study - set the next-pointer at the end of a node chain of p to val.
6164- Look for optimizable sequences at the same time.
6165- currently only looks for EXACT chains.
07be1b83
YO
6166
6167This is expermental code. The idea is to use this routine to perform
6168in place optimizations on branches and groups as they are constructed,
6169with the long term intention of removing optimization from study_chunk so
6170that it is purely analytical.
6171
6172Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6173to control which is which.
6174
3dab1dad
YO
6175*/
6176/* TODO: All four parms should be const */
07be1b83 6177
3dab1dad
YO
6178STATIC U8
6179S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6180{
6181 dVAR;
6182 register regnode *scan;
07be1b83
YO
6183 U8 exact = PSEUDO;
6184#ifdef EXPERIMENTAL_INPLACESCAN
6185 I32 min = 0;
6186#endif
6187
3dab1dad
YO
6188 GET_RE_DEBUG_FLAGS_DECL;
6189
07be1b83 6190
3dab1dad
YO
6191 if (SIZE_ONLY)
6192 return exact;
6193
6194 /* Find last node. */
6195
6196 scan = p;
6197 for (;;) {
6198 regnode * const temp = regnext(scan);
07be1b83
YO
6199#ifdef EXPERIMENTAL_INPLACESCAN
6200 if (PL_regkind[OP(scan)] == EXACT)
6201 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6202 return EXACT;
6203#endif
3dab1dad
YO
6204 if ( exact ) {
6205 switch (OP(scan)) {
6206 case EXACT:
6207 case EXACTF:
6208 case EXACTFL:
6209 if( exact == PSEUDO )
6210 exact= OP(scan);
07be1b83
YO
6211 else if ( exact != OP(scan) )
6212 exact= 0;
3dab1dad
YO
6213 case NOTHING:
6214 break;
6215 default:
6216 exact= 0;
6217 }
6218 }
6219 DEBUG_PARSE_r({
6220 SV * const mysv=sv_newmortal();
6221 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6222 regprop(RExC_rx, mysv, scan);
6223 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6224 SvPV_nolen_const(mysv),
6225 reg_name[exact],
6226 REG_NODE_NUM(scan));
6227 });
a0d0e21e
LW
6228 if (temp == NULL)
6229 break;
6230 scan = temp;
6231 }
07be1b83
YO
6232 DEBUG_PARSE_r({
6233 SV * const mysv_val=sv_newmortal();
6234 DEBUG_PARSE_MSG("");
6235 regprop(RExC_rx, mysv_val, val);
6236 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6237 SvPV_nolen_const(mysv_val),
6238 REG_NODE_NUM(val),
6239 val - scan
6240 );
6241 });
c277df42
IZ
6242 if (reg_off_by_arg[OP(scan)]) {
6243 ARG_SET(scan, val - scan);
a0ed51b3
LW
6244 }
6245 else {
c277df42
IZ
6246 NEXT_OFF(scan) = val - scan;
6247 }
3dab1dad
YO
6248
6249 return exact;
a687059c 6250}
07be1b83 6251#endif
a687059c
LW
6252
6253/*
a687059c
LW
6254 - regcurly - a little FSA that accepts {\d+,?\d*}
6255 */
79072805 6256STATIC I32
5f66b61c 6257S_regcurly(register const char *s)
a687059c
LW
6258{
6259 if (*s++ != '{')
6260 return FALSE;
f0fcb552 6261 if (!isDIGIT(*s))
a687059c 6262 return FALSE;
f0fcb552 6263 while (isDIGIT(*s))
a687059c
LW
6264 s++;
6265 if (*s == ',')
6266 s++;
f0fcb552 6267 while (isDIGIT(*s))
a687059c
LW
6268 s++;
6269 if (*s != '}')
6270 return FALSE;
6271 return TRUE;
6272}
6273
a687059c
LW
6274
6275/*
fd181c75 6276 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6277 */
6278void
097eb12c 6279Perl_regdump(pTHX_ const regexp *r)
a687059c 6280{
35ff7856 6281#ifdef DEBUGGING
97aff369 6282 dVAR;
c445ea15 6283 SV * const sv = sv_newmortal();
a687059c 6284
4f639d21 6285 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6286
6287 /* Header fields of interest. */
c277df42 6288 if (r->anchored_substr)
7b0972df 6289 PerlIO_printf(Perl_debug_log,
a0288114 6290 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 6291 PL_colors[0],
7b0972df 6292 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 6293 SvPVX_const(r->anchored_substr),
3280af22 6294 PL_colors[1],
c277df42 6295 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 6296 (IV)r->anchored_offset);
33b8afdf
JH
6297 else if (r->anchored_utf8)
6298 PerlIO_printf(Perl_debug_log,
a0288114 6299 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
6300 PL_colors[0],
6301 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 6302 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
6303 PL_colors[1],
6304 SvTAIL(r->anchored_utf8) ? "$" : "",
6305 (IV)r->anchored_offset);
c277df42 6306 if (r->float_substr)
7b0972df 6307 PerlIO_printf(Perl_debug_log,
a0288114 6308 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 6309 PL_colors[0],
b81d288d 6310 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6311 SvPVX_const(r->float_substr),
3280af22 6312 PL_colors[1],
c277df42 6313 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6314 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6315 else if (r->float_utf8)
6316 PerlIO_printf(Perl_debug_log,
a0288114 6317 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6318 PL_colors[0],
6319 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6320 SvPVX_const(r->float_utf8),
33b8afdf
JH
6321 PL_colors[1],
6322 SvTAIL(r->float_utf8) ? "$" : "",
6323 (IV)r->float_min_offset, (UV)r->float_max_offset);
6324 if (r->check_substr || r->check_utf8)
b81d288d
AB
6325 PerlIO_printf(Perl_debug_log,
6326 r->check_substr == r->float_substr
33b8afdf 6327 && r->check_utf8 == r->float_utf8
c277df42
IZ
6328 ? "(checking floating" : "(checking anchored");
6329 if (r->reganch & ROPT_NOSCAN)
6330 PerlIO_printf(Perl_debug_log, " noscan");
6331 if (r->reganch & ROPT_CHECK_ALL)
6332 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6333 if (r->check_substr || r->check_utf8)
c277df42
IZ
6334 PerlIO_printf(Perl_debug_log, ") ");
6335
46fc3d4c 6336 if (r->regstclass) {
32fc9b6a 6337 regprop(r, sv, r->regstclass);
3f7c398e 6338 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6339 }
774d564b 6340 if (r->reganch & ROPT_ANCH) {
6341 PerlIO_printf(Perl_debug_log, "anchored");
6342 if (r->reganch & ROPT_ANCH_BOL)
6343 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6344 if (r->reganch & ROPT_ANCH_MBOL)
6345 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6346 if (r->reganch & ROPT_ANCH_SBOL)
6347 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6348 if (r->reganch & ROPT_ANCH_GPOS)
6349 PerlIO_printf(Perl_debug_log, "(GPOS)");
6350 PerlIO_putc(Perl_debug_log, ' ');
6351 }
c277df42
IZ
6352 if (r->reganch & ROPT_GPOS_SEEN)
6353 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6354 if (r->reganch & ROPT_SKIP)
760ac839 6355 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6356 if (r->reganch & ROPT_IMPLICIT)
760ac839 6357 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6358 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6359 if (r->reganch & ROPT_EVAL_SEEN)
6360 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6361 PerlIO_printf(Perl_debug_log, "\n");
fac92740 6362 if (r->offsets) {
e4584336 6363 const U32 len = r->offsets[0];
a3621e74
YO
6364 GET_RE_DEBUG_FLAGS_DECL;
6365 DEBUG_OFFSETS_r({
1df70142 6366 U32 i;
e4584336 6367 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
07be1b83 6368 for (i = 1; i <= len; i++) {
be8e71aa
YO
6369 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6370 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
07be1b83 6371 }
e4584336 6372 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 6373 });
fac92740 6374 }
65e66c80 6375#else
96a5add6 6376 PERL_UNUSED_CONTEXT;
65e66c80 6377 PERL_UNUSED_ARG(r);
17c3b450 6378#endif /* DEBUGGING */
a687059c
LW
6379}
6380
6381/*
a0d0e21e
LW
6382- regprop - printable representation of opcode
6383*/
46fc3d4c 6384void
32fc9b6a 6385Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6386{
35ff7856 6387#ifdef DEBUGGING
97aff369 6388 dVAR;
9b155405 6389 register int k;
a0d0e21e 6390
54dc92de 6391 sv_setpvn(sv, "", 0);
9b155405 6392 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6393 /* It would be nice to FAIL() here, but this may be called from
6394 regexec.c, and it would be hard to supply pRExC_state. */
6395 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6396 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6397
3dab1dad 6398 k = PL_regkind[OP(o)];
9b155405 6399
2a782b5b 6400 if (k == EXACT) {
396482e1 6401 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6402 /* Using is_utf8_string() is a crude hack but it may
6403 * be the best for now since we have no flag "this EXACTish
6404 * node was UTF-8" --jhi */
1df70142 6405 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 6406 const char * const s = do_utf8 ?
c728cb41
JH
6407 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6408 UNI_DISPLAY_REGEX) :
2a782b5b 6409 STRING(o);
e1ec3a88 6410 const int len = do_utf8 ?
2a782b5b
JH
6411 strlen(s) :
6412 STR_LEN(o);
6413 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6414 PL_colors[0],
6415 len, s,
6416 PL_colors[1]);
bb263b4e 6417 } else if (k == TRIE) {
3dab1dad
YO
6418 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6419 /* print the details of the trie in dumpuntil instead, as
4f639d21 6420 * prog->data isn't available here */
a3621e74 6421 } else if (k == CURLY) {
cb434fcc 6422 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6423 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6424 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6425 }
2c2d71f5
JH
6426 else if (k == WHILEM && o->flags) /* Ordinal/of */
6427 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6428 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6429 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6430 else if (k == LOGICAL)
04ebc1ab 6431 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6432 else if (k == ANYOF) {
6433 int i, rangestart = -1;
2d03de9c 6434 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6435
6436 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6437 static const char * const anyofs[] = {
653099ff
GS
6438 "\\w",
6439 "\\W",
6440 "\\s",
6441 "\\S",
6442 "\\d",
6443 "\\D",
6444 "[:alnum:]",
6445 "[:^alnum:]",
6446 "[:alpha:]",
6447 "[:^alpha:]",
6448 "[:ascii:]",
6449 "[:^ascii:]",
6450 "[:ctrl:]",
6451 "[:^ctrl:]",
6452 "[:graph:]",
6453 "[:^graph:]",
6454 "[:lower:]",
6455 "[:^lower:]",
6456 "[:print:]",
6457 "[:^print:]",
6458 "[:punct:]",
6459 "[:^punct:]",
6460 "[:upper:]",
aaa51d5e 6461 "[:^upper:]",
653099ff 6462 "[:xdigit:]",
aaa51d5e
JF
6463 "[:^xdigit:]",
6464 "[:space:]",
6465 "[:^space:]",
6466 "[:blank:]",
6467 "[:^blank:]"
653099ff
GS
6468 };
6469
19860706 6470 if (flags & ANYOF_LOCALE)
396482e1 6471 sv_catpvs(sv, "{loc}");
19860706 6472 if (flags & ANYOF_FOLD)
396482e1 6473 sv_catpvs(sv, "{i}");
653099ff 6474 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6475 if (flags & ANYOF_INVERT)
396482e1 6476 sv_catpvs(sv, "^");
ffc61ed2
JH
6477 for (i = 0; i <= 256; i++) {
6478 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6479 if (rangestart == -1)
6480 rangestart = i;
6481 } else if (rangestart != -1) {
6482 if (i <= rangestart + 3)
6483 for (; rangestart < i; rangestart++)
653099ff 6484 put_byte(sv, rangestart);
ffc61ed2
JH
6485 else {
6486 put_byte(sv, rangestart);
396482e1 6487 sv_catpvs(sv, "-");
ffc61ed2 6488 put_byte(sv, i - 1);
653099ff 6489 }
ffc61ed2 6490 rangestart = -1;
653099ff 6491 }
847a199f 6492 }
ffc61ed2
JH
6493
6494 if (o->flags & ANYOF_CLASS)
bb7a0f54 6495 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6496 if (ANYOF_CLASS_TEST(o,i))
6497 sv_catpv(sv, anyofs[i]);
6498
6499 if (flags & ANYOF_UNICODE)
396482e1 6500 sv_catpvs(sv, "{unicode}");
1aa99e6b 6501 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6502 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6503
6504 {
6505 SV *lv;
32fc9b6a 6506 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6507
ffc61ed2
JH
6508 if (lv) {
6509 if (sw) {
89ebb4a3 6510 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6511
ffc61ed2 6512 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6513 uvchr_to_utf8(s, i);
ffc61ed2 6514
3568d838 6515 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6516 if (rangestart == -1)
6517 rangestart = i;
6518 } else if (rangestart != -1) {
ffc61ed2
JH
6519 if (i <= rangestart + 3)
6520 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6521 const U8 * const e = uvchr_to_utf8(s,rangestart);
6522 U8 *p;
6523 for(p = s; p < e; p++)
ffc61ed2
JH
6524 put_byte(sv, *p);
6525 }
6526 else {
2d03de9c
AL
6527 const U8 *e = uvchr_to_utf8(s,rangestart);
6528 U8 *p;
6529 for (p = s; p < e; p++)
ffc61ed2 6530 put_byte(sv, *p);
396482e1 6531 sv_catpvs(sv, "-");
2d03de9c
AL
6532 e = uvchr_to_utf8(s, i-1);
6533 for (p = s; p < e; p++)
1df70142 6534 put_byte(sv, *p);
ffc61ed2
JH
6535 }
6536 rangestart = -1;
6537 }
19860706 6538 }
ffc61ed2 6539
396482e1 6540 sv_catpvs(sv, "..."); /* et cetera */
19860706 6541 }
fde631ed 6542
ffc61ed2 6543 {
2e0de35c 6544 char *s = savesvpv(lv);
c445ea15 6545 char * const origs = s;
b81d288d 6546
3dab1dad
YO
6547 while (*s && *s != '\n')
6548 s++;
b81d288d 6549
ffc61ed2 6550 if (*s == '\n') {
2d03de9c 6551 const char * const t = ++s;
ffc61ed2
JH
6552
6553 while (*s) {
6554 if (*s == '\n')
6555 *s = ' ';
6556 s++;
6557 }
6558 if (s[-1] == ' ')
6559 s[-1] = 0;
6560
6561 sv_catpv(sv, t);
fde631ed 6562 }
b81d288d 6563
ffc61ed2 6564 Safefree(origs);
fde631ed
JH
6565 }
6566 }
653099ff 6567 }
ffc61ed2 6568
653099ff
GS
6569 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6570 }
9b155405 6571 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6572 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6573#else
96a5add6 6574 PERL_UNUSED_CONTEXT;
65e66c80
SP
6575 PERL_UNUSED_ARG(sv);
6576 PERL_UNUSED_ARG(o);
17c3b450 6577#endif /* DEBUGGING */
35ff7856 6578}
a687059c 6579
cad2e5aa
JH
6580SV *
6581Perl_re_intuit_string(pTHX_ regexp *prog)
6582{ /* Assume that RE_INTUIT is set */
97aff369 6583 dVAR;
a3621e74 6584 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6585 PERL_UNUSED_CONTEXT;
6586
a3621e74 6587 DEBUG_COMPILE_r(
cfd0369c 6588 {
2d03de9c 6589 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6590 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6591
6592 if (!PL_colorset) reginitcolors();
6593 PerlIO_printf(Perl_debug_log,
a0288114 6594 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6595 PL_colors[4],
6596 prog->check_substr ? "" : "utf8 ",
6597 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6598 s,
6599 PL_colors[1],
6600 (strlen(s) > 60 ? "..." : ""));
6601 } );
6602
33b8afdf 6603 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6604}
6605
2b69d0c2 6606void
864dbfa3 6607Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6608{
27da23d5 6609 dVAR;
9e55ce06 6610#ifdef DEBUGGING
c445ea15 6611 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 6612#endif
fc32ee4a 6613 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6614
7821416a
IZ
6615 if (!r || (--r->refcnt > 0))
6616 return;
be8e71aa 6617 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
c445ea15 6618 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6619 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6620 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6621 const int len = SvCUR(dsv);
9e55ce06
JH
6622 if (!PL_colorset)
6623 reginitcolors();
6624 PerlIO_printf(Perl_debug_log,
a3621e74 6625 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6626 PL_colors[4],PL_colors[5],PL_colors[0],
6627 len, len, s,
6628 PL_colors[1],
6629 len > 60 ? "..." : "");
6630 });
cad2e5aa 6631
43c5f42d
NC
6632 /* gcov results gave these as non-null 100% of the time, so there's no
6633 optimisation in checking them before calling Safefree */
6634 Safefree(r->precomp);
6635 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6636 RX_MATCH_COPY_FREE(r);
f8c7b90f 6637#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6638 if (r->saved_copy)
6639 SvREFCNT_dec(r->saved_copy);
6640#endif
a193d654
GS
6641 if (r->substrs) {
6642 if (r->anchored_substr)
6643 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6644 if (r->anchored_utf8)
6645 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6646 if (r->float_substr)
6647 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6648 if (r->float_utf8)
6649 SvREFCNT_dec(r->float_utf8);
2779dcf1 6650 Safefree(r->substrs);
a193d654 6651 }
c277df42
IZ
6652 if (r->data) {
6653 int n = r->data->count;
f3548bdc
DM
6654 PAD* new_comppad = NULL;
6655 PAD* old_comppad;
4026c95a 6656 PADOFFSET refcnt;
dfad63ad 6657
c277df42 6658 while (--n >= 0) {
261faec3 6659 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6660 switch (r->data->what[n]) {
6661 case 's':
6662 SvREFCNT_dec((SV*)r->data->data[n]);
6663 break;
653099ff
GS
6664 case 'f':
6665 Safefree(r->data->data[n]);
6666 break;
dfad63ad
HS
6667 case 'p':
6668 new_comppad = (AV*)r->data->data[n];
6669 break;
c277df42 6670 case 'o':
dfad63ad 6671 if (new_comppad == NULL)
cea2e8a9 6672 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6673 PAD_SAVE_LOCAL(old_comppad,
6674 /* Watch out for global destruction's random ordering. */
c445ea15 6675 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6676 );
b34c0dd4 6677 OP_REFCNT_LOCK;
4026c95a
SH
6678 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6679 OP_REFCNT_UNLOCK;
6680 if (!refcnt)
9b978d73 6681 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6682
f3548bdc 6683 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6684 SvREFCNT_dec((SV*)new_comppad);
6685 new_comppad = NULL;
c277df42
IZ
6686 break;
6687 case 'n':
9e55ce06 6688 break;
07be1b83 6689 case 'T':
be8e71aa
YO
6690 { /* Aho Corasick add-on structure for a trie node.
6691 Used in stclass optimization only */
07be1b83
YO
6692 U32 refcount;
6693 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6694 OP_REFCNT_LOCK;
6695 refcount = --aho->refcount;
6696 OP_REFCNT_UNLOCK;
6697 if ( !refcount ) {
6698 Safefree(aho->states);
6699 Safefree(aho->fail);
6700 aho->trie=NULL; /* not necessary to free this as it is
6701 handled by the 't' case */
6702 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6703 Safefree(r->regstclass);
07be1b83
YO
6704 }
6705 }
6706 break;
a3621e74 6707 case 't':
07be1b83 6708 {
be8e71aa 6709 /* trie structure. */
07be1b83
YO
6710 U32 refcount;
6711 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6712 OP_REFCNT_LOCK;
6713 refcount = --trie->refcount;
6714 OP_REFCNT_UNLOCK;
6715 if ( !refcount ) {
6716 Safefree(trie->charmap);
6717 if (trie->widecharmap)
6718 SvREFCNT_dec((SV*)trie->widecharmap);
6719 Safefree(trie->states);
6720 Safefree(trie->trans);
6721 if (trie->bitmap)
6722 Safefree(trie->bitmap);
6723 if (trie->wordlen)
6724 Safefree(trie->wordlen);
a3621e74 6725#ifdef DEBUGGING
be8e71aa
YO
6726 if (RX_DEBUG(r)) {
6727 if (trie->words)
6728 SvREFCNT_dec((SV*)trie->words);
6729 if (trie->revcharmap)
6730 SvREFCNT_dec((SV*)trie->revcharmap);
6731 }
a3621e74 6732#endif
07be1b83 6733 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6734 }
07be1b83
YO
6735 }
6736 break;
c277df42 6737 default:
830247a4 6738 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6739 }
6740 }
6741 Safefree(r->data->what);
6742 Safefree(r->data);
a0d0e21e
LW
6743 }
6744 Safefree(r->startp);
6745 Safefree(r->endp);
6746 Safefree(r);
a687059c 6747}
c277df42 6748
76234dfb 6749#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6750/*
6751 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6752 */
6753regnode *
864dbfa3 6754Perl_regnext(pTHX_ register regnode *p)
c277df42 6755{
97aff369 6756 dVAR;
c277df42
IZ
6757 register I32 offset;
6758
3280af22 6759 if (p == &PL_regdummy)
c277df42
IZ
6760 return(NULL);
6761
6762 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6763 if (offset == 0)
6764 return(NULL);
6765
c277df42 6766 return(p+offset);
c277df42 6767}
76234dfb 6768#endif
c277df42 6769
01f988be 6770STATIC void
cea2e8a9 6771S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6772{
6773 va_list args;
6774 STRLEN l1 = strlen(pat1);
6775 STRLEN l2 = strlen(pat2);
6776 char buf[512];
06bf62c7 6777 SV *msv;
73d840c0 6778 const char *message;
c277df42
IZ
6779
6780 if (l1 > 510)
6781 l1 = 510;
6782 if (l1 + l2 > 510)
6783 l2 = 510 - l1;
6784 Copy(pat1, buf, l1 , char);
6785 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6786 buf[l1 + l2] = '\n';
6787 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6788#ifdef I_STDARG
6789 /* ANSI variant takes additional second argument */
c277df42 6790 va_start(args, pat2);
8736538c
AS
6791#else
6792 va_start(args);
6793#endif
5a844595 6794 msv = vmess(buf, &args);
c277df42 6795 va_end(args);
cfd0369c 6796 message = SvPV_const(msv,l1);
c277df42
IZ
6797 if (l1 > 512)
6798 l1 = 512;
6799 Copy(message, buf, l1 , char);
197cf9b9 6800 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6801 Perl_croak(aTHX_ "%s", buf);
c277df42 6802}
a0ed51b3
LW
6803
6804/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6805
76234dfb 6806#ifndef PERL_IN_XSUB_RE
a0ed51b3 6807void
864dbfa3 6808Perl_save_re_context(pTHX)
b81d288d 6809{
97aff369 6810 dVAR;
1ade1aa1
NC
6811
6812 struct re_save_state *state;
6813
6814 SAVEVPTR(PL_curcop);
6815 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6816
6817 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6818 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6819 SSPUSHINT(SAVEt_RE_STATE);
6820
46ab3289 6821 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6822
a0ed51b3 6823 PL_reg_start_tmp = 0;
a0ed51b3 6824 PL_reg_start_tmpl = 0;
c445ea15 6825 PL_reg_oldsaved = NULL;
a5db57d6 6826 PL_reg_oldsavedlen = 0;
a5db57d6 6827 PL_reg_maxiter = 0;
a5db57d6 6828 PL_reg_leftiter = 0;
c445ea15 6829 PL_reg_poscache = NULL;
a5db57d6 6830 PL_reg_poscache_size = 0;
1ade1aa1
NC
6831#ifdef PERL_OLD_COPY_ON_WRITE
6832 PL_nrs = NULL;
6833#endif
ada6e8a9 6834
c445ea15
AL
6835 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6836 if (PL_curpm) {
6837 const REGEXP * const rx = PM_GETRE(PL_curpm);
6838 if (rx) {
1df70142 6839 U32 i;
ada6e8a9 6840 for (i = 1; i <= rx->nparens; i++) {
1df70142 6841 char digits[TYPE_CHARS(long)];
d9fad198 6842 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6843 GV *const *const gvp
6844 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6845
b37c2d43
AL
6846 if (gvp) {
6847 GV * const gv = *gvp;
6848 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6849 save_scalar(gv);
49f27e4b 6850 }
ada6e8a9
AMS
6851 }
6852 }
6853 }
a0ed51b3 6854}
76234dfb 6855#endif
51371543 6856
51371543 6857static void
acfe0abc 6858clear_re(pTHX_ void *r)
51371543 6859{
97aff369 6860 dVAR;
51371543
GS
6861 ReREFCNT_dec((regexp *)r);
6862}
ffbc6a93 6863
a28509cc
AL
6864#ifdef DEBUGGING
6865
6866STATIC void
6867S_put_byte(pTHX_ SV *sv, int c)
6868{
6869 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6870 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6871 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6872 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6873 else
6874 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6875}
6876
3dab1dad
YO
6877#define CLEAR_OPTSTART \
6878 if (optstart) STMT_START { \
07be1b83 6879 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6880 optstart=NULL; \
6881 } STMT_END
6882
6883#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6884
b5a2f8d8
NC
6885STATIC const regnode *
6886S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6887 const regnode *last, SV* sv, I32 l)
a28509cc 6888{
97aff369 6889 dVAR;
a28509cc 6890 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6891 register const regnode *next;
3dab1dad
YO
6892 const regnode *optstart= NULL;
6893 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6894
6895 while (op != END && (!last || node < last)) {
6896 /* While that wasn't END last time... */
6897
6898 NODE_ALIGN(node);
6899 op = OP(node);
6900 if (op == CLOSE)
6901 l--;
b5a2f8d8 6902 next = regnext((regnode *)node);
07be1b83 6903
a28509cc 6904 /* Where, what. */
8e11feef 6905 if (OP(node) == OPTIMIZED) {
be8e71aa 6906 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6907 optstart = node;
3dab1dad 6908 else
8e11feef 6909 goto after_print;
3dab1dad
YO
6910 } else
6911 CLEAR_OPTSTART;
07be1b83 6912
32fc9b6a 6913 regprop(r, sv, node);
a28509cc
AL
6914 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6915 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6916
6917 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6918 if (next == NULL) /* Next ptr. */
6919 PerlIO_printf(Perl_debug_log, "(0)");
6920 else
6921 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6922 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6923 }
6924
a28509cc
AL
6925 after_print:
6926 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6927 assert(next);
6928 {
6929 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
6930 ? regnext((regnode *)next)
6931 : next);
be8e71aa
YO
6932 if (last && nnode > last)
6933 nnode = last;
6934 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6935 }
a28509cc
AL
6936 }
6937 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 6938 assert(next);
3dab1dad 6939 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6940 }
6941 else if ( PL_regkind[(U8)op] == TRIE ) {
6942 const I32 n = ARG(node);
4f639d21 6943 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6944 const I32 arry_len = av_len(trie->words)+1;
6945 I32 word_idx;
6946 PerlIO_printf(Perl_debug_log,
07be1b83 6947 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
6948 (int)(2*(l+3)),
6949 "",
6950 trie->startstate,
6951 TRIE_WORDCOUNT(trie),
6952 (int)TRIE_CHARCOUNT(trie),
6953 trie->uniquecharcount,
6954 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
6955 (int)trie->minlen,
6956 (int)trie->maxlen
8e11feef 6957 );
3dab1dad
YO
6958 if (trie->bitmap) {
6959 int i;
6960 int rangestart= -1;
6961 sv_setpvn(sv, "", 0);
6962 for (i = 0; i <= 256; i++) {
6963 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6964 if (rangestart == -1)
6965 rangestart = i;
6966 } else if (rangestart != -1) {
6967 if (i <= rangestart + 3)
6968 for (; rangestart < i; rangestart++)
6969 put_byte(sv, rangestart);
6970 else {
6971 put_byte(sv, rangestart);
6972 sv_catpvs(sv, "-");
6973 put_byte(sv, i - 1);
6974 }
6975 rangestart = -1;
6976 }
6977 }
07be1b83 6978 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 6979 } else
07be1b83 6980 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
6981
6982 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6983 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6984 if (elem_ptr) {
6985 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6986 (int)(2*(l+4)), "",
6987 PL_colors[0],
cfd0369c 6988 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6989 PL_colors[1]
6990 );
a28509cc 6991 }
a28509cc
AL
6992 }
6993
6994 node = NEXTOPER(node);
6995 node += regarglen[(U8)op];
6996
6997 }
6998 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 6999 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7000 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7001 }
7002 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7003 assert(next);
3dab1dad 7004 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7005 next, sv, l + 1);
7006 }
7007 else if ( op == PLUS || op == STAR) {
3dab1dad 7008 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7009 }
7010 else if (op == ANYOF) {
7011 /* arglen 1 + class block */
7012 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7013 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7014 node = NEXTOPER(node);
7015 }
7016 else if (PL_regkind[(U8)op] == EXACT) {
7017 /* Literal string, where present. */
7018 node += NODE_SZ_STR(node) - 1;
7019 node = NEXTOPER(node);
7020 }
7021 else {
7022 node = NEXTOPER(node);
7023 node += regarglen[(U8)op];
7024 }
7025 if (op == CURLYX || op == OPEN)
7026 l++;
7027 else if (op == WHILEM)
7028 l--;
7029 }
3dab1dad 7030 CLEAR_OPTSTART;
a28509cc
AL
7031 return node;
7032}
7033
7034#endif /* DEBUGGING */
7035
241d1a3b
NC
7036/*
7037 * Local variables:
7038 * c-indentation-style: bsd
7039 * c-basic-offset: 4
7040 * indent-tabs-mode: t
7041 * End:
7042 *
37442d52
RGS
7043 * ex: set ts=8 sts=4 sw=4 noet:
7044 */