This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Adding Module::Build::Version missed in upgrade.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
758 } STMT_END
a3621e74
YO
759
760#define TRIE_READ_CHAR STMT_START { \
07be1b83 761 wordlen++; \
a3621e74
YO
762 if ( UTF ) { \
763 if ( folder ) { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
766 foldlen -= len; \
767 scan += len; \
768 len = 0; \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
774 } \
775 } else { \
e1ec3a88 776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
777 } \
778 } else { \
779 uvc = (U32)*uc; \
780 len = 1; \
781 } \
782} STMT_END
783
784
785#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
789
790#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
795 } \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
799} STMT_END
800
801#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 802 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
806} STMT_END
807
07be1b83
YO
808#define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
811 if (trie->wordlen) \
812 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
813 trie->states[ state ].wordnum = ++curword; \
814 DEBUG_r({ \
815 /* store the word for dumping */ \
816 SV* tmp; \
07be1b83
YO
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 819 else \
07be1b83 820 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
823 }); \
824 } else { \
825 NOOP; /* It's a dupe. So ignore it. */ \
826 } \
827} STMT_END
828
829#ifdef DEBUGGING
07be1b83 830/*
3dab1dad
YO
831 dump_trie(trie)
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
834
835 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
839
3dab1dad
YO
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
07be1b83
YO
842
843*/
3dab1dad
YO
844
845/*
846 dump_trie(trie)
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
849*/
850
851STATIC void
852S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
853{
854 U32 state;
855 GET_RE_DEBUG_FLAGS_DECL;
856
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
860
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad
YO
863 if ( tmp ) {
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
873
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
904 } else {
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
906 }
907 }
908
909 PerlIO_printf( Perl_debug_log, "]");
910
911 }
912 PerlIO_printf( Perl_debug_log, "\n" );
913 }
914}
915/*
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
924{
925 U32 state;
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
931
932 for( state=1 ; state < next_alloc ; state ++ ) {
933 U16 charid;
934
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
939 } else {
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
942 );
943 }
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
3dab1dad
YO
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
950 );
951 }
952
953 }
954}
955
956/*
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
962*/
963STATIC void
964S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
965{
966 U32 state;
967 U16 charid;
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad
YO
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1001 }
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1004 } else {
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 }
1008 }
07be1b83 1009}
3dab1dad
YO
1010
1011#endif
1012
07be1b83
YO
1013#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1020 )
3dab1dad 1021
07be1b83
YO
1022STATIC void
1023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1024{
1025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1026
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1029 ISBN 0-201-10088-6
1030
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1036 Consider
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1041 */
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1044 U32 *q;
be8e71aa
YO
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1048 U32 q_read = 0;
1049 U32 q_write = 0;
1050 U32 charid;
1051 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1052 U32 *fail;
1053 reg_ac_data *aho;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
72f13be8 1055 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
1056#ifndef DEBUGGING
1057 PERL_UNUSED_ARG(depth);
1058#endif
72f13be8 1059
07be1b83
YO
1060
1061 ARG_SET( stclass, data_slot );
1062 Newxz( aho, 1, reg_ac_data );
1063 RExC_rx->data->data[ data_slot ] = (void*)aho;
1064 aho->trie=trie;
1065 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1066 (trie->laststate+1)*sizeof(reg_trie_state));
1067 Newxz( q, numstates, U32);
1068 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1069 aho->refcount = 1;
1070 fail = aho->fail;
07be1b83
YO
1071 fail[ 0 ] = fail[ 1 ] = 1;
1072
1073 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1074 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1075 if ( newstate ) {
07be1b83
YO
1076 q[ q_write ] = newstate;
1077 /* set to point at the root */
1078 fail[ q[ q_write++ ] ]=1;
1079 }
1080 }
1081 while ( q_read < q_write) {
be8e71aa 1082 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1083 base = trie->states[ cur ].trans.base;
1084
1085 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1086 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1087 if (ch_state) {
07be1b83
YO
1088 U32 fail_state = cur;
1089 U32 fail_base;
1090 do {
1091 fail_state = fail[ fail_state ];
1092 fail_base = aho->states[ fail_state ].trans.base;
1093 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1094
1095 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1096 fail[ ch_state ] = fail_state;
1097 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1098 {
1099 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1100 }
1101 q[ q_write++ % numstates] = ch_state;
1102 }
1103 }
1104 }
1105
1106 DEBUG_TRIE_COMPILE_MORE_r({
1107 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1108 for( q_read=2; q_read<numstates; q_read++ ) {
1109 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1110 }
1111 PerlIO_printf(Perl_debug_log, "\n");
1112 });
1113 Safefree(q);
1114 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1115}
3dab1dad
YO
1116
1117
1118
a3621e74 1119STATIC I32
3dab1dad 1120S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1121{
27da23d5 1122 dVAR;
a3621e74
YO
1123 /* first pass, loop through and scan words */
1124 reg_trie_data *trie;
1125 regnode *cur;
9f7f3913 1126 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1127 STRLEN len = 0;
1128 UV uvc = 0;
1129 U16 curword = 0;
1130 U32 next_alloc = 0;
1131 /* we just use folder as a flag in utf8 */
e1ec3a88 1132 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1133 ? PL_fold
1134 : ( flags == EXACTFL
1135 ? PL_fold_locale
1136 : NULL
1137 )
1138 );
1139
e1ec3a88 1140 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1141 SV *re_trie_maxbuff;
3dab1dad
YO
1142#ifndef DEBUGGING
1143 /* these are only used during construction but are useful during
8e11feef
RGS
1144 * debugging so we store them in the struct when debugging.
1145 * Wordcount is actually superfluous in debugging as we have
1146 * (AV*)trie->words to use for it, but that's not available when
1147 * not debugging... We could make the macro use the AV during
1148 * debugging though...
1149 */
3dab1dad
YO
1150 U16 trie_wordcount=0;
1151 STRLEN trie_charcount=0;
07be1b83 1152 /*U32 trie_laststate=0;*/
3dab1dad
YO
1153 AV *trie_revcharmap;
1154#endif
a3621e74 1155 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1156#ifndef DEBUGGING
1157 PERL_UNUSED_ARG(depth);
1158#endif
a3621e74 1159
a02a5408 1160 Newxz( trie, 1, reg_trie_data );
a3621e74 1161 trie->refcount = 1;
3dab1dad 1162 trie->startstate = 1;
a3621e74 1163 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1164 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1165 if (!(UTF && folder))
1166 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1167 DEBUG_r({
1168 trie->words = newAV();
a3621e74 1169 });
3dab1dad 1170 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1171
0111c4fd 1172 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1173 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1174 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1175 }
3dab1dad
YO
1176 DEBUG_OPTIMISE_r({
1177 PerlIO_printf( Perl_debug_log,
1178 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1179 (int)depth * 2 + 2, "",
1180 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1181 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1182 });
a3621e74
YO
1183 /* -- First loop and Setup --
1184
1185 We first traverse the branches and scan each word to determine if it
1186 contains widechars, and how many unique chars there are, this is
1187 important as we have to build a table with at least as many columns as we
1188 have unique chars.
1189
1190 We use an array of integers to represent the character codes 0..255
1191 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1192 native representation of the character value as the key and IV's for the
1193 coded index.
1194
1195 *TODO* If we keep track of how many times each character is used we can
1196 remap the columns so that the table compression later on is more
1197 efficient in terms of memory by ensuring most common value is in the
1198 middle and the least common are on the outside. IMO this would be better
1199 than a most to least common mapping as theres a decent chance the most
1200 common letter will share a node with the least common, meaning the node
1201 will not be compressable. With a middle is most common approach the worst
1202 case is when we have the least common nodes twice.
1203
1204 */
1205
a3621e74 1206 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1207 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1208 const U8 *uc = (U8*)STRING( noper );
a28509cc 1209 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1210 STRLEN foldlen = 0;
1211 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1212 const U8 *scan = (U8*)NULL;
07be1b83 1213 U32 wordlen = 0; /* required init */
3dab1dad 1214 STRLEN chars=0;
a3621e74 1215
3dab1dad
YO
1216 TRIE_WORDCOUNT(trie)++;
1217 if (OP(noper) == NOTHING) {
1218 trie->minlen= 0;
1219 continue;
1220 }
1221 if (trie->bitmap) {
1222 TRIE_BITMAP_SET(trie,*uc);
1223 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1224 }
a3621e74 1225 for ( ; uc < e ; uc += len ) {
3dab1dad 1226 TRIE_CHARCOUNT(trie)++;
a3621e74 1227 TRIE_READ_CHAR;
3dab1dad 1228 chars++;
a3621e74
YO
1229 if ( uvc < 256 ) {
1230 if ( !trie->charmap[ uvc ] ) {
1231 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1232 if ( folder )
1233 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1234 TRIE_STORE_REVCHAR;
a3621e74
YO
1235 }
1236 } else {
1237 SV** svpp;
1238 if ( !trie->widecharmap )
1239 trie->widecharmap = newHV();
1240
1241 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1242
1243 if ( !svpp )
e4584336 1244 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1245
1246 if ( !SvTRUE( *svpp ) ) {
1247 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1248 TRIE_STORE_REVCHAR;
a3621e74
YO
1249 }
1250 }
1251 }
3dab1dad
YO
1252 if( cur == first ) {
1253 trie->minlen=chars;
1254 trie->maxlen=chars;
1255 } else if (chars < trie->minlen) {
1256 trie->minlen=chars;
1257 } else if (chars > trie->maxlen) {
1258 trie->maxlen=chars;
1259 }
1260
a3621e74
YO
1261 } /* end first pass */
1262 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1263 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1264 (int)depth * 2 + 2,"",
1265 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1266 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1267 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1268 );
07be1b83 1269 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1270
1271 /*
1272 We now know what we are dealing with in terms of unique chars and
1273 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1274 representation using a flat table will take. If it's over a reasonable
1275 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1276 conservative but potentially much slower representation using an array
1277 of lists.
1278
1279 At the end we convert both representations into the same compressed
1280 form that will be used in regexec.c for matching with. The latter
1281 is a form that cannot be used to construct with but has memory
1282 properties similar to the list form and access properties similar
1283 to the table form making it both suitable for fast searches and
1284 small enough that its feasable to store for the duration of a program.
1285
1286 See the comment in the code where the compressed table is produced
1287 inplace from the flat tabe representation for an explanation of how
1288 the compression works.
1289
1290 */
1291
1292
3dab1dad 1293 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1294 /*
1295 Second Pass -- Array Of Lists Representation
1296
1297 Each state will be represented by a list of charid:state records
1298 (reg_trie_trans_le) the first such element holds the CUR and LEN
1299 points of the allocated array. (See defines above).
1300
1301 We build the initial structure using the lists, and then convert
1302 it into the compressed table form which allows faster lookups
1303 (but cant be modified once converted).
a3621e74
YO
1304 */
1305
a3621e74
YO
1306 STRLEN transcount = 1;
1307
3dab1dad 1308 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1309 TRIE_LIST_NEW(1);
1310 next_alloc = 2;
1311
1312 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1313
c445ea15
AL
1314 regnode * const noper = NEXTOPER( cur );
1315 U8 *uc = (U8*)STRING( noper );
1316 const U8 * const e = uc + STR_LEN( noper );
1317 U32 state = 1; /* required init */
1318 U16 charid = 0; /* sanity init */
1319 U8 *scan = (U8*)NULL; /* sanity init */
1320 STRLEN foldlen = 0; /* required init */
07be1b83 1321 U32 wordlen = 0; /* required init */
c445ea15
AL
1322 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1323
3dab1dad 1324 if (OP(noper) != NOTHING) {
c445ea15
AL
1325 for ( ; uc < e ; uc += len ) {
1326
1327 TRIE_READ_CHAR;
1328
1329 if ( uvc < 256 ) {
1330 charid = trie->charmap[ uvc ];
1331 } else {
1332 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1333 if ( !svpp ) {
1334 charid = 0;
1335 } else {
1336 charid=(U16)SvIV( *svpp );
1337 }
1338 }
1339 if ( charid ) {
a3621e74 1340
c445ea15
AL
1341 U16 check;
1342 U32 newstate = 0;
a3621e74 1343
c445ea15
AL
1344 charid--;
1345 if ( !trie->states[ state ].trans.list ) {
1346 TRIE_LIST_NEW( state );
1347 }
1348 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1349 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1350 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1351 break;
1352 }
1353 }
1354 if ( ! newstate ) {
1355 newstate = next_alloc++;
1356 TRIE_LIST_PUSH( state, charid, newstate );
1357 transcount++;
1358 }
1359 state = newstate;
1360 } else {
1361 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1362 }
c445ea15
AL
1363 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1364 }
c445ea15 1365 }
3dab1dad 1366 TRIE_HANDLE_WORD(state);
a3621e74
YO
1367
1368 } /* end second pass */
1369
3dab1dad 1370 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1371 Renew( trie->states, next_alloc, reg_trie_state );
1372
3dab1dad
YO
1373 /* and now dump it out before we compress it */
1374 DEBUG_TRIE_COMPILE_MORE_r(
1375 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1376 );
a3621e74 1377
a02a5408 1378 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1379 {
1380 U32 state;
a3621e74
YO
1381 U32 tp = 0;
1382 U32 zp = 0;
1383
1384
1385 for( state=1 ; state < next_alloc ; state ++ ) {
1386 U32 base=0;
1387
1388 /*
1389 DEBUG_TRIE_COMPILE_MORE_r(
1390 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1391 );
1392 */
1393
1394 if (trie->states[state].trans.list) {
1395 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1396 U16 maxid=minid;
a28509cc 1397 U16 idx;
a3621e74
YO
1398
1399 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1400 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1401 if ( forid < minid ) {
1402 minid=forid;
1403 } else if ( forid > maxid ) {
1404 maxid=forid;
1405 }
a3621e74
YO
1406 }
1407 if ( transcount < tp + maxid - minid + 1) {
1408 transcount *= 2;
1409 Renew( trie->trans, transcount, reg_trie_trans );
1410 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1411 }
1412 base = trie->uniquecharcount + tp - minid;
1413 if ( maxid == minid ) {
1414 U32 set = 0;
1415 for ( ; zp < tp ; zp++ ) {
1416 if ( ! trie->trans[ zp ].next ) {
1417 base = trie->uniquecharcount + zp - minid;
1418 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ zp ].check = state;
1420 set = 1;
1421 break;
1422 }
1423 }
1424 if ( !set ) {
1425 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1426 trie->trans[ tp ].check = state;
1427 tp++;
1428 zp = tp;
1429 }
1430 } else {
1431 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1432 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1434 trie->trans[ tid ].check = state;
1435 }
1436 tp += ( maxid - minid + 1 );
1437 }
1438 Safefree(trie->states[ state ].trans.list);
1439 }
1440 /*
1441 DEBUG_TRIE_COMPILE_MORE_r(
1442 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1443 );
1444 */
1445 trie->states[ state ].trans.base=base;
1446 }
cc601c31 1447 trie->lasttrans = tp + 1;
a3621e74
YO
1448 }
1449 } else {
1450 /*
1451 Second Pass -- Flat Table Representation.
1452
1453 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1454 We know that we will need Charcount+1 trans at most to store the data
1455 (one row per char at worst case) So we preallocate both structures
1456 assuming worst case.
1457
1458 We then construct the trie using only the .next slots of the entry
1459 structs.
1460
1461 We use the .check field of the first entry of the node temporarily to
1462 make compression both faster and easier by keeping track of how many non
1463 zero fields are in the node.
1464
1465 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1466 transition.
1467
1468 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1469 number representing the first entry of the node, and state as a
1470 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1471 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1472 are 2 entrys per node. eg:
1473
1474 A B A B
1475 1. 2 4 1. 3 7
1476 2. 0 3 3. 0 5
1477 3. 0 0 5. 0 0
1478 4. 0 0 7. 0 0
1479
1480 The table is internally in the right hand, idx form. However as we also
1481 have to deal with the states array which is indexed by nodenum we have to
1482 use TRIE_NODENUM() to convert.
1483
1484 */
1485
3dab1dad
YO
1486
1487 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1488 reg_trie_trans );
3dab1dad 1489 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1490 next_alloc = trie->uniquecharcount + 1;
1491
3dab1dad 1492
a3621e74
YO
1493 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1494
c445ea15 1495 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1496 const U8 *uc = (U8*)STRING( noper );
1497 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1498
1499 U32 state = 1; /* required init */
1500
1501 U16 charid = 0; /* sanity init */
1502 U32 accept_state = 0; /* sanity init */
1503 U8 *scan = (U8*)NULL; /* sanity init */
1504
1505 STRLEN foldlen = 0; /* required init */
07be1b83 1506 U32 wordlen = 0; /* required init */
a3621e74
YO
1507 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1508
3dab1dad 1509 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1510 for ( ; uc < e ; uc += len ) {
1511
1512 TRIE_READ_CHAR;
1513
1514 if ( uvc < 256 ) {
1515 charid = trie->charmap[ uvc ];
1516 } else {
c445ea15
AL
1517 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1518 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1519 }
1520 if ( charid ) {
1521 charid--;
1522 if ( !trie->trans[ state + charid ].next ) {
1523 trie->trans[ state + charid ].next = next_alloc;
1524 trie->trans[ state ].check++;
1525 next_alloc += trie->uniquecharcount;
1526 }
1527 state = trie->trans[ state + charid ].next;
1528 } else {
e4584336 1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1530 }
1531 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1532 }
a3621e74 1533 }
3dab1dad
YO
1534 accept_state = TRIE_NODENUM( state );
1535 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1536
1537 } /* end second pass */
1538
3dab1dad
YO
1539 /* and now dump it out before we compress it */
1540 DEBUG_TRIE_COMPILE_MORE_r(
1541 dump_trie_interim_table(trie,next_alloc,depth+1)
1542 );
a3621e74 1543
a3621e74
YO
1544 {
1545 /*
1546 * Inplace compress the table.*
1547
1548 For sparse data sets the table constructed by the trie algorithm will
1549 be mostly 0/FAIL transitions or to put it another way mostly empty.
1550 (Note that leaf nodes will not contain any transitions.)
1551
1552 This algorithm compresses the tables by eliminating most such
1553 transitions, at the cost of a modest bit of extra work during lookup:
1554
1555 - Each states[] entry contains a .base field which indicates the
1556 index in the state[] array wheres its transition data is stored.
1557
1558 - If .base is 0 there are no valid transitions from that node.
1559
1560 - If .base is nonzero then charid is added to it to find an entry in
1561 the trans array.
1562
1563 -If trans[states[state].base+charid].check!=state then the
1564 transition is taken to be a 0/Fail transition. Thus if there are fail
1565 transitions at the front of the node then the .base offset will point
1566 somewhere inside the previous nodes data (or maybe even into a node
1567 even earlier), but the .check field determines if the transition is
1568 valid.
1569
1570 The following process inplace converts the table to the compressed
1571 table: We first do not compress the root node 1,and mark its all its
1572 .check pointers as 1 and set its .base pointer as 1 as well. This
1573 allows to do a DFA construction from the compressed table later, and
1574 ensures that any .base pointers we calculate later are greater than
1575 0.
1576
1577 - We set 'pos' to indicate the first entry of the second node.
1578
1579 - We then iterate over the columns of the node, finding the first and
1580 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1581 and set the .check pointers accordingly, and advance pos
1582 appropriately and repreat for the next node. Note that when we copy
1583 the next pointers we have to convert them from the original
1584 NODEIDX form to NODENUM form as the former is not valid post
1585 compression.
1586
1587 - If a node has no transitions used we mark its base as 0 and do not
1588 advance the pos pointer.
1589
1590 - If a node only has one transition we use a second pointer into the
1591 structure to fill in allocated fail transitions from other states.
1592 This pointer is independent of the main pointer and scans forward
1593 looking for null transitions that are allocated to a state. When it
1594 finds one it writes the single transition into the "hole". If the
1595 pointer doesnt find one the single transition is appeneded as normal.
1596
1597 - Once compressed we can Renew/realloc the structures to release the
1598 excess space.
1599
1600 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1601 specifically Fig 3.47 and the associated pseudocode.
1602
1603 demq
1604 */
a3b680e6 1605 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1606 U32 state, charid;
a3621e74 1607 U32 pos = 0, zp=0;
3dab1dad 1608 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1609
1610 for ( state = 1 ; state < laststate ; state++ ) {
1611 U8 flag = 0;
a28509cc
AL
1612 const U32 stateidx = TRIE_NODEIDX( state );
1613 const U32 o_used = trie->trans[ stateidx ].check;
1614 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1615 trie->trans[ stateidx ].check = 0;
1616
1617 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1618 if ( flag || trie->trans[ stateidx + charid ].next ) {
1619 if ( trie->trans[ stateidx + charid ].next ) {
1620 if (o_used == 1) {
1621 for ( ; zp < pos ; zp++ ) {
1622 if ( ! trie->trans[ zp ].next ) {
1623 break;
1624 }
1625 }
1626 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1627 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1628 trie->trans[ zp ].check = state;
1629 if ( ++zp > pos ) pos = zp;
1630 break;
1631 }
1632 used--;
1633 }
1634 if ( !flag ) {
1635 flag = 1;
1636 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1637 }
1638 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1639 trie->trans[ pos ].check = state;
1640 pos++;
1641 }
1642 }
1643 }
cc601c31 1644 trie->lasttrans = pos + 1;
a3621e74
YO
1645 Renew( trie->states, laststate + 1, reg_trie_state);
1646 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1647 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1648 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1649 (int)depth * 2 + 2,"",
1650 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1651 (IV)next_alloc,
1652 (IV)pos,
a3621e74
YO
1653 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1654 );
1655
1656 } /* end table compress */
1657 }
cc601c31
YO
1658 /* resize the trans array to remove unused space */
1659 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1660
3dab1dad
YO
1661 /* and now dump out the compressed format */
1662 DEBUG_TRIE_COMPILE_r(
1663 dump_trie(trie,depth+1)
1664 );
07be1b83 1665
3dab1dad
YO
1666 { /* Modify the program and insert the new TRIE node*/
1667 regnode *convert;
1668 U8 nodetype =(U8)(flags & 0xFF);
1669 char *str=NULL;
07be1b83
YO
1670#ifdef DEBUGGING
1671 U32 mjd_offset;
1672 U32 mjd_nodelen;
1673#endif
a3621e74 1674 /*
3dab1dad
YO
1675 This means we convert either the first branch or the first Exact,
1676 depending on whether the thing following (in 'last') is a branch
1677 or not and whther first is the startbranch (ie is it a sub part of
1678 the alternation or is it the whole thing.)
1679 Assuming its a sub part we conver the EXACT otherwise we convert
1680 the whole branch sequence, including the first.
a3621e74 1681 */
3dab1dad
YO
1682 /* Find the node we are going to overwrite */
1683 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1684 /* whole branch chain */
3dab1dad 1685 convert = first;
07be1b83
YO
1686 DEBUG_r({
1687 const regnode *nop = NEXTOPER( convert );
1688 mjd_offset= Node_Offset((nop));
1689 mjd_nodelen= Node_Length((nop));
1690 });
1691 } else {
1692 /* branch sub-chain */
3dab1dad
YO
1693 convert = NEXTOPER( first );
1694 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1695 DEBUG_r({
1696 mjd_offset= Node_Offset((convert));
1697 mjd_nodelen= Node_Length((convert));
1698 });
1699 }
1700 DEBUG_OPTIMISE_r(
1701 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1702 (int)depth * 2 + 2, "",
1703 mjd_offset,mjd_nodelen)
1704 );
a3621e74 1705
3dab1dad
YO
1706 /* But first we check to see if there is a common prefix we can
1707 split out as an EXACT and put in front of the TRIE node. */
1708 trie->startstate= 1;
1709 if ( trie->bitmap && !trie->widecharmap ) {
1710 U32 state;
1711 DEBUG_OPTIMISE_r(
8e11feef
RGS
1712 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1713 (int)depth * 2 + 2, "",
1714 TRIE_LASTSTATE(trie))
1715 );
1716 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1717 U32 ofs = 0;
8e11feef
RGS
1718 I32 idx = -1;
1719 U32 count = 0;
1720 const U32 base = trie->states[ state ].trans.base;
a3621e74 1721
3dab1dad 1722 if ( trie->states[state].wordnum )
8e11feef 1723 count = 1;
a3621e74 1724
8e11feef 1725 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1726 if ( ( base + ofs >= trie->uniquecharcount ) &&
1727 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1728 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1729 {
3dab1dad 1730 if ( ++count > 1 ) {
8e11feef 1731 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1732 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1733 if ( state == 1 ) break;
3dab1dad
YO
1734 if ( count == 2 ) {
1735 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1736 DEBUG_OPTIMISE_r(
8e11feef
RGS
1737 PerlIO_printf(Perl_debug_log,
1738 "%*sNew Start State=%"UVuf" Class: [",
1739 (int)depth * 2 + 2, "",
3dab1dad 1740 state));
be8e71aa
YO
1741 if (idx >= 0) {
1742 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1743 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1744
3dab1dad 1745 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1746 if ( folder )
1747 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1748 DEBUG_OPTIMISE_r(
07be1b83 1749 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1750 );
8e11feef
RGS
1751 }
1752 }
1753 TRIE_BITMAP_SET(trie,*ch);
1754 if ( folder )
1755 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1756 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1757 }
1758 idx = ofs;
1759 }
3dab1dad
YO
1760 }
1761 if ( count == 1 ) {
1762 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1763 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1764 DEBUG_OPTIMISE_r(
8e11feef
RGS
1765 PerlIO_printf( Perl_debug_log,
1766 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1767 (int)depth * 2 + 2, "",
1768 state, idx, ch)
3dab1dad
YO
1769 );
1770 if ( state==1 ) {
1771 OP( convert ) = nodetype;
1772 str=STRING(convert);
1773 STR_LEN(convert)=0;
1774 }
1775 *str++=*ch;
1776 STR_LEN(convert)++;
a3621e74 1777
8e11feef 1778 } else {
f9049ba1 1779#ifdef DEBUGGING
8e11feef
RGS
1780 if (state>1)
1781 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1782#endif
8e11feef
RGS
1783 break;
1784 }
1785 }
3dab1dad 1786 if (str) {
8e11feef 1787 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1788 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1789 trie->startstate = state;
07be1b83
YO
1790 trie->minlen -= (state - 1);
1791 trie->maxlen -= (state - 1);
1792 DEBUG_r({
1793 regnode *fix = convert;
1794 mjd_nodelen++;
1795 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1796 while( ++fix < n ) {
1797 Set_Node_Offset_Length(fix, 0, 0);
1798 }
1799 });
8e11feef
RGS
1800 if (trie->maxlen) {
1801 convert = n;
1802 } else {
3dab1dad
YO
1803 NEXT_OFF(convert) = (U16)(tail - convert);
1804 }
1805 }
1806 }
1807 if ( trie->maxlen ) {
1808 OP( convert ) = TRIE;
8e11feef
RGS
1809 NEXT_OFF( convert ) = (U16)(tail - convert);
1810 ARG_SET( convert, data_slot );
a3621e74 1811
3dab1dad
YO
1812 /* store the type in the flags */
1813 convert->flags = nodetype;
1814 /* XXX We really should free up the resource in trie now, as we wont use them */
1815 }
a3621e74
YO
1816 /* needed for dumping*/
1817 DEBUG_r({
1818 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1819 regnode *opt = convert;
1820 while (++opt<optimize) {
1821 Set_Node_Offset_Length(opt,0,0);
1822 }
a3621e74
YO
1823 /* We now need to mark all of the space originally used by the
1824 branches as optimized away. This keeps the dumpuntil from
1825 throwing a wobbly as it doesnt use regnext() to traverse the
1826 opcodes.
07be1b83 1827 We also "fix" the offsets
a3621e74
YO
1828 */
1829 while( optimize < last ) {
07be1b83 1830 mjd_nodelen += Node_Length((optimize));
a3621e74 1831 OP( optimize ) = OPTIMIZED;
07be1b83 1832 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1833 optimize++;
1834 }
07be1b83 1835 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1836 });
1837 } /* end node insert */
07be1b83 1838#ifndef DEBUGGING
6e8b4190 1839 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1840#endif
a3621e74
YO
1841 return 1;
1842}
1843
a3621e74 1844/*
5d1c421c
JH
1845 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1846 * These need to be revisited when a newer toolchain becomes available.
1847 */
1848#if defined(__sparc64__) && defined(__GNUC__)
1849# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1850# undef SPARC64_GCC_WORKAROUND
1851# define SPARC64_GCC_WORKAROUND 1
1852# endif
1853#endif
1854
07be1b83
YO
1855#define DEBUG_PEEP(str,scan,depth) \
1856 DEBUG_OPTIMISE_r({ \
1857 SV * const mysv=sv_newmortal(); \
1858 regnode *Next = regnext(scan); \
1859 regprop(RExC_rx, mysv, scan); \
1860 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1861 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1862 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1863 });
1864
1865#define JOIN_EXACT(scan,min,flags) \
1866 if (PL_regkind[OP(scan)] == EXACT) \
1867 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1868
be8e71aa 1869STATIC U32
07be1b83
YO
1870S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1871 /* Merge several consecutive EXACTish nodes into one. */
1872 regnode *n = regnext(scan);
1873 U32 stringok = 1;
1874 regnode *next = scan + NODE_SZ_STR(scan);
1875 U32 merged = 0;
1876 U32 stopnow = 0;
1877#ifdef DEBUGGING
1878 regnode *stop = scan;
72f13be8 1879 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 1880#else
d47053eb
RGS
1881 PERL_UNUSED_ARG(depth);
1882#endif
1883#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
1884 PERL_UNUSED_ARG(flags);
1885 PERL_UNUSED_ARG(val);
07be1b83 1886#endif
07be1b83
YO
1887 DEBUG_PEEP("join",scan,depth);
1888
1889 /* Skip NOTHING, merge EXACT*. */
1890 while (n &&
1891 ( PL_regkind[OP(n)] == NOTHING ||
1892 (stringok && (OP(n) == OP(scan))))
1893 && NEXT_OFF(n)
1894 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1895
1896 if (OP(n) == TAIL || n > next)
1897 stringok = 0;
1898 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
1899 DEBUG_PEEP("skip:",n,depth);
1900 NEXT_OFF(scan) += NEXT_OFF(n);
1901 next = n + NODE_STEP_REGNODE;
1902#ifdef DEBUGGING
1903 if (stringok)
1904 stop = n;
1905#endif
1906 n = regnext(n);
1907 }
1908 else if (stringok) {
1909 const int oldl = STR_LEN(scan);
1910 regnode * const nnext = regnext(n);
1911
1912 DEBUG_PEEP("merg",n,depth);
1913
1914 merged++;
1915 if (oldl + STR_LEN(n) > U8_MAX)
1916 break;
1917 NEXT_OFF(scan) += NEXT_OFF(n);
1918 STR_LEN(scan) += STR_LEN(n);
1919 next = n + NODE_SZ_STR(n);
1920 /* Now we can overwrite *n : */
1921 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1922#ifdef DEBUGGING
1923 stop = next - 1;
1924#endif
1925 n = nnext;
1926 if (stopnow) break;
1927 }
1928
d47053eb
RGS
1929#ifdef EXPERIMENTAL_INPLACESCAN
1930 if (flags && !NEXT_OFF(n)) {
1931 DEBUG_PEEP("atch", val, depth);
1932 if (reg_off_by_arg[OP(n)]) {
1933 ARG_SET(n, val - n);
1934 }
1935 else {
1936 NEXT_OFF(n) = val - n;
1937 }
1938 stopnow = 1;
1939 }
07be1b83
YO
1940#endif
1941 }
1942
1943 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1944 /*
1945 Two problematic code points in Unicode casefolding of EXACT nodes:
1946
1947 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1948 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1949
1950 which casefold to
1951
1952 Unicode UTF-8
1953
1954 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1955 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1956
1957 This means that in case-insensitive matching (or "loose matching",
1958 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1959 length of the above casefolded versions) can match a target string
1960 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1961 This would rather mess up the minimum length computation.
1962
1963 What we'll do is to look for the tail four bytes, and then peek
1964 at the preceding two bytes to see whether we need to decrease
1965 the minimum length by four (six minus two).
1966
1967 Thanks to the design of UTF-8, there cannot be false matches:
1968 A sequence of valid UTF-8 bytes cannot be a subsequence of
1969 another valid sequence of UTF-8 bytes.
1970
1971 */
1972 char * const s0 = STRING(scan), *s, *t;
1973 char * const s1 = s0 + STR_LEN(scan) - 1;
1974 char * const s2 = s1 - 4;
1975 const char t0[] = "\xcc\x88\xcc\x81";
1976 const char * const t1 = t0 + 3;
1977
1978 for (s = s0 + 2;
1979 s < s2 && (t = ninstr(s, s1, t0, t1));
1980 s = t + 4) {
1981 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1982 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1983 *min -= 4;
1984 }
1985 }
1986
1987#ifdef DEBUGGING
1988 /* Allow dumping */
1989 n = scan + NODE_SZ_STR(scan);
1990 while (n <= stop) {
1991 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1992 OP(n) = OPTIMIZED;
1993 NEXT_OFF(n) = 0;
1994 }
1995 n++;
1996 }
1997#endif
1998 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1999 return stopnow;
2000}
2001
653099ff
GS
2002/* REx optimizer. Converts nodes into quickier variants "in place".
2003 Finds fixed substrings. */
2004
a0288114 2005/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2006 to the position after last scanned or to NULL. */
2007
07be1b83
YO
2008
2009
76e3520e 2010STATIC I32
9a957fbc
AL
2011S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2012 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2013 /* scanp: Start here (read-write). */
2014 /* deltap: Write maxlen-minlen here. */
2015 /* last: Stop before this one. */
2016{
97aff369 2017 dVAR;
c277df42
IZ
2018 I32 min = 0, pars = 0, code;
2019 regnode *scan = *scanp, *next;
2020 I32 delta = 0;
2021 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2022 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2023 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2024 scan_data_t data_fake;
653099ff 2025 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2026 SV *re_trie_maxbuff = NULL;
2027
2028 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2029#ifdef DEBUGGING
2030 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2031#endif
b81d288d 2032
c277df42
IZ
2033 while (scan && OP(scan) != END && scan < last) {
2034 /* Peephole optimizer: */
07be1b83 2035 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2036
07be1b83 2037 JOIN_EXACT(scan,&min,0);
a3621e74 2038
653099ff
GS
2039 /* Follow the next-chain of the current node and optimize
2040 away all the NOTHINGs from it. */
c277df42 2041 if (OP(scan) != CURLYX) {
a3b680e6 2042 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2043 ? I32_MAX
2044 /* I32 may be smaller than U16 on CRAYs! */
2045 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2046 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2047 int noff;
2048 regnode *n = scan;
b81d288d 2049
c277df42
IZ
2050 /* Skip NOTHING and LONGJMP. */
2051 while ((n = regnext(n))
3dab1dad 2052 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2053 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2054 && off + noff < max)
2055 off += noff;
2056 if (reg_off_by_arg[OP(scan)])
2057 ARG(scan) = off;
b81d288d 2058 else
c277df42
IZ
2059 NEXT_OFF(scan) = off;
2060 }
a3621e74 2061
07be1b83 2062
3dab1dad 2063
653099ff
GS
2064 /* The principal pseudo-switch. Cannot be a switch, since we
2065 look into several different things. */
b81d288d 2066 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2067 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2068 next = regnext(scan);
2069 code = OP(scan);
a3621e74 2070 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2071
2072 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2073 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2074 struct regnode_charclass_class accum;
d4c19fe8 2075 regnode * const startbranch=scan;
c277df42 2076
653099ff 2077 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2078 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2079 if (flags & SCF_DO_STCLASS)
830247a4 2080 cl_init_zero(pRExC_state, &accum);
a3621e74 2081
c277df42 2082 while (OP(scan) == code) {
830247a4 2083 I32 deltanext, minnext, f = 0, fake;
653099ff 2084 struct regnode_charclass_class this_class;
c277df42
IZ
2085
2086 num++;
2087 data_fake.flags = 0;
b81d288d 2088 if (data) {
2c2d71f5 2089 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2090 data_fake.last_closep = data->last_closep;
2091 }
2092 else
2093 data_fake.last_closep = &fake;
c277df42
IZ
2094 next = regnext(scan);
2095 scan = NEXTOPER(scan);
2096 if (code != BRANCH)
2097 scan = NEXTOPER(scan);
653099ff 2098 if (flags & SCF_DO_STCLASS) {
830247a4 2099 cl_init(pRExC_state, &this_class);
653099ff
GS
2100 data_fake.start_class = &this_class;
2101 f = SCF_DO_STCLASS_AND;
b81d288d 2102 }
e1901655
IZ
2103 if (flags & SCF_WHILEM_VISITED_POS)
2104 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2105
653099ff 2106 /* we suppose the run is continuous, last=next...*/
830247a4 2107 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2108 next, &data_fake, f,depth+1);
b81d288d 2109 if (min1 > minnext)
c277df42
IZ
2110 min1 = minnext;
2111 if (max1 < minnext + deltanext)
2112 max1 = minnext + deltanext;
2113 if (deltanext == I32_MAX)
aca2d497 2114 is_inf = is_inf_internal = 1;
c277df42
IZ
2115 scan = next;
2116 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2117 pars++;
3dab1dad
YO
2118 if (data) {
2119 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2120 data->flags |= SF_HAS_EVAL;
2c2d71f5 2121 data->whilem_c = data_fake.whilem_c;
3dab1dad 2122 }
653099ff 2123 if (flags & SCF_DO_STCLASS)
830247a4 2124 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2125 if (code == SUSPEND)
c277df42
IZ
2126 break;
2127 }
2128 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2129 min1 = 0;
2130 if (flags & SCF_DO_SUBSTR) {
2131 data->pos_min += min1;
2132 data->pos_delta += max1 - min1;
2133 if (max1 != min1 || is_inf)
2134 data->longest = &(data->longest_float);
2135 }
2136 min += min1;
2137 delta += max1 - min1;
653099ff 2138 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2139 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2140 if (min1) {
2141 cl_and(data->start_class, &and_with);
2142 flags &= ~SCF_DO_STCLASS;
2143 }
2144 }
2145 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2146 if (min1) {
2147 cl_and(data->start_class, &accum);
653099ff 2148 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2149 }
2150 else {
b81d288d 2151 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2152 * data->start_class */
2153 StructCopy(data->start_class, &and_with,
2154 struct regnode_charclass_class);
2155 flags &= ~SCF_DO_STCLASS_AND;
2156 StructCopy(&accum, data->start_class,
2157 struct regnode_charclass_class);
2158 flags |= SCF_DO_STCLASS_OR;
2159 data->start_class->flags |= ANYOF_EOS;
2160 }
653099ff 2161 }
a3621e74
YO
2162
2163 /* demq.
2164
2165 Assuming this was/is a branch we are dealing with: 'scan' now
2166 points at the item that follows the branch sequence, whatever
2167 it is. We now start at the beginning of the sequence and look
2168 for subsequences of
2169
2170 BRANCH->EXACT=>X
2171 BRANCH->EXACT=>X
2172
2173 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2174
2175 If we can find such a subseqence we need to turn the first
2176 element into a trie and then add the subsequent branch exact
2177 strings to the trie.
2178
2179 We have two cases
2180
2181 1. patterns where the whole set of branch can be converted to a trie,
2182
2183 2. patterns where only a subset of the alternations can be
2184 converted to a trie.
2185
2186 In case 1 we can replace the whole set with a single regop
2187 for the trie. In case 2 we need to keep the start and end
2188 branchs so
2189
2190 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2191 becomes BRANCH TRIE; BRANCH X;
2192
2193 Hypthetically when we know the regex isnt anchored we can
2194 turn a case 1 into a DFA and let it rip... Every time it finds a match
2195 it would just call its tail, no WHILEM/CURLY needed.
2196
2197 */
07be1b83 2198 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2199 int made=0;
0111c4fd
RGS
2200 if (!re_trie_maxbuff) {
2201 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2202 if (!SvIOK(re_trie_maxbuff))
2203 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2204 }
a3621e74
YO
2205 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2206 regnode *cur;
2207 regnode *first = (regnode *)NULL;
2208 regnode *last = (regnode *)NULL;
2209 regnode *tail = scan;
2210 U8 optype = 0;
2211 U32 count=0;
2212
2213#ifdef DEBUGGING
c445ea15 2214 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2215#endif
2216 /* var tail is used because there may be a TAIL
2217 regop in the way. Ie, the exacts will point to the
2218 thing following the TAIL, but the last branch will
2219 point at the TAIL. So we advance tail. If we
2220 have nested (?:) we may have to move through several
2221 tails.
2222 */
2223
2224 while ( OP( tail ) == TAIL ) {
2225 /* this is the TAIL generated by (?:) */
2226 tail = regnext( tail );
2227 }
2228
3dab1dad 2229
a3621e74 2230 DEBUG_OPTIMISE_r({
32fc9b6a 2231 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2232 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2233 (int)depth * 2 + 2, "",
2234 "Looking for TRIE'able sequences. Tail node is: ",
2235 SvPV_nolen_const( mysv )
a3621e74
YO
2236 );
2237 });
3dab1dad 2238
a3621e74
YO
2239 /*
2240
2241 step through the branches, cur represents each
2242 branch, noper is the first thing to be matched
2243 as part of that branch and noper_next is the
2244 regnext() of that node. if noper is an EXACT
2245 and noper_next is the same as scan (our current
2246 position in the regex) then the EXACT branch is
2247 a possible optimization target. Once we have
2248 two or more consequetive such branches we can
2249 create a trie of the EXACT's contents and stich
2250 it in place. If the sequence represents all of
2251 the branches we eliminate the whole thing and
2252 replace it with a single TRIE. If it is a
2253 subsequence then we need to stitch it in. This
2254 means the first branch has to remain, and needs
2255 to be repointed at the item on the branch chain
2256 following the last branch optimized. This could
2257 be either a BRANCH, in which case the
2258 subsequence is internal, or it could be the
2259 item following the branch sequence in which
2260 case the subsequence is at the end.
2261
2262 */
2263
2264 /* dont use tail as the end marker for this traverse */
2265 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2266 regnode * const noper = NEXTOPER( cur );
2267 regnode * const noper_next = regnext( noper );
a3621e74 2268
a3621e74 2269 DEBUG_OPTIMISE_r({
32fc9b6a 2270 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2271 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2272 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2273
32fc9b6a 2274 regprop(RExC_rx, mysv, noper);
a3621e74 2275 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2276 SvPV_nolen_const(mysv));
a3621e74
YO
2277
2278 if ( noper_next ) {
32fc9b6a 2279 regprop(RExC_rx, mysv, noper_next );
a3621e74 2280 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2281 SvPV_nolen_const(mysv));
a3621e74 2282 }
3dab1dad
YO
2283 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2284 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2285 });
3dab1dad
YO
2286 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2287 : PL_regkind[ OP( noper ) ] == EXACT )
2288 || OP(noper) == NOTHING )
a3621e74
YO
2289 && noper_next == tail && count<U16_MAX)
2290 {
2291 count++;
3dab1dad
YO
2292 if ( !first || optype == NOTHING ) {
2293 if (!first) first = cur;
a3621e74
YO
2294 optype = OP( noper );
2295 } else {
a3621e74 2296 last = cur;
a3621e74
YO
2297 }
2298 } else {
2299 if ( last ) {
3dab1dad 2300 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2301 }
3dab1dad 2302 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2303 && noper_next == tail )
2304 {
2305 count = 1;
2306 first = cur;
2307 optype = OP( noper );
2308 } else {
2309 count = 0;
2310 first = NULL;
2311 optype = 0;
2312 }
2313 last = NULL;
2314 }
2315 }
2316 DEBUG_OPTIMISE_r({
32fc9b6a 2317 regprop(RExC_rx, mysv, cur);
a3621e74 2318 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2319 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2320 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2321
2322 });
2323 if ( last ) {
3dab1dad
YO
2324 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2325#ifdef TRIE_STUDY_OPT
07be1b83
YO
2326 if ( made && startbranch == first ) {
2327 if ( OP(first)!=TRIE )
2328 flags |= SCF_EXACT_TRIE;
2329 else {
2330 regnode *chk=*scanp;
2331 while ( OP( chk ) == OPEN )
2332 chk = regnext( chk );
2333 if (chk==first)
2334 flags |= SCF_EXACT_TRIE;
2335 }
2336 }
3dab1dad 2337#endif
07be1b83 2338 }
a3621e74 2339 }
3dab1dad
YO
2340
2341 } /* do trie */
a0ed51b3 2342 }
a3621e74 2343 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2344 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2345 } else /* single branch is optimized. */
c277df42
IZ
2346 scan = NEXTOPER(scan);
2347 continue;
a0ed51b3
LW
2348 }
2349 else if (OP(scan) == EXACT) {
cd439c50 2350 I32 l = STR_LEN(scan);
c445ea15 2351 UV uc;
a0ed51b3 2352 if (UTF) {
a3b680e6 2353 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2354 l = utf8_length(s, s + l);
9041c2e3 2355 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2356 } else {
2357 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2358 }
2359 min += l;
c277df42 2360 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2361 /* The code below prefers earlier match for fixed
2362 offset, later match for variable offset. */
2363 if (data->last_end == -1) { /* Update the start info. */
2364 data->last_start_min = data->pos_min;
2365 data->last_start_max = is_inf
b81d288d 2366 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2367 }
cd439c50 2368 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2369 if (UTF)
2370 SvUTF8_on(data->last_found);
0eda9292 2371 {
9a957fbc 2372 SV * const sv = data->last_found;
a28509cc 2373 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2374 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2375 if (mg && mg->mg_len >= 0)
5e43f467
JH
2376 mg->mg_len += utf8_length((U8*)STRING(scan),
2377 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2378 }
c277df42
IZ
2379 data->last_end = data->pos_min + l;
2380 data->pos_min += l; /* As in the first entry. */
2381 data->flags &= ~SF_BEFORE_EOL;
2382 }
653099ff
GS
2383 if (flags & SCF_DO_STCLASS_AND) {
2384 /* Check whether it is compatible with what we know already! */
2385 int compat = 1;
2386
1aa99e6b 2387 if (uc >= 0x100 ||
516a5887 2388 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2389 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2390 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2391 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2392 )
653099ff
GS
2393 compat = 0;
2394 ANYOF_CLASS_ZERO(data->start_class);
2395 ANYOF_BITMAP_ZERO(data->start_class);
2396 if (compat)
1aa99e6b 2397 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2398 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2399 if (uc < 0x100)
2400 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2401 }
2402 else if (flags & SCF_DO_STCLASS_OR) {
2403 /* false positive possible if the class is case-folded */
1aa99e6b 2404 if (uc < 0x100)
9b877dbb
IH
2405 ANYOF_BITMAP_SET(data->start_class, uc);
2406 else
2407 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2408 data->start_class->flags &= ~ANYOF_EOS;
2409 cl_and(data->start_class, &and_with);
2410 }
2411 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2412 }
3dab1dad 2413 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2414 I32 l = STR_LEN(scan);
1aa99e6b 2415 UV uc = *((U8*)STRING(scan));
653099ff
GS
2416
2417 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2418 if (flags & SCF_DO_SUBSTR) {
2419 assert(data);
830247a4 2420 scan_commit(pRExC_state, data);
ecaa9b9c 2421 }
a0ed51b3 2422 if (UTF) {
6136c704 2423 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2424 l = utf8_length(s, s + l);
9041c2e3 2425 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2426 }
2427 min += l;
ecaa9b9c 2428 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2429 data->pos_min += l;
653099ff
GS
2430 if (flags & SCF_DO_STCLASS_AND) {
2431 /* Check whether it is compatible with what we know already! */
2432 int compat = 1;
2433
1aa99e6b 2434 if (uc >= 0x100 ||
516a5887 2435 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2436 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2437 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2438 compat = 0;
2439 ANYOF_CLASS_ZERO(data->start_class);
2440 ANYOF_BITMAP_ZERO(data->start_class);
2441 if (compat) {
1aa99e6b 2442 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2443 data->start_class->flags &= ~ANYOF_EOS;
2444 data->start_class->flags |= ANYOF_FOLD;
2445 if (OP(scan) == EXACTFL)
2446 data->start_class->flags |= ANYOF_LOCALE;
2447 }
2448 }
2449 else if (flags & SCF_DO_STCLASS_OR) {
2450 if (data->start_class->flags & ANYOF_FOLD) {
2451 /* false positive possible if the class is case-folded.
2452 Assume that the locale settings are the same... */
1aa99e6b
IH
2453 if (uc < 0x100)
2454 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2455 data->start_class->flags &= ~ANYOF_EOS;
2456 }
2457 cl_and(data->start_class, &and_with);
2458 }
2459 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2460 }
3dab1dad
YO
2461#ifdef TRIE_STUDY_OPT
2462 else if (OP(scan) == TRIE) {
2463 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2464 min += trie->minlen;
07be1b83 2465 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2466 flags &= ~SCF_DO_STCLASS; /* xxx */
2467 if (flags & SCF_DO_SUBSTR) {
2468 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2469 data->pos_min += trie->minlen;
07be1b83
YO
2470 data->pos_delta += (trie->maxlen - trie->minlen);
2471 if (trie->maxlen != trie->minlen)
2472 data->longest = &(data->longest_float);
3dab1dad
YO
2473 }
2474 }
2475#endif
bfed75c6 2476 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2477 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2478 I32 f = flags, pos_before = 0;
d4c19fe8 2479 regnode * const oscan = scan;
653099ff
GS
2480 struct regnode_charclass_class this_class;
2481 struct regnode_charclass_class *oclass = NULL;
727f22e3 2482 I32 next_is_eval = 0;
653099ff 2483
3dab1dad 2484 switch (PL_regkind[OP(scan)]) {
653099ff 2485 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2486 scan = NEXTOPER(scan);
2487 goto finish;
2488 case PLUS:
653099ff 2489 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2490 next = NEXTOPER(scan);
653099ff 2491 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2492 mincount = 1;
2493 maxcount = REG_INFTY;
c277df42
IZ
2494 next = regnext(scan);
2495 scan = NEXTOPER(scan);
2496 goto do_curly;
2497 }
2498 }
2499 if (flags & SCF_DO_SUBSTR)
2500 data->pos_min++;
2501 min++;
2502 /* Fall through. */
2503 case STAR:
653099ff
GS
2504 if (flags & SCF_DO_STCLASS) {
2505 mincount = 0;
b81d288d 2506 maxcount = REG_INFTY;
653099ff
GS
2507 next = regnext(scan);
2508 scan = NEXTOPER(scan);
2509 goto do_curly;
2510 }
b81d288d 2511 is_inf = is_inf_internal = 1;
c277df42
IZ
2512 scan = regnext(scan);
2513 if (flags & SCF_DO_SUBSTR) {
830247a4 2514 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2515 data->longest = &(data->longest_float);
2516 }
2517 goto optimize_curly_tail;
2518 case CURLY:
b81d288d 2519 mincount = ARG1(scan);
c277df42
IZ
2520 maxcount = ARG2(scan);
2521 next = regnext(scan);
cb434fcc
IZ
2522 if (OP(scan) == CURLYX) {
2523 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2524 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2525 }
c277df42 2526 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2527 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2528 do_curly:
2529 if (flags & SCF_DO_SUBSTR) {
830247a4 2530 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2531 pos_before = data->pos_min;
2532 }
2533 if (data) {
2534 fl = data->flags;
2535 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2536 if (is_inf)
2537 data->flags |= SF_IS_INF;
2538 }
653099ff 2539 if (flags & SCF_DO_STCLASS) {
830247a4 2540 cl_init(pRExC_state, &this_class);
653099ff
GS
2541 oclass = data->start_class;
2542 data->start_class = &this_class;
2543 f |= SCF_DO_STCLASS_AND;
2544 f &= ~SCF_DO_STCLASS_OR;
2545 }
e1901655
IZ
2546 /* These are the cases when once a subexpression
2547 fails at a particular position, it cannot succeed
2548 even after backtracking at the enclosing scope.
b81d288d 2549
e1901655
IZ
2550 XXXX what if minimal match and we are at the
2551 initial run of {n,m}? */
2552 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2553 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2554
c277df42 2555 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2556 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2557 (mincount == 0
2558 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2559
2560 if (flags & SCF_DO_STCLASS)
2561 data->start_class = oclass;
2562 if (mincount == 0 || minnext == 0) {
2563 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2564 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2565 }
2566 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2567 /* Switch to OR mode: cache the old value of
653099ff
GS
2568 * data->start_class */
2569 StructCopy(data->start_class, &and_with,
2570 struct regnode_charclass_class);
2571 flags &= ~SCF_DO_STCLASS_AND;
2572 StructCopy(&this_class, data->start_class,
2573 struct regnode_charclass_class);
2574 flags |= SCF_DO_STCLASS_OR;
2575 data->start_class->flags |= ANYOF_EOS;
2576 }
2577 } else { /* Non-zero len */
2578 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2579 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2580 cl_and(data->start_class, &and_with);
2581 }
2582 else if (flags & SCF_DO_STCLASS_AND)
2583 cl_and(data->start_class, &this_class);
2584 flags &= ~SCF_DO_STCLASS;
2585 }
c277df42
IZ
2586 if (!scan) /* It was not CURLYX, but CURLY. */
2587 scan = next;
041457d9
DM
2588 if ( /* ? quantifier ok, except for (?{ ... }) */
2589 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2590 && (minnext == 0) && (deltanext == 0)
99799961 2591 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2592 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2593 && ckWARN(WARN_REGEXP))
b45f050a 2594 {
830247a4 2595 vWARN(RExC_parse,
b45f050a
JF
2596 "Quantifier unexpected on zero-length expression");
2597 }
2598
c277df42 2599 min += minnext * mincount;
b81d288d 2600 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2601 && (minnext + deltanext) > 0)
2602 || deltanext == I32_MAX);
aca2d497 2603 is_inf |= is_inf_internal;
c277df42
IZ
2604 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2605
2606 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2607 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2608 && data->flags & SF_IN_PAR
2609 && !(data->flags & SF_HAS_EVAL)
2610 && !deltanext && minnext == 1 ) {
2611 /* Try to optimize to CURLYN. */
2612 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2613 regnode * const nxt1 = nxt;
497b47a8
JH
2614#ifdef DEBUGGING
2615 regnode *nxt2;
2616#endif
c277df42
IZ
2617
2618 /* Skip open. */
2619 nxt = regnext(nxt);
bfed75c6 2620 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2621 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2622 && STR_LEN(nxt) == 1))
c277df42 2623 goto nogo;
497b47a8 2624#ifdef DEBUGGING
c277df42 2625 nxt2 = nxt;
497b47a8 2626#endif
c277df42 2627 nxt = regnext(nxt);
b81d288d 2628 if (OP(nxt) != CLOSE)
c277df42
IZ
2629 goto nogo;
2630 /* Now we know that nxt2 is the only contents: */
eb160463 2631 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2632 OP(oscan) = CURLYN;
2633 OP(nxt1) = NOTHING; /* was OPEN. */
2634#ifdef DEBUGGING
2635 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2636 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2637 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2638 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2639 OP(nxt + 1) = OPTIMIZED; /* was count. */
2640 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2641#endif
c277df42 2642 }
c277df42
IZ
2643 nogo:
2644
2645 /* Try optimization CURLYX => CURLYM. */
b81d288d 2646 if ( OP(oscan) == CURLYX && data
c277df42 2647 && !(data->flags & SF_HAS_PAR)
c277df42 2648 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2649 && !deltanext /* atom is fixed width */
2650 && minnext != 0 /* CURLYM can't handle zero width */
2651 ) {
c277df42
IZ
2652 /* XXXX How to optimize if data == 0? */
2653 /* Optimize to a simpler form. */
2654 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2655 regnode *nxt2;
2656
2657 OP(oscan) = CURLYM;
2658 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2659 && (OP(nxt2) != WHILEM))
c277df42
IZ
2660 nxt = nxt2;
2661 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2662 /* Need to optimize away parenths. */
2663 if (data->flags & SF_IN_PAR) {
2664 /* Set the parenth number. */
2665 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2666
b81d288d 2667 if (OP(nxt) != CLOSE)
b45f050a 2668 FAIL("Panic opt close");
eb160463 2669 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2670 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2671 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2672#ifdef DEBUGGING
2673 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2674 OP(nxt + 1) = OPTIMIZED; /* was count. */
2675 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2676 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2677#endif
c277df42
IZ
2678#if 0
2679 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2680 regnode *nnxt = regnext(nxt1);
b81d288d 2681
c277df42
IZ
2682 if (nnxt == nxt) {
2683 if (reg_off_by_arg[OP(nxt1)])
2684 ARG_SET(nxt1, nxt2 - nxt1);
2685 else if (nxt2 - nxt1 < U16_MAX)
2686 NEXT_OFF(nxt1) = nxt2 - nxt1;
2687 else
2688 OP(nxt) = NOTHING; /* Cannot beautify */
2689 }
2690 nxt1 = nnxt;
2691 }
2692#endif
2693 /* Optimize again: */
b81d288d 2694 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2695 NULL, 0,depth+1);
a0ed51b3
LW
2696 }
2697 else
c277df42 2698 oscan->flags = 0;
c277df42 2699 }
e1901655
IZ
2700 else if ((OP(oscan) == CURLYX)
2701 && (flags & SCF_WHILEM_VISITED_POS)
2702 /* See the comment on a similar expression above.
2703 However, this time it not a subexpression
2704 we care about, but the expression itself. */
2705 && (maxcount == REG_INFTY)
2706 && data && ++data->whilem_c < 16) {
2707 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2708 /* Find WHILEM (as in regexec.c) */
2709 regnode *nxt = oscan + NEXT_OFF(oscan);
2710
2711 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2712 nxt += ARG(nxt);
eb160463
GS
2713 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2714 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2715 }
b81d288d 2716 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2717 pars++;
2718 if (flags & SCF_DO_SUBSTR) {
c445ea15 2719 SV *last_str = NULL;
c277df42
IZ
2720 int counted = mincount != 0;
2721
2722 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2723#if defined(SPARC64_GCC_WORKAROUND)
2724 I32 b = 0;
2725 STRLEN l = 0;
cfd0369c 2726 const char *s = NULL;
5d1c421c
JH
2727 I32 old = 0;
2728
2729 if (pos_before >= data->last_start_min)
2730 b = pos_before;
2731 else
2732 b = data->last_start_min;
2733
2734 l = 0;
cfd0369c 2735 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2736 old = b - data->last_start_min;
2737
2738#else
b81d288d 2739 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2740 ? pos_before : data->last_start_min;
2741 STRLEN l;
d4c19fe8 2742 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2743 I32 old = b - data->last_start_min;
5d1c421c 2744#endif
a0ed51b3
LW
2745
2746 if (UTF)
2747 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2748
a0ed51b3 2749 l -= old;
c277df42 2750 /* Get the added string: */
79cb57f6 2751 last_str = newSVpvn(s + old, l);
0e933229
IH
2752 if (UTF)
2753 SvUTF8_on(last_str);
c277df42
IZ
2754 if (deltanext == 0 && pos_before == b) {
2755 /* What was added is a constant string */
2756 if (mincount > 1) {
2757 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2758 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2759 SvPVX_const(last_str), l, mincount - 1);
b162af07 2760 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2761 /* Add additional parts. */
b81d288d 2762 SvCUR_set(data->last_found,
c277df42
IZ
2763 SvCUR(data->last_found) - l);
2764 sv_catsv(data->last_found, last_str);
0eda9292
JH
2765 {
2766 SV * sv = data->last_found;
2767 MAGIC *mg =
2768 SvUTF8(sv) && SvMAGICAL(sv) ?
2769 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2770 if (mg && mg->mg_len >= 0)
2771 mg->mg_len += CHR_SVLEN(last_str);
2772 }
c277df42
IZ
2773 data->last_end += l * (mincount - 1);
2774 }
2a8d9689
HS
2775 } else {
2776 /* start offset must point into the last copy */
2777 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2778 data->last_start_max += is_inf ? I32_MAX
2779 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2780 }
2781 }
2782 /* It is counted once already... */
2783 data->pos_min += minnext * (mincount - counted);
2784 data->pos_delta += - counted * deltanext +
2785 (minnext + deltanext) * maxcount - minnext * mincount;
2786 if (mincount != maxcount) {
653099ff
GS
2787 /* Cannot extend fixed substrings found inside
2788 the group. */
830247a4 2789 scan_commit(pRExC_state,data);
c277df42 2790 if (mincount && last_str) {
d4c19fe8
AL
2791 SV * const sv = data->last_found;
2792 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2793 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2794
2795 if (mg)
2796 mg->mg_len = -1;
2797 sv_setsv(sv, last_str);
c277df42 2798 data->last_end = data->pos_min;
b81d288d 2799 data->last_start_min =
a0ed51b3 2800 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2801 data->last_start_max = is_inf
2802 ? I32_MAX
c277df42 2803 : data->pos_min + data->pos_delta
a0ed51b3 2804 - CHR_SVLEN(last_str);
c277df42
IZ
2805 }
2806 data->longest = &(data->longest_float);
2807 }
aca2d497 2808 SvREFCNT_dec(last_str);
c277df42 2809 }
405ff068 2810 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2811 data->flags |= SF_HAS_EVAL;
2812 optimize_curly_tail:
c277df42 2813 if (OP(oscan) != CURLYX) {
3dab1dad 2814 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2815 && NEXT_OFF(next))
2816 NEXT_OFF(oscan) += NEXT_OFF(next);
2817 }
c277df42 2818 continue;
653099ff 2819 default: /* REF and CLUMP only? */
c277df42 2820 if (flags & SCF_DO_SUBSTR) {
830247a4 2821 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2822 data->longest = &(data->longest_float);
2823 }
aca2d497 2824 is_inf = is_inf_internal = 1;
653099ff 2825 if (flags & SCF_DO_STCLASS_OR)
830247a4 2826 cl_anything(pRExC_state, data->start_class);
653099ff 2827 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2828 break;
2829 }
a0ed51b3 2830 }
bfed75c6 2831 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2832 int value = 0;
653099ff 2833
c277df42 2834 if (flags & SCF_DO_SUBSTR) {
830247a4 2835 scan_commit(pRExC_state,data);
c277df42
IZ
2836 data->pos_min++;
2837 }
2838 min++;
653099ff
GS
2839 if (flags & SCF_DO_STCLASS) {
2840 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2841
2842 /* Some of the logic below assumes that switching
2843 locale on will only add false positives. */
3dab1dad 2844 switch (PL_regkind[OP(scan)]) {
653099ff 2845 case SANY:
653099ff
GS
2846 default:
2847 do_default:
2848 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2849 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2850 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2851 break;
2852 case REG_ANY:
2853 if (OP(scan) == SANY)
2854 goto do_default;
2855 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2856 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2857 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2858 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2859 }
2860 if (flags & SCF_DO_STCLASS_AND || !value)
2861 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2862 break;
2863 case ANYOF:
2864 if (flags & SCF_DO_STCLASS_AND)
2865 cl_and(data->start_class,
2866 (struct regnode_charclass_class*)scan);
2867 else
830247a4 2868 cl_or(pRExC_state, data->start_class,
653099ff
GS
2869 (struct regnode_charclass_class*)scan);
2870 break;
2871 case ALNUM:
2872 if (flags & SCF_DO_STCLASS_AND) {
2873 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2874 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2875 for (value = 0; value < 256; value++)
2876 if (!isALNUM(value))
2877 ANYOF_BITMAP_CLEAR(data->start_class, value);
2878 }
2879 }
2880 else {
2881 if (data->start_class->flags & ANYOF_LOCALE)
2882 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2883 else {
2884 for (value = 0; value < 256; value++)
2885 if (isALNUM(value))
b81d288d 2886 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2887 }
2888 }
2889 break;
2890 case ALNUML:
2891 if (flags & SCF_DO_STCLASS_AND) {
2892 if (data->start_class->flags & ANYOF_LOCALE)
2893 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2894 }
2895 else {
2896 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2897 data->start_class->flags |= ANYOF_LOCALE;
2898 }
2899 break;
2900 case NALNUM:
2901 if (flags & SCF_DO_STCLASS_AND) {
2902 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2903 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2904 for (value = 0; value < 256; value++)
2905 if (isALNUM(value))
2906 ANYOF_BITMAP_CLEAR(data->start_class, value);
2907 }
2908 }
2909 else {
2910 if (data->start_class->flags & ANYOF_LOCALE)
2911 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2912 else {
2913 for (value = 0; value < 256; value++)
2914 if (!isALNUM(value))
b81d288d 2915 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2916 }
2917 }
2918 break;
2919 case NALNUML:
2920 if (flags & SCF_DO_STCLASS_AND) {
2921 if (data->start_class->flags & ANYOF_LOCALE)
2922 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2923 }
2924 else {
2925 data->start_class->flags |= ANYOF_LOCALE;
2926 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2927 }
2928 break;
2929 case SPACE:
2930 if (flags & SCF_DO_STCLASS_AND) {
2931 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2932 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2933 for (value = 0; value < 256; value++)
2934 if (!isSPACE(value))
2935 ANYOF_BITMAP_CLEAR(data->start_class, value);
2936 }
2937 }
2938 else {
2939 if (data->start_class->flags & ANYOF_LOCALE)
2940 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2941 else {
2942 for (value = 0; value < 256; value++)
2943 if (isSPACE(value))
b81d288d 2944 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2945 }
2946 }
2947 break;
2948 case SPACEL:
2949 if (flags & SCF_DO_STCLASS_AND) {
2950 if (data->start_class->flags & ANYOF_LOCALE)
2951 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2952 }
2953 else {
2954 data->start_class->flags |= ANYOF_LOCALE;
2955 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2956 }
2957 break;
2958 case NSPACE:
2959 if (flags & SCF_DO_STCLASS_AND) {
2960 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2961 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2962 for (value = 0; value < 256; value++)
2963 if (isSPACE(value))
2964 ANYOF_BITMAP_CLEAR(data->start_class, value);
2965 }
2966 }
2967 else {
2968 if (data->start_class->flags & ANYOF_LOCALE)
2969 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2970 else {
2971 for (value = 0; value < 256; value++)
2972 if (!isSPACE(value))
b81d288d 2973 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2974 }
2975 }
2976 break;
2977 case NSPACEL:
2978 if (flags & SCF_DO_STCLASS_AND) {
2979 if (data->start_class->flags & ANYOF_LOCALE) {
2980 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2981 for (value = 0; value < 256; value++)
2982 if (!isSPACE(value))
2983 ANYOF_BITMAP_CLEAR(data->start_class, value);
2984 }
2985 }
2986 else {
2987 data->start_class->flags |= ANYOF_LOCALE;
2988 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2989 }
2990 break;
2991 case DIGIT:
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2994 for (value = 0; value < 256; value++)
2995 if (!isDIGIT(value))
2996 ANYOF_BITMAP_CLEAR(data->start_class, value);
2997 }
2998 else {
2999 if (data->start_class->flags & ANYOF_LOCALE)
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3001 else {
3002 for (value = 0; value < 256; value++)
3003 if (isDIGIT(value))
b81d288d 3004 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3005 }
3006 }
3007 break;
3008 case NDIGIT:
3009 if (flags & SCF_DO_STCLASS_AND) {
3010 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3011 for (value = 0; value < 256; value++)
3012 if (isDIGIT(value))
3013 ANYOF_BITMAP_CLEAR(data->start_class, value);
3014 }
3015 else {
3016 if (data->start_class->flags & ANYOF_LOCALE)
3017 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3018 else {
3019 for (value = 0; value < 256; value++)
3020 if (!isDIGIT(value))
b81d288d 3021 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3022 }
3023 }
3024 break;
3025 }
3026 if (flags & SCF_DO_STCLASS_OR)
3027 cl_and(data->start_class, &and_with);
3028 flags &= ~SCF_DO_STCLASS;
3029 }
a0ed51b3 3030 }
3dab1dad 3031 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3032 data->flags |= (OP(scan) == MEOL
3033 ? SF_BEFORE_MEOL
3034 : SF_BEFORE_SEOL);
a0ed51b3 3035 }
3dab1dad 3036 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3037 /* Lookbehind, or need to calculate parens/evals/stclass: */
3038 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3039 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3040 /* Lookahead/lookbehind */
cb434fcc 3041 I32 deltanext, minnext, fake = 0;
c277df42 3042 regnode *nscan;
653099ff
GS
3043 struct regnode_charclass_class intrnl;
3044 int f = 0;
c277df42
IZ
3045
3046 data_fake.flags = 0;
b81d288d 3047 if (data) {
2c2d71f5 3048 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3049 data_fake.last_closep = data->last_closep;
3050 }
3051 else
3052 data_fake.last_closep = &fake;
653099ff
GS
3053 if ( flags & SCF_DO_STCLASS && !scan->flags
3054 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3055 cl_init(pRExC_state, &intrnl);
653099ff 3056 data_fake.start_class = &intrnl;
e1901655 3057 f |= SCF_DO_STCLASS_AND;
653099ff 3058 }
e1901655
IZ
3059 if (flags & SCF_WHILEM_VISITED_POS)
3060 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3061 next = regnext(scan);
3062 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3063 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3064 if (scan->flags) {
3065 if (deltanext) {
9baa0206 3066 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3067 }
3068 else if (minnext > U8_MAX) {
9baa0206 3069 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3070 }
eb160463 3071 scan->flags = (U8)minnext;
c277df42 3072 }
be8e71aa
YO
3073 if (data) {
3074 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3075 pars++;
3076 if (data_fake.flags & SF_HAS_EVAL)
3077 data->flags |= SF_HAS_EVAL;
2c2d71f5 3078 data->whilem_c = data_fake.whilem_c;
be8e71aa 3079 }
e1901655 3080 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3081 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3082
3083 cl_and(data->start_class, &intrnl);
3084 if (was)
3085 data->start_class->flags |= ANYOF_EOS;
3086 }
a0ed51b3
LW
3087 }
3088 else if (OP(scan) == OPEN) {
c277df42 3089 pars++;
a0ed51b3 3090 }
cb434fcc 3091 else if (OP(scan) == CLOSE) {
eb160463 3092 if ((I32)ARG(scan) == is_par) {
cb434fcc 3093 next = regnext(scan);
c277df42 3094
cb434fcc
IZ
3095 if ( next && (OP(next) != WHILEM) && next < last)
3096 is_par = 0; /* Disable optimization */
3097 }
3098 if (data)
3099 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3100 }
3101 else if (OP(scan) == EVAL) {
c277df42
IZ
3102 if (data)
3103 data->flags |= SF_HAS_EVAL;
3104 }
96776eda 3105 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3106 if (flags & SCF_DO_SUBSTR) {
830247a4 3107 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3108 data->longest = &(data->longest_float);
3109 }
3110 is_inf = is_inf_internal = 1;
653099ff 3111 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3112 cl_anything(pRExC_state, data->start_class);
96776eda 3113 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3114 }
c277df42
IZ
3115 /* Else: zero-length, ignore. */
3116 scan = regnext(scan);
3117 }
3118
3119 finish:
3120 *scanp = scan;
aca2d497 3121 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3122 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3123 data->pos_delta = I32_MAX - data->pos_min;
3124 if (is_par > U8_MAX)
3125 is_par = 0;
3126 if (is_par && pars==1 && data) {
3127 data->flags |= SF_IN_PAR;
3128 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3129 }
3130 else if (pars && data) {
c277df42
IZ
3131 data->flags |= SF_HAS_PAR;
3132 data->flags &= ~SF_IN_PAR;
3133 }
653099ff
GS
3134 if (flags & SCF_DO_STCLASS_OR)
3135 cl_and(data->start_class, &and_with);
07be1b83
YO
3136 if (flags & SCF_EXACT_TRIE)
3137 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3138 return min;
3139}
3140
76e3520e 3141STATIC I32
5f66b61c 3142S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3143{
830247a4 3144 if (RExC_rx->data) {
b81d288d
AB
3145 Renewc(RExC_rx->data,
3146 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3147 char, struct reg_data);
830247a4
IZ
3148 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3149 RExC_rx->data->count += n;
a0ed51b3
LW
3150 }
3151 else {
a02a5408 3152 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3153 char, struct reg_data);
a02a5408 3154 Newx(RExC_rx->data->what, n, U8);
830247a4 3155 RExC_rx->data->count = n;
c277df42 3156 }
830247a4
IZ
3157 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3158 return RExC_rx->data->count - n;
c277df42
IZ
3159}
3160
76234dfb 3161#ifndef PERL_IN_XSUB_RE
d88dccdf 3162void
864dbfa3 3163Perl_reginitcolors(pTHX)
d88dccdf 3164{
97aff369 3165 dVAR;
1df70142 3166 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3167 if (s) {
1df70142
AL
3168 char *t = savepv(s);
3169 int i = 0;
3170 PL_colors[0] = t;
d88dccdf 3171 while (++i < 6) {
1df70142
AL
3172 t = strchr(t, '\t');
3173 if (t) {
3174 *t = '\0';
3175 PL_colors[i] = ++t;
d88dccdf
IZ
3176 }
3177 else
1df70142 3178 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3179 }
3180 } else {
1df70142 3181 int i = 0;
b81d288d 3182 while (i < 6)
06b5626a 3183 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3184 }
3185 PL_colorset = 1;
3186}
76234dfb 3187#endif
8615cb43 3188
07be1b83 3189
a687059c 3190/*
e50aee73 3191 - pregcomp - compile a regular expression into internal code
a687059c
LW
3192 *
3193 * We can't allocate space until we know how big the compiled form will be,
3194 * but we can't compile it (and thus know how big it is) until we've got a
3195 * place to put the code. So we cheat: we compile it twice, once with code
3196 * generation turned off and size counting turned on, and once "for real".
3197 * This also means that we don't allocate space until we are sure that the
3198 * thing really will compile successfully, and we never have to move the
3199 * code and thus invalidate pointers into it. (Note that it has to be in
3200 * one piece because free() must be able to free it all.) [NB: not true in perl]
3201 *
3202 * Beware that the optimization-preparation code in here knows about some
3203 * of the structure of the compiled regexp. [I'll say.]
3204 */
3205regexp *
864dbfa3 3206Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3207{
97aff369 3208 dVAR;
a0d0e21e 3209 register regexp *r;
c277df42 3210 regnode *scan;
c277df42 3211 regnode *first;
a0d0e21e 3212 I32 flags;
a0d0e21e
LW
3213 I32 minlen = 0;
3214 I32 sawplus = 0;
3215 I32 sawopen = 0;
2c2d71f5 3216 scan_data_t data;
830247a4 3217 RExC_state_t RExC_state;
be8e71aa 3218 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3219#ifdef TRIE_STUDY_OPT
3220 int restudied= 0;
3221 RExC_state_t copyRExC_state;
3222#endif
a0d0e21e 3223
a3621e74
YO
3224 GET_RE_DEBUG_FLAGS_DECL;
3225
a0d0e21e 3226 if (exp == NULL)
c277df42 3227 FAIL("NULL regexp argument");
a0d0e21e 3228
a5961de5 3229 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3230
5cfc7842 3231 RExC_precomp = exp;
a3621e74
YO
3232 DEBUG_r(if (!PL_colorset) reginitcolors());
3233 DEBUG_COMPILE_r({
3234 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3235 PL_colors[4],PL_colors[5],PL_colors[0],
3236 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3237 });
e2509266 3238 RExC_flags = pm->op_pmflags;
830247a4 3239 RExC_sawback = 0;
bbce6d69 3240
830247a4
IZ
3241 RExC_seen = 0;
3242 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3243 RExC_seen_evals = 0;
3244 RExC_extralen = 0;
c277df42 3245
bbce6d69 3246 /* First pass: determine size, legality. */
830247a4 3247 RExC_parse = exp;
fac92740 3248 RExC_start = exp;
830247a4
IZ
3249 RExC_end = xend;
3250 RExC_naughty = 0;
3251 RExC_npar = 1;
3252 RExC_size = 0L;
3253 RExC_emit = &PL_regdummy;
3254 RExC_whilem_seen = 0;
85ddcde9
JH
3255#if 0 /* REGC() is (currently) a NOP at the first pass.
3256 * Clever compilers notice this and complain. --jhi */
830247a4 3257 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3258#endif
3dab1dad
YO
3259 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3260 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3261 RExC_precomp = NULL;
a0d0e21e
LW
3262 return(NULL);
3263 }
3dab1dad
YO
3264 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3265 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3266 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3267 DEBUG_PARSE_r({
3268 RExC_lastnum=0;
3269 RExC_lastparse=NULL;
3270 });
c277df42 3271
07be1b83 3272
c277df42
IZ
3273 /* Small enough for pointer-storage convention?
3274 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3275 if (RExC_size >= 0x10000L && RExC_extralen)
3276 RExC_size += RExC_extralen;
c277df42 3277 else
830247a4
IZ
3278 RExC_extralen = 0;
3279 if (RExC_whilem_seen > 15)
3280 RExC_whilem_seen = 15;
a0d0e21e 3281
bbce6d69 3282 /* Allocate space and initialize. */
a02a5408 3283 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3284 char, regexp);
a0d0e21e 3285 if (r == NULL)
b45f050a
JF
3286 FAIL("Regexp out of space");
3287
0f79a09d
GS
3288#ifdef DEBUGGING
3289 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3290 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3291#endif
c277df42 3292 r->refcnt = 1;
bbce6d69 3293 r->prelen = xend - exp;
5cfc7842 3294 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3295 r->subbeg = NULL;
f8c7b90f 3296#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3297 r->saved_copy = NULL;
ed252734 3298#endif
cf93c79d 3299 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3300 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3301 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3302
3303 r->substrs = 0; /* Useful during FAIL. */
3304 r->startp = 0; /* Useful during FAIL. */
3305 r->endp = 0; /* Useful during FAIL. */
3306
a02a5408 3307 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3308 if (r->offsets) {
2af232bd 3309 r->offsets[0] = RExC_size;
fac92740 3310 }
a3621e74 3311 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3312 "%s %"UVuf" bytes for offset annotations.\n",
3313 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3314 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3315
830247a4 3316 RExC_rx = r;
bbce6d69 3317
3318 /* Second pass: emit code. */
e2509266 3319 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3320 RExC_parse = exp;
3321 RExC_end = xend;
3322 RExC_naughty = 0;
3323 RExC_npar = 1;
fac92740 3324 RExC_emit_start = r->program;
830247a4 3325 RExC_emit = r->program;
2cd61cdb 3326 /* Store the count of eval-groups for security checks: */
eb160463 3327 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3328 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3329 r->data = 0;
3dab1dad 3330 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3331 return(NULL);
07be1b83
YO
3332 /* XXXX To minimize changes to RE engine we always allocate
3333 3-units-long substrs field. */
3334 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3335
07be1b83
YO
3336reStudy:
3337 Zero(r->substrs, 1, struct reg_substr_data);
3338 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3339
07be1b83
YO
3340#ifdef TRIE_STUDY_OPT
3341 if ( restudied ) {
3342 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3343 RExC_state=copyRExC_state;
3344 if (data.longest_fixed)
3345 SvREFCNT_dec(data.longest_fixed);
3346 if (data.longest_float)
3347 SvREFCNT_dec(data.longest_float);
3348 if (data.last_found)
3349 SvREFCNT_dec(data.last_found);
3350 } else {
3351 copyRExC_state=RExC_state;
3352 }
3353#endif
a0d0e21e 3354 /* Dig out information for optimizations. */
cf93c79d 3355 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3356 pm->op_pmflags = RExC_flags;
a0ed51b3 3357 if (UTF)
5ff6fc6d 3358 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3359 r->regstclass = NULL;
830247a4 3360 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3361 r->reganch |= ROPT_NAUGHTY;
c277df42 3362 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3363
653099ff 3364 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3365 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3366 I32 fake;
c5254dd6 3367 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3368 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3369 int stclass_flag;
07be1b83 3370 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3371
3372 first = scan;
c277df42 3373 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3374 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3375 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3376 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3377 /* for now we can't handle lookbehind IFMATCH*/
3378 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3379 (OP(first) == PLUS) ||
3380 (OP(first) == MINMOD) ||
653099ff 3381 /* An {n,m} with n>0 */
07be1b83
YO
3382 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3383 {
3384 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3385 if (OP(first) == PLUS)
3386 sawplus = 1;
3387 else
3dab1dad 3388 first += regarglen[OP(first)];
07be1b83
YO
3389 if (OP(first) == IFMATCH) {
3390 first = NEXTOPER(first);
3391 first += EXTRA_STEP_2ARGS;
7c167cea 3392 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3393 first = NEXTOPER(first);
a687059c
LW
3394 }
3395
a0d0e21e
LW
3396 /* Starting-point info. */
3397 again:
07be1b83 3398 /* Ignore EXACT as we deal with it later. */
3dab1dad 3399 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3400 if (OP(first) == EXACT)
6f207bd3 3401 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3402 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3403 r->regstclass = first;
3404 }
07be1b83
YO
3405#ifdef TRIE_STCLASS
3406 else if (OP(first) == TRIE &&
3407 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3408 {
3409 /* this can happen only on restudy */
3410 struct regnode_1 *trie_op;
3411 Newxz(trie_op,1,struct regnode_1);
3412 StructCopy(first,trie_op,struct regnode_1);
3413 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3414 r->regstclass = (regnode *)trie_op;
3415 }
3416#endif
bfed75c6 3417 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3418 r->regstclass = first;
3dab1dad
YO
3419 else if (PL_regkind[OP(first)] == BOUND ||
3420 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3421 r->regstclass = first;
3dab1dad 3422 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3423 r->reganch |= (OP(first) == MBOL
3424 ? ROPT_ANCH_MBOL
3425 : (OP(first) == SBOL
3426 ? ROPT_ANCH_SBOL
3427 : ROPT_ANCH_BOL));
a0d0e21e 3428 first = NEXTOPER(first);
774d564b 3429 goto again;
3430 }
3431 else if (OP(first) == GPOS) {
3432 r->reganch |= ROPT_ANCH_GPOS;
3433 first = NEXTOPER(first);
3434 goto again;
a0d0e21e 3435 }
e09294f4 3436 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3437 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3438 !(r->reganch & ROPT_ANCH) )
3439 {
3440 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3441 const int type =
3442 (OP(NEXTOPER(first)) == REG_ANY)
3443 ? ROPT_ANCH_MBOL
3444 : ROPT_ANCH_SBOL;
cad2e5aa 3445 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3446 first = NEXTOPER(first);
774d564b 3447 goto again;
a0d0e21e 3448 }
b81d288d 3449 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3450 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3451 /* x+ must match at the 1st pos of run of x's */
3452 r->reganch |= ROPT_SKIP;
a0d0e21e 3453
c277df42 3454 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3455#ifdef TRIE_STUDY_OPT
3456 DEBUG_COMPILE_r(
3457 if (!restudied)
3458 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3459 (IV)(first - scan + 1))
3460 );
3461#else
3462 DEBUG_COMPILE_r(
3463 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3464 (IV)(first - scan + 1))
3465 );
3466#endif
3467
3468
a0d0e21e
LW
3469 /*
3470 * If there's something expensive in the r.e., find the
3471 * longest literal string that must appear and make it the
3472 * regmust. Resolve ties in favor of later strings, since
3473 * the regstart check works with the beginning of the r.e.
3474 * and avoiding duplication strengthens checking. Not a
3475 * strong reason, but sufficient in the absence of others.
3476 * [Now we resolve ties in favor of the earlier string if
c277df42 3477 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3478 * earlier string may buy us something the later one won't.]
3479 */
a0d0e21e 3480 minlen = 0;
a687059c 3481
396482e1
GA
3482 data.longest_fixed = newSVpvs("");
3483 data.longest_float = newSVpvs("");
3484 data.last_found = newSVpvs("");
c277df42
IZ
3485 data.longest = &(data.longest_fixed);
3486 first = scan;
653099ff 3487 if (!r->regstclass) {
830247a4 3488 cl_init(pRExC_state, &ch_class);
653099ff
GS
3489 data.start_class = &ch_class;
3490 stclass_flag = SCF_DO_STCLASS_AND;
3491 } else /* XXXX Check for BOUND? */
3492 stclass_flag = 0;
cb434fcc 3493 data.last_closep = &last_close;
653099ff 3494
830247a4 3495 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3496 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3497
3498#ifdef TRIE_STUDY_OPT
3499 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3500 goto reStudy;
3501 }
3502#endif
3503
830247a4 3504 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3505 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3506 && !RExC_seen_zerolen
3507 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3508 r->reganch |= ROPT_CHECK_ALL;
830247a4 3509 scan_commit(pRExC_state, &data);
c277df42
IZ
3510 SvREFCNT_dec(data.last_found);
3511
a0ed51b3 3512 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3513 if (longest_float_length
c277df42
IZ
3514 || (data.flags & SF_FL_BEFORE_EOL
3515 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3516 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3517 int t;
3518
a0ed51b3 3519 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3520 && data.offset_fixed == data.offset_float_min
3521 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3522 goto remove_float; /* As in (a)+. */
3523
33b8afdf
JH
3524 if (SvUTF8(data.longest_float)) {
3525 r->float_utf8 = data.longest_float;
c445ea15 3526 r->float_substr = NULL;
33b8afdf
JH
3527 } else {
3528 r->float_substr = data.longest_float;
c445ea15 3529 r->float_utf8 = NULL;
33b8afdf 3530 }
c277df42
IZ
3531 r->float_min_offset = data.offset_float_min;
3532 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3533 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3534 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3535 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3536 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3537 }
3538 else {
aca2d497 3539 remove_float:
c445ea15 3540 r->float_substr = r->float_utf8 = NULL;
c277df42 3541 SvREFCNT_dec(data.longest_float);
c5254dd6 3542 longest_float_length = 0;
a0d0e21e 3543 }
c277df42 3544
a0ed51b3 3545 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3546 if (longest_fixed_length
c277df42
IZ
3547 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3548 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3549 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3550 int t;
3551
33b8afdf
JH
3552 if (SvUTF8(data.longest_fixed)) {
3553 r->anchored_utf8 = data.longest_fixed;
c445ea15 3554 r->anchored_substr = NULL;
33b8afdf
JH
3555 } else {
3556 r->anchored_substr = data.longest_fixed;
c445ea15 3557 r->anchored_utf8 = NULL;
33b8afdf 3558 }
c277df42 3559 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3560 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3561 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3562 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3563 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3564 }
3565 else {
c445ea15 3566 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3567 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3568 longest_fixed_length = 0;
a0d0e21e 3569 }
b81d288d 3570 if (r->regstclass
ffc61ed2 3571 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3572 r->regstclass = NULL;
33b8afdf
JH
3573 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3574 && stclass_flag
653099ff 3575 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3576 && !cl_is_anything(data.start_class))
3577 {
1df70142 3578 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3579
a02a5408 3580 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3581 struct regnode_charclass_class);
3582 StructCopy(data.start_class,
830247a4 3583 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3584 struct regnode_charclass_class);
830247a4 3585 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3586 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3587 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3588 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3589 PerlIO_printf(Perl_debug_log,
a0288114 3590 "synthetic stclass \"%s\".\n",
3f7c398e 3591 SvPVX_const(sv));});
653099ff 3592 }
c277df42
IZ
3593
3594 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3595 if (longest_fixed_length > longest_float_length) {
c277df42 3596 r->check_substr = r->anchored_substr;
33b8afdf 3597 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3598 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3599 if (r->reganch & ROPT_ANCH_SINGLE)
3600 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3601 }
3602 else {
c277df42 3603 r->check_substr = r->float_substr;
33b8afdf 3604 r->check_utf8 = r->float_utf8;
c277df42
IZ
3605 r->check_offset_min = data.offset_float_min;
3606 r->check_offset_max = data.offset_float_max;
a0d0e21e 3607 }
30382c73
IZ
3608 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3609 This should be changed ASAP! */
33b8afdf 3610 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3611 r->reganch |= RE_USE_INTUIT;
33b8afdf 3612 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3613 r->reganch |= RE_INTUIT_TAIL;
3614 }
a0ed51b3
LW
3615 }
3616 else {
c277df42
IZ
3617 /* Several toplevels. Best we can is to set minlen. */
3618 I32 fake;
653099ff 3619 struct regnode_charclass_class ch_class;
cb434fcc 3620 I32 last_close = 0;
c277df42 3621
a3621e74 3622 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3623
c277df42 3624 scan = r->program + 1;
830247a4 3625 cl_init(pRExC_state, &ch_class);
653099ff 3626 data.start_class = &ch_class;
cb434fcc 3627 data.last_closep = &last_close;
07be1b83
YO
3628
3629 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3630 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3631
3632#ifdef TRIE_STUDY_OPT
3633 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3634 goto reStudy;
3635 }
3636#endif
3637
33b8afdf 3638 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3639 = r->float_substr = r->float_utf8 = NULL;
653099ff 3640 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3641 && !cl_is_anything(data.start_class))
3642 {
1df70142 3643 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3644
a02a5408 3645 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3646 struct regnode_charclass_class);
3647 StructCopy(data.start_class,
830247a4 3648 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3649 struct regnode_charclass_class);
830247a4 3650 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3651 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3652 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3653 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3654 PerlIO_printf(Perl_debug_log,
a0288114 3655 "synthetic stclass \"%s\".\n",
3f7c398e 3656 SvPVX_const(sv));});
653099ff 3657 }
a0d0e21e
LW
3658 }
3659
a0d0e21e 3660 r->minlen = minlen;
b81d288d 3661 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3662 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3663 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3664 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3665 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3666 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3667 if (RExC_seen & REG_SEEN_CANY)
3668 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3669 Newxz(r->startp, RExC_npar, I32);
3670 Newxz(r->endp, RExC_npar, I32);
07be1b83 3671
f2278c82 3672 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3673 DEBUG_DUMP_r({
3674 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3675 regdump(r);
3676 });
8e9a8a48
YO
3677 DEBUG_OFFSETS_r(if (r->offsets) {
3678 const U32 len = r->offsets[0];
3679 U32 i;
3680 GET_RE_DEBUG_FLAGS_DECL;
3681 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3682 for (i = 1; i <= len; i++) {
3683 if (r->offsets[i*2-1] || r->offsets[i*2])
3684 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3685 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3686 }
3687 PerlIO_printf(Perl_debug_log, "\n");
3688 });
a0d0e21e 3689 return(r);
a687059c
LW
3690}
3691
3dab1dad
YO
3692
3693#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3694 int rem=(int)(RExC_end - RExC_parse); \
3695 int cut; \
3696 int num; \
3697 int iscut=0; \
3698 if (rem>10) { \
3699 rem=10; \
3700 iscut=1; \
3701 } \
3702 cut=10-rem; \
3703 if (RExC_lastparse!=RExC_parse) \
3704 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3705 rem, RExC_parse, \
3706 cut + 4, \
3707 iscut ? "..." : "<" \
3708 ); \
3709 else \
3710 PerlIO_printf(Perl_debug_log,"%16s",""); \
3711 \
3712 if (SIZE_ONLY) \
3713 num=RExC_size; \
3714 else \
3715 num=REG_NODE_NUM(RExC_emit); \
3716 if (RExC_lastnum!=num) \
be8e71aa 3717 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3718 else \
be8e71aa
YO
3719 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3720 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3721 (int)((depth*2)), "", \
3dab1dad
YO
3722 (funcname) \
3723 ); \
3724 RExC_lastnum=num; \
3725 RExC_lastparse=RExC_parse; \
3726})
3727
07be1b83
YO
3728
3729
3dab1dad
YO
3730#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3731 DEBUG_PARSE_MSG((funcname)); \
3732 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3733})
a687059c
LW
3734/*
3735 - reg - regular expression, i.e. main body or parenthesized thing
3736 *
3737 * Caller must absorb opening parenthesis.
3738 *
3739 * Combining parenthesis handling with the base level of regular expression
3740 * is a trifle forced, but the need to tie the tails of the branches to what
3741 * follows makes it hard to avoid.
3742 */
07be1b83
YO
3743#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3744#ifdef DEBUGGING
3745#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3746#else
3747#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3748#endif
3dab1dad 3749
76e3520e 3750STATIC regnode *
3dab1dad 3751S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3752 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3753{
27da23d5 3754 dVAR;
c277df42
IZ
3755 register regnode *ret; /* Will be the head of the group. */
3756 register regnode *br;
3757 register regnode *lastbr;
cbbf8932 3758 register regnode *ender = NULL;
a0d0e21e 3759 register I32 parno = 0;
cbbf8932
AL
3760 I32 flags;
3761 const I32 oregflags = RExC_flags;
6136c704
AL
3762 bool have_branch = 0;
3763 bool is_open = 0;
9d1d55b5
JP
3764
3765 /* for (?g), (?gc), and (?o) warnings; warning
3766 about (?c) will warn about (?g) -- japhy */
3767
6136c704
AL
3768#define WASTED_O 0x01
3769#define WASTED_G 0x02
3770#define WASTED_C 0x04
3771#define WASTED_GC (0x02|0x04)
cbbf8932 3772 I32 wastedflags = 0x00;
9d1d55b5 3773
fac92740 3774 char * parse_start = RExC_parse; /* MJD */
a28509cc 3775 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3776
3dab1dad
YO
3777 GET_RE_DEBUG_FLAGS_DECL;
3778 DEBUG_PARSE("reg ");
3779
3780
821b33a5 3781 *flagp = 0; /* Tentatively. */
a0d0e21e 3782
9d1d55b5 3783
a0d0e21e
LW
3784 /* Make an OPEN node, if parenthesized. */
3785 if (paren) {
fac92740 3786 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3787 U32 posflags = 0, negflags = 0;
3788 U32 *flagsp = &posflags;
6136c704 3789 bool is_logical = 0;
a28509cc 3790 const char * const seqstart = RExC_parse;
ca9dfc88 3791
830247a4
IZ
3792 RExC_parse++;
3793 paren = *RExC_parse++;
c277df42 3794 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3795 switch (paren) {
fac92740 3796 case '<': /* (?<...) */
830247a4 3797 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3798 if (*RExC_parse == '!')
c277df42 3799 paren = ',';
b81d288d 3800 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3801 goto unknown;
830247a4 3802 RExC_parse++;
fac92740
MJD
3803 case '=': /* (?=...) */
3804 case '!': /* (?!...) */
830247a4 3805 RExC_seen_zerolen++;
fac92740
MJD
3806 case ':': /* (?:...) */
3807 case '>': /* (?>...) */
a0d0e21e 3808 break;
fac92740
MJD
3809 case '$': /* (?$...) */
3810 case '@': /* (?@...) */
8615cb43 3811 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3812 break;
fac92740 3813 case '#': /* (?#...) */
830247a4
IZ
3814 while (*RExC_parse && *RExC_parse != ')')
3815 RExC_parse++;
3816 if (*RExC_parse != ')')
c277df42 3817 FAIL("Sequence (?#... not terminated");
830247a4 3818 nextchar(pRExC_state);
a0d0e21e
LW
3819 *flagp = TRYAGAIN;
3820 return NULL;
fac92740 3821 case 'p': /* (?p...) */
9014280d 3822 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3823 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3824 /* FALL THROUGH*/
fac92740 3825 case '?': /* (??...) */
6136c704 3826 is_logical = 1;
438a3801
YST
3827 if (*RExC_parse != '{')
3828 goto unknown;
830247a4 3829 paren = *RExC_parse++;
0f5d15d6 3830 /* FALL THROUGH */
fac92740 3831 case '{': /* (?{...}) */
c277df42 3832 {
c277df42
IZ
3833 I32 count = 1, n = 0;
3834 char c;
830247a4 3835 char *s = RExC_parse;
c277df42 3836
830247a4
IZ
3837 RExC_seen_zerolen++;
3838 RExC_seen |= REG_SEEN_EVAL;
3839 while (count && (c = *RExC_parse)) {
6136c704
AL
3840 if (c == '\\') {
3841 if (RExC_parse[1])
3842 RExC_parse++;
3843 }
b81d288d 3844 else if (c == '{')
c277df42 3845 count++;
b81d288d 3846 else if (c == '}')
c277df42 3847 count--;
830247a4 3848 RExC_parse++;
c277df42 3849 }
6136c704 3850 if (*RExC_parse != ')') {
b81d288d 3851 RExC_parse = s;
b45f050a
JF
3852 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3853 }
c277df42 3854 if (!SIZE_ONLY) {
f3548bdc 3855 PAD *pad;
6136c704
AL
3856 OP_4tree *sop, *rop;
3857 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3858
569233ed
SB
3859 ENTER;
3860 Perl_save_re_context(aTHX);
f3548bdc 3861 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3862 sop->op_private |= OPpREFCOUNTED;
3863 /* re_dup will OpREFCNT_inc */
3864 OpREFCNT_set(sop, 1);
569233ed 3865 LEAVE;
c277df42 3866
830247a4
IZ
3867 n = add_data(pRExC_state, 3, "nop");
3868 RExC_rx->data->data[n] = (void*)rop;
3869 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3870 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3871 SvREFCNT_dec(sv);
a0ed51b3 3872 }
e24b16f9 3873 else { /* First pass */
830247a4 3874 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3875 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3876 /* No compiled RE interpolated, has runtime
3877 components ===> unsafe. */
3878 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3879 if (PL_tainting && PL_tainted)
cc6b7395 3880 FAIL("Eval-group in insecure regular expression");
54df2634 3881#if PERL_VERSION > 8
923e4eb5 3882 if (IN_PERL_COMPILETIME)
b5c19bd7 3883 PL_cv_has_eval = 1;
54df2634 3884#endif
c277df42 3885 }
b5c19bd7 3886
830247a4 3887 nextchar(pRExC_state);
6136c704 3888 if (is_logical) {
830247a4 3889 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3890 if (!SIZE_ONLY)
3891 ret->flags = 2;
3dab1dad 3892 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3893 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3894 return ret;
3895 }
ccb2c380
MP
3896 ret = reganode(pRExC_state, EVAL, n);
3897 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3898 Set_Node_Offset(ret, parse_start);
3899 return ret;
c277df42 3900 }
fac92740 3901 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3902 {
fac92740 3903 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3904 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3905 || RExC_parse[1] == '<'
830247a4 3906 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3907 I32 flag;
3908
830247a4 3909 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3910 if (!SIZE_ONLY)
3911 ret->flags = 1;
3dab1dad 3912 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3913 goto insert_if;
b81d288d 3914 }
a0ed51b3 3915 }
830247a4 3916 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3917 /* (?(1)...) */
6136c704 3918 char c;
830247a4 3919 parno = atoi(RExC_parse++);
c277df42 3920
830247a4
IZ
3921 while (isDIGIT(*RExC_parse))
3922 RExC_parse++;
fac92740 3923 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3924
830247a4 3925 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3926 vFAIL("Switch condition not recognized");
c277df42 3927 insert_if:
3dab1dad
YO
3928 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3929 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3930 if (br == NULL)
830247a4 3931 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3932 else
3dab1dad 3933 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3934 c = *nextchar(pRExC_state);
d1b80229
IZ
3935 if (flags&HASWIDTH)
3936 *flagp |= HASWIDTH;
c277df42 3937 if (c == '|') {
830247a4 3938 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3939 regbranch(pRExC_state, &flags, 1,depth+1);
3940 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3941 if (flags&HASWIDTH)
3942 *flagp |= HASWIDTH;
830247a4 3943 c = *nextchar(pRExC_state);
a0ed51b3
LW
3944 }
3945 else
c277df42
IZ
3946 lastbr = NULL;
3947 if (c != ')')
8615cb43 3948 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3949 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3950 REGTAIL(pRExC_state, br, ender);
c277df42 3951 if (lastbr) {
3dab1dad
YO
3952 REGTAIL(pRExC_state, lastbr, ender);
3953 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3954 }
3955 else
3dab1dad 3956 REGTAIL(pRExC_state, ret, ender);
c277df42 3957 return ret;
a0ed51b3
LW
3958 }
3959 else {
830247a4 3960 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3961 }
3962 }
1b1626e4 3963 case 0:
830247a4 3964 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3965 vFAIL("Sequence (? incomplete");
1b1626e4 3966 break;
a0d0e21e 3967 default:
830247a4 3968 --RExC_parse;
fac92740 3969 parse_flags: /* (?i) */
830247a4 3970 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3971 /* (?g), (?gc) and (?o) are useless here
3972 and must be globally applied -- japhy */
3973
3974 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3975 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3976 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3977 if (! (wastedflags & wflagbit) ) {
3978 wastedflags |= wflagbit;
3979 vWARN5(
3980 RExC_parse + 1,
3981 "Useless (%s%c) - %suse /%c modifier",
3982 flagsp == &negflags ? "?-" : "?",
3983 *RExC_parse,
3984 flagsp == &negflags ? "don't " : "",
3985 *RExC_parse
3986 );
3987 }
3988 }
3989 }
3990 else if (*RExC_parse == 'c') {
3991 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3992 if (! (wastedflags & WASTED_C) ) {
3993 wastedflags |= WASTED_GC;
9d1d55b5
JP
3994 vWARN3(
3995 RExC_parse + 1,
3996 "Useless (%sc) - %suse /gc modifier",
3997 flagsp == &negflags ? "?-" : "?",
3998 flagsp == &negflags ? "don't " : ""
3999 );
4000 }
4001 }
4002 }
4003 else { pmflag(flagsp, *RExC_parse); }
4004
830247a4 4005 ++RExC_parse;
ca9dfc88 4006 }
830247a4 4007 if (*RExC_parse == '-') {
ca9dfc88 4008 flagsp = &negflags;
9d1d55b5 4009 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4010 ++RExC_parse;
ca9dfc88 4011 goto parse_flags;
48c036b1 4012 }
e2509266
JH
4013 RExC_flags |= posflags;
4014 RExC_flags &= ~negflags;
830247a4
IZ
4015 if (*RExC_parse == ':') {
4016 RExC_parse++;
ca9dfc88
IZ
4017 paren = ':';
4018 break;
4019 }
c277df42 4020 unknown:
830247a4
IZ
4021 if (*RExC_parse != ')') {
4022 RExC_parse++;
4023 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4024 }
830247a4 4025 nextchar(pRExC_state);
a0d0e21e
LW
4026 *flagp = TRYAGAIN;
4027 return NULL;
4028 }
4029 }
fac92740 4030 else { /* (...) */
830247a4
IZ
4031 parno = RExC_npar;
4032 RExC_npar++;
4033 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4034 Set_Node_Length(ret, 1); /* MJD */
4035 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4036 is_open = 1;
a0d0e21e 4037 }
a0ed51b3 4038 }
fac92740 4039 else /* ! paren */
a0d0e21e
LW
4040 ret = NULL;
4041
4042 /* Pick up the branches, linking them together. */
fac92740 4043 parse_start = RExC_parse; /* MJD */
3dab1dad 4044 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4045 /* branch_len = (paren != 0); */
2af232bd 4046
a0d0e21e
LW
4047 if (br == NULL)
4048 return(NULL);
830247a4
IZ
4049 if (*RExC_parse == '|') {
4050 if (!SIZE_ONLY && RExC_extralen) {
4051 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4052 }
fac92740 4053 else { /* MJD */
830247a4 4054 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4055 Set_Node_Length(br, paren != 0);
4056 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4057 }
c277df42
IZ
4058 have_branch = 1;
4059 if (SIZE_ONLY)
830247a4 4060 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4061 }
4062 else if (paren == ':') {
c277df42
IZ
4063 *flagp |= flags&SIMPLE;
4064 }
6136c704 4065 if (is_open) { /* Starts with OPEN. */
3dab1dad 4066 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4067 }
4068 else if (paren != '?') /* Not Conditional */
a0d0e21e 4069 ret = br;
32a0ca98 4070 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4071 lastbr = br;
830247a4
IZ
4072 while (*RExC_parse == '|') {
4073 if (!SIZE_ONLY && RExC_extralen) {
4074 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4075 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4076 }
4077 if (SIZE_ONLY)
830247a4
IZ
4078 RExC_extralen += 2; /* Account for LONGJMP. */
4079 nextchar(pRExC_state);
3dab1dad 4080 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4081
a687059c 4082 if (br == NULL)
a0d0e21e 4083 return(NULL);
3dab1dad 4084 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4085 lastbr = br;
821b33a5
IZ
4086 if (flags&HASWIDTH)
4087 *flagp |= HASWIDTH;
a687059c 4088 *flagp |= flags&SPSTART;
a0d0e21e
LW
4089 }
4090
c277df42
IZ
4091 if (have_branch || paren != ':') {
4092 /* Make a closing node, and hook it on the end. */
4093 switch (paren) {
4094 case ':':
830247a4 4095 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4096 break;
4097 case 1:
830247a4 4098 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4099 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4100 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4101 break;
4102 case '<':
c277df42
IZ
4103 case ',':
4104 case '=':
4105 case '!':
c277df42 4106 *flagp &= ~HASWIDTH;
821b33a5
IZ
4107 /* FALL THROUGH */
4108 case '>':
830247a4 4109 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4110 break;
4111 case 0:
830247a4 4112 ender = reg_node(pRExC_state, END);
c277df42
IZ
4113 break;
4114 }
07be1b83 4115 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4116
9674d46a 4117 if (have_branch && !SIZE_ONLY) {
c277df42 4118 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4119 for (br = ret; br; br = regnext(br)) {
4120 const U8 op = PL_regkind[OP(br)];
4121 if (op == BRANCH) {
07be1b83 4122 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4123 }
4124 else if (op == BRANCHJ) {
07be1b83 4125 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4126 }
c277df42
IZ
4127 }
4128 }
a0d0e21e 4129 }
c277df42
IZ
4130
4131 {
e1ec3a88
AL
4132 const char *p;
4133 static const char parens[] = "=!<,>";
c277df42
IZ
4134
4135 if (paren && (p = strchr(parens, paren))) {
eb160463 4136 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4137 int flag = (p - parens) > 1;
4138
4139 if (paren == '>')
4140 node = SUSPEND, flag = 0;
830247a4 4141 reginsert(pRExC_state, node,ret);
45948336
EP
4142 Set_Node_Cur_Length(ret);
4143 Set_Node_Offset(ret, parse_start + 1);
c277df42 4144 ret->flags = flag;
07be1b83 4145 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4146 }
a0d0e21e
LW
4147 }
4148
4149 /* Check for proper termination. */
ce3e6498 4150 if (paren) {
e2509266 4151 RExC_flags = oregflags;
830247a4
IZ
4152 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4153 RExC_parse = oregcomp_parse;
380a0633 4154 vFAIL("Unmatched (");
ce3e6498 4155 }
a0ed51b3 4156 }
830247a4
IZ
4157 else if (!paren && RExC_parse < RExC_end) {
4158 if (*RExC_parse == ')') {
4159 RExC_parse++;
380a0633 4160 vFAIL("Unmatched )");
a0ed51b3
LW
4161 }
4162 else
b45f050a 4163 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4164 /* NOTREACHED */
4165 }
a687059c 4166
a0d0e21e 4167 return(ret);
a687059c
LW
4168}
4169
4170/*
4171 - regbranch - one alternative of an | operator
4172 *
4173 * Implements the concatenation operator.
4174 */
76e3520e 4175STATIC regnode *
3dab1dad 4176S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4177{
97aff369 4178 dVAR;
c277df42
IZ
4179 register regnode *ret;
4180 register regnode *chain = NULL;
4181 register regnode *latest;
4182 I32 flags = 0, c = 0;
3dab1dad
YO
4183 GET_RE_DEBUG_FLAGS_DECL;
4184 DEBUG_PARSE("brnc");
b81d288d 4185 if (first)
c277df42
IZ
4186 ret = NULL;
4187 else {
b81d288d 4188 if (!SIZE_ONLY && RExC_extralen)
830247a4 4189 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4190 else {
830247a4 4191 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4192 Set_Node_Length(ret, 1);
4193 }
c277df42
IZ
4194 }
4195
b81d288d 4196 if (!first && SIZE_ONLY)
830247a4 4197 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4198
c277df42 4199 *flagp = WORST; /* Tentatively. */
a0d0e21e 4200
830247a4
IZ
4201 RExC_parse--;
4202 nextchar(pRExC_state);
4203 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4204 flags &= ~TRYAGAIN;
3dab1dad 4205 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4206 if (latest == NULL) {
4207 if (flags & TRYAGAIN)
4208 continue;
4209 return(NULL);
a0ed51b3
LW
4210 }
4211 else if (ret == NULL)
c277df42 4212 ret = latest;
a0d0e21e 4213 *flagp |= flags&HASWIDTH;
c277df42 4214 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4215 *flagp |= flags&SPSTART;
4216 else {
830247a4 4217 RExC_naughty++;
3dab1dad 4218 REGTAIL(pRExC_state, chain, latest);
a687059c 4219 }
a0d0e21e 4220 chain = latest;
c277df42
IZ
4221 c++;
4222 }
4223 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4224 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4225 if (ret == NULL)
4226 ret = chain;
4227 }
4228 if (c == 1) {
4229 *flagp |= flags&SIMPLE;
a0d0e21e 4230 }
a687059c 4231
d4c19fe8 4232 return ret;
a687059c
LW
4233}
4234
4235/*
4236 - regpiece - something followed by possible [*+?]
4237 *
4238 * Note that the branching code sequences used for ? and the general cases
4239 * of * and + are somewhat optimized: they use the same NOTHING node as
4240 * both the endmarker for their branch list and the body of the last branch.
4241 * It might seem that this node could be dispensed with entirely, but the
4242 * endmarker role is not redundant.
4243 */
76e3520e 4244STATIC regnode *
3dab1dad 4245S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4246{
97aff369 4247 dVAR;
c277df42 4248 register regnode *ret;
a0d0e21e
LW
4249 register char op;
4250 register char *next;
4251 I32 flags;
1df70142 4252 const char * const origparse = RExC_parse;
a0d0e21e 4253 I32 min;
c277df42 4254 I32 max = REG_INFTY;
fac92740 4255 char *parse_start;
3dab1dad
YO
4256 GET_RE_DEBUG_FLAGS_DECL;
4257 DEBUG_PARSE("piec");
a0d0e21e 4258
3dab1dad 4259 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4260 if (ret == NULL) {
4261 if (flags & TRYAGAIN)
4262 *flagp |= TRYAGAIN;
4263 return(NULL);
4264 }
4265
830247a4 4266 op = *RExC_parse;
a0d0e21e 4267
830247a4 4268 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 4269 const char *maxpos = NULL;
fac92740 4270 parse_start = RExC_parse; /* MJD */
830247a4 4271 next = RExC_parse + 1;
a0d0e21e
LW
4272 while (isDIGIT(*next) || *next == ',') {
4273 if (*next == ',') {
4274 if (maxpos)
4275 break;
4276 else
4277 maxpos = next;
a687059c 4278 }
a0d0e21e
LW
4279 next++;
4280 }
4281 if (*next == '}') { /* got one */
4282 if (!maxpos)
4283 maxpos = next;
830247a4
IZ
4284 RExC_parse++;
4285 min = atoi(RExC_parse);
a0d0e21e
LW
4286 if (*maxpos == ',')
4287 maxpos++;
4288 else
830247a4 4289 maxpos = RExC_parse;
a0d0e21e
LW
4290 max = atoi(maxpos);
4291 if (!max && *maxpos != '0')
c277df42
IZ
4292 max = REG_INFTY; /* meaning "infinity" */
4293 else if (max >= REG_INFTY)
8615cb43 4294 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4295 RExC_parse = next;
4296 nextchar(pRExC_state);
a0d0e21e
LW
4297
4298 do_curly:
4299 if ((flags&SIMPLE)) {
830247a4
IZ
4300 RExC_naughty += 2 + RExC_naughty / 2;
4301 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4302 Set_Node_Offset(ret, parse_start+1); /* MJD */
4303 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4304 }
4305 else {
3dab1dad 4306 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4307
4308 w->flags = 0;
3dab1dad 4309 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4310 if (!SIZE_ONLY && RExC_extralen) {
4311 reginsert(pRExC_state, LONGJMP,ret);
4312 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4313 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4314 }
830247a4 4315 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4316 /* MJD hk */
4317 Set_Node_Offset(ret, parse_start+1);
2af232bd 4318 Set_Node_Length(ret,
fac92740 4319 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4320
830247a4 4321 if (!SIZE_ONLY && RExC_extralen)
c277df42 4322 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4323 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4324 if (SIZE_ONLY)
830247a4
IZ
4325 RExC_whilem_seen++, RExC_extralen += 3;
4326 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4327 }
c277df42 4328 ret->flags = 0;
a0d0e21e
LW
4329
4330 if (min > 0)
821b33a5
IZ
4331 *flagp = WORST;
4332 if (max > 0)
4333 *flagp |= HASWIDTH;
a0d0e21e 4334 if (max && max < min)
8615cb43 4335 vFAIL("Can't do {n,m} with n > m");
c277df42 4336 if (!SIZE_ONLY) {
eb160463
GS
4337 ARG1_SET(ret, (U16)min);
4338 ARG2_SET(ret, (U16)max);
a687059c 4339 }
a687059c 4340
a0d0e21e 4341 goto nest_check;
a687059c 4342 }
a0d0e21e 4343 }
a687059c 4344
a0d0e21e
LW
4345 if (!ISMULT1(op)) {
4346 *flagp = flags;
a687059c 4347 return(ret);
a0d0e21e 4348 }
bb20fd44 4349
c277df42 4350#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4351
4352 /* if this is reinstated, don't forget to put this back into perldiag:
4353
4354 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4355
4356 (F) The part of the regexp subject to either the * or + quantifier
4357 could match an empty string. The {#} shows in the regular
4358 expression about where the problem was discovered.
4359
4360 */
4361
bb20fd44 4362 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4363 vFAIL("Regexp *+ operand could be empty");
b81d288d 4364#endif
bb20fd44 4365
fac92740 4366 parse_start = RExC_parse;
830247a4 4367 nextchar(pRExC_state);
a0d0e21e 4368
821b33a5 4369 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4370
4371 if (op == '*' && (flags&SIMPLE)) {
830247a4 4372 reginsert(pRExC_state, STAR, ret);
c277df42 4373 ret->flags = 0;
830247a4 4374 RExC_naughty += 4;
a0d0e21e
LW
4375 }
4376 else if (op == '*') {
4377 min = 0;
4378 goto do_curly;
a0ed51b3
LW
4379 }
4380 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4381 reginsert(pRExC_state, PLUS, ret);
c277df42 4382 ret->flags = 0;
830247a4 4383 RExC_naughty += 3;
a0d0e21e
LW
4384 }
4385 else if (op == '+') {
4386 min = 1;
4387 goto do_curly;
a0ed51b3
LW
4388 }
4389 else if (op == '?') {
a0d0e21e
LW
4390 min = 0; max = 1;
4391 goto do_curly;
4392 }
4393 nest_check:
041457d9 4394 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4395 vWARN3(RExC_parse,
b45f050a 4396 "%.*s matches null string many times",
afd78fd5 4397 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4398 origparse);
a0d0e21e
LW
4399 }
4400
830247a4
IZ
4401 if (*RExC_parse == '?') {
4402 nextchar(pRExC_state);
4403 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4404 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4405 }
830247a4
IZ
4406 if (ISMULT2(RExC_parse)) {
4407 RExC_parse++;
b45f050a
JF
4408 vFAIL("Nested quantifiers");
4409 }
a0d0e21e
LW
4410
4411 return(ret);
a687059c
LW
4412}
4413
4414/*
4415 - regatom - the lowest level
4416 *
4417 * Optimization: gobbles an entire sequence of ordinary characters so that
4418 * it can turn them into a single node, which is smaller to store and
4419 * faster to run. Backslashed characters are exceptions, each becoming a
4420 * separate node; the code is simpler that way and it's not worth fixing.
4421 *
7f6f358c
YO
4422 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4423 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4424 */
76e3520e 4425STATIC regnode *
3dab1dad 4426S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4427{
97aff369 4428 dVAR;
cbbf8932 4429 register regnode *ret = NULL;
a0d0e21e 4430 I32 flags;
45948336 4431 char *parse_start = RExC_parse;
3dab1dad
YO
4432 GET_RE_DEBUG_FLAGS_DECL;
4433 DEBUG_PARSE("atom");
a0d0e21e
LW
4434 *flagp = WORST; /* Tentatively. */
4435
4436tryagain:
830247a4 4437 switch (*RExC_parse) {
a0d0e21e 4438 case '^':
830247a4
IZ
4439 RExC_seen_zerolen++;
4440 nextchar(pRExC_state);
e2509266 4441 if (RExC_flags & PMf_MULTILINE)
830247a4 4442 ret = reg_node(pRExC_state, MBOL);
e2509266 4443 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4444 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4445 else
830247a4 4446 ret = reg_node(pRExC_state, BOL);
fac92740 4447 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4448 break;
4449 case '$':
830247a4 4450 nextchar(pRExC_state);
b81d288d 4451 if (*RExC_parse)
830247a4 4452 RExC_seen_zerolen++;
e2509266 4453 if (RExC_flags & PMf_MULTILINE)
830247a4 4454 ret = reg_node(pRExC_state, MEOL);
e2509266 4455 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4456 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4457 else
830247a4 4458 ret = reg_node(pRExC_state, EOL);
fac92740 4459 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4460 break;
4461 case '.':
830247a4 4462 nextchar(pRExC_state);
e2509266 4463 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4464 ret = reg_node(pRExC_state, SANY);
4465 else
4466 ret = reg_node(pRExC_state, REG_ANY);
4467 *flagp |= HASWIDTH|SIMPLE;
830247a4 4468 RExC_naughty++;
fac92740 4469 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4470 break;
4471 case '[':
b45f050a 4472 {
3dab1dad
YO
4473 char * const oregcomp_parse = ++RExC_parse;
4474 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4475 if (*RExC_parse != ']') {
4476 RExC_parse = oregcomp_parse;
b45f050a
JF
4477 vFAIL("Unmatched [");
4478 }
830247a4 4479 nextchar(pRExC_state);
a0d0e21e 4480 *flagp |= HASWIDTH|SIMPLE;
fac92740 4481 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4482 break;
b45f050a 4483 }
a0d0e21e 4484 case '(':
830247a4 4485 nextchar(pRExC_state);
3dab1dad 4486 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4487 if (ret == NULL) {
bf93d4cc 4488 if (flags & TRYAGAIN) {
830247a4 4489 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4490 /* Make parent create an empty node if needed. */
4491 *flagp |= TRYAGAIN;
4492 return(NULL);
4493 }
a0d0e21e 4494 goto tryagain;
bf93d4cc 4495 }
a0d0e21e
LW
4496 return(NULL);
4497 }
c277df42 4498 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4499 break;
4500 case '|':
4501 case ')':
4502 if (flags & TRYAGAIN) {
4503 *flagp |= TRYAGAIN;
4504 return NULL;
4505 }
b45f050a 4506 vFAIL("Internal urp");
a0d0e21e
LW
4507 /* Supposed to be caught earlier. */
4508 break;
85afd4ae 4509 case '{':
830247a4
IZ
4510 if (!regcurly(RExC_parse)) {
4511 RExC_parse++;
85afd4ae
CS
4512 goto defchar;
4513 }
4514 /* FALL THROUGH */
a0d0e21e
LW
4515 case '?':
4516 case '+':
4517 case '*':
830247a4 4518 RExC_parse++;
b45f050a 4519 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4520 break;
4521 case '\\':
830247a4 4522 switch (*++RExC_parse) {
a0d0e21e 4523 case 'A':
830247a4
IZ
4524 RExC_seen_zerolen++;
4525 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4526 *flagp |= SIMPLE;
830247a4 4527 nextchar(pRExC_state);
fac92740 4528 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4529 break;
4530 case 'G':
830247a4
IZ
4531 ret = reg_node(pRExC_state, GPOS);
4532 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4533 *flagp |= SIMPLE;
830247a4 4534 nextchar(pRExC_state);
fac92740 4535 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4536 break;
4537 case 'Z':
830247a4 4538 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4539 *flagp |= SIMPLE;
a1917ab9 4540 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4541 nextchar(pRExC_state);
a0d0e21e 4542 break;
b85d18e9 4543 case 'z':
830247a4 4544 ret = reg_node(pRExC_state, EOS);
b85d18e9 4545 *flagp |= SIMPLE;
830247a4
IZ
4546 RExC_seen_zerolen++; /* Do not optimize RE away */
4547 nextchar(pRExC_state);
fac92740 4548 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4549 break;
4a2d328f 4550 case 'C':
f33976b4
DB
4551 ret = reg_node(pRExC_state, CANY);
4552 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4553 *flagp |= HASWIDTH|SIMPLE;
830247a4 4554 nextchar(pRExC_state);
fac92740 4555 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4556 break;
4557 case 'X':
830247a4 4558 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4559 *flagp |= HASWIDTH;
830247a4 4560 nextchar(pRExC_state);
fac92740 4561 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4562 break;
a0d0e21e 4563 case 'w':
eb160463 4564 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4565 *flagp |= HASWIDTH|SIMPLE;
830247a4 4566 nextchar(pRExC_state);
fac92740 4567 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4568 break;
4569 case 'W':
eb160463 4570 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4571 *flagp |= HASWIDTH|SIMPLE;
830247a4 4572 nextchar(pRExC_state);
fac92740 4573 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4574 break;
4575 case 'b':
830247a4
IZ
4576 RExC_seen_zerolen++;
4577 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4578 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4579 *flagp |= SIMPLE;
830247a4 4580 nextchar(pRExC_state);
fac92740 4581 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4582 break;
4583 case 'B':
830247a4
IZ
4584 RExC_seen_zerolen++;
4585 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4586 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4587 *flagp |= SIMPLE;
830247a4 4588 nextchar(pRExC_state);
fac92740 4589 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4590 break;
4591 case 's':
eb160463 4592 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4593 *flagp |= HASWIDTH|SIMPLE;
830247a4 4594 nextchar(pRExC_state);
fac92740 4595 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4596 break;
4597 case 'S':
eb160463 4598 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4599 *flagp |= HASWIDTH|SIMPLE;
830247a4 4600 nextchar(pRExC_state);
fac92740 4601 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4602 break;
4603 case 'd':
ffc61ed2 4604 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4605 *flagp |= HASWIDTH|SIMPLE;
830247a4 4606 nextchar(pRExC_state);
fac92740 4607 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4608 break;
4609 case 'D':
ffc61ed2 4610 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4611 *flagp |= HASWIDTH|SIMPLE;
830247a4 4612 nextchar(pRExC_state);
fac92740 4613 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4614 break;
a14b48bc
LW
4615 case 'p':
4616 case 'P':
3568d838 4617 {
3dab1dad 4618 char* const oldregxend = RExC_end;
ccb2c380 4619 char* parse_start = RExC_parse - 2;
a14b48bc 4620
830247a4 4621 if (RExC_parse[1] == '{') {
3568d838 4622 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4623 RExC_end = strchr(RExC_parse, '}');
4624 if (!RExC_end) {
3dab1dad 4625 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4626 RExC_parse += 2;
4627 RExC_end = oldregxend;
0da60cf5 4628 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4629 }
830247a4 4630 RExC_end++;
a14b48bc 4631 }
af6f566e 4632 else {
830247a4 4633 RExC_end = RExC_parse + 2;
af6f566e
HS
4634 if (RExC_end > oldregxend)
4635 RExC_end = oldregxend;
4636 }
830247a4 4637 RExC_parse--;
a14b48bc 4638
3dab1dad 4639 ret = regclass(pRExC_state,depth+1);
a14b48bc 4640
830247a4
IZ
4641 RExC_end = oldregxend;
4642 RExC_parse--;
ccb2c380
MP
4643
4644 Set_Node_Offset(ret, parse_start + 2);
4645 Set_Node_Cur_Length(ret);
830247a4 4646 nextchar(pRExC_state);
a14b48bc
LW
4647 *flagp |= HASWIDTH|SIMPLE;
4648 }
4649 break;
a0d0e21e
LW
4650 case 'n':
4651 case 'r':
4652 case 't':
4653 case 'f':
4654 case 'e':
4655 case 'a':
4656 case 'x':
4657 case 'c':
4658 case '0':
4659 goto defchar;
4660 case '1': case '2': case '3': case '4':
4661 case '5': case '6': case '7': case '8': case '9':
4662 {
1df70142 4663 const I32 num = atoi(RExC_parse);
a0d0e21e 4664
830247a4 4665 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4666 goto defchar;
4667 else {
3dab1dad 4668 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4669 while (isDIGIT(*RExC_parse))
4670 RExC_parse++;
b45f050a 4671
eb160463 4672 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4673 vFAIL("Reference to nonexistent group");
830247a4 4674 RExC_sawback = 1;
eb160463
GS
4675 ret = reganode(pRExC_state,
4676 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4677 num);
a0d0e21e 4678 *flagp |= HASWIDTH;
2af232bd 4679
fac92740 4680 /* override incorrect value set in reganode MJD */
2af232bd 4681 Set_Node_Offset(ret, parse_start+1);
fac92740 4682 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4683 RExC_parse--;
4684 nextchar(pRExC_state);
a0d0e21e
LW
4685 }
4686 }
4687 break;
4688 case '\0':
830247a4 4689 if (RExC_parse >= RExC_end)
b45f050a 4690 FAIL("Trailing \\");
a0d0e21e
LW
4691 /* FALL THROUGH */
4692 default:
a0288114 4693 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4694 back into the quick-grab loop below */
45948336 4695 parse_start--;
a0d0e21e
LW
4696 goto defchar;
4697 }
4698 break;
4633a7c4
LW
4699
4700 case '#':
e2509266 4701 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4702 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4703 RExC_parse++;
830247a4 4704 if (RExC_parse < RExC_end)
4633a7c4
LW
4705 goto tryagain;
4706 }
4707 /* FALL THROUGH */
4708
a0d0e21e 4709 default: {
ba210ebe 4710 register STRLEN len;
58ae7d3f 4711 register UV ender;
a0d0e21e 4712 register char *p;
3dab1dad 4713 char *s;
80aecb99 4714 STRLEN foldlen;
89ebb4a3 4715 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4716
4717 parse_start = RExC_parse - 1;
a0d0e21e 4718
830247a4 4719 RExC_parse++;
a0d0e21e
LW
4720
4721 defchar:
58ae7d3f 4722 ender = 0;
eb160463
GS
4723 ret = reg_node(pRExC_state,
4724 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4725 s = STRING(ret);
830247a4
IZ
4726 for (len = 0, p = RExC_parse - 1;
4727 len < 127 && p < RExC_end;
a0d0e21e
LW
4728 len++)
4729 {
3dab1dad 4730 char * const oldp = p;
5b5a24f7 4731
e2509266 4732 if (RExC_flags & PMf_EXTENDED)
830247a4 4733 p = regwhite(p, RExC_end);
a0d0e21e
LW
4734 switch (*p) {
4735 case '^':
4736 case '$':
4737 case '.':
4738 case '[':
4739 case '(':
4740 case ')':
4741 case '|':
4742 goto loopdone;
4743 case '\\':
4744 switch (*++p) {
4745 case 'A':
1ed8eac0
JF
4746 case 'C':
4747 case 'X':
a0d0e21e
LW
4748 case 'G':
4749 case 'Z':
b85d18e9 4750 case 'z':
a0d0e21e
LW
4751 case 'w':
4752 case 'W':
4753 case 'b':
4754 case 'B':
4755 case 's':
4756 case 'S':
4757 case 'd':
4758 case 'D':
a14b48bc
LW
4759 case 'p':
4760 case 'P':
a0d0e21e
LW
4761 --p;
4762 goto loopdone;
4763 case 'n':
4764 ender = '\n';
4765 p++;
a687059c 4766 break;
a0d0e21e
LW
4767 case 'r':
4768 ender = '\r';
4769 p++;
a687059c 4770 break;
a0d0e21e
LW
4771 case 't':
4772 ender = '\t';
4773 p++;
a687059c 4774 break;
a0d0e21e
LW
4775 case 'f':
4776 ender = '\f';
4777 p++;
a687059c 4778 break;
a0d0e21e 4779 case 'e':
c7f1f016 4780 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4781 p++;
a687059c 4782 break;
a0d0e21e 4783 case 'a':
c7f1f016 4784 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4785 p++;
a687059c 4786 break;
a0d0e21e 4787 case 'x':
a0ed51b3 4788 if (*++p == '{') {
1df70142 4789 char* const e = strchr(p, '}');
b81d288d 4790
b45f050a 4791 if (!e) {
830247a4 4792 RExC_parse = p + 1;
b45f050a
JF
4793 vFAIL("Missing right brace on \\x{}");
4794 }
de5f0749 4795 else {
a4c04bdc
NC
4796 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4797 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4798 STRLEN numlen = e - p - 1;
53305cf1 4799 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4800 if (ender > 0xff)
4801 RExC_utf8 = 1;
a0ed51b3
LW
4802 p = e + 1;
4803 }
a0ed51b3
LW
4804 }
4805 else {
a4c04bdc 4806 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4807 STRLEN numlen = 2;
53305cf1 4808 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4809 p += numlen;
4810 }
a687059c 4811 break;
a0d0e21e
LW
4812 case 'c':
4813 p++;
bbce6d69 4814 ender = UCHARAT(p++);
4815 ender = toCTRL(ender);
a687059c 4816 break;
a0d0e21e
LW
4817 case '0': case '1': case '2': case '3':case '4':
4818 case '5': case '6': case '7': case '8':case '9':
4819 if (*p == '0' ||
830247a4 4820 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4821 I32 flags = 0;
1df70142 4822 STRLEN numlen = 3;
53305cf1 4823 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4824 p += numlen;
4825 }
4826 else {
4827 --p;
4828 goto loopdone;
a687059c
LW
4829 }
4830 break;
a0d0e21e 4831 case '\0':
830247a4 4832 if (p >= RExC_end)
b45f050a 4833 FAIL("Trailing \\");
a687059c 4834 /* FALL THROUGH */
a0d0e21e 4835 default:
041457d9 4836 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4837 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4838 goto normal_default;
a0d0e21e
LW
4839 }
4840 break;
a687059c 4841 default:
a0ed51b3 4842 normal_default:
fd400ab9 4843 if (UTF8_IS_START(*p) && UTF) {
1df70142 4844 STRLEN numlen;
5e12f4fb 4845 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4846 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4847 p += numlen;
4848 }
4849 else
4850 ender = *p++;
a0d0e21e 4851 break;
a687059c 4852 }
e2509266 4853 if (RExC_flags & PMf_EXTENDED)
830247a4 4854 p = regwhite(p, RExC_end);
60a8b682
JH
4855 if (UTF && FOLD) {
4856 /* Prime the casefolded buffer. */
ac7e0132 4857 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4858 }
a0d0e21e
LW
4859 if (ISMULT2(p)) { /* Back off on ?+*. */
4860 if (len)
4861 p = oldp;
16ea2a2e 4862 else if (UTF) {
80aecb99 4863 if (FOLD) {
60a8b682 4864 /* Emit all the Unicode characters. */
1df70142 4865 STRLEN numlen;
80aecb99
JH
4866 for (foldbuf = tmpbuf;
4867 foldlen;
4868 foldlen -= numlen) {
4869 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4870 if (numlen > 0) {
71207a34 4871 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4872 s += unilen;
4873 len += unilen;
4874 /* In EBCDIC the numlen
4875 * and unilen can differ. */
9dc45d57 4876 foldbuf += numlen;
47654450
JH
4877 if (numlen >= foldlen)
4878 break;
9dc45d57
JH
4879 }
4880 else
4881 break; /* "Can't happen." */
80aecb99
JH
4882 }
4883 }
4884 else {
71207a34 4885 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4886 if (unilen > 0) {
0ebc6274
JH
4887 s += unilen;
4888 len += unilen;
9dc45d57 4889 }
80aecb99 4890 }
a0ed51b3 4891 }
a0d0e21e
LW
4892 else {
4893 len++;
eb160463 4894 REGC((char)ender, s++);
a0d0e21e
LW
4895 }
4896 break;
a687059c 4897 }
16ea2a2e 4898 if (UTF) {
80aecb99 4899 if (FOLD) {
60a8b682 4900 /* Emit all the Unicode characters. */
1df70142 4901 STRLEN numlen;
80aecb99
JH
4902 for (foldbuf = tmpbuf;
4903 foldlen;
4904 foldlen -= numlen) {
4905 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4906 if (numlen > 0) {
71207a34 4907 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4908 len += unilen;
4909 s += unilen;
4910 /* In EBCDIC the numlen
4911 * and unilen can differ. */
9dc45d57 4912 foldbuf += numlen;
47654450
JH
4913 if (numlen >= foldlen)
4914 break;
9dc45d57
JH
4915 }
4916 else
4917 break;
80aecb99
JH
4918 }
4919 }
4920 else {
71207a34 4921 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4922 if (unilen > 0) {
0ebc6274
JH
4923 s += unilen;
4924 len += unilen;
9dc45d57 4925 }
80aecb99
JH
4926 }
4927 len--;
a0ed51b3
LW
4928 }
4929 else
eb160463 4930 REGC((char)ender, s++);
a0d0e21e
LW
4931 }
4932 loopdone:
830247a4 4933 RExC_parse = p - 1;
fac92740 4934 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4935 nextchar(pRExC_state);
793db0cb
JH
4936 {
4937 /* len is STRLEN which is unsigned, need to copy to signed */
4938 IV iv = len;
4939 if (iv < 0)
4940 vFAIL("Internal disaster");
4941 }
a0d0e21e
LW
4942 if (len > 0)
4943 *flagp |= HASWIDTH;
090f7165 4944 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4945 *flagp |= SIMPLE;
3dab1dad 4946
cd439c50 4947 if (SIZE_ONLY)
830247a4 4948 RExC_size += STR_SZ(len);
3dab1dad
YO
4949 else {
4950 STR_LEN(ret) = len;
830247a4 4951 RExC_emit += STR_SZ(len);
07be1b83 4952 }
3dab1dad 4953 }
a0d0e21e
LW
4954 break;
4955 }
a687059c 4956
60a8b682
JH
4957 /* If the encoding pragma is in effect recode the text of
4958 * any EXACT-kind nodes. */
3dab1dad
YO
4959 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4960 const STRLEN oldlen = STR_LEN(ret);
4961 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4962
4963 if (RExC_utf8)
4964 SvUTF8_on(sv);
4965 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4966 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4967 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4968
4969 if (SvUTF8(sv))
4970 RExC_utf8 = 1;
4971 if (!SIZE_ONLY) {
a3621e74
YO
4972 GET_RE_DEBUG_FLAGS_DECL;
4973 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4974 (int)oldlen, STRING(ret),
4975 (int)newlen, s));
4976 Copy(s, STRING(ret), newlen, char);
4977 STR_LEN(ret) += newlen - oldlen;
4978 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4979 } else
4980 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4981 }
a72c7584
JH
4982 }
4983
a0d0e21e 4984 return(ret);
a687059c
LW
4985}
4986
873ef191 4987STATIC char *
5f66b61c 4988S_regwhite(char *p, const char *e)
5b5a24f7
CS
4989{
4990 while (p < e) {
4991 if (isSPACE(*p))
4992 ++p;
4993 else if (*p == '#') {
4994 do {
4995 p++;
4996 } while (p < e && *p != '\n');
4997 }
4998 else
4999 break;
5000 }
5001 return p;
5002}
5003
b8c5462f
JH
5004/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5005 Character classes ([:foo:]) can also be negated ([:^foo:]).
5006 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5007 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5008 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5009
5010#define POSIXCC_DONE(c) ((c) == ':')
5011#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5012#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5013
b8c5462f 5014STATIC I32
830247a4 5015S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5016{
97aff369 5017 dVAR;
936ed897 5018 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5019
830247a4 5020 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5021 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5022 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5023 const char c = UCHARAT(RExC_parse);
097eb12c 5024 char* const s = RExC_parse++;
b81d288d 5025
9a86a77b 5026 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5027 RExC_parse++;
5028 if (RExC_parse == RExC_end)
620e46c5 5029 /* Grandfather lone [:, [=, [. */
830247a4 5030 RExC_parse = s;
620e46c5 5031 else {
3dab1dad 5032 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5033 assert(*t == c);
5034
9a86a77b 5035 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5036 const char *posixcc = s + 1;
830247a4 5037 RExC_parse++; /* skip over the ending ] */
3dab1dad 5038
b8c5462f 5039 if (*s == ':') {
1df70142
AL
5040 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5041 const I32 skip = t - posixcc;
80916619
NC
5042
5043 /* Initially switch on the length of the name. */
5044 switch (skip) {
5045 case 4:
3dab1dad
YO
5046 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5047 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5048 break;
80916619
NC
5049 case 5:
5050 /* Names all of length 5. */
5051 /* alnum alpha ascii blank cntrl digit graph lower
5052 print punct space upper */
5053 /* Offset 4 gives the best switch position. */
5054 switch (posixcc[4]) {
5055 case 'a':
3dab1dad
YO
5056 if (memEQ(posixcc, "alph", 4)) /* alpha */
5057 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5058 break;
5059 case 'e':
3dab1dad
YO
5060 if (memEQ(posixcc, "spac", 4)) /* space */
5061 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5062 break;
5063 case 'h':
3dab1dad
YO
5064 if (memEQ(posixcc, "grap", 4)) /* graph */
5065 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5066 break;
5067 case 'i':
3dab1dad
YO
5068 if (memEQ(posixcc, "asci", 4)) /* ascii */
5069 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5070 break;
5071 case 'k':
3dab1dad
YO
5072 if (memEQ(posixcc, "blan", 4)) /* blank */
5073 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5074 break;
5075 case 'l':
3dab1dad
YO
5076 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5077 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5078 break;
5079 case 'm':
3dab1dad
YO
5080 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5081 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5082 break;
5083 case 'r':
3dab1dad
YO
5084 if (memEQ(posixcc, "lowe", 4)) /* lower */
5085 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5086 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5087 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5088 break;
5089 case 't':
3dab1dad
YO
5090 if (memEQ(posixcc, "digi", 4)) /* digit */
5091 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5092 else if (memEQ(posixcc, "prin", 4)) /* print */
5093 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5094 else if (memEQ(posixcc, "punc", 4)) /* punct */
5095 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5096 break;
b8c5462f
JH
5097 }
5098 break;
80916619 5099 case 6:
3dab1dad
YO
5100 if (memEQ(posixcc, "xdigit", 6))
5101 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5102 break;
5103 }
80916619
NC
5104
5105 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5106 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5107 t - s - 1, s + 1);
80916619
NC
5108 assert (posixcc[skip] == ':');
5109 assert (posixcc[skip+1] == ']');
b45f050a 5110 } else if (!SIZE_ONLY) {
b8c5462f 5111 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5112
830247a4 5113 /* adjust RExC_parse so the warning shows after
b45f050a 5114 the class closes */
9a86a77b 5115 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5116 RExC_parse++;
b45f050a
JF
5117 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5118 }
b8c5462f
JH
5119 } else {
5120 /* Maternal grandfather:
5121 * "[:" ending in ":" but not in ":]" */
830247a4 5122 RExC_parse = s;
767d463e 5123 }
620e46c5
JH
5124 }
5125 }
5126
b8c5462f
JH
5127 return namedclass;
5128}
5129
5130STATIC void
830247a4 5131S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5132{
97aff369 5133 dVAR;
3dab1dad 5134 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5135 const char *s = RExC_parse;
5136 const char c = *s++;
b8c5462f 5137
3dab1dad 5138 while (isALNUM(*s))
b8c5462f
JH
5139 s++;
5140 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5141 if (ckWARN(WARN_REGEXP))
5142 vWARN3(s+2,
5143 "POSIX syntax [%c %c] belongs inside character classes",
5144 c, c);
b45f050a
JF
5145
5146 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5147 if (POSIXCC_NOTYET(c)) {
830247a4 5148 /* adjust RExC_parse so the error shows after
b45f050a 5149 the class closes */
9a86a77b 5150 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5151 NOOP;
b45f050a
JF
5152 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5153 }
b8c5462f
JH
5154 }
5155 }
620e46c5
JH
5156}
5157
7f6f358c
YO
5158
5159/*
5160 parse a class specification and produce either an ANYOF node that
5161 matches the pattern. If the pattern matches a single char only and
5162 that char is < 256 then we produce an EXACT node instead.
5163*/
76e3520e 5164STATIC regnode *
3dab1dad 5165S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5166{
97aff369 5167 dVAR;
07be1b83 5168 register UV value;
9a86a77b 5169 register UV nextvalue;
3568d838 5170 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5171 register IV range = 0;
c277df42 5172 register regnode *ret;
ba210ebe 5173 STRLEN numlen;
ffc61ed2 5174 IV namedclass;
cbbf8932 5175 char *rangebegin = NULL;
936ed897 5176 bool need_class = 0;
c445ea15 5177 SV *listsv = NULL;
ffc61ed2 5178 UV n;
9e55ce06 5179 bool optimize_invert = TRUE;
cbbf8932 5180 AV* unicode_alternate = NULL;
1b2d223b
JH
5181#ifdef EBCDIC
5182 UV literal_endpoint = 0;
5183#endif
7f6f358c 5184 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5185
3dab1dad 5186 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5187 case we need to change the emitted regop to an EXACT. */
07be1b83 5188 const char * orig_parse = RExC_parse;
72f13be8 5189 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5190#ifndef DEBUGGING
5191 PERL_UNUSED_ARG(depth);
5192#endif
72f13be8 5193
3dab1dad 5194 DEBUG_PARSE("clas");
7f6f358c
YO
5195
5196 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5197 ret = reganode(pRExC_state, ANYOF, 0);
5198
5199 if (!SIZE_ONLY)
5200 ANYOF_FLAGS(ret) = 0;
5201
9a86a77b 5202 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5203 RExC_naughty++;
5204 RExC_parse++;
5205 if (!SIZE_ONLY)
5206 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5207 }
a0d0e21e 5208
73060fc4 5209 if (SIZE_ONLY) {
830247a4 5210 RExC_size += ANYOF_SKIP;
73060fc4
JH
5211 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5212 }
936ed897 5213 else {
830247a4 5214 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5215 if (FOLD)
5216 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5217 if (LOC)
5218 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5219 ANYOF_BITMAP_ZERO(ret);
396482e1 5220 listsv = newSVpvs("# comment\n");
a0d0e21e 5221 }
b8c5462f 5222
9a86a77b
JH
5223 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5224
b938889d 5225 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5226 checkposixcc(pRExC_state);
b8c5462f 5227
f064b6ad
HS
5228 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5229 if (UCHARAT(RExC_parse) == ']')
5230 goto charclassloop;
ffc61ed2 5231
9a86a77b 5232 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5233
5234 charclassloop:
5235
5236 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5237
73b437c8 5238 if (!range)
830247a4 5239 rangebegin = RExC_parse;
ffc61ed2 5240 if (UTF) {
5e12f4fb 5241 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5242 RExC_end - RExC_parse,
9f7f3913 5243 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5244 RExC_parse += numlen;
5245 }
5246 else
5247 value = UCHARAT(RExC_parse++);
7f6f358c 5248
9a86a77b
JH
5249 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5250 if (value == '[' && POSIXCC(nextvalue))
830247a4 5251 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5252 else if (value == '\\') {
ffc61ed2 5253 if (UTF) {
5e12f4fb 5254 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5255 RExC_end - RExC_parse,
9f7f3913 5256 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5257 RExC_parse += numlen;
5258 }
5259 else
5260 value = UCHARAT(RExC_parse++);
470c3474 5261 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5262 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5263 * be a problem later if we want switch on Unicode.
5264 * A similar issue a little bit later when switching on
5265 * namedclass. --jhi */
ffc61ed2 5266 switch ((I32)value) {
b8c5462f
JH
5267 case 'w': namedclass = ANYOF_ALNUM; break;
5268 case 'W': namedclass = ANYOF_NALNUM; break;
5269 case 's': namedclass = ANYOF_SPACE; break;
5270 case 'S': namedclass = ANYOF_NSPACE; break;
5271 case 'd': namedclass = ANYOF_DIGIT; break;
5272 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5273 case 'p':
5274 case 'P':
3dab1dad
YO
5275 {
5276 char *e;
af6f566e 5277 if (RExC_parse >= RExC_end)
2a4859cd 5278 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5279 if (*RExC_parse == '{') {
1df70142 5280 const U8 c = (U8)value;
ffc61ed2
JH
5281 e = strchr(RExC_parse++, '}');
5282 if (!e)
0da60cf5 5283 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5284 while (isSPACE(UCHARAT(RExC_parse)))
5285 RExC_parse++;
5286 if (e == RExC_parse)
0da60cf5 5287 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5288 n = e - RExC_parse;
ab13f0c7
JH
5289 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5290 n--;
ffc61ed2
JH
5291 }
5292 else {
5293 e = RExC_parse;
5294 n = 1;
5295 }
5296 if (!SIZE_ONLY) {
ab13f0c7
JH
5297 if (UCHARAT(RExC_parse) == '^') {
5298 RExC_parse++;
5299 n--;
5300 value = value == 'p' ? 'P' : 'p'; /* toggle */
5301 while (isSPACE(UCHARAT(RExC_parse))) {
5302 RExC_parse++;
5303 n--;
5304 }
5305 }
097eb12c
AL
5306 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5307 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5308 }
5309 RExC_parse = e + 1;
5310 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5311 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5312 }
f81125e2 5313 break;
b8c5462f
JH
5314 case 'n': value = '\n'; break;
5315 case 'r': value = '\r'; break;
5316 case 't': value = '\t'; break;
5317 case 'f': value = '\f'; break;
5318 case 'b': value = '\b'; break;
c7f1f016
NIS
5319 case 'e': value = ASCII_TO_NATIVE('\033');break;
5320 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5321 case 'x':
ffc61ed2 5322 if (*RExC_parse == '{') {
a4c04bdc
NC
5323 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5324 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5325 char * const e = strchr(RExC_parse++, '}');
b81d288d 5326 if (!e)
ffc61ed2 5327 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5328
5329 numlen = e - RExC_parse;
5330 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5331 RExC_parse = e + 1;
5332 }
5333 else {
a4c04bdc 5334 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5335 numlen = 2;
5336 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5337 RExC_parse += numlen;
5338 }
b8c5462f
JH
5339 break;
5340 case 'c':
830247a4 5341 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5342 value = toCTRL(value);
5343 break;
5344 case '0': case '1': case '2': case '3': case '4':
5345 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5346 {
5347 I32 flags = 0;
5348 numlen = 3;
5349 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5350 RExC_parse += numlen;
b8c5462f 5351 break;
53305cf1 5352 }
1028017a 5353 default:
041457d9 5354 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5355 vWARN2(RExC_parse,
5356 "Unrecognized escape \\%c in character class passed through",
5357 (int)value);
1028017a 5358 break;
b8c5462f 5359 }
ffc61ed2 5360 } /* end of \blah */
1b2d223b
JH
5361#ifdef EBCDIC
5362 else
5363 literal_endpoint++;
5364#endif
ffc61ed2
JH
5365
5366 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5367
5368 if (!SIZE_ONLY && !need_class)
936ed897 5369 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5370
936ed897 5371 need_class = 1;
ffc61ed2
JH
5372
5373 /* a bad range like a-\d, a-[:digit:] ? */
5374 if (range) {
73b437c8 5375 if (!SIZE_ONLY) {
afd78fd5 5376 if (ckWARN(WARN_REGEXP)) {
097eb12c 5377 const int w =
afd78fd5
JH
5378 RExC_parse >= rangebegin ?
5379 RExC_parse - rangebegin : 0;
830247a4 5380 vWARN4(RExC_parse,
b45f050a 5381 "False [] range \"%*.*s\"",
097eb12c 5382 w, w, rangebegin);
afd78fd5 5383 }
3568d838
JH
5384 if (prevvalue < 256) {
5385 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5386 ANYOF_BITMAP_SET(ret, '-');
5387 }
5388 else {
5389 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5390 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5391 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5392 }
b8c5462f 5393 }
ffc61ed2
JH
5394
5395 range = 0; /* this was not a true range */
73b437c8 5396 }
ffc61ed2 5397
73b437c8 5398 if (!SIZE_ONLY) {
c49a72a9
NC
5399 const char *what = NULL;
5400 char yesno = 0;
5401
3568d838
JH
5402 if (namedclass > OOB_NAMEDCLASS)
5403 optimize_invert = FALSE;
e2962f66
JH
5404 /* Possible truncation here but in some 64-bit environments
5405 * the compiler gets heartburn about switch on 64-bit values.
5406 * A similar issue a little earlier when switching on value.
98f323fa 5407 * --jhi */
e2962f66 5408 switch ((I32)namedclass) {
73b437c8
JH
5409 case ANYOF_ALNUM:
5410 if (LOC)
936ed897 5411 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5412 else {
5413 for (value = 0; value < 256; value++)
5414 if (isALNUM(value))
936ed897 5415 ANYOF_BITMAP_SET(ret, value);
73b437c8 5416 }
c49a72a9
NC
5417 yesno = '+';
5418 what = "Word";
73b437c8
JH
5419 break;
5420 case ANYOF_NALNUM:
5421 if (LOC)
936ed897 5422 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5423 else {
5424 for (value = 0; value < 256; value++)
5425 if (!isALNUM(value))
936ed897 5426 ANYOF_BITMAP_SET(ret, value);
73b437c8 5427 }
c49a72a9
NC
5428 yesno = '!';
5429 what = "Word";
73b437c8 5430 break;
ffc61ed2 5431 case ANYOF_ALNUMC:
73b437c8 5432 if (LOC)
ffc61ed2 5433 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5434 else {
5435 for (value = 0; value < 256; value++)
ffc61ed2 5436 if (isALNUMC(value))
936ed897 5437 ANYOF_BITMAP_SET(ret, value);
73b437c8 5438 }
c49a72a9
NC
5439 yesno = '+';
5440 what = "Alnum";
73b437c8
JH
5441 break;
5442 case ANYOF_NALNUMC:
5443 if (LOC)
936ed897 5444 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5445 else {
5446 for (value = 0; value < 256; value++)
5447 if (!isALNUMC(value))
936ed897 5448 ANYOF_BITMAP_SET(ret, value);
73b437c8 5449 }
c49a72a9
NC
5450 yesno = '!';
5451 what = "Alnum";
73b437c8
JH
5452 break;
5453 case ANYOF_ALPHA:
5454 if (LOC)
936ed897 5455 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5456 else {
5457 for (value = 0; value < 256; value++)
5458 if (isALPHA(value))
936ed897 5459 ANYOF_BITMAP_SET(ret, value);
73b437c8 5460 }
c49a72a9
NC
5461 yesno = '+';
5462 what = "Alpha";
73b437c8
JH
5463 break;
5464 case ANYOF_NALPHA:
5465 if (LOC)
936ed897 5466 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5467 else {
5468 for (value = 0; value < 256; value++)
5469 if (!isALPHA(value))
936ed897 5470 ANYOF_BITMAP_SET(ret, value);
73b437c8 5471 }
c49a72a9
NC
5472 yesno = '!';
5473 what = "Alpha";
73b437c8
JH
5474 break;
5475 case ANYOF_ASCII:
5476 if (LOC)
936ed897 5477 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5478 else {
c7f1f016 5479#ifndef EBCDIC
1ba5c669
JH
5480 for (value = 0; value < 128; value++)
5481 ANYOF_BITMAP_SET(ret, value);
5482#else /* EBCDIC */
ffbc6a93 5483 for (value = 0; value < 256; value++) {
3a3c4447
JH
5484 if (isASCII(value))
5485 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5486 }
1ba5c669 5487#endif /* EBCDIC */
73b437c8 5488 }
c49a72a9
NC
5489 yesno = '+';
5490 what = "ASCII";
73b437c8
JH
5491 break;
5492 case ANYOF_NASCII:
5493 if (LOC)
936ed897 5494 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5495 else {
c7f1f016 5496#ifndef EBCDIC
1ba5c669
JH
5497 for (value = 128; value < 256; value++)
5498 ANYOF_BITMAP_SET(ret, value);
5499#else /* EBCDIC */
ffbc6a93 5500 for (value = 0; value < 256; value++) {
3a3c4447
JH
5501 if (!isASCII(value))
5502 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5503 }
1ba5c669 5504#endif /* EBCDIC */
73b437c8 5505 }
c49a72a9
NC
5506 yesno = '!';
5507 what = "ASCII";
73b437c8 5508 break;
aaa51d5e
JF
5509 case ANYOF_BLANK:
5510 if (LOC)
5511 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5512 else {
5513 for (value = 0; value < 256; value++)
5514 if (isBLANK(value))
5515 ANYOF_BITMAP_SET(ret, value);
5516 }
c49a72a9
NC
5517 yesno = '+';
5518 what = "Blank";
aaa51d5e
JF
5519 break;
5520 case ANYOF_NBLANK:
5521 if (LOC)
5522 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5523 else {
5524 for (value = 0; value < 256; value++)
5525 if (!isBLANK(value))
5526 ANYOF_BITMAP_SET(ret, value);
5527 }
c49a72a9
NC
5528 yesno = '!';
5529 what = "Blank";
aaa51d5e 5530 break;
73b437c8
JH
5531 case ANYOF_CNTRL:
5532 if (LOC)
936ed897 5533 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5534 else {
5535 for (value = 0; value < 256; value++)
5536 if (isCNTRL(value))
936ed897 5537 ANYOF_BITMAP_SET(ret, value);
73b437c8 5538 }
c49a72a9
NC
5539 yesno = '+';
5540 what = "Cntrl";
73b437c8
JH
5541 break;
5542 case ANYOF_NCNTRL:
5543 if (LOC)
936ed897 5544 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5545 else {
5546 for (value = 0; value < 256; value++)
5547 if (!isCNTRL(value))
936ed897 5548 ANYOF_BITMAP_SET(ret, value);
73b437c8 5549 }
c49a72a9
NC
5550 yesno = '!';
5551 what = "Cntrl";
ffc61ed2
JH
5552 break;
5553 case ANYOF_DIGIT:
5554 if (LOC)
5555 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5556 else {
5557 /* consecutive digits assumed */
5558 for (value = '0'; value <= '9'; value++)
5559 ANYOF_BITMAP_SET(ret, value);
5560 }
c49a72a9
NC
5561 yesno = '+';
5562 what = "Digit";
ffc61ed2
JH
5563 break;
5564 case ANYOF_NDIGIT:
5565 if (LOC)
5566 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5567 else {
5568 /* consecutive digits assumed */
5569 for (value = 0; value < '0'; value++)
5570 ANYOF_BITMAP_SET(ret, value);
5571 for (value = '9' + 1; value < 256; value++)
5572 ANYOF_BITMAP_SET(ret, value);
5573 }
c49a72a9
NC
5574 yesno = '!';
5575 what = "Digit";
73b437c8
JH
5576 break;
5577 case ANYOF_GRAPH:
5578 if (LOC)
936ed897 5579 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5580 else {
5581 for (value = 0; value < 256; value++)
5582 if (isGRAPH(value))
936ed897 5583 ANYOF_BITMAP_SET(ret, value);
73b437c8 5584 }
c49a72a9
NC
5585 yesno = '+';
5586 what = "Graph";
73b437c8
JH
5587 break;
5588 case ANYOF_NGRAPH:
5589 if (LOC)
936ed897 5590 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5591 else {
5592 for (value = 0; value < 256; value++)
5593 if (!isGRAPH(value))
936ed897 5594 ANYOF_BITMAP_SET(ret, value);
73b437c8 5595 }
c49a72a9
NC
5596 yesno = '!';
5597 what = "Graph";
73b437c8
JH
5598 break;
5599 case ANYOF_LOWER:
5600 if (LOC)
936ed897 5601 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5602 else {
5603 for (value = 0; value < 256; value++)
5604 if (isLOWER(value))
936ed897 5605 ANYOF_BITMAP_SET(ret, value);
73b437c8 5606 }
c49a72a9
NC
5607 yesno = '+';
5608 what = "Lower";
73b437c8
JH
5609 break;
5610 case ANYOF_NLOWER:
5611 if (LOC)
936ed897 5612 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5613 else {
5614 for (value = 0; value < 256; value++)
5615 if (!isLOWER(value))
936ed897 5616 ANYOF_BITMAP_SET(ret, value);
73b437c8 5617 }
c49a72a9
NC
5618 yesno = '!';
5619 what = "Lower";
73b437c8
JH
5620 break;
5621 case ANYOF_PRINT:
5622 if (LOC)
936ed897 5623 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5624 else {
5625 for (value = 0; value < 256; value++)
5626 if (isPRINT(value))
936ed897 5627 ANYOF_BITMAP_SET(ret, value);
73b437c8 5628 }
c49a72a9
NC
5629 yesno = '+';
5630 what = "Print";
73b437c8
JH
5631 break;
5632 case ANYOF_NPRINT:
5633 if (LOC)
936ed897 5634 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5635 else {
5636 for (value = 0; value < 256; value++)
5637 if (!isPRINT(value))
936ed897 5638 ANYOF_BITMAP_SET(ret, value);
73b437c8 5639 }
c49a72a9
NC
5640 yesno = '!';
5641 what = "Print";
73b437c8 5642 break;
aaa51d5e
JF
5643 case ANYOF_PSXSPC:
5644 if (LOC)
5645 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5646 else {
5647 for (value = 0; value < 256; value++)
5648 if (isPSXSPC(value))
5649 ANYOF_BITMAP_SET(ret, value);
5650 }
c49a72a9
NC
5651 yesno = '+';
5652 what = "Space";
aaa51d5e
JF
5653 break;
5654 case ANYOF_NPSXSPC:
5655 if (LOC)
5656 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5657 else {
5658 for (value = 0; value < 256; value++)
5659 if (!isPSXSPC(value))
5660 ANYOF_BITMAP_SET(ret, value);
5661 }
c49a72a9
NC
5662 yesno = '!';
5663 what = "Space";
aaa51d5e 5664 break;
73b437c8
JH
5665 case ANYOF_PUNCT:
5666 if (LOC)
936ed897 5667 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5668 else {
5669 for (value = 0; value < 256; value++)
5670 if (isPUNCT(value))
936ed897 5671 ANYOF_BITMAP_SET(ret, value);
73b437c8 5672 }
c49a72a9
NC
5673 yesno = '+';
5674 what = "Punct";
73b437c8
JH
5675 break;
5676 case ANYOF_NPUNCT:
5677 if (LOC)
936ed897 5678 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5679 else {
5680 for (value = 0; value < 256; value++)
5681 if (!isPUNCT(value))
936ed897 5682 ANYOF_BITMAP_SET(ret, value);
73b437c8 5683 }
c49a72a9
NC
5684 yesno = '!';
5685 what = "Punct";
ffc61ed2
JH
5686 break;
5687 case ANYOF_SPACE:
5688 if (LOC)
5689 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5690 else {
5691 for (value = 0; value < 256; value++)
5692 if (isSPACE(value))
5693 ANYOF_BITMAP_SET(ret, value);
5694 }
c49a72a9
NC
5695 yesno = '+';
5696 what = "SpacePerl";
ffc61ed2
JH
5697 break;
5698 case ANYOF_NSPACE:
5699 if (LOC)
5700 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5701 else {
5702 for (value = 0; value < 256; value++)
5703 if (!isSPACE(value))
5704 ANYOF_BITMAP_SET(ret, value);
5705 }
c49a72a9
NC
5706 yesno = '!';
5707 what = "SpacePerl";
73b437c8
JH
5708 break;
5709 case ANYOF_UPPER:
5710 if (LOC)
936ed897 5711 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5712 else {
5713 for (value = 0; value < 256; value++)
5714 if (isUPPER(value))
936ed897 5715 ANYOF_BITMAP_SET(ret, value);
73b437c8 5716 }
c49a72a9
NC
5717 yesno = '+';
5718 what = "Upper";
73b437c8
JH
5719 break;
5720 case ANYOF_NUPPER:
5721 if (LOC)
936ed897 5722 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5723 else {
5724 for (value = 0; value < 256; value++)
5725 if (!isUPPER(value))
936ed897 5726 ANYOF_BITMAP_SET(ret, value);
73b437c8 5727 }
c49a72a9
NC
5728 yesno = '!';
5729 what = "Upper";
73b437c8
JH
5730 break;
5731 case ANYOF_XDIGIT:
5732 if (LOC)
936ed897 5733 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5734 else {
5735 for (value = 0; value < 256; value++)
5736 if (isXDIGIT(value))
936ed897 5737 ANYOF_BITMAP_SET(ret, value);
73b437c8 5738 }
c49a72a9
NC
5739 yesno = '+';
5740 what = "XDigit";
73b437c8
JH
5741 break;
5742 case ANYOF_NXDIGIT:
5743 if (LOC)
936ed897 5744 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5745 else {
5746 for (value = 0; value < 256; value++)
5747 if (!isXDIGIT(value))
936ed897 5748 ANYOF_BITMAP_SET(ret, value);
73b437c8 5749 }
c49a72a9
NC
5750 yesno = '!';
5751 what = "XDigit";
73b437c8 5752 break;
f81125e2
JP
5753 case ANYOF_MAX:
5754 /* this is to handle \p and \P */
5755 break;
73b437c8 5756 default:
b45f050a 5757 vFAIL("Invalid [::] class");
73b437c8 5758 break;
b8c5462f 5759 }
c49a72a9
NC
5760 if (what) {
5761 /* Strings such as "+utf8::isWord\n" */
5762 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5763 }
b8c5462f 5764 if (LOC)
936ed897 5765 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5766 continue;
a0d0e21e 5767 }
ffc61ed2
JH
5768 } /* end of namedclass \blah */
5769
a0d0e21e 5770 if (range) {
eb160463 5771 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5772 const int w = RExC_parse - rangebegin;
5773 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5774 range = 0; /* not a valid range */
73b437c8 5775 }
a0d0e21e
LW
5776 }
5777 else {
3568d838 5778 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5779 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5780 RExC_parse[1] != ']') {
5781 RExC_parse++;
ffc61ed2
JH
5782
5783 /* a bad range like \w-, [:word:]- ? */
5784 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5785 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5786 const int w =
afd78fd5
JH
5787 RExC_parse >= rangebegin ?
5788 RExC_parse - rangebegin : 0;
830247a4 5789 vWARN4(RExC_parse,
b45f050a 5790 "False [] range \"%*.*s\"",
097eb12c 5791 w, w, rangebegin);
afd78fd5 5792 }
73b437c8 5793 if (!SIZE_ONLY)
936ed897 5794 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5795 } else
ffc61ed2
JH
5796 range = 1; /* yeah, it's a range! */
5797 continue; /* but do it the next time */
a0d0e21e 5798 }
a687059c 5799 }
ffc61ed2 5800
93733859 5801 /* now is the next time */
07be1b83 5802 /*stored += (value - prevvalue + 1);*/
ae5c130c 5803 if (!SIZE_ONLY) {
3568d838 5804 if (prevvalue < 256) {
1df70142 5805 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5806 IV i;
3568d838 5807#ifdef EBCDIC
1b2d223b
JH
5808 /* In EBCDIC [\x89-\x91] should include
5809 * the \x8e but [i-j] should not. */
5810 if (literal_endpoint == 2 &&
5811 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5812 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5813 {
3568d838
JH
5814 if (isLOWER(prevvalue)) {
5815 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5816 if (isLOWER(i))
5817 ANYOF_BITMAP_SET(ret, i);
5818 } else {
3568d838 5819 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5820 if (isUPPER(i))
5821 ANYOF_BITMAP_SET(ret, i);
5822 }
8ada0baa 5823 }
ffc61ed2 5824 else
8ada0baa 5825#endif
07be1b83
YO
5826 for (i = prevvalue; i <= ceilvalue; i++) {
5827 if (!ANYOF_BITMAP_TEST(ret,i)) {
5828 stored++;
5829 ANYOF_BITMAP_SET(ret, i);
5830 }
5831 }
3568d838 5832 }
a5961de5 5833 if (value > 255 || UTF) {
1df70142
AL
5834 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5835 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5836 stored+=2; /* can't optimize this class */
ffc61ed2 5837 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5838 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5839 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5840 prevnatvalue, natvalue);
5841 }
5842 else if (prevnatvalue == natvalue) {
5843 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5844 if (FOLD) {
89ebb4a3 5845 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5846 STRLEN foldlen;
1df70142 5847 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5848
c840d2a2
JH
5849 /* If folding and foldable and a single
5850 * character, insert also the folded version
5851 * to the charclass. */
9e55ce06 5852 if (f != value) {
eb160463 5853 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5854 Perl_sv_catpvf(aTHX_ listsv,
5855 "%04"UVxf"\n", f);
5856 else {
5857 /* Any multicharacter foldings
5858 * require the following transform:
5859 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5860 * where E folds into "pq" and F folds
5861 * into "rst", all other characters
5862 * fold to single characters. We save
5863 * away these multicharacter foldings,
5864 * to be later saved as part of the
5865 * additional "s" data. */
5866 SV *sv;
5867
5868 if (!unicode_alternate)
5869 unicode_alternate = newAV();
5870 sv = newSVpvn((char*)foldbuf, foldlen);
5871 SvUTF8_on(sv);
5872 av_push(unicode_alternate, sv);
5873 }
5874 }
254ba52a 5875
60a8b682
JH
5876 /* If folding and the value is one of the Greek
5877 * sigmas insert a few more sigmas to make the
5878 * folding rules of the sigmas to work right.
5879 * Note that not all the possible combinations
5880 * are handled here: some of them are handled
9e55ce06
JH
5881 * by the standard folding rules, and some of
5882 * them (literal or EXACTF cases) are handled
5883 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5884 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5885 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5886 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5887 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5888 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5889 }
5890 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5891 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5892 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5893 }
5894 }
ffc61ed2 5895 }
1b2d223b
JH
5896#ifdef EBCDIC
5897 literal_endpoint = 0;
5898#endif
8ada0baa 5899 }
ffc61ed2
JH
5900
5901 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5902 }
ffc61ed2 5903
936ed897 5904 if (need_class) {
4f66b38d 5905 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5906 if (SIZE_ONLY)
830247a4 5907 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5908 else
830247a4 5909 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5910 }
ffc61ed2 5911
7f6f358c
YO
5912
5913 if (SIZE_ONLY)
5914 return ret;
5915 /****** !SIZE_ONLY AFTER HERE *********/
5916
5917 if( stored == 1 && value < 256
5918 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5919 ) {
5920 /* optimize single char class to an EXACT node
5921 but *only* when its not a UTF/high char */
07be1b83
YO
5922 const char * cur_parse= RExC_parse;
5923 RExC_emit = (regnode *)orig_emit;
5924 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5925 ret = reg_node(pRExC_state,
5926 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5927 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5928 *STRING(ret)= (char)value;
5929 STR_LEN(ret)= 1;
5930 RExC_emit += STR_SZ(1);
5931 return ret;
5932 }
ae5c130c 5933 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5934 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5935 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5936 ) {
a0ed51b3 5937 for (value = 0; value < 256; ++value) {
936ed897 5938 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5939 UV fold = PL_fold[value];
ffc61ed2
JH
5940
5941 if (fold != value)
5942 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5943 }
5944 }
936ed897 5945 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5946 }
ffc61ed2 5947
ae5c130c 5948 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5949 if (optimize_invert &&
ffc61ed2
JH
5950 /* If the only flag is inversion. */
5951 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5952 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5953 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5954 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5955 }
7f6f358c 5956 {
097eb12c 5957 AV * const av = newAV();
ffc61ed2 5958 SV *rv;
9e55ce06 5959 /* The 0th element stores the character class description
6a0407ee 5960 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5961 * to initialize the appropriate swash (which gets stored in
5962 * the 1st element), and also useful for dumping the regnode.
5963 * The 2nd element stores the multicharacter foldings,
6a0407ee 5964 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5965 av_store(av, 0, listsv);
5966 av_store(av, 1, NULL);
9e55ce06 5967 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5968 rv = newRV_noinc((SV*)av);
19860706 5969 n = add_data(pRExC_state, 1, "s");
830247a4 5970 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5971 ARG_SET(ret, n);
a0ed51b3 5972 }
a0ed51b3
LW
5973 return ret;
5974}
5975
76e3520e 5976STATIC char*
830247a4 5977S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5978{
097eb12c 5979 char* const retval = RExC_parse++;
a0d0e21e 5980
4633a7c4 5981 for (;;) {
830247a4
IZ
5982 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5983 RExC_parse[2] == '#') {
e994fd66
AE
5984 while (*RExC_parse != ')') {
5985 if (RExC_parse == RExC_end)
5986 FAIL("Sequence (?#... not terminated");
830247a4 5987 RExC_parse++;
e994fd66 5988 }
830247a4 5989 RExC_parse++;
4633a7c4
LW
5990 continue;
5991 }
e2509266 5992 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5993 if (isSPACE(*RExC_parse)) {
5994 RExC_parse++;
748a9306
LW
5995 continue;
5996 }
830247a4 5997 else if (*RExC_parse == '#') {
e994fd66
AE
5998 while (RExC_parse < RExC_end)
5999 if (*RExC_parse++ == '\n') break;
748a9306
LW
6000 continue;
6001 }
748a9306 6002 }
4633a7c4 6003 return retval;
a0d0e21e 6004 }
a687059c
LW
6005}
6006
6007/*
c277df42 6008- reg_node - emit a node
a0d0e21e 6009*/
76e3520e 6010STATIC regnode * /* Location. */
830247a4 6011S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6012{
97aff369 6013 dVAR;
c277df42 6014 register regnode *ptr;
504618e9 6015 regnode * const ret = RExC_emit;
07be1b83 6016 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6017
c277df42 6018 if (SIZE_ONLY) {
830247a4
IZ
6019 SIZE_ALIGN(RExC_size);
6020 RExC_size += 1;
a0d0e21e
LW
6021 return(ret);
6022 }
c277df42 6023 NODE_ALIGN_FILL(ret);
a0d0e21e 6024 ptr = ret;
c277df42 6025 FILL_ADVANCE_NODE(ptr, op);
fac92740 6026 if (RExC_offsets) { /* MJD */
07be1b83 6027 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6028 "reg_node", __LINE__,
6029 reg_name[op],
07be1b83
YO
6030 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6031 ? "Overwriting end of array!\n" : "OK",
6032 (UV)(RExC_emit - RExC_emit_start),
6033 (UV)(RExC_parse - RExC_start),
6034 (UV)RExC_offsets[0]));
ccb2c380 6035 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6036 }
07be1b83 6037
830247a4 6038 RExC_emit = ptr;
a687059c 6039
a0d0e21e 6040 return(ret);
a687059c
LW
6041}
6042
6043/*
a0d0e21e
LW
6044- reganode - emit a node with an argument
6045*/
76e3520e 6046STATIC regnode * /* Location. */
830247a4 6047S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6048{
97aff369 6049 dVAR;
c277df42 6050 register regnode *ptr;
504618e9 6051 regnode * const ret = RExC_emit;
07be1b83 6052 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6053
c277df42 6054 if (SIZE_ONLY) {
830247a4
IZ
6055 SIZE_ALIGN(RExC_size);
6056 RExC_size += 2;
a0d0e21e
LW
6057 return(ret);
6058 }
fe14fcc3 6059
c277df42 6060 NODE_ALIGN_FILL(ret);
a0d0e21e 6061 ptr = ret;
c277df42 6062 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6063 if (RExC_offsets) { /* MJD */
07be1b83 6064 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6065 "reganode",
ccb2c380
MP
6066 __LINE__,
6067 reg_name[op],
07be1b83 6068 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6069 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6070 (UV)(RExC_emit - RExC_emit_start),
6071 (UV)(RExC_parse - RExC_start),
6072 (UV)RExC_offsets[0]));
ccb2c380 6073 Set_Cur_Node_Offset;
fac92740
MJD
6074 }
6075
830247a4 6076 RExC_emit = ptr;
fe14fcc3 6077
a0d0e21e 6078 return(ret);
fe14fcc3
LW
6079}
6080
6081/*
cd439c50 6082- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6083*/
71207a34
AL
6084STATIC STRLEN
6085S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6086{
97aff369 6087 dVAR;
71207a34 6088 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6089}
6090
6091/*
a0d0e21e
LW
6092- reginsert - insert an operator in front of already-emitted operand
6093*
6094* Means relocating the operand.
6095*/
76e3520e 6096STATIC void
830247a4 6097S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6098{
97aff369 6099 dVAR;
c277df42
IZ
6100 register regnode *src;
6101 register regnode *dst;
6102 register regnode *place;
504618e9 6103 const int offset = regarglen[(U8)op];
07be1b83 6104 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6105/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6106
6107 if (SIZE_ONLY) {
830247a4 6108 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6109 return;
6110 }
a687059c 6111
830247a4
IZ
6112 src = RExC_emit;
6113 RExC_emit += NODE_STEP_REGNODE + offset;
6114 dst = RExC_emit;
fac92740 6115 while (src > opnd) {
c277df42 6116 StructCopy(--src, --dst, regnode);
fac92740 6117 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6118 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6119 "reg_insert",
ccb2c380
MP
6120 __LINE__,
6121 reg_name[op],
07be1b83
YO
6122 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6123 ? "Overwriting end of array!\n" : "OK",
6124 (UV)(src - RExC_emit_start),
6125 (UV)(dst - RExC_emit_start),
6126 (UV)RExC_offsets[0]));
ccb2c380
MP
6127 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6128 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6129 }
6130 }
6131
a0d0e21e
LW
6132
6133 place = opnd; /* Op node, where operand used to be. */
fac92740 6134 if (RExC_offsets) { /* MJD */
07be1b83 6135 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6136 "reginsert",
ccb2c380
MP
6137 __LINE__,
6138 reg_name[op],
07be1b83 6139 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6140 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6141 (UV)(place - RExC_emit_start),
6142 (UV)(RExC_parse - RExC_start),
fac92740 6143 RExC_offsets[0]));
ccb2c380 6144 Set_Node_Offset(place, RExC_parse);
45948336 6145 Set_Node_Length(place, 1);
fac92740 6146 }
c277df42
IZ
6147 src = NEXTOPER(place);
6148 FILL_ADVANCE_NODE(place, op);
6149 Zero(src, offset, regnode);
a687059c
LW
6150}
6151
6152/*
c277df42 6153- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6154- SEE ALSO: regtail_study
a0d0e21e 6155*/
097eb12c 6156/* TODO: All three parms should be const */
76e3520e 6157STATIC void
3dab1dad 6158S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6159{
97aff369 6160 dVAR;
c277df42 6161 register regnode *scan;
72f13be8 6162 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6163#ifndef DEBUGGING
6164 PERL_UNUSED_ARG(depth);
6165#endif
a0d0e21e 6166
c277df42 6167 if (SIZE_ONLY)
a0d0e21e
LW
6168 return;
6169
6170 /* Find last node. */
6171 scan = p;
6172 for (;;) {
504618e9 6173 regnode * const temp = regnext(scan);
3dab1dad
YO
6174 DEBUG_PARSE_r({
6175 SV * const mysv=sv_newmortal();
6176 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6177 regprop(RExC_rx, mysv, scan);
6178 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6179 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6180 });
6181 if (temp == NULL)
6182 break;
6183 scan = temp;
6184 }
6185
6186 if (reg_off_by_arg[OP(scan)]) {
6187 ARG_SET(scan, val - scan);
6188 }
6189 else {
6190 NEXT_OFF(scan) = val - scan;
6191 }
6192}
6193
07be1b83 6194#ifdef DEBUGGING
3dab1dad
YO
6195/*
6196- regtail_study - set the next-pointer at the end of a node chain of p to val.
6197- Look for optimizable sequences at the same time.
6198- currently only looks for EXACT chains.
07be1b83
YO
6199
6200This is expermental code. The idea is to use this routine to perform
6201in place optimizations on branches and groups as they are constructed,
6202with the long term intention of removing optimization from study_chunk so
6203that it is purely analytical.
6204
6205Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6206to control which is which.
6207
3dab1dad
YO
6208*/
6209/* TODO: All four parms should be const */
07be1b83 6210
3dab1dad
YO
6211STATIC U8
6212S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6213{
6214 dVAR;
6215 register regnode *scan;
07be1b83
YO
6216 U8 exact = PSEUDO;
6217#ifdef EXPERIMENTAL_INPLACESCAN
6218 I32 min = 0;
6219#endif
6220
3dab1dad
YO
6221 GET_RE_DEBUG_FLAGS_DECL;
6222
07be1b83 6223
3dab1dad
YO
6224 if (SIZE_ONLY)
6225 return exact;
6226
6227 /* Find last node. */
6228
6229 scan = p;
6230 for (;;) {
6231 regnode * const temp = regnext(scan);
07be1b83
YO
6232#ifdef EXPERIMENTAL_INPLACESCAN
6233 if (PL_regkind[OP(scan)] == EXACT)
6234 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6235 return EXACT;
6236#endif
3dab1dad
YO
6237 if ( exact ) {
6238 switch (OP(scan)) {
6239 case EXACT:
6240 case EXACTF:
6241 case EXACTFL:
6242 if( exact == PSEUDO )
6243 exact= OP(scan);
07be1b83
YO
6244 else if ( exact != OP(scan) )
6245 exact= 0;
3dab1dad
YO
6246 case NOTHING:
6247 break;
6248 default:
6249 exact= 0;
6250 }
6251 }
6252 DEBUG_PARSE_r({
6253 SV * const mysv=sv_newmortal();
6254 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6255 regprop(RExC_rx, mysv, scan);
6256 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6257 SvPV_nolen_const(mysv),
6258 reg_name[exact],
6259 REG_NODE_NUM(scan));
6260 });
a0d0e21e
LW
6261 if (temp == NULL)
6262 break;
6263 scan = temp;
6264 }
07be1b83
YO
6265 DEBUG_PARSE_r({
6266 SV * const mysv_val=sv_newmortal();
6267 DEBUG_PARSE_MSG("");
6268 regprop(RExC_rx, mysv_val, val);
6269 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6270 SvPV_nolen_const(mysv_val),
6271 REG_NODE_NUM(val),
6272 val - scan
6273 );
6274 });
c277df42
IZ
6275 if (reg_off_by_arg[OP(scan)]) {
6276 ARG_SET(scan, val - scan);
a0ed51b3
LW
6277 }
6278 else {
c277df42
IZ
6279 NEXT_OFF(scan) = val - scan;
6280 }
3dab1dad
YO
6281
6282 return exact;
a687059c 6283}
07be1b83 6284#endif
a687059c
LW
6285
6286/*
a687059c
LW
6287 - regcurly - a little FSA that accepts {\d+,?\d*}
6288 */
79072805 6289STATIC I32
5f66b61c 6290S_regcurly(register const char *s)
a687059c
LW
6291{
6292 if (*s++ != '{')
6293 return FALSE;
f0fcb552 6294 if (!isDIGIT(*s))
a687059c 6295 return FALSE;
f0fcb552 6296 while (isDIGIT(*s))
a687059c
LW
6297 s++;
6298 if (*s == ',')
6299 s++;
f0fcb552 6300 while (isDIGIT(*s))
a687059c
LW
6301 s++;
6302 if (*s != '}')
6303 return FALSE;
6304 return TRUE;
6305}
6306
a687059c
LW
6307
6308/*
fd181c75 6309 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6310 */
6311void
097eb12c 6312Perl_regdump(pTHX_ const regexp *r)
a687059c 6313{
35ff7856 6314#ifdef DEBUGGING
97aff369 6315 dVAR;
c445ea15 6316 SV * const sv = sv_newmortal();
a687059c 6317
4f639d21 6318 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6319
6320 /* Header fields of interest. */
c277df42 6321 if (r->anchored_substr)
7b0972df 6322 PerlIO_printf(Perl_debug_log,
a0288114 6323 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 6324 PL_colors[0],
7b0972df 6325 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 6326 SvPVX_const(r->anchored_substr),
3280af22 6327 PL_colors[1],
c277df42 6328 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 6329 (IV)r->anchored_offset);
33b8afdf
JH
6330 else if (r->anchored_utf8)
6331 PerlIO_printf(Perl_debug_log,
a0288114 6332 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
6333 PL_colors[0],
6334 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 6335 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
6336 PL_colors[1],
6337 SvTAIL(r->anchored_utf8) ? "$" : "",
6338 (IV)r->anchored_offset);
c277df42 6339 if (r->float_substr)
7b0972df 6340 PerlIO_printf(Perl_debug_log,
a0288114 6341 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 6342 PL_colors[0],
b81d288d 6343 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6344 SvPVX_const(r->float_substr),
3280af22 6345 PL_colors[1],
c277df42 6346 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6347 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6348 else if (r->float_utf8)
6349 PerlIO_printf(Perl_debug_log,
a0288114 6350 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6351 PL_colors[0],
6352 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6353 SvPVX_const(r->float_utf8),
33b8afdf
JH
6354 PL_colors[1],
6355 SvTAIL(r->float_utf8) ? "$" : "",
6356 (IV)r->float_min_offset, (UV)r->float_max_offset);
6357 if (r->check_substr || r->check_utf8)
b81d288d
AB
6358 PerlIO_printf(Perl_debug_log,
6359 r->check_substr == r->float_substr
33b8afdf 6360 && r->check_utf8 == r->float_utf8
c277df42
IZ
6361 ? "(checking floating" : "(checking anchored");
6362 if (r->reganch & ROPT_NOSCAN)
6363 PerlIO_printf(Perl_debug_log, " noscan");
6364 if (r->reganch & ROPT_CHECK_ALL)
6365 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6366 if (r->check_substr || r->check_utf8)
c277df42
IZ
6367 PerlIO_printf(Perl_debug_log, ") ");
6368
46fc3d4c 6369 if (r->regstclass) {
32fc9b6a 6370 regprop(r, sv, r->regstclass);
3f7c398e 6371 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6372 }
774d564b 6373 if (r->reganch & ROPT_ANCH) {
6374 PerlIO_printf(Perl_debug_log, "anchored");
6375 if (r->reganch & ROPT_ANCH_BOL)
6376 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6377 if (r->reganch & ROPT_ANCH_MBOL)
6378 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6379 if (r->reganch & ROPT_ANCH_SBOL)
6380 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6381 if (r->reganch & ROPT_ANCH_GPOS)
6382 PerlIO_printf(Perl_debug_log, "(GPOS)");
6383 PerlIO_putc(Perl_debug_log, ' ');
6384 }
c277df42
IZ
6385 if (r->reganch & ROPT_GPOS_SEEN)
6386 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6387 if (r->reganch & ROPT_SKIP)
760ac839 6388 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6389 if (r->reganch & ROPT_IMPLICIT)
760ac839 6390 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6391 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6392 if (r->reganch & ROPT_EVAL_SEEN)
6393 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6394 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6395#else
96a5add6 6396 PERL_UNUSED_CONTEXT;
65e66c80 6397 PERL_UNUSED_ARG(r);
17c3b450 6398#endif /* DEBUGGING */
a687059c
LW
6399}
6400
6401/*
a0d0e21e
LW
6402- regprop - printable representation of opcode
6403*/
46fc3d4c 6404void
32fc9b6a 6405Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6406{
35ff7856 6407#ifdef DEBUGGING
97aff369 6408 dVAR;
9b155405 6409 register int k;
a0d0e21e 6410
54dc92de 6411 sv_setpvn(sv, "", 0);
9b155405 6412 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6413 /* It would be nice to FAIL() here, but this may be called from
6414 regexec.c, and it would be hard to supply pRExC_state. */
6415 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6416 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6417
3dab1dad 6418 k = PL_regkind[OP(o)];
9b155405 6419
2a782b5b 6420 if (k == EXACT) {
396482e1 6421 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6422 /* Using is_utf8_string() is a crude hack but it may
6423 * be the best for now since we have no flag "this EXACTish
6424 * node was UTF-8" --jhi */
1df70142 6425 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
0df25f3d
YO
6426 RE_PV_DISPLAY_DECL(s, len, do_utf8, dsv, STRING(o), STR_LEN(o), 60);
6427
2a782b5b
JH
6428 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6429 PL_colors[0],
6430 len, s,
6431 PL_colors[1]);
bb263b4e 6432 } else if (k == TRIE) {
3dab1dad
YO
6433 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6434 /* print the details of the trie in dumpuntil instead, as
4f639d21 6435 * prog->data isn't available here */
a3621e74 6436 } else if (k == CURLY) {
cb434fcc 6437 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6438 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6439 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6440 }
2c2d71f5
JH
6441 else if (k == WHILEM && o->flags) /* Ordinal/of */
6442 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6443 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6444 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6445 else if (k == LOGICAL)
04ebc1ab 6446 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6447 else if (k == ANYOF) {
6448 int i, rangestart = -1;
2d03de9c 6449 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6450
6451 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6452 static const char * const anyofs[] = {
653099ff
GS
6453 "\\w",
6454 "\\W",
6455 "\\s",
6456 "\\S",
6457 "\\d",
6458 "\\D",
6459 "[:alnum:]",
6460 "[:^alnum:]",
6461 "[:alpha:]",
6462 "[:^alpha:]",
6463 "[:ascii:]",
6464 "[:^ascii:]",
6465 "[:ctrl:]",
6466 "[:^ctrl:]",
6467 "[:graph:]",
6468 "[:^graph:]",
6469 "[:lower:]",
6470 "[:^lower:]",
6471 "[:print:]",
6472 "[:^print:]",
6473 "[:punct:]",
6474 "[:^punct:]",
6475 "[:upper:]",
aaa51d5e 6476 "[:^upper:]",
653099ff 6477 "[:xdigit:]",
aaa51d5e
JF
6478 "[:^xdigit:]",
6479 "[:space:]",
6480 "[:^space:]",
6481 "[:blank:]",
6482 "[:^blank:]"
653099ff
GS
6483 };
6484
19860706 6485 if (flags & ANYOF_LOCALE)
396482e1 6486 sv_catpvs(sv, "{loc}");
19860706 6487 if (flags & ANYOF_FOLD)
396482e1 6488 sv_catpvs(sv, "{i}");
653099ff 6489 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6490 if (flags & ANYOF_INVERT)
396482e1 6491 sv_catpvs(sv, "^");
ffc61ed2
JH
6492 for (i = 0; i <= 256; i++) {
6493 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6494 if (rangestart == -1)
6495 rangestart = i;
6496 } else if (rangestart != -1) {
6497 if (i <= rangestart + 3)
6498 for (; rangestart < i; rangestart++)
653099ff 6499 put_byte(sv, rangestart);
ffc61ed2
JH
6500 else {
6501 put_byte(sv, rangestart);
396482e1 6502 sv_catpvs(sv, "-");
ffc61ed2 6503 put_byte(sv, i - 1);
653099ff 6504 }
ffc61ed2 6505 rangestart = -1;
653099ff 6506 }
847a199f 6507 }
ffc61ed2
JH
6508
6509 if (o->flags & ANYOF_CLASS)
bb7a0f54 6510 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6511 if (ANYOF_CLASS_TEST(o,i))
6512 sv_catpv(sv, anyofs[i]);
6513
6514 if (flags & ANYOF_UNICODE)
396482e1 6515 sv_catpvs(sv, "{unicode}");
1aa99e6b 6516 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6517 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6518
6519 {
6520 SV *lv;
32fc9b6a 6521 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6522
ffc61ed2
JH
6523 if (lv) {
6524 if (sw) {
89ebb4a3 6525 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6526
ffc61ed2 6527 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6528 uvchr_to_utf8(s, i);
ffc61ed2 6529
3568d838 6530 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6531 if (rangestart == -1)
6532 rangestart = i;
6533 } else if (rangestart != -1) {
ffc61ed2
JH
6534 if (i <= rangestart + 3)
6535 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6536 const U8 * const e = uvchr_to_utf8(s,rangestart);
6537 U8 *p;
6538 for(p = s; p < e; p++)
ffc61ed2
JH
6539 put_byte(sv, *p);
6540 }
6541 else {
2d03de9c
AL
6542 const U8 *e = uvchr_to_utf8(s,rangestart);
6543 U8 *p;
6544 for (p = s; p < e; p++)
ffc61ed2 6545 put_byte(sv, *p);
396482e1 6546 sv_catpvs(sv, "-");
2d03de9c
AL
6547 e = uvchr_to_utf8(s, i-1);
6548 for (p = s; p < e; p++)
1df70142 6549 put_byte(sv, *p);
ffc61ed2
JH
6550 }
6551 rangestart = -1;
6552 }
19860706 6553 }
ffc61ed2 6554
396482e1 6555 sv_catpvs(sv, "..."); /* et cetera */
19860706 6556 }
fde631ed 6557
ffc61ed2 6558 {
2e0de35c 6559 char *s = savesvpv(lv);
c445ea15 6560 char * const origs = s;
b81d288d 6561
3dab1dad
YO
6562 while (*s && *s != '\n')
6563 s++;
b81d288d 6564
ffc61ed2 6565 if (*s == '\n') {
2d03de9c 6566 const char * const t = ++s;
ffc61ed2
JH
6567
6568 while (*s) {
6569 if (*s == '\n')
6570 *s = ' ';
6571 s++;
6572 }
6573 if (s[-1] == ' ')
6574 s[-1] = 0;
6575
6576 sv_catpv(sv, t);
fde631ed 6577 }
b81d288d 6578
ffc61ed2 6579 Safefree(origs);
fde631ed
JH
6580 }
6581 }
653099ff 6582 }
ffc61ed2 6583
653099ff
GS
6584 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6585 }
9b155405 6586 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6587 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6588#else
96a5add6 6589 PERL_UNUSED_CONTEXT;
65e66c80
SP
6590 PERL_UNUSED_ARG(sv);
6591 PERL_UNUSED_ARG(o);
f9049ba1 6592 PERL_UNUSED_ARG(prog);
17c3b450 6593#endif /* DEBUGGING */
35ff7856 6594}
a687059c 6595
cad2e5aa
JH
6596SV *
6597Perl_re_intuit_string(pTHX_ regexp *prog)
6598{ /* Assume that RE_INTUIT is set */
97aff369 6599 dVAR;
a3621e74 6600 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6601 PERL_UNUSED_CONTEXT;
6602
a3621e74 6603 DEBUG_COMPILE_r(
cfd0369c 6604 {
2d03de9c 6605 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6606 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6607
6608 if (!PL_colorset) reginitcolors();
6609 PerlIO_printf(Perl_debug_log,
a0288114 6610 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6611 PL_colors[4],
6612 prog->check_substr ? "" : "utf8 ",
6613 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6614 s,
6615 PL_colors[1],
6616 (strlen(s) > 60 ? "..." : ""));
6617 } );
6618
33b8afdf 6619 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6620}
6621
2b69d0c2 6622void
864dbfa3 6623Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6624{
27da23d5 6625 dVAR;
0df25f3d
YO
6626
6627
6628
fc32ee4a 6629 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6630
7821416a
IZ
6631 if (!r || (--r->refcnt > 0))
6632 return;
be8e71aa 6633 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
0df25f3d
YO
6634 RE_PV_DISPLAY_DECL(s, len, (r->reganch & ROPT_UTF8),
6635 PERL_DEBUG_PAD_ZERO(0), r->precomp, r->prelen, 60);
6636
6637 if (!PL_colorset)
6638 reginitcolors();
6639 PerlIO_printf(Perl_debug_log,
6640 "%sFreeing REx:%s %s%*.*s%s%s\n",
6641 PL_colors[4],PL_colors[5],PL_colors[0],
6642 len, len, s,
6643 PL_colors[1],
6644 len > 60 ? "..." : "");
9e55ce06 6645 });
cad2e5aa 6646
43c5f42d
NC
6647 /* gcov results gave these as non-null 100% of the time, so there's no
6648 optimisation in checking them before calling Safefree */
6649 Safefree(r->precomp);
6650 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6651 RX_MATCH_COPY_FREE(r);
f8c7b90f 6652#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6653 if (r->saved_copy)
6654 SvREFCNT_dec(r->saved_copy);
6655#endif
a193d654
GS
6656 if (r->substrs) {
6657 if (r->anchored_substr)
6658 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6659 if (r->anchored_utf8)
6660 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6661 if (r->float_substr)
6662 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6663 if (r->float_utf8)
6664 SvREFCNT_dec(r->float_utf8);
2779dcf1 6665 Safefree(r->substrs);
a193d654 6666 }
c277df42
IZ
6667 if (r->data) {
6668 int n = r->data->count;
f3548bdc
DM
6669 PAD* new_comppad = NULL;
6670 PAD* old_comppad;
4026c95a 6671 PADOFFSET refcnt;
dfad63ad 6672
c277df42 6673 while (--n >= 0) {
261faec3 6674 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6675 switch (r->data->what[n]) {
6676 case 's':
6677 SvREFCNT_dec((SV*)r->data->data[n]);
6678 break;
653099ff
GS
6679 case 'f':
6680 Safefree(r->data->data[n]);
6681 break;
dfad63ad
HS
6682 case 'p':
6683 new_comppad = (AV*)r->data->data[n];
6684 break;
c277df42 6685 case 'o':
dfad63ad 6686 if (new_comppad == NULL)
cea2e8a9 6687 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6688 PAD_SAVE_LOCAL(old_comppad,
6689 /* Watch out for global destruction's random ordering. */
c445ea15 6690 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6691 );
b34c0dd4 6692 OP_REFCNT_LOCK;
4026c95a
SH
6693 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6694 OP_REFCNT_UNLOCK;
6695 if (!refcnt)
9b978d73 6696 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6697
f3548bdc 6698 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6699 SvREFCNT_dec((SV*)new_comppad);
6700 new_comppad = NULL;
c277df42
IZ
6701 break;
6702 case 'n':
9e55ce06 6703 break;
07be1b83 6704 case 'T':
be8e71aa
YO
6705 { /* Aho Corasick add-on structure for a trie node.
6706 Used in stclass optimization only */
07be1b83
YO
6707 U32 refcount;
6708 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6709 OP_REFCNT_LOCK;
6710 refcount = --aho->refcount;
6711 OP_REFCNT_UNLOCK;
6712 if ( !refcount ) {
6713 Safefree(aho->states);
6714 Safefree(aho->fail);
6715 aho->trie=NULL; /* not necessary to free this as it is
6716 handled by the 't' case */
6717 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6718 Safefree(r->regstclass);
07be1b83
YO
6719 }
6720 }
6721 break;
a3621e74 6722 case 't':
07be1b83 6723 {
be8e71aa 6724 /* trie structure. */
07be1b83
YO
6725 U32 refcount;
6726 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6727 OP_REFCNT_LOCK;
6728 refcount = --trie->refcount;
6729 OP_REFCNT_UNLOCK;
6730 if ( !refcount ) {
6731 Safefree(trie->charmap);
6732 if (trie->widecharmap)
6733 SvREFCNT_dec((SV*)trie->widecharmap);
6734 Safefree(trie->states);
6735 Safefree(trie->trans);
6736 if (trie->bitmap)
6737 Safefree(trie->bitmap);
6738 if (trie->wordlen)
6739 Safefree(trie->wordlen);
a3621e74 6740#ifdef DEBUGGING
be8e71aa
YO
6741 if (RX_DEBUG(r)) {
6742 if (trie->words)
6743 SvREFCNT_dec((SV*)trie->words);
6744 if (trie->revcharmap)
6745 SvREFCNT_dec((SV*)trie->revcharmap);
6746 }
a3621e74 6747#endif
07be1b83 6748 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6749 }
07be1b83
YO
6750 }
6751 break;
c277df42 6752 default:
830247a4 6753 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6754 }
6755 }
6756 Safefree(r->data->what);
6757 Safefree(r->data);
a0d0e21e
LW
6758 }
6759 Safefree(r->startp);
6760 Safefree(r->endp);
6761 Safefree(r);
a687059c 6762}
c277df42 6763
76234dfb 6764#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6765/*
6766 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6767 */
6768regnode *
864dbfa3 6769Perl_regnext(pTHX_ register regnode *p)
c277df42 6770{
97aff369 6771 dVAR;
c277df42
IZ
6772 register I32 offset;
6773
3280af22 6774 if (p == &PL_regdummy)
c277df42
IZ
6775 return(NULL);
6776
6777 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6778 if (offset == 0)
6779 return(NULL);
6780
c277df42 6781 return(p+offset);
c277df42 6782}
76234dfb 6783#endif
c277df42 6784
01f988be 6785STATIC void
cea2e8a9 6786S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6787{
6788 va_list args;
6789 STRLEN l1 = strlen(pat1);
6790 STRLEN l2 = strlen(pat2);
6791 char buf[512];
06bf62c7 6792 SV *msv;
73d840c0 6793 const char *message;
c277df42
IZ
6794
6795 if (l1 > 510)
6796 l1 = 510;
6797 if (l1 + l2 > 510)
6798 l2 = 510 - l1;
6799 Copy(pat1, buf, l1 , char);
6800 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6801 buf[l1 + l2] = '\n';
6802 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6803#ifdef I_STDARG
6804 /* ANSI variant takes additional second argument */
c277df42 6805 va_start(args, pat2);
8736538c
AS
6806#else
6807 va_start(args);
6808#endif
5a844595 6809 msv = vmess(buf, &args);
c277df42 6810 va_end(args);
cfd0369c 6811 message = SvPV_const(msv,l1);
c277df42
IZ
6812 if (l1 > 512)
6813 l1 = 512;
6814 Copy(message, buf, l1 , char);
197cf9b9 6815 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6816 Perl_croak(aTHX_ "%s", buf);
c277df42 6817}
a0ed51b3
LW
6818
6819/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6820
76234dfb 6821#ifndef PERL_IN_XSUB_RE
a0ed51b3 6822void
864dbfa3 6823Perl_save_re_context(pTHX)
b81d288d 6824{
97aff369 6825 dVAR;
1ade1aa1
NC
6826
6827 struct re_save_state *state;
6828
6829 SAVEVPTR(PL_curcop);
6830 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6831
6832 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6833 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6834 SSPUSHINT(SAVEt_RE_STATE);
6835
46ab3289 6836 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6837
a0ed51b3 6838 PL_reg_start_tmp = 0;
a0ed51b3 6839 PL_reg_start_tmpl = 0;
c445ea15 6840 PL_reg_oldsaved = NULL;
a5db57d6 6841 PL_reg_oldsavedlen = 0;
a5db57d6 6842 PL_reg_maxiter = 0;
a5db57d6 6843 PL_reg_leftiter = 0;
c445ea15 6844 PL_reg_poscache = NULL;
a5db57d6 6845 PL_reg_poscache_size = 0;
1ade1aa1
NC
6846#ifdef PERL_OLD_COPY_ON_WRITE
6847 PL_nrs = NULL;
6848#endif
ada6e8a9 6849
c445ea15
AL
6850 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6851 if (PL_curpm) {
6852 const REGEXP * const rx = PM_GETRE(PL_curpm);
6853 if (rx) {
1df70142 6854 U32 i;
ada6e8a9 6855 for (i = 1; i <= rx->nparens; i++) {
1df70142 6856 char digits[TYPE_CHARS(long)];
d9fad198 6857 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6858 GV *const *const gvp
6859 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6860
b37c2d43
AL
6861 if (gvp) {
6862 GV * const gv = *gvp;
6863 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6864 save_scalar(gv);
49f27e4b 6865 }
ada6e8a9
AMS
6866 }
6867 }
6868 }
a0ed51b3 6869}
76234dfb 6870#endif
51371543 6871
51371543 6872static void
acfe0abc 6873clear_re(pTHX_ void *r)
51371543 6874{
97aff369 6875 dVAR;
51371543
GS
6876 ReREFCNT_dec((regexp *)r);
6877}
ffbc6a93 6878
a28509cc
AL
6879#ifdef DEBUGGING
6880
6881STATIC void
6882S_put_byte(pTHX_ SV *sv, int c)
6883{
6884 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6885 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6886 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6887 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6888 else
6889 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6890}
6891
3dab1dad
YO
6892#define CLEAR_OPTSTART \
6893 if (optstart) STMT_START { \
07be1b83 6894 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6895 optstart=NULL; \
6896 } STMT_END
6897
6898#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6899
b5a2f8d8
NC
6900STATIC const regnode *
6901S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6902 const regnode *last, SV* sv, I32 l)
a28509cc 6903{
97aff369 6904 dVAR;
a28509cc 6905 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6906 register const regnode *next;
3dab1dad
YO
6907 const regnode *optstart= NULL;
6908 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6909
6910 while (op != END && (!last || node < last)) {
6911 /* While that wasn't END last time... */
6912
6913 NODE_ALIGN(node);
6914 op = OP(node);
6915 if (op == CLOSE)
6916 l--;
b5a2f8d8 6917 next = regnext((regnode *)node);
07be1b83 6918
a28509cc 6919 /* Where, what. */
8e11feef 6920 if (OP(node) == OPTIMIZED) {
e68ec53f 6921 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6922 optstart = node;
3dab1dad 6923 else
8e11feef 6924 goto after_print;
3dab1dad
YO
6925 } else
6926 CLEAR_OPTSTART;
07be1b83 6927
32fc9b6a 6928 regprop(r, sv, node);
a28509cc
AL
6929 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6930 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6931
6932 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6933 if (next == NULL) /* Next ptr. */
6934 PerlIO_printf(Perl_debug_log, "(0)");
6935 else
6936 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6937 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6938 }
6939
a28509cc
AL
6940 after_print:
6941 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6942 assert(next);
6943 {
6944 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
6945 ? regnext((regnode *)next)
6946 : next);
be8e71aa
YO
6947 if (last && nnode > last)
6948 nnode = last;
6949 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6950 }
a28509cc
AL
6951 }
6952 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 6953 assert(next);
3dab1dad 6954 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6955 }
6956 else if ( PL_regkind[(U8)op] == TRIE ) {
6957 const I32 n = ARG(node);
4f639d21 6958 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6959 const I32 arry_len = av_len(trie->words)+1;
6960 I32 word_idx;
6961 PerlIO_printf(Perl_debug_log,
07be1b83 6962 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
6963 (int)(2*(l+3)),
6964 "",
6965 trie->startstate,
6966 TRIE_WORDCOUNT(trie),
6967 (int)TRIE_CHARCOUNT(trie),
6968 trie->uniquecharcount,
6969 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
6970 (int)trie->minlen,
6971 (int)trie->maxlen
8e11feef 6972 );
3dab1dad
YO
6973 if (trie->bitmap) {
6974 int i;
6975 int rangestart= -1;
6976 sv_setpvn(sv, "", 0);
6977 for (i = 0; i <= 256; i++) {
6978 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6979 if (rangestart == -1)
6980 rangestart = i;
6981 } else if (rangestart != -1) {
6982 if (i <= rangestart + 3)
6983 for (; rangestart < i; rangestart++)
6984 put_byte(sv, rangestart);
6985 else {
6986 put_byte(sv, rangestart);
6987 sv_catpvs(sv, "-");
6988 put_byte(sv, i - 1);
6989 }
6990 rangestart = -1;
6991 }
6992 }
07be1b83 6993 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 6994 } else
07be1b83 6995 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
6996
6997 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6998 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6999 if (elem_ptr) {
7000 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7001 (int)(2*(l+4)), "",
7002 PL_colors[0],
cfd0369c 7003 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
7004 PL_colors[1]
7005 );
a28509cc 7006 }
a28509cc
AL
7007 }
7008
7009 node = NEXTOPER(node);
7010 node += regarglen[(U8)op];
7011
7012 }
7013 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 7014 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7015 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7016 }
7017 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7018 assert(next);
3dab1dad 7019 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7020 next, sv, l + 1);
7021 }
7022 else if ( op == PLUS || op == STAR) {
3dab1dad 7023 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7024 }
7025 else if (op == ANYOF) {
7026 /* arglen 1 + class block */
7027 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7028 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7029 node = NEXTOPER(node);
7030 }
7031 else if (PL_regkind[(U8)op] == EXACT) {
7032 /* Literal string, where present. */
7033 node += NODE_SZ_STR(node) - 1;
7034 node = NEXTOPER(node);
7035 }
7036 else {
7037 node = NEXTOPER(node);
7038 node += regarglen[(U8)op];
7039 }
7040 if (op == CURLYX || op == OPEN)
7041 l++;
7042 else if (op == WHILEM)
7043 l--;
7044 }
3dab1dad 7045 CLEAR_OPTSTART;
a28509cc
AL
7046 return node;
7047}
7048
7049#endif /* DEBUGGING */
7050
241d1a3b
NC
7051/*
7052 * Local variables:
7053 * c-indentation-style: bsd
7054 * c-basic-offset: 4
7055 * indent-tabs-mode: t
7056 * End:
7057 *
37442d52
RGS
7058 * ex: set ts=8 sts=4 sw=4 noet:
7059 */