This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
migrate CURLY/PLUS/STAR branch in regmatch() to new FSM-esque paradigm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
758 } STMT_END
a3621e74
YO
759
760#define TRIE_READ_CHAR STMT_START { \
07be1b83 761 wordlen++; \
a3621e74
YO
762 if ( UTF ) { \
763 if ( folder ) { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
766 foldlen -= len; \
767 scan += len; \
768 len = 0; \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
774 } \
775 } else { \
e1ec3a88 776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
777 } \
778 } else { \
779 uvc = (U32)*uc; \
780 len = 1; \
781 } \
782} STMT_END
783
784
785#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
789
790#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
795 } \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
799} STMT_END
800
801#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 802 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
806} STMT_END
807
07be1b83
YO
808#define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
811 if (trie->wordlen) \
812 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
813 trie->states[ state ].wordnum = ++curword; \
814 DEBUG_r({ \
815 /* store the word for dumping */ \
816 SV* tmp; \
07be1b83
YO
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 819 else \
07be1b83 820 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
823 }); \
824 } else { \
825 NOOP; /* It's a dupe. So ignore it. */ \
826 } \
827} STMT_END
828
829#ifdef DEBUGGING
07be1b83 830/*
3dab1dad
YO
831 dump_trie(trie)
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
834
835 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
839
3dab1dad
YO
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
07be1b83
YO
842
843*/
3dab1dad
YO
844
845/*
846 dump_trie(trie)
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
849*/
850
851STATIC void
852S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
853{
854 U32 state;
855 GET_RE_DEBUG_FLAGS_DECL;
856
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
860
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad
YO
863 if ( tmp ) {
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
873
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
904 } else {
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
906 }
907 }
908
909 PerlIO_printf( Perl_debug_log, "]");
910
911 }
912 PerlIO_printf( Perl_debug_log, "\n" );
913 }
914}
915/*
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
924{
925 U32 state;
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
931
932 for( state=1 ; state < next_alloc ; state ++ ) {
933 U16 charid;
934
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
939 } else {
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
942 );
943 }
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
3dab1dad
YO
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
950 );
951 }
952
953 }
954}
955
956/*
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
962*/
963STATIC void
964S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
965{
966 U32 state;
967 U16 charid;
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad
YO
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1001 }
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1004 } else {
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 }
1008 }
07be1b83 1009}
3dab1dad
YO
1010
1011#endif
1012
07be1b83
YO
1013#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1020 )
3dab1dad 1021
07be1b83
YO
1022STATIC void
1023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1024{
1025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1026
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1029 ISBN 0-201-10088-6
1030
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1036 Consider
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1041 */
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1044 U32 *q;
be8e71aa
YO
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1048 U32 q_read = 0;
1049 U32 q_write = 0;
1050 U32 charid;
1051 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1052 U32 *fail;
1053 reg_ac_data *aho;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
72f13be8 1055 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
1056#ifndef DEBUGGING
1057 PERL_UNUSED_ARG(depth);
1058#endif
72f13be8 1059
07be1b83
YO
1060
1061 ARG_SET( stclass, data_slot );
1062 Newxz( aho, 1, reg_ac_data );
1063 RExC_rx->data->data[ data_slot ] = (void*)aho;
1064 aho->trie=trie;
1065 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1066 (trie->laststate+1)*sizeof(reg_trie_state));
1067 Newxz( q, numstates, U32);
1068 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1069 aho->refcount = 1;
1070 fail = aho->fail;
07be1b83
YO
1071 fail[ 0 ] = fail[ 1 ] = 1;
1072
1073 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1074 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1075 if ( newstate ) {
07be1b83
YO
1076 q[ q_write ] = newstate;
1077 /* set to point at the root */
1078 fail[ q[ q_write++ ] ]=1;
1079 }
1080 }
1081 while ( q_read < q_write) {
be8e71aa 1082 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1083 base = trie->states[ cur ].trans.base;
1084
1085 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1086 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1087 if (ch_state) {
07be1b83
YO
1088 U32 fail_state = cur;
1089 U32 fail_base;
1090 do {
1091 fail_state = fail[ fail_state ];
1092 fail_base = aho->states[ fail_state ].trans.base;
1093 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1094
1095 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1096 fail[ ch_state ] = fail_state;
1097 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1098 {
1099 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1100 }
1101 q[ q_write++ % numstates] = ch_state;
1102 }
1103 }
1104 }
1105
1106 DEBUG_TRIE_COMPILE_MORE_r({
1107 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1108 for( q_read=2; q_read<numstates; q_read++ ) {
1109 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1110 }
1111 PerlIO_printf(Perl_debug_log, "\n");
1112 });
1113 Safefree(q);
1114 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1115}
3dab1dad
YO
1116
1117
1118
a3621e74 1119STATIC I32
3dab1dad 1120S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1121{
27da23d5 1122 dVAR;
a3621e74
YO
1123 /* first pass, loop through and scan words */
1124 reg_trie_data *trie;
1125 regnode *cur;
9f7f3913 1126 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1127 STRLEN len = 0;
1128 UV uvc = 0;
1129 U16 curword = 0;
1130 U32 next_alloc = 0;
1131 /* we just use folder as a flag in utf8 */
e1ec3a88 1132 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1133 ? PL_fold
1134 : ( flags == EXACTFL
1135 ? PL_fold_locale
1136 : NULL
1137 )
1138 );
1139
e1ec3a88 1140 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1141 SV *re_trie_maxbuff;
3dab1dad
YO
1142#ifndef DEBUGGING
1143 /* these are only used during construction but are useful during
8e11feef
RGS
1144 * debugging so we store them in the struct when debugging.
1145 * Wordcount is actually superfluous in debugging as we have
1146 * (AV*)trie->words to use for it, but that's not available when
1147 * not debugging... We could make the macro use the AV during
1148 * debugging though...
1149 */
3dab1dad
YO
1150 U16 trie_wordcount=0;
1151 STRLEN trie_charcount=0;
07be1b83 1152 /*U32 trie_laststate=0;*/
3dab1dad
YO
1153 AV *trie_revcharmap;
1154#endif
a3621e74 1155 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1156#ifndef DEBUGGING
1157 PERL_UNUSED_ARG(depth);
1158#endif
a3621e74 1159
a02a5408 1160 Newxz( trie, 1, reg_trie_data );
a3621e74 1161 trie->refcount = 1;
3dab1dad 1162 trie->startstate = 1;
a3621e74 1163 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1164 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1165 if (!(UTF && folder))
1166 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1167 DEBUG_r({
1168 trie->words = newAV();
a3621e74 1169 });
3dab1dad 1170 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1171
0111c4fd 1172 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1173 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1174 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1175 }
3dab1dad
YO
1176 DEBUG_OPTIMISE_r({
1177 PerlIO_printf( Perl_debug_log,
1178 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1179 (int)depth * 2 + 2, "",
1180 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1181 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1182 });
a3621e74
YO
1183 /* -- First loop and Setup --
1184
1185 We first traverse the branches and scan each word to determine if it
1186 contains widechars, and how many unique chars there are, this is
1187 important as we have to build a table with at least as many columns as we
1188 have unique chars.
1189
1190 We use an array of integers to represent the character codes 0..255
1191 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1192 native representation of the character value as the key and IV's for the
1193 coded index.
1194
1195 *TODO* If we keep track of how many times each character is used we can
1196 remap the columns so that the table compression later on is more
1197 efficient in terms of memory by ensuring most common value is in the
1198 middle and the least common are on the outside. IMO this would be better
1199 than a most to least common mapping as theres a decent chance the most
1200 common letter will share a node with the least common, meaning the node
1201 will not be compressable. With a middle is most common approach the worst
1202 case is when we have the least common nodes twice.
1203
1204 */
1205
a3621e74 1206 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1207 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1208 const U8 *uc = (U8*)STRING( noper );
a28509cc 1209 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1210 STRLEN foldlen = 0;
1211 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1212 const U8 *scan = (U8*)NULL;
07be1b83 1213 U32 wordlen = 0; /* required init */
3dab1dad 1214 STRLEN chars=0;
a3621e74 1215
3dab1dad
YO
1216 TRIE_WORDCOUNT(trie)++;
1217 if (OP(noper) == NOTHING) {
1218 trie->minlen= 0;
1219 continue;
1220 }
1221 if (trie->bitmap) {
1222 TRIE_BITMAP_SET(trie,*uc);
1223 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1224 }
a3621e74 1225 for ( ; uc < e ; uc += len ) {
3dab1dad 1226 TRIE_CHARCOUNT(trie)++;
a3621e74 1227 TRIE_READ_CHAR;
3dab1dad 1228 chars++;
a3621e74
YO
1229 if ( uvc < 256 ) {
1230 if ( !trie->charmap[ uvc ] ) {
1231 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1232 if ( folder )
1233 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1234 TRIE_STORE_REVCHAR;
a3621e74
YO
1235 }
1236 } else {
1237 SV** svpp;
1238 if ( !trie->widecharmap )
1239 trie->widecharmap = newHV();
1240
1241 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1242
1243 if ( !svpp )
e4584336 1244 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1245
1246 if ( !SvTRUE( *svpp ) ) {
1247 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1248 TRIE_STORE_REVCHAR;
a3621e74
YO
1249 }
1250 }
1251 }
3dab1dad
YO
1252 if( cur == first ) {
1253 trie->minlen=chars;
1254 trie->maxlen=chars;
1255 } else if (chars < trie->minlen) {
1256 trie->minlen=chars;
1257 } else if (chars > trie->maxlen) {
1258 trie->maxlen=chars;
1259 }
1260
a3621e74
YO
1261 } /* end first pass */
1262 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1263 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1264 (int)depth * 2 + 2,"",
1265 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1266 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1267 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1268 );
07be1b83 1269 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1270
1271 /*
1272 We now know what we are dealing with in terms of unique chars and
1273 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1274 representation using a flat table will take. If it's over a reasonable
1275 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1276 conservative but potentially much slower representation using an array
1277 of lists.
1278
1279 At the end we convert both representations into the same compressed
1280 form that will be used in regexec.c for matching with. The latter
1281 is a form that cannot be used to construct with but has memory
1282 properties similar to the list form and access properties similar
1283 to the table form making it both suitable for fast searches and
1284 small enough that its feasable to store for the duration of a program.
1285
1286 See the comment in the code where the compressed table is produced
1287 inplace from the flat tabe representation for an explanation of how
1288 the compression works.
1289
1290 */
1291
1292
3dab1dad 1293 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1294 /*
1295 Second Pass -- Array Of Lists Representation
1296
1297 Each state will be represented by a list of charid:state records
1298 (reg_trie_trans_le) the first such element holds the CUR and LEN
1299 points of the allocated array. (See defines above).
1300
1301 We build the initial structure using the lists, and then convert
1302 it into the compressed table form which allows faster lookups
1303 (but cant be modified once converted).
a3621e74
YO
1304 */
1305
a3621e74
YO
1306 STRLEN transcount = 1;
1307
3dab1dad 1308 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1309 TRIE_LIST_NEW(1);
1310 next_alloc = 2;
1311
1312 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1313
c445ea15
AL
1314 regnode * const noper = NEXTOPER( cur );
1315 U8 *uc = (U8*)STRING( noper );
1316 const U8 * const e = uc + STR_LEN( noper );
1317 U32 state = 1; /* required init */
1318 U16 charid = 0; /* sanity init */
1319 U8 *scan = (U8*)NULL; /* sanity init */
1320 STRLEN foldlen = 0; /* required init */
07be1b83 1321 U32 wordlen = 0; /* required init */
c445ea15
AL
1322 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1323
3dab1dad 1324 if (OP(noper) != NOTHING) {
c445ea15
AL
1325 for ( ; uc < e ; uc += len ) {
1326
1327 TRIE_READ_CHAR;
1328
1329 if ( uvc < 256 ) {
1330 charid = trie->charmap[ uvc ];
1331 } else {
1332 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1333 if ( !svpp ) {
1334 charid = 0;
1335 } else {
1336 charid=(U16)SvIV( *svpp );
1337 }
1338 }
1339 if ( charid ) {
a3621e74 1340
c445ea15
AL
1341 U16 check;
1342 U32 newstate = 0;
a3621e74 1343
c445ea15
AL
1344 charid--;
1345 if ( !trie->states[ state ].trans.list ) {
1346 TRIE_LIST_NEW( state );
1347 }
1348 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1349 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1350 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1351 break;
1352 }
1353 }
1354 if ( ! newstate ) {
1355 newstate = next_alloc++;
1356 TRIE_LIST_PUSH( state, charid, newstate );
1357 transcount++;
1358 }
1359 state = newstate;
1360 } else {
1361 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1362 }
c445ea15
AL
1363 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1364 }
c445ea15 1365 }
3dab1dad 1366 TRIE_HANDLE_WORD(state);
a3621e74
YO
1367
1368 } /* end second pass */
1369
3dab1dad 1370 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1371 Renew( trie->states, next_alloc, reg_trie_state );
1372
3dab1dad
YO
1373 /* and now dump it out before we compress it */
1374 DEBUG_TRIE_COMPILE_MORE_r(
1375 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1376 );
a3621e74 1377
a02a5408 1378 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1379 {
1380 U32 state;
a3621e74
YO
1381 U32 tp = 0;
1382 U32 zp = 0;
1383
1384
1385 for( state=1 ; state < next_alloc ; state ++ ) {
1386 U32 base=0;
1387
1388 /*
1389 DEBUG_TRIE_COMPILE_MORE_r(
1390 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1391 );
1392 */
1393
1394 if (trie->states[state].trans.list) {
1395 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1396 U16 maxid=minid;
a28509cc 1397 U16 idx;
a3621e74
YO
1398
1399 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1400 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1401 if ( forid < minid ) {
1402 minid=forid;
1403 } else if ( forid > maxid ) {
1404 maxid=forid;
1405 }
a3621e74
YO
1406 }
1407 if ( transcount < tp + maxid - minid + 1) {
1408 transcount *= 2;
1409 Renew( trie->trans, transcount, reg_trie_trans );
1410 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1411 }
1412 base = trie->uniquecharcount + tp - minid;
1413 if ( maxid == minid ) {
1414 U32 set = 0;
1415 for ( ; zp < tp ; zp++ ) {
1416 if ( ! trie->trans[ zp ].next ) {
1417 base = trie->uniquecharcount + zp - minid;
1418 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ zp ].check = state;
1420 set = 1;
1421 break;
1422 }
1423 }
1424 if ( !set ) {
1425 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1426 trie->trans[ tp ].check = state;
1427 tp++;
1428 zp = tp;
1429 }
1430 } else {
1431 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1432 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1434 trie->trans[ tid ].check = state;
1435 }
1436 tp += ( maxid - minid + 1 );
1437 }
1438 Safefree(trie->states[ state ].trans.list);
1439 }
1440 /*
1441 DEBUG_TRIE_COMPILE_MORE_r(
1442 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1443 );
1444 */
1445 trie->states[ state ].trans.base=base;
1446 }
cc601c31 1447 trie->lasttrans = tp + 1;
a3621e74
YO
1448 }
1449 } else {
1450 /*
1451 Second Pass -- Flat Table Representation.
1452
1453 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1454 We know that we will need Charcount+1 trans at most to store the data
1455 (one row per char at worst case) So we preallocate both structures
1456 assuming worst case.
1457
1458 We then construct the trie using only the .next slots of the entry
1459 structs.
1460
1461 We use the .check field of the first entry of the node temporarily to
1462 make compression both faster and easier by keeping track of how many non
1463 zero fields are in the node.
1464
1465 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1466 transition.
1467
1468 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1469 number representing the first entry of the node, and state as a
1470 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1471 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1472 are 2 entrys per node. eg:
1473
1474 A B A B
1475 1. 2 4 1. 3 7
1476 2. 0 3 3. 0 5
1477 3. 0 0 5. 0 0
1478 4. 0 0 7. 0 0
1479
1480 The table is internally in the right hand, idx form. However as we also
1481 have to deal with the states array which is indexed by nodenum we have to
1482 use TRIE_NODENUM() to convert.
1483
1484 */
1485
3dab1dad
YO
1486
1487 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1488 reg_trie_trans );
3dab1dad 1489 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1490 next_alloc = trie->uniquecharcount + 1;
1491
3dab1dad 1492
a3621e74
YO
1493 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1494
c445ea15 1495 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1496 const U8 *uc = (U8*)STRING( noper );
1497 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1498
1499 U32 state = 1; /* required init */
1500
1501 U16 charid = 0; /* sanity init */
1502 U32 accept_state = 0; /* sanity init */
1503 U8 *scan = (U8*)NULL; /* sanity init */
1504
1505 STRLEN foldlen = 0; /* required init */
07be1b83 1506 U32 wordlen = 0; /* required init */
a3621e74
YO
1507 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1508
3dab1dad 1509 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1510 for ( ; uc < e ; uc += len ) {
1511
1512 TRIE_READ_CHAR;
1513
1514 if ( uvc < 256 ) {
1515 charid = trie->charmap[ uvc ];
1516 } else {
c445ea15
AL
1517 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1518 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1519 }
1520 if ( charid ) {
1521 charid--;
1522 if ( !trie->trans[ state + charid ].next ) {
1523 trie->trans[ state + charid ].next = next_alloc;
1524 trie->trans[ state ].check++;
1525 next_alloc += trie->uniquecharcount;
1526 }
1527 state = trie->trans[ state + charid ].next;
1528 } else {
e4584336 1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1530 }
1531 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1532 }
a3621e74 1533 }
3dab1dad
YO
1534 accept_state = TRIE_NODENUM( state );
1535 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1536
1537 } /* end second pass */
1538
3dab1dad
YO
1539 /* and now dump it out before we compress it */
1540 DEBUG_TRIE_COMPILE_MORE_r(
1541 dump_trie_interim_table(trie,next_alloc,depth+1)
1542 );
a3621e74 1543
a3621e74
YO
1544 {
1545 /*
1546 * Inplace compress the table.*
1547
1548 For sparse data sets the table constructed by the trie algorithm will
1549 be mostly 0/FAIL transitions or to put it another way mostly empty.
1550 (Note that leaf nodes will not contain any transitions.)
1551
1552 This algorithm compresses the tables by eliminating most such
1553 transitions, at the cost of a modest bit of extra work during lookup:
1554
1555 - Each states[] entry contains a .base field which indicates the
1556 index in the state[] array wheres its transition data is stored.
1557
1558 - If .base is 0 there are no valid transitions from that node.
1559
1560 - If .base is nonzero then charid is added to it to find an entry in
1561 the trans array.
1562
1563 -If trans[states[state].base+charid].check!=state then the
1564 transition is taken to be a 0/Fail transition. Thus if there are fail
1565 transitions at the front of the node then the .base offset will point
1566 somewhere inside the previous nodes data (or maybe even into a node
1567 even earlier), but the .check field determines if the transition is
1568 valid.
1569
1570 The following process inplace converts the table to the compressed
1571 table: We first do not compress the root node 1,and mark its all its
1572 .check pointers as 1 and set its .base pointer as 1 as well. This
1573 allows to do a DFA construction from the compressed table later, and
1574 ensures that any .base pointers we calculate later are greater than
1575 0.
1576
1577 - We set 'pos' to indicate the first entry of the second node.
1578
1579 - We then iterate over the columns of the node, finding the first and
1580 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1581 and set the .check pointers accordingly, and advance pos
1582 appropriately and repreat for the next node. Note that when we copy
1583 the next pointers we have to convert them from the original
1584 NODEIDX form to NODENUM form as the former is not valid post
1585 compression.
1586
1587 - If a node has no transitions used we mark its base as 0 and do not
1588 advance the pos pointer.
1589
1590 - If a node only has one transition we use a second pointer into the
1591 structure to fill in allocated fail transitions from other states.
1592 This pointer is independent of the main pointer and scans forward
1593 looking for null transitions that are allocated to a state. When it
1594 finds one it writes the single transition into the "hole". If the
1595 pointer doesnt find one the single transition is appeneded as normal.
1596
1597 - Once compressed we can Renew/realloc the structures to release the
1598 excess space.
1599
1600 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1601 specifically Fig 3.47 and the associated pseudocode.
1602
1603 demq
1604 */
a3b680e6 1605 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1606 U32 state, charid;
a3621e74 1607 U32 pos = 0, zp=0;
3dab1dad 1608 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1609
1610 for ( state = 1 ; state < laststate ; state++ ) {
1611 U8 flag = 0;
a28509cc
AL
1612 const U32 stateidx = TRIE_NODEIDX( state );
1613 const U32 o_used = trie->trans[ stateidx ].check;
1614 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1615 trie->trans[ stateidx ].check = 0;
1616
1617 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1618 if ( flag || trie->trans[ stateidx + charid ].next ) {
1619 if ( trie->trans[ stateidx + charid ].next ) {
1620 if (o_used == 1) {
1621 for ( ; zp < pos ; zp++ ) {
1622 if ( ! trie->trans[ zp ].next ) {
1623 break;
1624 }
1625 }
1626 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1627 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1628 trie->trans[ zp ].check = state;
1629 if ( ++zp > pos ) pos = zp;
1630 break;
1631 }
1632 used--;
1633 }
1634 if ( !flag ) {
1635 flag = 1;
1636 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1637 }
1638 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1639 trie->trans[ pos ].check = state;
1640 pos++;
1641 }
1642 }
1643 }
cc601c31 1644 trie->lasttrans = pos + 1;
a3621e74
YO
1645 Renew( trie->states, laststate + 1, reg_trie_state);
1646 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1647 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1648 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1649 (int)depth * 2 + 2,"",
1650 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1651 (IV)next_alloc,
1652 (IV)pos,
a3621e74
YO
1653 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1654 );
1655
1656 } /* end table compress */
1657 }
cc601c31
YO
1658 /* resize the trans array to remove unused space */
1659 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1660
3dab1dad
YO
1661 /* and now dump out the compressed format */
1662 DEBUG_TRIE_COMPILE_r(
1663 dump_trie(trie,depth+1)
1664 );
07be1b83 1665
3dab1dad
YO
1666 { /* Modify the program and insert the new TRIE node*/
1667 regnode *convert;
1668 U8 nodetype =(U8)(flags & 0xFF);
1669 char *str=NULL;
07be1b83
YO
1670#ifdef DEBUGGING
1671 U32 mjd_offset;
1672 U32 mjd_nodelen;
1673#endif
a3621e74 1674 /*
3dab1dad
YO
1675 This means we convert either the first branch or the first Exact,
1676 depending on whether the thing following (in 'last') is a branch
1677 or not and whther first is the startbranch (ie is it a sub part of
1678 the alternation or is it the whole thing.)
1679 Assuming its a sub part we conver the EXACT otherwise we convert
1680 the whole branch sequence, including the first.
a3621e74 1681 */
3dab1dad
YO
1682 /* Find the node we are going to overwrite */
1683 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1684 /* whole branch chain */
3dab1dad 1685 convert = first;
07be1b83
YO
1686 DEBUG_r({
1687 const regnode *nop = NEXTOPER( convert );
1688 mjd_offset= Node_Offset((nop));
1689 mjd_nodelen= Node_Length((nop));
1690 });
1691 } else {
1692 /* branch sub-chain */
3dab1dad
YO
1693 convert = NEXTOPER( first );
1694 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1695 DEBUG_r({
1696 mjd_offset= Node_Offset((convert));
1697 mjd_nodelen= Node_Length((convert));
1698 });
1699 }
1700 DEBUG_OPTIMISE_r(
1701 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1702 (int)depth * 2 + 2, "",
1703 mjd_offset,mjd_nodelen)
1704 );
a3621e74 1705
3dab1dad
YO
1706 /* But first we check to see if there is a common prefix we can
1707 split out as an EXACT and put in front of the TRIE node. */
1708 trie->startstate= 1;
1709 if ( trie->bitmap && !trie->widecharmap ) {
1710 U32 state;
1711 DEBUG_OPTIMISE_r(
8e11feef
RGS
1712 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1713 (int)depth * 2 + 2, "",
1714 TRIE_LASTSTATE(trie))
1715 );
1716 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1717 U32 ofs = 0;
8e11feef
RGS
1718 I32 idx = -1;
1719 U32 count = 0;
1720 const U32 base = trie->states[ state ].trans.base;
a3621e74 1721
3dab1dad 1722 if ( trie->states[state].wordnum )
8e11feef 1723 count = 1;
a3621e74 1724
8e11feef 1725 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1726 if ( ( base + ofs >= trie->uniquecharcount ) &&
1727 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1728 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1729 {
3dab1dad 1730 if ( ++count > 1 ) {
8e11feef 1731 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1732 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1733 if ( state == 1 ) break;
3dab1dad
YO
1734 if ( count == 2 ) {
1735 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1736 DEBUG_OPTIMISE_r(
8e11feef
RGS
1737 PerlIO_printf(Perl_debug_log,
1738 "%*sNew Start State=%"UVuf" Class: [",
1739 (int)depth * 2 + 2, "",
3dab1dad 1740 state));
be8e71aa
YO
1741 if (idx >= 0) {
1742 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1743 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1744
3dab1dad 1745 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1746 if ( folder )
1747 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1748 DEBUG_OPTIMISE_r(
07be1b83 1749 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1750 );
8e11feef
RGS
1751 }
1752 }
1753 TRIE_BITMAP_SET(trie,*ch);
1754 if ( folder )
1755 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1756 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1757 }
1758 idx = ofs;
1759 }
3dab1dad
YO
1760 }
1761 if ( count == 1 ) {
1762 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1763 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1764 DEBUG_OPTIMISE_r(
8e11feef
RGS
1765 PerlIO_printf( Perl_debug_log,
1766 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1767 (int)depth * 2 + 2, "",
1768 state, idx, ch)
3dab1dad
YO
1769 );
1770 if ( state==1 ) {
1771 OP( convert ) = nodetype;
1772 str=STRING(convert);
1773 STR_LEN(convert)=0;
1774 }
1775 *str++=*ch;
1776 STR_LEN(convert)++;
a3621e74 1777
8e11feef 1778 } else {
f9049ba1 1779#ifdef DEBUGGING
8e11feef
RGS
1780 if (state>1)
1781 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1782#endif
8e11feef
RGS
1783 break;
1784 }
1785 }
3dab1dad 1786 if (str) {
8e11feef 1787 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1788 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1789 trie->startstate = state;
07be1b83
YO
1790 trie->minlen -= (state - 1);
1791 trie->maxlen -= (state - 1);
1792 DEBUG_r({
1793 regnode *fix = convert;
1794 mjd_nodelen++;
1795 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1796 while( ++fix < n ) {
1797 Set_Node_Offset_Length(fix, 0, 0);
1798 }
1799 });
8e11feef
RGS
1800 if (trie->maxlen) {
1801 convert = n;
1802 } else {
3dab1dad
YO
1803 NEXT_OFF(convert) = (U16)(tail - convert);
1804 }
1805 }
1806 }
1807 if ( trie->maxlen ) {
1808 OP( convert ) = TRIE;
8e11feef
RGS
1809 NEXT_OFF( convert ) = (U16)(tail - convert);
1810 ARG_SET( convert, data_slot );
a3621e74 1811
3dab1dad
YO
1812 /* store the type in the flags */
1813 convert->flags = nodetype;
1814 /* XXX We really should free up the resource in trie now, as we wont use them */
1815 }
a3621e74
YO
1816 /* needed for dumping*/
1817 DEBUG_r({
1818 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1819 regnode *opt = convert;
1820 while (++opt<optimize) {
1821 Set_Node_Offset_Length(opt,0,0);
1822 }
a3621e74
YO
1823 /* We now need to mark all of the space originally used by the
1824 branches as optimized away. This keeps the dumpuntil from
1825 throwing a wobbly as it doesnt use regnext() to traverse the
1826 opcodes.
07be1b83 1827 We also "fix" the offsets
a3621e74
YO
1828 */
1829 while( optimize < last ) {
07be1b83 1830 mjd_nodelen += Node_Length((optimize));
a3621e74 1831 OP( optimize ) = OPTIMIZED;
07be1b83 1832 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1833 optimize++;
1834 }
07be1b83 1835 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1836 });
1837 } /* end node insert */
07be1b83 1838#ifndef DEBUGGING
6e8b4190 1839 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1840#endif
a3621e74
YO
1841 return 1;
1842}
1843
a3621e74 1844/*
5d1c421c
JH
1845 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1846 * These need to be revisited when a newer toolchain becomes available.
1847 */
1848#if defined(__sparc64__) && defined(__GNUC__)
1849# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1850# undef SPARC64_GCC_WORKAROUND
1851# define SPARC64_GCC_WORKAROUND 1
1852# endif
1853#endif
1854
07be1b83
YO
1855#define DEBUG_PEEP(str,scan,depth) \
1856 DEBUG_OPTIMISE_r({ \
1857 SV * const mysv=sv_newmortal(); \
1858 regnode *Next = regnext(scan); \
1859 regprop(RExC_rx, mysv, scan); \
1860 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1861 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1862 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1863 });
1864
1865#define JOIN_EXACT(scan,min,flags) \
1866 if (PL_regkind[OP(scan)] == EXACT) \
1867 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1868
be8e71aa 1869STATIC U32
07be1b83
YO
1870S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1871 /* Merge several consecutive EXACTish nodes into one. */
1872 regnode *n = regnext(scan);
1873 U32 stringok = 1;
1874 regnode *next = scan + NODE_SZ_STR(scan);
1875 U32 merged = 0;
1876 U32 stopnow = 0;
1877#ifdef DEBUGGING
1878 regnode *stop = scan;
72f13be8 1879 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
1880#else
1881 PERL_UNUSED_ARG(flags);
1882 PERL_UNUSED_ARG(val);
1883 PERL_UNUSED_ARG(depth);
07be1b83 1884#endif
07be1b83
YO
1885 DEBUG_PEEP("join",scan,depth);
1886
1887 /* Skip NOTHING, merge EXACT*. */
1888 while (n &&
1889 ( PL_regkind[OP(n)] == NOTHING ||
1890 (stringok && (OP(n) == OP(scan))))
1891 && NEXT_OFF(n)
1892 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1893
1894 if (OP(n) == TAIL || n > next)
1895 stringok = 0;
1896 if (PL_regkind[OP(n)] == NOTHING) {
1897
1898 DEBUG_PEEP("skip:",n,depth);
1899 NEXT_OFF(scan) += NEXT_OFF(n);
1900 next = n + NODE_STEP_REGNODE;
1901#ifdef DEBUGGING
1902 if (stringok)
1903 stop = n;
1904#endif
1905 n = regnext(n);
1906 }
1907 else if (stringok) {
1908 const int oldl = STR_LEN(scan);
1909 regnode * const nnext = regnext(n);
1910
1911 DEBUG_PEEP("merg",n,depth);
1912
1913 merged++;
1914 if (oldl + STR_LEN(n) > U8_MAX)
1915 break;
1916 NEXT_OFF(scan) += NEXT_OFF(n);
1917 STR_LEN(scan) += STR_LEN(n);
1918 next = n + NODE_SZ_STR(n);
1919 /* Now we can overwrite *n : */
1920 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1921#ifdef DEBUGGING
1922 stop = next - 1;
1923#endif
1924 n = nnext;
1925 if (stopnow) break;
1926 }
1927
1928#ifdef EXPERIMENTAL_INPLACESCAN
1929 if (flags && !NEXT_OFF(n)) {
1930 DEBUG_PEEP("atch",val,depth);
1931 if (reg_off_by_arg[OP(n)]) {
1932 ARG_SET(n, val - n);
1933 }
1934 else {
1935 NEXT_OFF(n) = val - n;
1936 }
1937 stopnow=1;
1938 }
1939#endif
1940 }
1941
1942 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1943 /*
1944 Two problematic code points in Unicode casefolding of EXACT nodes:
1945
1946 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1947 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1948
1949 which casefold to
1950
1951 Unicode UTF-8
1952
1953 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1954 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1955
1956 This means that in case-insensitive matching (or "loose matching",
1957 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1958 length of the above casefolded versions) can match a target string
1959 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1960 This would rather mess up the minimum length computation.
1961
1962 What we'll do is to look for the tail four bytes, and then peek
1963 at the preceding two bytes to see whether we need to decrease
1964 the minimum length by four (six minus two).
1965
1966 Thanks to the design of UTF-8, there cannot be false matches:
1967 A sequence of valid UTF-8 bytes cannot be a subsequence of
1968 another valid sequence of UTF-8 bytes.
1969
1970 */
1971 char * const s0 = STRING(scan), *s, *t;
1972 char * const s1 = s0 + STR_LEN(scan) - 1;
1973 char * const s2 = s1 - 4;
1974 const char t0[] = "\xcc\x88\xcc\x81";
1975 const char * const t1 = t0 + 3;
1976
1977 for (s = s0 + 2;
1978 s < s2 && (t = ninstr(s, s1, t0, t1));
1979 s = t + 4) {
1980 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1981 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1982 *min -= 4;
1983 }
1984 }
1985
1986#ifdef DEBUGGING
1987 /* Allow dumping */
1988 n = scan + NODE_SZ_STR(scan);
1989 while (n <= stop) {
1990 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1991 OP(n) = OPTIMIZED;
1992 NEXT_OFF(n) = 0;
1993 }
1994 n++;
1995 }
1996#endif
1997 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1998 return stopnow;
1999}
2000
653099ff
GS
2001/* REx optimizer. Converts nodes into quickier variants "in place".
2002 Finds fixed substrings. */
2003
a0288114 2004/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2005 to the position after last scanned or to NULL. */
2006
07be1b83
YO
2007
2008
76e3520e 2009STATIC I32
9a957fbc
AL
2010S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2011 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2012 /* scanp: Start here (read-write). */
2013 /* deltap: Write maxlen-minlen here. */
2014 /* last: Stop before this one. */
2015{
97aff369 2016 dVAR;
c277df42
IZ
2017 I32 min = 0, pars = 0, code;
2018 regnode *scan = *scanp, *next;
2019 I32 delta = 0;
2020 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2021 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2022 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2023 scan_data_t data_fake;
653099ff 2024 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2025 SV *re_trie_maxbuff = NULL;
2026
2027 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2028#ifdef DEBUGGING
2029 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2030#endif
b81d288d 2031
c277df42
IZ
2032 while (scan && OP(scan) != END && scan < last) {
2033 /* Peephole optimizer: */
07be1b83 2034 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2035
07be1b83 2036 JOIN_EXACT(scan,&min,0);
a3621e74 2037
653099ff
GS
2038 /* Follow the next-chain of the current node and optimize
2039 away all the NOTHINGs from it. */
c277df42 2040 if (OP(scan) != CURLYX) {
a3b680e6 2041 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2042 ? I32_MAX
2043 /* I32 may be smaller than U16 on CRAYs! */
2044 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2045 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2046 int noff;
2047 regnode *n = scan;
b81d288d 2048
c277df42
IZ
2049 /* Skip NOTHING and LONGJMP. */
2050 while ((n = regnext(n))
3dab1dad 2051 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2052 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2053 && off + noff < max)
2054 off += noff;
2055 if (reg_off_by_arg[OP(scan)])
2056 ARG(scan) = off;
b81d288d 2057 else
c277df42
IZ
2058 NEXT_OFF(scan) = off;
2059 }
a3621e74 2060
07be1b83 2061
3dab1dad 2062
653099ff
GS
2063 /* The principal pseudo-switch. Cannot be a switch, since we
2064 look into several different things. */
b81d288d 2065 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2066 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2067 next = regnext(scan);
2068 code = OP(scan);
a3621e74 2069 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2070
2071 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2072 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2073 struct regnode_charclass_class accum;
d4c19fe8 2074 regnode * const startbranch=scan;
c277df42 2075
653099ff 2076 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2077 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2078 if (flags & SCF_DO_STCLASS)
830247a4 2079 cl_init_zero(pRExC_state, &accum);
a3621e74 2080
c277df42 2081 while (OP(scan) == code) {
830247a4 2082 I32 deltanext, minnext, f = 0, fake;
653099ff 2083 struct regnode_charclass_class this_class;
c277df42
IZ
2084
2085 num++;
2086 data_fake.flags = 0;
b81d288d 2087 if (data) {
2c2d71f5 2088 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2089 data_fake.last_closep = data->last_closep;
2090 }
2091 else
2092 data_fake.last_closep = &fake;
c277df42
IZ
2093 next = regnext(scan);
2094 scan = NEXTOPER(scan);
2095 if (code != BRANCH)
2096 scan = NEXTOPER(scan);
653099ff 2097 if (flags & SCF_DO_STCLASS) {
830247a4 2098 cl_init(pRExC_state, &this_class);
653099ff
GS
2099 data_fake.start_class = &this_class;
2100 f = SCF_DO_STCLASS_AND;
b81d288d 2101 }
e1901655
IZ
2102 if (flags & SCF_WHILEM_VISITED_POS)
2103 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2104
653099ff 2105 /* we suppose the run is continuous, last=next...*/
830247a4 2106 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2107 next, &data_fake, f,depth+1);
b81d288d 2108 if (min1 > minnext)
c277df42
IZ
2109 min1 = minnext;
2110 if (max1 < minnext + deltanext)
2111 max1 = minnext + deltanext;
2112 if (deltanext == I32_MAX)
aca2d497 2113 is_inf = is_inf_internal = 1;
c277df42
IZ
2114 scan = next;
2115 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2116 pars++;
3dab1dad
YO
2117 if (data) {
2118 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2119 data->flags |= SF_HAS_EVAL;
2c2d71f5 2120 data->whilem_c = data_fake.whilem_c;
3dab1dad 2121 }
653099ff 2122 if (flags & SCF_DO_STCLASS)
830247a4 2123 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2124 if (code == SUSPEND)
c277df42
IZ
2125 break;
2126 }
2127 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2128 min1 = 0;
2129 if (flags & SCF_DO_SUBSTR) {
2130 data->pos_min += min1;
2131 data->pos_delta += max1 - min1;
2132 if (max1 != min1 || is_inf)
2133 data->longest = &(data->longest_float);
2134 }
2135 min += min1;
2136 delta += max1 - min1;
653099ff 2137 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2138 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2139 if (min1) {
2140 cl_and(data->start_class, &and_with);
2141 flags &= ~SCF_DO_STCLASS;
2142 }
2143 }
2144 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2145 if (min1) {
2146 cl_and(data->start_class, &accum);
653099ff 2147 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2148 }
2149 else {
b81d288d 2150 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2151 * data->start_class */
2152 StructCopy(data->start_class, &and_with,
2153 struct regnode_charclass_class);
2154 flags &= ~SCF_DO_STCLASS_AND;
2155 StructCopy(&accum, data->start_class,
2156 struct regnode_charclass_class);
2157 flags |= SCF_DO_STCLASS_OR;
2158 data->start_class->flags |= ANYOF_EOS;
2159 }
653099ff 2160 }
a3621e74
YO
2161
2162 /* demq.
2163
2164 Assuming this was/is a branch we are dealing with: 'scan' now
2165 points at the item that follows the branch sequence, whatever
2166 it is. We now start at the beginning of the sequence and look
2167 for subsequences of
2168
2169 BRANCH->EXACT=>X
2170 BRANCH->EXACT=>X
2171
2172 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2173
2174 If we can find such a subseqence we need to turn the first
2175 element into a trie and then add the subsequent branch exact
2176 strings to the trie.
2177
2178 We have two cases
2179
2180 1. patterns where the whole set of branch can be converted to a trie,
2181
2182 2. patterns where only a subset of the alternations can be
2183 converted to a trie.
2184
2185 In case 1 we can replace the whole set with a single regop
2186 for the trie. In case 2 we need to keep the start and end
2187 branchs so
2188
2189 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2190 becomes BRANCH TRIE; BRANCH X;
2191
2192 Hypthetically when we know the regex isnt anchored we can
2193 turn a case 1 into a DFA and let it rip... Every time it finds a match
2194 it would just call its tail, no WHILEM/CURLY needed.
2195
2196 */
07be1b83 2197 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2198 int made=0;
0111c4fd
RGS
2199 if (!re_trie_maxbuff) {
2200 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2201 if (!SvIOK(re_trie_maxbuff))
2202 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2203 }
a3621e74
YO
2204 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2205 regnode *cur;
2206 regnode *first = (regnode *)NULL;
2207 regnode *last = (regnode *)NULL;
2208 regnode *tail = scan;
2209 U8 optype = 0;
2210 U32 count=0;
2211
2212#ifdef DEBUGGING
c445ea15 2213 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2214#endif
2215 /* var tail is used because there may be a TAIL
2216 regop in the way. Ie, the exacts will point to the
2217 thing following the TAIL, but the last branch will
2218 point at the TAIL. So we advance tail. If we
2219 have nested (?:) we may have to move through several
2220 tails.
2221 */
2222
2223 while ( OP( tail ) == TAIL ) {
2224 /* this is the TAIL generated by (?:) */
2225 tail = regnext( tail );
2226 }
2227
3dab1dad 2228
a3621e74 2229 DEBUG_OPTIMISE_r({
32fc9b6a 2230 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2231 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2232 (int)depth * 2 + 2, "",
2233 "Looking for TRIE'able sequences. Tail node is: ",
2234 SvPV_nolen_const( mysv )
a3621e74
YO
2235 );
2236 });
3dab1dad 2237
a3621e74
YO
2238 /*
2239
2240 step through the branches, cur represents each
2241 branch, noper is the first thing to be matched
2242 as part of that branch and noper_next is the
2243 regnext() of that node. if noper is an EXACT
2244 and noper_next is the same as scan (our current
2245 position in the regex) then the EXACT branch is
2246 a possible optimization target. Once we have
2247 two or more consequetive such branches we can
2248 create a trie of the EXACT's contents and stich
2249 it in place. If the sequence represents all of
2250 the branches we eliminate the whole thing and
2251 replace it with a single TRIE. If it is a
2252 subsequence then we need to stitch it in. This
2253 means the first branch has to remain, and needs
2254 to be repointed at the item on the branch chain
2255 following the last branch optimized. This could
2256 be either a BRANCH, in which case the
2257 subsequence is internal, or it could be the
2258 item following the branch sequence in which
2259 case the subsequence is at the end.
2260
2261 */
2262
2263 /* dont use tail as the end marker for this traverse */
2264 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2265 regnode * const noper = NEXTOPER( cur );
2266 regnode * const noper_next = regnext( noper );
a3621e74 2267
a3621e74 2268 DEBUG_OPTIMISE_r({
32fc9b6a 2269 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2270 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2271 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2272
32fc9b6a 2273 regprop(RExC_rx, mysv, noper);
a3621e74 2274 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2275 SvPV_nolen_const(mysv));
a3621e74
YO
2276
2277 if ( noper_next ) {
32fc9b6a 2278 regprop(RExC_rx, mysv, noper_next );
a3621e74 2279 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2280 SvPV_nolen_const(mysv));
a3621e74 2281 }
3dab1dad
YO
2282 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2283 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2284 });
3dab1dad
YO
2285 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2286 : PL_regkind[ OP( noper ) ] == EXACT )
2287 || OP(noper) == NOTHING )
a3621e74
YO
2288 && noper_next == tail && count<U16_MAX)
2289 {
2290 count++;
3dab1dad
YO
2291 if ( !first || optype == NOTHING ) {
2292 if (!first) first = cur;
a3621e74
YO
2293 optype = OP( noper );
2294 } else {
a3621e74 2295 last = cur;
a3621e74
YO
2296 }
2297 } else {
2298 if ( last ) {
3dab1dad 2299 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2300 }
3dab1dad 2301 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2302 && noper_next == tail )
2303 {
2304 count = 1;
2305 first = cur;
2306 optype = OP( noper );
2307 } else {
2308 count = 0;
2309 first = NULL;
2310 optype = 0;
2311 }
2312 last = NULL;
2313 }
2314 }
2315 DEBUG_OPTIMISE_r({
32fc9b6a 2316 regprop(RExC_rx, mysv, cur);
a3621e74 2317 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2318 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2319 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2320
2321 });
2322 if ( last ) {
3dab1dad
YO
2323 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2324#ifdef TRIE_STUDY_OPT
07be1b83
YO
2325 if ( made && startbranch == first ) {
2326 if ( OP(first)!=TRIE )
2327 flags |= SCF_EXACT_TRIE;
2328 else {
2329 regnode *chk=*scanp;
2330 while ( OP( chk ) == OPEN )
2331 chk = regnext( chk );
2332 if (chk==first)
2333 flags |= SCF_EXACT_TRIE;
2334 }
2335 }
3dab1dad 2336#endif
07be1b83 2337 }
a3621e74 2338 }
3dab1dad
YO
2339
2340 } /* do trie */
a0ed51b3 2341 }
a3621e74 2342 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2343 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2344 } else /* single branch is optimized. */
c277df42
IZ
2345 scan = NEXTOPER(scan);
2346 continue;
a0ed51b3
LW
2347 }
2348 else if (OP(scan) == EXACT) {
cd439c50 2349 I32 l = STR_LEN(scan);
c445ea15 2350 UV uc;
a0ed51b3 2351 if (UTF) {
a3b680e6 2352 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2353 l = utf8_length(s, s + l);
9041c2e3 2354 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2355 } else {
2356 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2357 }
2358 min += l;
c277df42 2359 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2360 /* The code below prefers earlier match for fixed
2361 offset, later match for variable offset. */
2362 if (data->last_end == -1) { /* Update the start info. */
2363 data->last_start_min = data->pos_min;
2364 data->last_start_max = is_inf
b81d288d 2365 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2366 }
cd439c50 2367 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2368 if (UTF)
2369 SvUTF8_on(data->last_found);
0eda9292 2370 {
9a957fbc 2371 SV * const sv = data->last_found;
a28509cc 2372 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2373 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2374 if (mg && mg->mg_len >= 0)
5e43f467
JH
2375 mg->mg_len += utf8_length((U8*)STRING(scan),
2376 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2377 }
c277df42
IZ
2378 data->last_end = data->pos_min + l;
2379 data->pos_min += l; /* As in the first entry. */
2380 data->flags &= ~SF_BEFORE_EOL;
2381 }
653099ff
GS
2382 if (flags & SCF_DO_STCLASS_AND) {
2383 /* Check whether it is compatible with what we know already! */
2384 int compat = 1;
2385
1aa99e6b 2386 if (uc >= 0x100 ||
516a5887 2387 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2388 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2389 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2390 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2391 )
653099ff
GS
2392 compat = 0;
2393 ANYOF_CLASS_ZERO(data->start_class);
2394 ANYOF_BITMAP_ZERO(data->start_class);
2395 if (compat)
1aa99e6b 2396 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2397 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2398 if (uc < 0x100)
2399 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2400 }
2401 else if (flags & SCF_DO_STCLASS_OR) {
2402 /* false positive possible if the class is case-folded */
1aa99e6b 2403 if (uc < 0x100)
9b877dbb
IH
2404 ANYOF_BITMAP_SET(data->start_class, uc);
2405 else
2406 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2407 data->start_class->flags &= ~ANYOF_EOS;
2408 cl_and(data->start_class, &and_with);
2409 }
2410 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2411 }
3dab1dad 2412 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2413 I32 l = STR_LEN(scan);
1aa99e6b 2414 UV uc = *((U8*)STRING(scan));
653099ff
GS
2415
2416 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2417 if (flags & SCF_DO_SUBSTR) {
2418 assert(data);
830247a4 2419 scan_commit(pRExC_state, data);
ecaa9b9c 2420 }
a0ed51b3 2421 if (UTF) {
6136c704 2422 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2423 l = utf8_length(s, s + l);
9041c2e3 2424 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2425 }
2426 min += l;
ecaa9b9c 2427 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2428 data->pos_min += l;
653099ff
GS
2429 if (flags & SCF_DO_STCLASS_AND) {
2430 /* Check whether it is compatible with what we know already! */
2431 int compat = 1;
2432
1aa99e6b 2433 if (uc >= 0x100 ||
516a5887 2434 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2435 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2436 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2437 compat = 0;
2438 ANYOF_CLASS_ZERO(data->start_class);
2439 ANYOF_BITMAP_ZERO(data->start_class);
2440 if (compat) {
1aa99e6b 2441 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2442 data->start_class->flags &= ~ANYOF_EOS;
2443 data->start_class->flags |= ANYOF_FOLD;
2444 if (OP(scan) == EXACTFL)
2445 data->start_class->flags |= ANYOF_LOCALE;
2446 }
2447 }
2448 else if (flags & SCF_DO_STCLASS_OR) {
2449 if (data->start_class->flags & ANYOF_FOLD) {
2450 /* false positive possible if the class is case-folded.
2451 Assume that the locale settings are the same... */
1aa99e6b
IH
2452 if (uc < 0x100)
2453 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2454 data->start_class->flags &= ~ANYOF_EOS;
2455 }
2456 cl_and(data->start_class, &and_with);
2457 }
2458 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2459 }
3dab1dad
YO
2460#ifdef TRIE_STUDY_OPT
2461 else if (OP(scan) == TRIE) {
2462 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2463 min += trie->minlen;
07be1b83 2464 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2465 flags &= ~SCF_DO_STCLASS; /* xxx */
2466 if (flags & SCF_DO_SUBSTR) {
2467 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2468 data->pos_min += trie->minlen;
07be1b83
YO
2469 data->pos_delta += (trie->maxlen - trie->minlen);
2470 if (trie->maxlen != trie->minlen)
2471 data->longest = &(data->longest_float);
3dab1dad
YO
2472 }
2473 }
2474#endif
bfed75c6 2475 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2476 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2477 I32 f = flags, pos_before = 0;
d4c19fe8 2478 regnode * const oscan = scan;
653099ff
GS
2479 struct regnode_charclass_class this_class;
2480 struct regnode_charclass_class *oclass = NULL;
727f22e3 2481 I32 next_is_eval = 0;
653099ff 2482
3dab1dad 2483 switch (PL_regkind[OP(scan)]) {
653099ff 2484 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2485 scan = NEXTOPER(scan);
2486 goto finish;
2487 case PLUS:
653099ff 2488 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2489 next = NEXTOPER(scan);
653099ff 2490 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2491 mincount = 1;
2492 maxcount = REG_INFTY;
c277df42
IZ
2493 next = regnext(scan);
2494 scan = NEXTOPER(scan);
2495 goto do_curly;
2496 }
2497 }
2498 if (flags & SCF_DO_SUBSTR)
2499 data->pos_min++;
2500 min++;
2501 /* Fall through. */
2502 case STAR:
653099ff
GS
2503 if (flags & SCF_DO_STCLASS) {
2504 mincount = 0;
b81d288d 2505 maxcount = REG_INFTY;
653099ff
GS
2506 next = regnext(scan);
2507 scan = NEXTOPER(scan);
2508 goto do_curly;
2509 }
b81d288d 2510 is_inf = is_inf_internal = 1;
c277df42
IZ
2511 scan = regnext(scan);
2512 if (flags & SCF_DO_SUBSTR) {
830247a4 2513 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2514 data->longest = &(data->longest_float);
2515 }
2516 goto optimize_curly_tail;
2517 case CURLY:
b81d288d 2518 mincount = ARG1(scan);
c277df42
IZ
2519 maxcount = ARG2(scan);
2520 next = regnext(scan);
cb434fcc
IZ
2521 if (OP(scan) == CURLYX) {
2522 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2523 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2524 }
c277df42 2525 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2526 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2527 do_curly:
2528 if (flags & SCF_DO_SUBSTR) {
830247a4 2529 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2530 pos_before = data->pos_min;
2531 }
2532 if (data) {
2533 fl = data->flags;
2534 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2535 if (is_inf)
2536 data->flags |= SF_IS_INF;
2537 }
653099ff 2538 if (flags & SCF_DO_STCLASS) {
830247a4 2539 cl_init(pRExC_state, &this_class);
653099ff
GS
2540 oclass = data->start_class;
2541 data->start_class = &this_class;
2542 f |= SCF_DO_STCLASS_AND;
2543 f &= ~SCF_DO_STCLASS_OR;
2544 }
e1901655
IZ
2545 /* These are the cases when once a subexpression
2546 fails at a particular position, it cannot succeed
2547 even after backtracking at the enclosing scope.
b81d288d 2548
e1901655
IZ
2549 XXXX what if minimal match and we are at the
2550 initial run of {n,m}? */
2551 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2552 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2553
c277df42 2554 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2555 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2556 (mincount == 0
2557 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2558
2559 if (flags & SCF_DO_STCLASS)
2560 data->start_class = oclass;
2561 if (mincount == 0 || minnext == 0) {
2562 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2563 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2564 }
2565 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2566 /* Switch to OR mode: cache the old value of
653099ff
GS
2567 * data->start_class */
2568 StructCopy(data->start_class, &and_with,
2569 struct regnode_charclass_class);
2570 flags &= ~SCF_DO_STCLASS_AND;
2571 StructCopy(&this_class, data->start_class,
2572 struct regnode_charclass_class);
2573 flags |= SCF_DO_STCLASS_OR;
2574 data->start_class->flags |= ANYOF_EOS;
2575 }
2576 } else { /* Non-zero len */
2577 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2578 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2579 cl_and(data->start_class, &and_with);
2580 }
2581 else if (flags & SCF_DO_STCLASS_AND)
2582 cl_and(data->start_class, &this_class);
2583 flags &= ~SCF_DO_STCLASS;
2584 }
c277df42
IZ
2585 if (!scan) /* It was not CURLYX, but CURLY. */
2586 scan = next;
041457d9
DM
2587 if ( /* ? quantifier ok, except for (?{ ... }) */
2588 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2589 && (minnext == 0) && (deltanext == 0)
99799961 2590 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2591 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2592 && ckWARN(WARN_REGEXP))
b45f050a 2593 {
830247a4 2594 vWARN(RExC_parse,
b45f050a
JF
2595 "Quantifier unexpected on zero-length expression");
2596 }
2597
c277df42 2598 min += minnext * mincount;
b81d288d 2599 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2600 && (minnext + deltanext) > 0)
2601 || deltanext == I32_MAX);
aca2d497 2602 is_inf |= is_inf_internal;
c277df42
IZ
2603 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2604
2605 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2606 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2607 && data->flags & SF_IN_PAR
2608 && !(data->flags & SF_HAS_EVAL)
2609 && !deltanext && minnext == 1 ) {
2610 /* Try to optimize to CURLYN. */
2611 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2612 regnode * const nxt1 = nxt;
497b47a8
JH
2613#ifdef DEBUGGING
2614 regnode *nxt2;
2615#endif
c277df42
IZ
2616
2617 /* Skip open. */
2618 nxt = regnext(nxt);
bfed75c6 2619 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2620 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2621 && STR_LEN(nxt) == 1))
c277df42 2622 goto nogo;
497b47a8 2623#ifdef DEBUGGING
c277df42 2624 nxt2 = nxt;
497b47a8 2625#endif
c277df42 2626 nxt = regnext(nxt);
b81d288d 2627 if (OP(nxt) != CLOSE)
c277df42
IZ
2628 goto nogo;
2629 /* Now we know that nxt2 is the only contents: */
eb160463 2630 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2631 OP(oscan) = CURLYN;
2632 OP(nxt1) = NOTHING; /* was OPEN. */
2633#ifdef DEBUGGING
2634 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2635 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2636 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2637 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2638 OP(nxt + 1) = OPTIMIZED; /* was count. */
2639 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2640#endif
c277df42 2641 }
c277df42
IZ
2642 nogo:
2643
2644 /* Try optimization CURLYX => CURLYM. */
b81d288d 2645 if ( OP(oscan) == CURLYX && data
c277df42 2646 && !(data->flags & SF_HAS_PAR)
c277df42 2647 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2648 && !deltanext /* atom is fixed width */
2649 && minnext != 0 /* CURLYM can't handle zero width */
2650 ) {
c277df42
IZ
2651 /* XXXX How to optimize if data == 0? */
2652 /* Optimize to a simpler form. */
2653 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2654 regnode *nxt2;
2655
2656 OP(oscan) = CURLYM;
2657 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2658 && (OP(nxt2) != WHILEM))
c277df42
IZ
2659 nxt = nxt2;
2660 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2661 /* Need to optimize away parenths. */
2662 if (data->flags & SF_IN_PAR) {
2663 /* Set the parenth number. */
2664 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2665
b81d288d 2666 if (OP(nxt) != CLOSE)
b45f050a 2667 FAIL("Panic opt close");
eb160463 2668 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2669 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2670 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2671#ifdef DEBUGGING
2672 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2673 OP(nxt + 1) = OPTIMIZED; /* was count. */
2674 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2675 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2676#endif
c277df42
IZ
2677#if 0
2678 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2679 regnode *nnxt = regnext(nxt1);
b81d288d 2680
c277df42
IZ
2681 if (nnxt == nxt) {
2682 if (reg_off_by_arg[OP(nxt1)])
2683 ARG_SET(nxt1, nxt2 - nxt1);
2684 else if (nxt2 - nxt1 < U16_MAX)
2685 NEXT_OFF(nxt1) = nxt2 - nxt1;
2686 else
2687 OP(nxt) = NOTHING; /* Cannot beautify */
2688 }
2689 nxt1 = nnxt;
2690 }
2691#endif
2692 /* Optimize again: */
b81d288d 2693 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2694 NULL, 0,depth+1);
a0ed51b3
LW
2695 }
2696 else
c277df42 2697 oscan->flags = 0;
c277df42 2698 }
e1901655
IZ
2699 else if ((OP(oscan) == CURLYX)
2700 && (flags & SCF_WHILEM_VISITED_POS)
2701 /* See the comment on a similar expression above.
2702 However, this time it not a subexpression
2703 we care about, but the expression itself. */
2704 && (maxcount == REG_INFTY)
2705 && data && ++data->whilem_c < 16) {
2706 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2707 /* Find WHILEM (as in regexec.c) */
2708 regnode *nxt = oscan + NEXT_OFF(oscan);
2709
2710 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2711 nxt += ARG(nxt);
eb160463
GS
2712 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2713 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2714 }
b81d288d 2715 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2716 pars++;
2717 if (flags & SCF_DO_SUBSTR) {
c445ea15 2718 SV *last_str = NULL;
c277df42
IZ
2719 int counted = mincount != 0;
2720
2721 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2722#if defined(SPARC64_GCC_WORKAROUND)
2723 I32 b = 0;
2724 STRLEN l = 0;
cfd0369c 2725 const char *s = NULL;
5d1c421c
JH
2726 I32 old = 0;
2727
2728 if (pos_before >= data->last_start_min)
2729 b = pos_before;
2730 else
2731 b = data->last_start_min;
2732
2733 l = 0;
cfd0369c 2734 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2735 old = b - data->last_start_min;
2736
2737#else
b81d288d 2738 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2739 ? pos_before : data->last_start_min;
2740 STRLEN l;
d4c19fe8 2741 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2742 I32 old = b - data->last_start_min;
5d1c421c 2743#endif
a0ed51b3
LW
2744
2745 if (UTF)
2746 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2747
a0ed51b3 2748 l -= old;
c277df42 2749 /* Get the added string: */
79cb57f6 2750 last_str = newSVpvn(s + old, l);
0e933229
IH
2751 if (UTF)
2752 SvUTF8_on(last_str);
c277df42
IZ
2753 if (deltanext == 0 && pos_before == b) {
2754 /* What was added is a constant string */
2755 if (mincount > 1) {
2756 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2757 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2758 SvPVX_const(last_str), l, mincount - 1);
b162af07 2759 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2760 /* Add additional parts. */
b81d288d 2761 SvCUR_set(data->last_found,
c277df42
IZ
2762 SvCUR(data->last_found) - l);
2763 sv_catsv(data->last_found, last_str);
0eda9292
JH
2764 {
2765 SV * sv = data->last_found;
2766 MAGIC *mg =
2767 SvUTF8(sv) && SvMAGICAL(sv) ?
2768 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2769 if (mg && mg->mg_len >= 0)
2770 mg->mg_len += CHR_SVLEN(last_str);
2771 }
c277df42
IZ
2772 data->last_end += l * (mincount - 1);
2773 }
2a8d9689
HS
2774 } else {
2775 /* start offset must point into the last copy */
2776 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2777 data->last_start_max += is_inf ? I32_MAX
2778 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2779 }
2780 }
2781 /* It is counted once already... */
2782 data->pos_min += minnext * (mincount - counted);
2783 data->pos_delta += - counted * deltanext +
2784 (minnext + deltanext) * maxcount - minnext * mincount;
2785 if (mincount != maxcount) {
653099ff
GS
2786 /* Cannot extend fixed substrings found inside
2787 the group. */
830247a4 2788 scan_commit(pRExC_state,data);
c277df42 2789 if (mincount && last_str) {
d4c19fe8
AL
2790 SV * const sv = data->last_found;
2791 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2792 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2793
2794 if (mg)
2795 mg->mg_len = -1;
2796 sv_setsv(sv, last_str);
c277df42 2797 data->last_end = data->pos_min;
b81d288d 2798 data->last_start_min =
a0ed51b3 2799 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2800 data->last_start_max = is_inf
2801 ? I32_MAX
c277df42 2802 : data->pos_min + data->pos_delta
a0ed51b3 2803 - CHR_SVLEN(last_str);
c277df42
IZ
2804 }
2805 data->longest = &(data->longest_float);
2806 }
aca2d497 2807 SvREFCNT_dec(last_str);
c277df42 2808 }
405ff068 2809 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2810 data->flags |= SF_HAS_EVAL;
2811 optimize_curly_tail:
c277df42 2812 if (OP(oscan) != CURLYX) {
3dab1dad 2813 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2814 && NEXT_OFF(next))
2815 NEXT_OFF(oscan) += NEXT_OFF(next);
2816 }
c277df42 2817 continue;
653099ff 2818 default: /* REF and CLUMP only? */
c277df42 2819 if (flags & SCF_DO_SUBSTR) {
830247a4 2820 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2821 data->longest = &(data->longest_float);
2822 }
aca2d497 2823 is_inf = is_inf_internal = 1;
653099ff 2824 if (flags & SCF_DO_STCLASS_OR)
830247a4 2825 cl_anything(pRExC_state, data->start_class);
653099ff 2826 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2827 break;
2828 }
a0ed51b3 2829 }
bfed75c6 2830 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2831 int value = 0;
653099ff 2832
c277df42 2833 if (flags & SCF_DO_SUBSTR) {
830247a4 2834 scan_commit(pRExC_state,data);
c277df42
IZ
2835 data->pos_min++;
2836 }
2837 min++;
653099ff
GS
2838 if (flags & SCF_DO_STCLASS) {
2839 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2840
2841 /* Some of the logic below assumes that switching
2842 locale on will only add false positives. */
3dab1dad 2843 switch (PL_regkind[OP(scan)]) {
653099ff 2844 case SANY:
653099ff
GS
2845 default:
2846 do_default:
2847 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2848 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2849 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2850 break;
2851 case REG_ANY:
2852 if (OP(scan) == SANY)
2853 goto do_default;
2854 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2855 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2856 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2857 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2858 }
2859 if (flags & SCF_DO_STCLASS_AND || !value)
2860 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2861 break;
2862 case ANYOF:
2863 if (flags & SCF_DO_STCLASS_AND)
2864 cl_and(data->start_class,
2865 (struct regnode_charclass_class*)scan);
2866 else
830247a4 2867 cl_or(pRExC_state, data->start_class,
653099ff
GS
2868 (struct regnode_charclass_class*)scan);
2869 break;
2870 case ALNUM:
2871 if (flags & SCF_DO_STCLASS_AND) {
2872 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2873 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2874 for (value = 0; value < 256; value++)
2875 if (!isALNUM(value))
2876 ANYOF_BITMAP_CLEAR(data->start_class, value);
2877 }
2878 }
2879 else {
2880 if (data->start_class->flags & ANYOF_LOCALE)
2881 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2882 else {
2883 for (value = 0; value < 256; value++)
2884 if (isALNUM(value))
b81d288d 2885 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2886 }
2887 }
2888 break;
2889 case ALNUML:
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 if (data->start_class->flags & ANYOF_LOCALE)
2892 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2893 }
2894 else {
2895 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2896 data->start_class->flags |= ANYOF_LOCALE;
2897 }
2898 break;
2899 case NALNUM:
2900 if (flags & SCF_DO_STCLASS_AND) {
2901 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2902 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2903 for (value = 0; value < 256; value++)
2904 if (isALNUM(value))
2905 ANYOF_BITMAP_CLEAR(data->start_class, value);
2906 }
2907 }
2908 else {
2909 if (data->start_class->flags & ANYOF_LOCALE)
2910 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2911 else {
2912 for (value = 0; value < 256; value++)
2913 if (!isALNUM(value))
b81d288d 2914 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2915 }
2916 }
2917 break;
2918 case NALNUML:
2919 if (flags & SCF_DO_STCLASS_AND) {
2920 if (data->start_class->flags & ANYOF_LOCALE)
2921 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2922 }
2923 else {
2924 data->start_class->flags |= ANYOF_LOCALE;
2925 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2926 }
2927 break;
2928 case SPACE:
2929 if (flags & SCF_DO_STCLASS_AND) {
2930 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2931 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2932 for (value = 0; value < 256; value++)
2933 if (!isSPACE(value))
2934 ANYOF_BITMAP_CLEAR(data->start_class, value);
2935 }
2936 }
2937 else {
2938 if (data->start_class->flags & ANYOF_LOCALE)
2939 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2940 else {
2941 for (value = 0; value < 256; value++)
2942 if (isSPACE(value))
b81d288d 2943 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2944 }
2945 }
2946 break;
2947 case SPACEL:
2948 if (flags & SCF_DO_STCLASS_AND) {
2949 if (data->start_class->flags & ANYOF_LOCALE)
2950 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2951 }
2952 else {
2953 data->start_class->flags |= ANYOF_LOCALE;
2954 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2955 }
2956 break;
2957 case NSPACE:
2958 if (flags & SCF_DO_STCLASS_AND) {
2959 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2960 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2961 for (value = 0; value < 256; value++)
2962 if (isSPACE(value))
2963 ANYOF_BITMAP_CLEAR(data->start_class, value);
2964 }
2965 }
2966 else {
2967 if (data->start_class->flags & ANYOF_LOCALE)
2968 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2969 else {
2970 for (value = 0; value < 256; value++)
2971 if (!isSPACE(value))
b81d288d 2972 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2973 }
2974 }
2975 break;
2976 case NSPACEL:
2977 if (flags & SCF_DO_STCLASS_AND) {
2978 if (data->start_class->flags & ANYOF_LOCALE) {
2979 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2980 for (value = 0; value < 256; value++)
2981 if (!isSPACE(value))
2982 ANYOF_BITMAP_CLEAR(data->start_class, value);
2983 }
2984 }
2985 else {
2986 data->start_class->flags |= ANYOF_LOCALE;
2987 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2988 }
2989 break;
2990 case DIGIT:
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2993 for (value = 0; value < 256; value++)
2994 if (!isDIGIT(value))
2995 ANYOF_BITMAP_CLEAR(data->start_class, value);
2996 }
2997 else {
2998 if (data->start_class->flags & ANYOF_LOCALE)
2999 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3000 else {
3001 for (value = 0; value < 256; value++)
3002 if (isDIGIT(value))
b81d288d 3003 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3004 }
3005 }
3006 break;
3007 case NDIGIT:
3008 if (flags & SCF_DO_STCLASS_AND) {
3009 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3010 for (value = 0; value < 256; value++)
3011 if (isDIGIT(value))
3012 ANYOF_BITMAP_CLEAR(data->start_class, value);
3013 }
3014 else {
3015 if (data->start_class->flags & ANYOF_LOCALE)
3016 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3017 else {
3018 for (value = 0; value < 256; value++)
3019 if (!isDIGIT(value))
b81d288d 3020 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3021 }
3022 }
3023 break;
3024 }
3025 if (flags & SCF_DO_STCLASS_OR)
3026 cl_and(data->start_class, &and_with);
3027 flags &= ~SCF_DO_STCLASS;
3028 }
a0ed51b3 3029 }
3dab1dad 3030 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3031 data->flags |= (OP(scan) == MEOL
3032 ? SF_BEFORE_MEOL
3033 : SF_BEFORE_SEOL);
a0ed51b3 3034 }
3dab1dad 3035 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3036 /* Lookbehind, or need to calculate parens/evals/stclass: */
3037 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3038 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3039 /* Lookahead/lookbehind */
cb434fcc 3040 I32 deltanext, minnext, fake = 0;
c277df42 3041 regnode *nscan;
653099ff
GS
3042 struct regnode_charclass_class intrnl;
3043 int f = 0;
c277df42
IZ
3044
3045 data_fake.flags = 0;
b81d288d 3046 if (data) {
2c2d71f5 3047 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3048 data_fake.last_closep = data->last_closep;
3049 }
3050 else
3051 data_fake.last_closep = &fake;
653099ff
GS
3052 if ( flags & SCF_DO_STCLASS && !scan->flags
3053 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3054 cl_init(pRExC_state, &intrnl);
653099ff 3055 data_fake.start_class = &intrnl;
e1901655 3056 f |= SCF_DO_STCLASS_AND;
653099ff 3057 }
e1901655
IZ
3058 if (flags & SCF_WHILEM_VISITED_POS)
3059 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3060 next = regnext(scan);
3061 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3062 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3063 if (scan->flags) {
3064 if (deltanext) {
9baa0206 3065 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3066 }
3067 else if (minnext > U8_MAX) {
9baa0206 3068 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3069 }
eb160463 3070 scan->flags = (U8)minnext;
c277df42 3071 }
be8e71aa
YO
3072 if (data) {
3073 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3074 pars++;
3075 if (data_fake.flags & SF_HAS_EVAL)
3076 data->flags |= SF_HAS_EVAL;
2c2d71f5 3077 data->whilem_c = data_fake.whilem_c;
be8e71aa 3078 }
e1901655 3079 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3080 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3081
3082 cl_and(data->start_class, &intrnl);
3083 if (was)
3084 data->start_class->flags |= ANYOF_EOS;
3085 }
a0ed51b3
LW
3086 }
3087 else if (OP(scan) == OPEN) {
c277df42 3088 pars++;
a0ed51b3 3089 }
cb434fcc 3090 else if (OP(scan) == CLOSE) {
eb160463 3091 if ((I32)ARG(scan) == is_par) {
cb434fcc 3092 next = regnext(scan);
c277df42 3093
cb434fcc
IZ
3094 if ( next && (OP(next) != WHILEM) && next < last)
3095 is_par = 0; /* Disable optimization */
3096 }
3097 if (data)
3098 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3099 }
3100 else if (OP(scan) == EVAL) {
c277df42
IZ
3101 if (data)
3102 data->flags |= SF_HAS_EVAL;
3103 }
96776eda 3104 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3105 if (flags & SCF_DO_SUBSTR) {
830247a4 3106 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3107 data->longest = &(data->longest_float);
3108 }
3109 is_inf = is_inf_internal = 1;
653099ff 3110 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3111 cl_anything(pRExC_state, data->start_class);
96776eda 3112 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3113 }
c277df42
IZ
3114 /* Else: zero-length, ignore. */
3115 scan = regnext(scan);
3116 }
3117
3118 finish:
3119 *scanp = scan;
aca2d497 3120 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3121 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3122 data->pos_delta = I32_MAX - data->pos_min;
3123 if (is_par > U8_MAX)
3124 is_par = 0;
3125 if (is_par && pars==1 && data) {
3126 data->flags |= SF_IN_PAR;
3127 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3128 }
3129 else if (pars && data) {
c277df42
IZ
3130 data->flags |= SF_HAS_PAR;
3131 data->flags &= ~SF_IN_PAR;
3132 }
653099ff
GS
3133 if (flags & SCF_DO_STCLASS_OR)
3134 cl_and(data->start_class, &and_with);
07be1b83
YO
3135 if (flags & SCF_EXACT_TRIE)
3136 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3137 return min;
3138}
3139
76e3520e 3140STATIC I32
5f66b61c 3141S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3142{
830247a4 3143 if (RExC_rx->data) {
b81d288d
AB
3144 Renewc(RExC_rx->data,
3145 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3146 char, struct reg_data);
830247a4
IZ
3147 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3148 RExC_rx->data->count += n;
a0ed51b3
LW
3149 }
3150 else {
a02a5408 3151 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3152 char, struct reg_data);
a02a5408 3153 Newx(RExC_rx->data->what, n, U8);
830247a4 3154 RExC_rx->data->count = n;
c277df42 3155 }
830247a4
IZ
3156 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3157 return RExC_rx->data->count - n;
c277df42
IZ
3158}
3159
76234dfb 3160#ifndef PERL_IN_XSUB_RE
d88dccdf 3161void
864dbfa3 3162Perl_reginitcolors(pTHX)
d88dccdf 3163{
97aff369 3164 dVAR;
1df70142 3165 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3166 if (s) {
1df70142
AL
3167 char *t = savepv(s);
3168 int i = 0;
3169 PL_colors[0] = t;
d88dccdf 3170 while (++i < 6) {
1df70142
AL
3171 t = strchr(t, '\t');
3172 if (t) {
3173 *t = '\0';
3174 PL_colors[i] = ++t;
d88dccdf
IZ
3175 }
3176 else
1df70142 3177 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3178 }
3179 } else {
1df70142 3180 int i = 0;
b81d288d 3181 while (i < 6)
06b5626a 3182 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3183 }
3184 PL_colorset = 1;
3185}
76234dfb 3186#endif
8615cb43 3187
07be1b83 3188
a687059c 3189/*
e50aee73 3190 - pregcomp - compile a regular expression into internal code
a687059c
LW
3191 *
3192 * We can't allocate space until we know how big the compiled form will be,
3193 * but we can't compile it (and thus know how big it is) until we've got a
3194 * place to put the code. So we cheat: we compile it twice, once with code
3195 * generation turned off and size counting turned on, and once "for real".
3196 * This also means that we don't allocate space until we are sure that the
3197 * thing really will compile successfully, and we never have to move the
3198 * code and thus invalidate pointers into it. (Note that it has to be in
3199 * one piece because free() must be able to free it all.) [NB: not true in perl]
3200 *
3201 * Beware that the optimization-preparation code in here knows about some
3202 * of the structure of the compiled regexp. [I'll say.]
3203 */
3204regexp *
864dbfa3 3205Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3206{
97aff369 3207 dVAR;
a0d0e21e 3208 register regexp *r;
c277df42 3209 regnode *scan;
c277df42 3210 regnode *first;
a0d0e21e 3211 I32 flags;
a0d0e21e
LW
3212 I32 minlen = 0;
3213 I32 sawplus = 0;
3214 I32 sawopen = 0;
2c2d71f5 3215 scan_data_t data;
830247a4 3216 RExC_state_t RExC_state;
be8e71aa 3217 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3218#ifdef TRIE_STUDY_OPT
3219 int restudied= 0;
3220 RExC_state_t copyRExC_state;
3221#endif
a0d0e21e 3222
a3621e74
YO
3223 GET_RE_DEBUG_FLAGS_DECL;
3224
a0d0e21e 3225 if (exp == NULL)
c277df42 3226 FAIL("NULL regexp argument");
a0d0e21e 3227
a5961de5 3228 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3229
5cfc7842 3230 RExC_precomp = exp;
a3621e74
YO
3231 DEBUG_r(if (!PL_colorset) reginitcolors());
3232 DEBUG_COMPILE_r({
3233 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3234 PL_colors[4],PL_colors[5],PL_colors[0],
3235 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3236 });
e2509266 3237 RExC_flags = pm->op_pmflags;
830247a4 3238 RExC_sawback = 0;
bbce6d69 3239
830247a4
IZ
3240 RExC_seen = 0;
3241 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3242 RExC_seen_evals = 0;
3243 RExC_extralen = 0;
c277df42 3244
bbce6d69 3245 /* First pass: determine size, legality. */
830247a4 3246 RExC_parse = exp;
fac92740 3247 RExC_start = exp;
830247a4
IZ
3248 RExC_end = xend;
3249 RExC_naughty = 0;
3250 RExC_npar = 1;
3251 RExC_size = 0L;
3252 RExC_emit = &PL_regdummy;
3253 RExC_whilem_seen = 0;
85ddcde9
JH
3254#if 0 /* REGC() is (currently) a NOP at the first pass.
3255 * Clever compilers notice this and complain. --jhi */
830247a4 3256 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3257#endif
3dab1dad
YO
3258 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3259 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3260 RExC_precomp = NULL;
a0d0e21e
LW
3261 return(NULL);
3262 }
3dab1dad
YO
3263 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3264 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3265 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3266 DEBUG_PARSE_r({
3267 RExC_lastnum=0;
3268 RExC_lastparse=NULL;
3269 });
c277df42 3270
07be1b83 3271
c277df42
IZ
3272 /* Small enough for pointer-storage convention?
3273 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3274 if (RExC_size >= 0x10000L && RExC_extralen)
3275 RExC_size += RExC_extralen;
c277df42 3276 else
830247a4
IZ
3277 RExC_extralen = 0;
3278 if (RExC_whilem_seen > 15)
3279 RExC_whilem_seen = 15;
a0d0e21e 3280
bbce6d69 3281 /* Allocate space and initialize. */
a02a5408 3282 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3283 char, regexp);
a0d0e21e 3284 if (r == NULL)
b45f050a
JF
3285 FAIL("Regexp out of space");
3286
0f79a09d
GS
3287#ifdef DEBUGGING
3288 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3289 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3290#endif
c277df42 3291 r->refcnt = 1;
bbce6d69 3292 r->prelen = xend - exp;
5cfc7842 3293 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3294 r->subbeg = NULL;
f8c7b90f 3295#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3296 r->saved_copy = NULL;
ed252734 3297#endif
cf93c79d 3298 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3299 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3300 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3301
3302 r->substrs = 0; /* Useful during FAIL. */
3303 r->startp = 0; /* Useful during FAIL. */
3304 r->endp = 0; /* Useful during FAIL. */
3305
a02a5408 3306 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3307 if (r->offsets) {
2af232bd 3308 r->offsets[0] = RExC_size;
fac92740 3309 }
a3621e74 3310 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3311 "%s %"UVuf" bytes for offset annotations.\n",
3312 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3313 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3314
830247a4 3315 RExC_rx = r;
bbce6d69 3316
3317 /* Second pass: emit code. */
e2509266 3318 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3319 RExC_parse = exp;
3320 RExC_end = xend;
3321 RExC_naughty = 0;
3322 RExC_npar = 1;
fac92740 3323 RExC_emit_start = r->program;
830247a4 3324 RExC_emit = r->program;
2cd61cdb 3325 /* Store the count of eval-groups for security checks: */
eb160463 3326 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3327 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3328 r->data = 0;
3dab1dad 3329 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3330 return(NULL);
07be1b83
YO
3331 /* XXXX To minimize changes to RE engine we always allocate
3332 3-units-long substrs field. */
3333 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3334
07be1b83
YO
3335reStudy:
3336 Zero(r->substrs, 1, struct reg_substr_data);
3337 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3338
07be1b83
YO
3339#ifdef TRIE_STUDY_OPT
3340 if ( restudied ) {
3341 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3342 RExC_state=copyRExC_state;
3343 if (data.longest_fixed)
3344 SvREFCNT_dec(data.longest_fixed);
3345 if (data.longest_float)
3346 SvREFCNT_dec(data.longest_float);
3347 if (data.last_found)
3348 SvREFCNT_dec(data.last_found);
3349 } else {
3350 copyRExC_state=RExC_state;
3351 }
3352#endif
a0d0e21e 3353 /* Dig out information for optimizations. */
cf93c79d 3354 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3355 pm->op_pmflags = RExC_flags;
a0ed51b3 3356 if (UTF)
5ff6fc6d 3357 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3358 r->regstclass = NULL;
830247a4 3359 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3360 r->reganch |= ROPT_NAUGHTY;
c277df42 3361 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3362
653099ff 3363 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3364 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3365 I32 fake;
c5254dd6 3366 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3367 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3368 int stclass_flag;
07be1b83 3369 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3370
3371 first = scan;
c277df42 3372 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3373 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3374 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3375 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3376 /* for now we can't handle lookbehind IFMATCH*/
3377 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3378 (OP(first) == PLUS) ||
3379 (OP(first) == MINMOD) ||
653099ff 3380 /* An {n,m} with n>0 */
07be1b83
YO
3381 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3382 {
3383 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3384 if (OP(first) == PLUS)
3385 sawplus = 1;
3386 else
3dab1dad 3387 first += regarglen[OP(first)];
07be1b83
YO
3388 if (OP(first) == IFMATCH) {
3389 first = NEXTOPER(first);
3390 first += EXTRA_STEP_2ARGS;
7c167cea 3391 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3392 first = NEXTOPER(first);
a687059c
LW
3393 }
3394
a0d0e21e
LW
3395 /* Starting-point info. */
3396 again:
07be1b83 3397 /* Ignore EXACT as we deal with it later. */
3dab1dad 3398 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3399 if (OP(first) == EXACT)
6f207bd3 3400 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3401 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3402 r->regstclass = first;
3403 }
07be1b83
YO
3404#ifdef TRIE_STCLASS
3405 else if (OP(first) == TRIE &&
3406 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3407 {
3408 /* this can happen only on restudy */
3409 struct regnode_1 *trie_op;
3410 Newxz(trie_op,1,struct regnode_1);
3411 StructCopy(first,trie_op,struct regnode_1);
3412 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3413 r->regstclass = (regnode *)trie_op;
3414 }
3415#endif
bfed75c6 3416 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3417 r->regstclass = first;
3dab1dad
YO
3418 else if (PL_regkind[OP(first)] == BOUND ||
3419 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3420 r->regstclass = first;
3dab1dad 3421 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3422 r->reganch |= (OP(first) == MBOL
3423 ? ROPT_ANCH_MBOL
3424 : (OP(first) == SBOL
3425 ? ROPT_ANCH_SBOL
3426 : ROPT_ANCH_BOL));
a0d0e21e 3427 first = NEXTOPER(first);
774d564b 3428 goto again;
3429 }
3430 else if (OP(first) == GPOS) {
3431 r->reganch |= ROPT_ANCH_GPOS;
3432 first = NEXTOPER(first);
3433 goto again;
a0d0e21e 3434 }
e09294f4 3435 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3436 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3437 !(r->reganch & ROPT_ANCH) )
3438 {
3439 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3440 const int type =
3441 (OP(NEXTOPER(first)) == REG_ANY)
3442 ? ROPT_ANCH_MBOL
3443 : ROPT_ANCH_SBOL;
cad2e5aa 3444 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3445 first = NEXTOPER(first);
774d564b 3446 goto again;
a0d0e21e 3447 }
b81d288d 3448 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3449 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3450 /* x+ must match at the 1st pos of run of x's */
3451 r->reganch |= ROPT_SKIP;
a0d0e21e 3452
c277df42 3453 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3454#ifdef TRIE_STUDY_OPT
3455 DEBUG_COMPILE_r(
3456 if (!restudied)
3457 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3458 (IV)(first - scan + 1))
3459 );
3460#else
3461 DEBUG_COMPILE_r(
3462 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3463 (IV)(first - scan + 1))
3464 );
3465#endif
3466
3467
a0d0e21e
LW
3468 /*
3469 * If there's something expensive in the r.e., find the
3470 * longest literal string that must appear and make it the
3471 * regmust. Resolve ties in favor of later strings, since
3472 * the regstart check works with the beginning of the r.e.
3473 * and avoiding duplication strengthens checking. Not a
3474 * strong reason, but sufficient in the absence of others.
3475 * [Now we resolve ties in favor of the earlier string if
c277df42 3476 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3477 * earlier string may buy us something the later one won't.]
3478 */
a0d0e21e 3479 minlen = 0;
a687059c 3480
396482e1
GA
3481 data.longest_fixed = newSVpvs("");
3482 data.longest_float = newSVpvs("");
3483 data.last_found = newSVpvs("");
c277df42
IZ
3484 data.longest = &(data.longest_fixed);
3485 first = scan;
653099ff 3486 if (!r->regstclass) {
830247a4 3487 cl_init(pRExC_state, &ch_class);
653099ff
GS
3488 data.start_class = &ch_class;
3489 stclass_flag = SCF_DO_STCLASS_AND;
3490 } else /* XXXX Check for BOUND? */
3491 stclass_flag = 0;
cb434fcc 3492 data.last_closep = &last_close;
653099ff 3493
830247a4 3494 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3495 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3496
3497#ifdef TRIE_STUDY_OPT
3498 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3499 goto reStudy;
3500 }
3501#endif
3502
830247a4 3503 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3504 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3505 && !RExC_seen_zerolen
3506 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3507 r->reganch |= ROPT_CHECK_ALL;
830247a4 3508 scan_commit(pRExC_state, &data);
c277df42
IZ
3509 SvREFCNT_dec(data.last_found);
3510
a0ed51b3 3511 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3512 if (longest_float_length
c277df42
IZ
3513 || (data.flags & SF_FL_BEFORE_EOL
3514 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3515 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3516 int t;
3517
a0ed51b3 3518 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3519 && data.offset_fixed == data.offset_float_min
3520 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3521 goto remove_float; /* As in (a)+. */
3522
33b8afdf
JH
3523 if (SvUTF8(data.longest_float)) {
3524 r->float_utf8 = data.longest_float;
c445ea15 3525 r->float_substr = NULL;
33b8afdf
JH
3526 } else {
3527 r->float_substr = data.longest_float;
c445ea15 3528 r->float_utf8 = NULL;
33b8afdf 3529 }
c277df42
IZ
3530 r->float_min_offset = data.offset_float_min;
3531 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3532 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3533 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3534 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3535 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3536 }
3537 else {
aca2d497 3538 remove_float:
c445ea15 3539 r->float_substr = r->float_utf8 = NULL;
c277df42 3540 SvREFCNT_dec(data.longest_float);
c5254dd6 3541 longest_float_length = 0;
a0d0e21e 3542 }
c277df42 3543
a0ed51b3 3544 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3545 if (longest_fixed_length
c277df42
IZ
3546 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3547 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3548 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3549 int t;
3550
33b8afdf
JH
3551 if (SvUTF8(data.longest_fixed)) {
3552 r->anchored_utf8 = data.longest_fixed;
c445ea15 3553 r->anchored_substr = NULL;
33b8afdf
JH
3554 } else {
3555 r->anchored_substr = data.longest_fixed;
c445ea15 3556 r->anchored_utf8 = NULL;
33b8afdf 3557 }
c277df42 3558 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3559 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3560 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3561 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3562 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3563 }
3564 else {
c445ea15 3565 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3566 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3567 longest_fixed_length = 0;
a0d0e21e 3568 }
b81d288d 3569 if (r->regstclass
ffc61ed2 3570 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3571 r->regstclass = NULL;
33b8afdf
JH
3572 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3573 && stclass_flag
653099ff 3574 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3575 && !cl_is_anything(data.start_class))
3576 {
1df70142 3577 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3578
a02a5408 3579 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3580 struct regnode_charclass_class);
3581 StructCopy(data.start_class,
830247a4 3582 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3583 struct regnode_charclass_class);
830247a4 3584 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3585 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3586 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3587 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3588 PerlIO_printf(Perl_debug_log,
a0288114 3589 "synthetic stclass \"%s\".\n",
3f7c398e 3590 SvPVX_const(sv));});
653099ff 3591 }
c277df42
IZ
3592
3593 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3594 if (longest_fixed_length > longest_float_length) {
c277df42 3595 r->check_substr = r->anchored_substr;
33b8afdf 3596 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3597 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3598 if (r->reganch & ROPT_ANCH_SINGLE)
3599 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3600 }
3601 else {
c277df42 3602 r->check_substr = r->float_substr;
33b8afdf 3603 r->check_utf8 = r->float_utf8;
c277df42
IZ
3604 r->check_offset_min = data.offset_float_min;
3605 r->check_offset_max = data.offset_float_max;
a0d0e21e 3606 }
30382c73
IZ
3607 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3608 This should be changed ASAP! */
33b8afdf 3609 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3610 r->reganch |= RE_USE_INTUIT;
33b8afdf 3611 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3612 r->reganch |= RE_INTUIT_TAIL;
3613 }
a0ed51b3
LW
3614 }
3615 else {
c277df42
IZ
3616 /* Several toplevels. Best we can is to set minlen. */
3617 I32 fake;
653099ff 3618 struct regnode_charclass_class ch_class;
cb434fcc 3619 I32 last_close = 0;
c277df42 3620
a3621e74 3621 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3622
c277df42 3623 scan = r->program + 1;
830247a4 3624 cl_init(pRExC_state, &ch_class);
653099ff 3625 data.start_class = &ch_class;
cb434fcc 3626 data.last_closep = &last_close;
07be1b83
YO
3627
3628 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3629 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3630
3631#ifdef TRIE_STUDY_OPT
3632 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3633 goto reStudy;
3634 }
3635#endif
3636
33b8afdf 3637 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3638 = r->float_substr = r->float_utf8 = NULL;
653099ff 3639 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3640 && !cl_is_anything(data.start_class))
3641 {
1df70142 3642 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3643
a02a5408 3644 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3645 struct regnode_charclass_class);
3646 StructCopy(data.start_class,
830247a4 3647 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3648 struct regnode_charclass_class);
830247a4 3649 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3650 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3651 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3652 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3653 PerlIO_printf(Perl_debug_log,
a0288114 3654 "synthetic stclass \"%s\".\n",
3f7c398e 3655 SvPVX_const(sv));});
653099ff 3656 }
a0d0e21e
LW
3657 }
3658
a0d0e21e 3659 r->minlen = minlen;
b81d288d 3660 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3661 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3662 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3663 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3664 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3665 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3666 if (RExC_seen & REG_SEEN_CANY)
3667 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3668 Newxz(r->startp, RExC_npar, I32);
3669 Newxz(r->endp, RExC_npar, I32);
07be1b83 3670
f2278c82 3671 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3672 DEBUG_DUMP_r({
3673 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3674 regdump(r);
3675 });
8e9a8a48
YO
3676 DEBUG_OFFSETS_r(if (r->offsets) {
3677 const U32 len = r->offsets[0];
3678 U32 i;
3679 GET_RE_DEBUG_FLAGS_DECL;
3680 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3681 for (i = 1; i <= len; i++) {
3682 if (r->offsets[i*2-1] || r->offsets[i*2])
3683 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3684 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3685 }
3686 PerlIO_printf(Perl_debug_log, "\n");
3687 });
a0d0e21e 3688 return(r);
a687059c
LW
3689}
3690
3dab1dad
YO
3691
3692#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3693 int rem=(int)(RExC_end - RExC_parse); \
3694 int cut; \
3695 int num; \
3696 int iscut=0; \
3697 if (rem>10) { \
3698 rem=10; \
3699 iscut=1; \
3700 } \
3701 cut=10-rem; \
3702 if (RExC_lastparse!=RExC_parse) \
3703 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3704 rem, RExC_parse, \
3705 cut + 4, \
3706 iscut ? "..." : "<" \
3707 ); \
3708 else \
3709 PerlIO_printf(Perl_debug_log,"%16s",""); \
3710 \
3711 if (SIZE_ONLY) \
3712 num=RExC_size; \
3713 else \
3714 num=REG_NODE_NUM(RExC_emit); \
3715 if (RExC_lastnum!=num) \
be8e71aa 3716 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3717 else \
be8e71aa
YO
3718 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3719 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3720 (int)((depth*2)), "", \
3dab1dad
YO
3721 (funcname) \
3722 ); \
3723 RExC_lastnum=num; \
3724 RExC_lastparse=RExC_parse; \
3725})
3726
07be1b83
YO
3727
3728
3dab1dad
YO
3729#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3730 DEBUG_PARSE_MSG((funcname)); \
3731 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3732})
a687059c
LW
3733/*
3734 - reg - regular expression, i.e. main body or parenthesized thing
3735 *
3736 * Caller must absorb opening parenthesis.
3737 *
3738 * Combining parenthesis handling with the base level of regular expression
3739 * is a trifle forced, but the need to tie the tails of the branches to what
3740 * follows makes it hard to avoid.
3741 */
07be1b83
YO
3742#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3743#ifdef DEBUGGING
3744#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3745#else
3746#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3747#endif
3dab1dad 3748
76e3520e 3749STATIC regnode *
3dab1dad 3750S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3751 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3752{
27da23d5 3753 dVAR;
c277df42
IZ
3754 register regnode *ret; /* Will be the head of the group. */
3755 register regnode *br;
3756 register regnode *lastbr;
cbbf8932 3757 register regnode *ender = NULL;
a0d0e21e 3758 register I32 parno = 0;
cbbf8932
AL
3759 I32 flags;
3760 const I32 oregflags = RExC_flags;
6136c704
AL
3761 bool have_branch = 0;
3762 bool is_open = 0;
9d1d55b5
JP
3763
3764 /* for (?g), (?gc), and (?o) warnings; warning
3765 about (?c) will warn about (?g) -- japhy */
3766
6136c704
AL
3767#define WASTED_O 0x01
3768#define WASTED_G 0x02
3769#define WASTED_C 0x04
3770#define WASTED_GC (0x02|0x04)
cbbf8932 3771 I32 wastedflags = 0x00;
9d1d55b5 3772
fac92740 3773 char * parse_start = RExC_parse; /* MJD */
a28509cc 3774 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3775
3dab1dad
YO
3776 GET_RE_DEBUG_FLAGS_DECL;
3777 DEBUG_PARSE("reg ");
3778
3779
821b33a5 3780 *flagp = 0; /* Tentatively. */
a0d0e21e 3781
9d1d55b5 3782
a0d0e21e
LW
3783 /* Make an OPEN node, if parenthesized. */
3784 if (paren) {
fac92740 3785 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3786 U32 posflags = 0, negflags = 0;
3787 U32 *flagsp = &posflags;
6136c704 3788 bool is_logical = 0;
a28509cc 3789 const char * const seqstart = RExC_parse;
ca9dfc88 3790
830247a4
IZ
3791 RExC_parse++;
3792 paren = *RExC_parse++;
c277df42 3793 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3794 switch (paren) {
fac92740 3795 case '<': /* (?<...) */
830247a4 3796 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3797 if (*RExC_parse == '!')
c277df42 3798 paren = ',';
b81d288d 3799 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3800 goto unknown;
830247a4 3801 RExC_parse++;
fac92740
MJD
3802 case '=': /* (?=...) */
3803 case '!': /* (?!...) */
830247a4 3804 RExC_seen_zerolen++;
fac92740
MJD
3805 case ':': /* (?:...) */
3806 case '>': /* (?>...) */
a0d0e21e 3807 break;
fac92740
MJD
3808 case '$': /* (?$...) */
3809 case '@': /* (?@...) */
8615cb43 3810 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3811 break;
fac92740 3812 case '#': /* (?#...) */
830247a4
IZ
3813 while (*RExC_parse && *RExC_parse != ')')
3814 RExC_parse++;
3815 if (*RExC_parse != ')')
c277df42 3816 FAIL("Sequence (?#... not terminated");
830247a4 3817 nextchar(pRExC_state);
a0d0e21e
LW
3818 *flagp = TRYAGAIN;
3819 return NULL;
fac92740 3820 case 'p': /* (?p...) */
9014280d 3821 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3822 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3823 /* FALL THROUGH*/
fac92740 3824 case '?': /* (??...) */
6136c704 3825 is_logical = 1;
438a3801
YST
3826 if (*RExC_parse != '{')
3827 goto unknown;
830247a4 3828 paren = *RExC_parse++;
0f5d15d6 3829 /* FALL THROUGH */
fac92740 3830 case '{': /* (?{...}) */
c277df42 3831 {
c277df42
IZ
3832 I32 count = 1, n = 0;
3833 char c;
830247a4 3834 char *s = RExC_parse;
c277df42 3835
830247a4
IZ
3836 RExC_seen_zerolen++;
3837 RExC_seen |= REG_SEEN_EVAL;
3838 while (count && (c = *RExC_parse)) {
6136c704
AL
3839 if (c == '\\') {
3840 if (RExC_parse[1])
3841 RExC_parse++;
3842 }
b81d288d 3843 else if (c == '{')
c277df42 3844 count++;
b81d288d 3845 else if (c == '}')
c277df42 3846 count--;
830247a4 3847 RExC_parse++;
c277df42 3848 }
6136c704 3849 if (*RExC_parse != ')') {
b81d288d 3850 RExC_parse = s;
b45f050a
JF
3851 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3852 }
c277df42 3853 if (!SIZE_ONLY) {
f3548bdc 3854 PAD *pad;
6136c704
AL
3855 OP_4tree *sop, *rop;
3856 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3857
569233ed
SB
3858 ENTER;
3859 Perl_save_re_context(aTHX);
f3548bdc 3860 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3861 sop->op_private |= OPpREFCOUNTED;
3862 /* re_dup will OpREFCNT_inc */
3863 OpREFCNT_set(sop, 1);
569233ed 3864 LEAVE;
c277df42 3865
830247a4
IZ
3866 n = add_data(pRExC_state, 3, "nop");
3867 RExC_rx->data->data[n] = (void*)rop;
3868 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3869 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3870 SvREFCNT_dec(sv);
a0ed51b3 3871 }
e24b16f9 3872 else { /* First pass */
830247a4 3873 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3874 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3875 /* No compiled RE interpolated, has runtime
3876 components ===> unsafe. */
3877 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3878 if (PL_tainting && PL_tainted)
cc6b7395 3879 FAIL("Eval-group in insecure regular expression");
54df2634 3880#if PERL_VERSION > 8
923e4eb5 3881 if (IN_PERL_COMPILETIME)
b5c19bd7 3882 PL_cv_has_eval = 1;
54df2634 3883#endif
c277df42 3884 }
b5c19bd7 3885
830247a4 3886 nextchar(pRExC_state);
6136c704 3887 if (is_logical) {
830247a4 3888 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3889 if (!SIZE_ONLY)
3890 ret->flags = 2;
3dab1dad 3891 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3892 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3893 return ret;
3894 }
ccb2c380
MP
3895 ret = reganode(pRExC_state, EVAL, n);
3896 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3897 Set_Node_Offset(ret, parse_start);
3898 return ret;
c277df42 3899 }
fac92740 3900 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3901 {
fac92740 3902 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3903 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3904 || RExC_parse[1] == '<'
830247a4 3905 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3906 I32 flag;
3907
830247a4 3908 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3909 if (!SIZE_ONLY)
3910 ret->flags = 1;
3dab1dad 3911 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3912 goto insert_if;
b81d288d 3913 }
a0ed51b3 3914 }
830247a4 3915 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3916 /* (?(1)...) */
6136c704 3917 char c;
830247a4 3918 parno = atoi(RExC_parse++);
c277df42 3919
830247a4
IZ
3920 while (isDIGIT(*RExC_parse))
3921 RExC_parse++;
fac92740 3922 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3923
830247a4 3924 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3925 vFAIL("Switch condition not recognized");
c277df42 3926 insert_if:
3dab1dad
YO
3927 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3928 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3929 if (br == NULL)
830247a4 3930 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3931 else
3dab1dad 3932 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3933 c = *nextchar(pRExC_state);
d1b80229
IZ
3934 if (flags&HASWIDTH)
3935 *flagp |= HASWIDTH;
c277df42 3936 if (c == '|') {
830247a4 3937 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3938 regbranch(pRExC_state, &flags, 1,depth+1);
3939 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3940 if (flags&HASWIDTH)
3941 *flagp |= HASWIDTH;
830247a4 3942 c = *nextchar(pRExC_state);
a0ed51b3
LW
3943 }
3944 else
c277df42
IZ
3945 lastbr = NULL;
3946 if (c != ')')
8615cb43 3947 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3948 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3949 REGTAIL(pRExC_state, br, ender);
c277df42 3950 if (lastbr) {
3dab1dad
YO
3951 REGTAIL(pRExC_state, lastbr, ender);
3952 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3953 }
3954 else
3dab1dad 3955 REGTAIL(pRExC_state, ret, ender);
c277df42 3956 return ret;
a0ed51b3
LW
3957 }
3958 else {
830247a4 3959 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3960 }
3961 }
1b1626e4 3962 case 0:
830247a4 3963 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3964 vFAIL("Sequence (? incomplete");
1b1626e4 3965 break;
a0d0e21e 3966 default:
830247a4 3967 --RExC_parse;
fac92740 3968 parse_flags: /* (?i) */
830247a4 3969 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3970 /* (?g), (?gc) and (?o) are useless here
3971 and must be globally applied -- japhy */
3972
3973 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3974 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3975 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3976 if (! (wastedflags & wflagbit) ) {
3977 wastedflags |= wflagbit;
3978 vWARN5(
3979 RExC_parse + 1,
3980 "Useless (%s%c) - %suse /%c modifier",
3981 flagsp == &negflags ? "?-" : "?",
3982 *RExC_parse,
3983 flagsp == &negflags ? "don't " : "",
3984 *RExC_parse
3985 );
3986 }
3987 }
3988 }
3989 else if (*RExC_parse == 'c') {
3990 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3991 if (! (wastedflags & WASTED_C) ) {
3992 wastedflags |= WASTED_GC;
9d1d55b5
JP
3993 vWARN3(
3994 RExC_parse + 1,
3995 "Useless (%sc) - %suse /gc modifier",
3996 flagsp == &negflags ? "?-" : "?",
3997 flagsp == &negflags ? "don't " : ""
3998 );
3999 }
4000 }
4001 }
4002 else { pmflag(flagsp, *RExC_parse); }
4003
830247a4 4004 ++RExC_parse;
ca9dfc88 4005 }
830247a4 4006 if (*RExC_parse == '-') {
ca9dfc88 4007 flagsp = &negflags;
9d1d55b5 4008 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4009 ++RExC_parse;
ca9dfc88 4010 goto parse_flags;
48c036b1 4011 }
e2509266
JH
4012 RExC_flags |= posflags;
4013 RExC_flags &= ~negflags;
830247a4
IZ
4014 if (*RExC_parse == ':') {
4015 RExC_parse++;
ca9dfc88
IZ
4016 paren = ':';
4017 break;
4018 }
c277df42 4019 unknown:
830247a4
IZ
4020 if (*RExC_parse != ')') {
4021 RExC_parse++;
4022 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4023 }
830247a4 4024 nextchar(pRExC_state);
a0d0e21e
LW
4025 *flagp = TRYAGAIN;
4026 return NULL;
4027 }
4028 }
fac92740 4029 else { /* (...) */
830247a4
IZ
4030 parno = RExC_npar;
4031 RExC_npar++;
4032 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4033 Set_Node_Length(ret, 1); /* MJD */
4034 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4035 is_open = 1;
a0d0e21e 4036 }
a0ed51b3 4037 }
fac92740 4038 else /* ! paren */
a0d0e21e
LW
4039 ret = NULL;
4040
4041 /* Pick up the branches, linking them together. */
fac92740 4042 parse_start = RExC_parse; /* MJD */
3dab1dad 4043 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4044 /* branch_len = (paren != 0); */
2af232bd 4045
a0d0e21e
LW
4046 if (br == NULL)
4047 return(NULL);
830247a4
IZ
4048 if (*RExC_parse == '|') {
4049 if (!SIZE_ONLY && RExC_extralen) {
4050 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4051 }
fac92740 4052 else { /* MJD */
830247a4 4053 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4054 Set_Node_Length(br, paren != 0);
4055 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4056 }
c277df42
IZ
4057 have_branch = 1;
4058 if (SIZE_ONLY)
830247a4 4059 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4060 }
4061 else if (paren == ':') {
c277df42
IZ
4062 *flagp |= flags&SIMPLE;
4063 }
6136c704 4064 if (is_open) { /* Starts with OPEN. */
3dab1dad 4065 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4066 }
4067 else if (paren != '?') /* Not Conditional */
a0d0e21e 4068 ret = br;
32a0ca98 4069 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4070 lastbr = br;
830247a4
IZ
4071 while (*RExC_parse == '|') {
4072 if (!SIZE_ONLY && RExC_extralen) {
4073 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4074 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4075 }
4076 if (SIZE_ONLY)
830247a4
IZ
4077 RExC_extralen += 2; /* Account for LONGJMP. */
4078 nextchar(pRExC_state);
3dab1dad 4079 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4080
a687059c 4081 if (br == NULL)
a0d0e21e 4082 return(NULL);
3dab1dad 4083 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4084 lastbr = br;
821b33a5
IZ
4085 if (flags&HASWIDTH)
4086 *flagp |= HASWIDTH;
a687059c 4087 *flagp |= flags&SPSTART;
a0d0e21e
LW
4088 }
4089
c277df42
IZ
4090 if (have_branch || paren != ':') {
4091 /* Make a closing node, and hook it on the end. */
4092 switch (paren) {
4093 case ':':
830247a4 4094 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4095 break;
4096 case 1:
830247a4 4097 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4098 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4099 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4100 break;
4101 case '<':
c277df42
IZ
4102 case ',':
4103 case '=':
4104 case '!':
c277df42 4105 *flagp &= ~HASWIDTH;
821b33a5
IZ
4106 /* FALL THROUGH */
4107 case '>':
830247a4 4108 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4109 break;
4110 case 0:
830247a4 4111 ender = reg_node(pRExC_state, END);
c277df42
IZ
4112 break;
4113 }
07be1b83 4114 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4115
9674d46a 4116 if (have_branch && !SIZE_ONLY) {
c277df42 4117 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4118 for (br = ret; br; br = regnext(br)) {
4119 const U8 op = PL_regkind[OP(br)];
4120 if (op == BRANCH) {
07be1b83 4121 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4122 }
4123 else if (op == BRANCHJ) {
07be1b83 4124 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4125 }
c277df42
IZ
4126 }
4127 }
a0d0e21e 4128 }
c277df42
IZ
4129
4130 {
e1ec3a88
AL
4131 const char *p;
4132 static const char parens[] = "=!<,>";
c277df42
IZ
4133
4134 if (paren && (p = strchr(parens, paren))) {
eb160463 4135 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4136 int flag = (p - parens) > 1;
4137
4138 if (paren == '>')
4139 node = SUSPEND, flag = 0;
830247a4 4140 reginsert(pRExC_state, node,ret);
45948336
EP
4141 Set_Node_Cur_Length(ret);
4142 Set_Node_Offset(ret, parse_start + 1);
c277df42 4143 ret->flags = flag;
07be1b83 4144 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4145 }
a0d0e21e
LW
4146 }
4147
4148 /* Check for proper termination. */
ce3e6498 4149 if (paren) {
e2509266 4150 RExC_flags = oregflags;
830247a4
IZ
4151 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4152 RExC_parse = oregcomp_parse;
380a0633 4153 vFAIL("Unmatched (");
ce3e6498 4154 }
a0ed51b3 4155 }
830247a4
IZ
4156 else if (!paren && RExC_parse < RExC_end) {
4157 if (*RExC_parse == ')') {
4158 RExC_parse++;
380a0633 4159 vFAIL("Unmatched )");
a0ed51b3
LW
4160 }
4161 else
b45f050a 4162 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4163 /* NOTREACHED */
4164 }
a687059c 4165
a0d0e21e 4166 return(ret);
a687059c
LW
4167}
4168
4169/*
4170 - regbranch - one alternative of an | operator
4171 *
4172 * Implements the concatenation operator.
4173 */
76e3520e 4174STATIC regnode *
3dab1dad 4175S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4176{
97aff369 4177 dVAR;
c277df42
IZ
4178 register regnode *ret;
4179 register regnode *chain = NULL;
4180 register regnode *latest;
4181 I32 flags = 0, c = 0;
3dab1dad
YO
4182 GET_RE_DEBUG_FLAGS_DECL;
4183 DEBUG_PARSE("brnc");
b81d288d 4184 if (first)
c277df42
IZ
4185 ret = NULL;
4186 else {
b81d288d 4187 if (!SIZE_ONLY && RExC_extralen)
830247a4 4188 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4189 else {
830247a4 4190 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4191 Set_Node_Length(ret, 1);
4192 }
c277df42
IZ
4193 }
4194
b81d288d 4195 if (!first && SIZE_ONLY)
830247a4 4196 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4197
c277df42 4198 *flagp = WORST; /* Tentatively. */
a0d0e21e 4199
830247a4
IZ
4200 RExC_parse--;
4201 nextchar(pRExC_state);
4202 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4203 flags &= ~TRYAGAIN;
3dab1dad 4204 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4205 if (latest == NULL) {
4206 if (flags & TRYAGAIN)
4207 continue;
4208 return(NULL);
a0ed51b3
LW
4209 }
4210 else if (ret == NULL)
c277df42 4211 ret = latest;
a0d0e21e 4212 *flagp |= flags&HASWIDTH;
c277df42 4213 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4214 *flagp |= flags&SPSTART;
4215 else {
830247a4 4216 RExC_naughty++;
3dab1dad 4217 REGTAIL(pRExC_state, chain, latest);
a687059c 4218 }
a0d0e21e 4219 chain = latest;
c277df42
IZ
4220 c++;
4221 }
4222 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4223 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4224 if (ret == NULL)
4225 ret = chain;
4226 }
4227 if (c == 1) {
4228 *flagp |= flags&SIMPLE;
a0d0e21e 4229 }
a687059c 4230
d4c19fe8 4231 return ret;
a687059c
LW
4232}
4233
4234/*
4235 - regpiece - something followed by possible [*+?]
4236 *
4237 * Note that the branching code sequences used for ? and the general cases
4238 * of * and + are somewhat optimized: they use the same NOTHING node as
4239 * both the endmarker for their branch list and the body of the last branch.
4240 * It might seem that this node could be dispensed with entirely, but the
4241 * endmarker role is not redundant.
4242 */
76e3520e 4243STATIC regnode *
3dab1dad 4244S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4245{
97aff369 4246 dVAR;
c277df42 4247 register regnode *ret;
a0d0e21e
LW
4248 register char op;
4249 register char *next;
4250 I32 flags;
1df70142 4251 const char * const origparse = RExC_parse;
a0d0e21e 4252 I32 min;
c277df42 4253 I32 max = REG_INFTY;
fac92740 4254 char *parse_start;
3dab1dad
YO
4255 GET_RE_DEBUG_FLAGS_DECL;
4256 DEBUG_PARSE("piec");
a0d0e21e 4257
3dab1dad 4258 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4259 if (ret == NULL) {
4260 if (flags & TRYAGAIN)
4261 *flagp |= TRYAGAIN;
4262 return(NULL);
4263 }
4264
830247a4 4265 op = *RExC_parse;
a0d0e21e 4266
830247a4 4267 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 4268 const char *maxpos = NULL;
fac92740 4269 parse_start = RExC_parse; /* MJD */
830247a4 4270 next = RExC_parse + 1;
a0d0e21e
LW
4271 while (isDIGIT(*next) || *next == ',') {
4272 if (*next == ',') {
4273 if (maxpos)
4274 break;
4275 else
4276 maxpos = next;
a687059c 4277 }
a0d0e21e
LW
4278 next++;
4279 }
4280 if (*next == '}') { /* got one */
4281 if (!maxpos)
4282 maxpos = next;
830247a4
IZ
4283 RExC_parse++;
4284 min = atoi(RExC_parse);
a0d0e21e
LW
4285 if (*maxpos == ',')
4286 maxpos++;
4287 else
830247a4 4288 maxpos = RExC_parse;
a0d0e21e
LW
4289 max = atoi(maxpos);
4290 if (!max && *maxpos != '0')
c277df42
IZ
4291 max = REG_INFTY; /* meaning "infinity" */
4292 else if (max >= REG_INFTY)
8615cb43 4293 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4294 RExC_parse = next;
4295 nextchar(pRExC_state);
a0d0e21e
LW
4296
4297 do_curly:
4298 if ((flags&SIMPLE)) {
830247a4
IZ
4299 RExC_naughty += 2 + RExC_naughty / 2;
4300 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4301 Set_Node_Offset(ret, parse_start+1); /* MJD */
4302 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4303 }
4304 else {
3dab1dad 4305 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4306
4307 w->flags = 0;
3dab1dad 4308 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4309 if (!SIZE_ONLY && RExC_extralen) {
4310 reginsert(pRExC_state, LONGJMP,ret);
4311 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4312 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4313 }
830247a4 4314 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4315 /* MJD hk */
4316 Set_Node_Offset(ret, parse_start+1);
2af232bd 4317 Set_Node_Length(ret,
fac92740 4318 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4319
830247a4 4320 if (!SIZE_ONLY && RExC_extralen)
c277df42 4321 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4322 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4323 if (SIZE_ONLY)
830247a4
IZ
4324 RExC_whilem_seen++, RExC_extralen += 3;
4325 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4326 }
c277df42 4327 ret->flags = 0;
a0d0e21e
LW
4328
4329 if (min > 0)
821b33a5
IZ
4330 *flagp = WORST;
4331 if (max > 0)
4332 *flagp |= HASWIDTH;
a0d0e21e 4333 if (max && max < min)
8615cb43 4334 vFAIL("Can't do {n,m} with n > m");
c277df42 4335 if (!SIZE_ONLY) {
eb160463
GS
4336 ARG1_SET(ret, (U16)min);
4337 ARG2_SET(ret, (U16)max);
a687059c 4338 }
a687059c 4339
a0d0e21e 4340 goto nest_check;
a687059c 4341 }
a0d0e21e 4342 }
a687059c 4343
a0d0e21e
LW
4344 if (!ISMULT1(op)) {
4345 *flagp = flags;
a687059c 4346 return(ret);
a0d0e21e 4347 }
bb20fd44 4348
c277df42 4349#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4350
4351 /* if this is reinstated, don't forget to put this back into perldiag:
4352
4353 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4354
4355 (F) The part of the regexp subject to either the * or + quantifier
4356 could match an empty string. The {#} shows in the regular
4357 expression about where the problem was discovered.
4358
4359 */
4360
bb20fd44 4361 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4362 vFAIL("Regexp *+ operand could be empty");
b81d288d 4363#endif
bb20fd44 4364
fac92740 4365 parse_start = RExC_parse;
830247a4 4366 nextchar(pRExC_state);
a0d0e21e 4367
821b33a5 4368 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4369
4370 if (op == '*' && (flags&SIMPLE)) {
830247a4 4371 reginsert(pRExC_state, STAR, ret);
c277df42 4372 ret->flags = 0;
830247a4 4373 RExC_naughty += 4;
a0d0e21e
LW
4374 }
4375 else if (op == '*') {
4376 min = 0;
4377 goto do_curly;
a0ed51b3
LW
4378 }
4379 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4380 reginsert(pRExC_state, PLUS, ret);
c277df42 4381 ret->flags = 0;
830247a4 4382 RExC_naughty += 3;
a0d0e21e
LW
4383 }
4384 else if (op == '+') {
4385 min = 1;
4386 goto do_curly;
a0ed51b3
LW
4387 }
4388 else if (op == '?') {
a0d0e21e
LW
4389 min = 0; max = 1;
4390 goto do_curly;
4391 }
4392 nest_check:
041457d9 4393 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4394 vWARN3(RExC_parse,
b45f050a 4395 "%.*s matches null string many times",
afd78fd5 4396 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4397 origparse);
a0d0e21e
LW
4398 }
4399
830247a4
IZ
4400 if (*RExC_parse == '?') {
4401 nextchar(pRExC_state);
4402 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4403 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4404 }
830247a4
IZ
4405 if (ISMULT2(RExC_parse)) {
4406 RExC_parse++;
b45f050a
JF
4407 vFAIL("Nested quantifiers");
4408 }
a0d0e21e
LW
4409
4410 return(ret);
a687059c
LW
4411}
4412
4413/*
4414 - regatom - the lowest level
4415 *
4416 * Optimization: gobbles an entire sequence of ordinary characters so that
4417 * it can turn them into a single node, which is smaller to store and
4418 * faster to run. Backslashed characters are exceptions, each becoming a
4419 * separate node; the code is simpler that way and it's not worth fixing.
4420 *
7f6f358c
YO
4421 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4422 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4423 */
76e3520e 4424STATIC regnode *
3dab1dad 4425S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4426{
97aff369 4427 dVAR;
cbbf8932 4428 register regnode *ret = NULL;
a0d0e21e 4429 I32 flags;
45948336 4430 char *parse_start = RExC_parse;
3dab1dad
YO
4431 GET_RE_DEBUG_FLAGS_DECL;
4432 DEBUG_PARSE("atom");
a0d0e21e
LW
4433 *flagp = WORST; /* Tentatively. */
4434
4435tryagain:
830247a4 4436 switch (*RExC_parse) {
a0d0e21e 4437 case '^':
830247a4
IZ
4438 RExC_seen_zerolen++;
4439 nextchar(pRExC_state);
e2509266 4440 if (RExC_flags & PMf_MULTILINE)
830247a4 4441 ret = reg_node(pRExC_state, MBOL);
e2509266 4442 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4443 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4444 else
830247a4 4445 ret = reg_node(pRExC_state, BOL);
fac92740 4446 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4447 break;
4448 case '$':
830247a4 4449 nextchar(pRExC_state);
b81d288d 4450 if (*RExC_parse)
830247a4 4451 RExC_seen_zerolen++;
e2509266 4452 if (RExC_flags & PMf_MULTILINE)
830247a4 4453 ret = reg_node(pRExC_state, MEOL);
e2509266 4454 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4455 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4456 else
830247a4 4457 ret = reg_node(pRExC_state, EOL);
fac92740 4458 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4459 break;
4460 case '.':
830247a4 4461 nextchar(pRExC_state);
e2509266 4462 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4463 ret = reg_node(pRExC_state, SANY);
4464 else
4465 ret = reg_node(pRExC_state, REG_ANY);
4466 *flagp |= HASWIDTH|SIMPLE;
830247a4 4467 RExC_naughty++;
fac92740 4468 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4469 break;
4470 case '[':
b45f050a 4471 {
3dab1dad
YO
4472 char * const oregcomp_parse = ++RExC_parse;
4473 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4474 if (*RExC_parse != ']') {
4475 RExC_parse = oregcomp_parse;
b45f050a
JF
4476 vFAIL("Unmatched [");
4477 }
830247a4 4478 nextchar(pRExC_state);
a0d0e21e 4479 *flagp |= HASWIDTH|SIMPLE;
fac92740 4480 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4481 break;
b45f050a 4482 }
a0d0e21e 4483 case '(':
830247a4 4484 nextchar(pRExC_state);
3dab1dad 4485 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4486 if (ret == NULL) {
bf93d4cc 4487 if (flags & TRYAGAIN) {
830247a4 4488 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4489 /* Make parent create an empty node if needed. */
4490 *flagp |= TRYAGAIN;
4491 return(NULL);
4492 }
a0d0e21e 4493 goto tryagain;
bf93d4cc 4494 }
a0d0e21e
LW
4495 return(NULL);
4496 }
c277df42 4497 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4498 break;
4499 case '|':
4500 case ')':
4501 if (flags & TRYAGAIN) {
4502 *flagp |= TRYAGAIN;
4503 return NULL;
4504 }
b45f050a 4505 vFAIL("Internal urp");
a0d0e21e
LW
4506 /* Supposed to be caught earlier. */
4507 break;
85afd4ae 4508 case '{':
830247a4
IZ
4509 if (!regcurly(RExC_parse)) {
4510 RExC_parse++;
85afd4ae
CS
4511 goto defchar;
4512 }
4513 /* FALL THROUGH */
a0d0e21e
LW
4514 case '?':
4515 case '+':
4516 case '*':
830247a4 4517 RExC_parse++;
b45f050a 4518 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4519 break;
4520 case '\\':
830247a4 4521 switch (*++RExC_parse) {
a0d0e21e 4522 case 'A':
830247a4
IZ
4523 RExC_seen_zerolen++;
4524 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4525 *flagp |= SIMPLE;
830247a4 4526 nextchar(pRExC_state);
fac92740 4527 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4528 break;
4529 case 'G':
830247a4
IZ
4530 ret = reg_node(pRExC_state, GPOS);
4531 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4532 *flagp |= SIMPLE;
830247a4 4533 nextchar(pRExC_state);
fac92740 4534 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4535 break;
4536 case 'Z':
830247a4 4537 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4538 *flagp |= SIMPLE;
a1917ab9 4539 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4540 nextchar(pRExC_state);
a0d0e21e 4541 break;
b85d18e9 4542 case 'z':
830247a4 4543 ret = reg_node(pRExC_state, EOS);
b85d18e9 4544 *flagp |= SIMPLE;
830247a4
IZ
4545 RExC_seen_zerolen++; /* Do not optimize RE away */
4546 nextchar(pRExC_state);
fac92740 4547 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4548 break;
4a2d328f 4549 case 'C':
f33976b4
DB
4550 ret = reg_node(pRExC_state, CANY);
4551 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4552 *flagp |= HASWIDTH|SIMPLE;
830247a4 4553 nextchar(pRExC_state);
fac92740 4554 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4555 break;
4556 case 'X':
830247a4 4557 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4558 *flagp |= HASWIDTH;
830247a4 4559 nextchar(pRExC_state);
fac92740 4560 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4561 break;
a0d0e21e 4562 case 'w':
eb160463 4563 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4564 *flagp |= HASWIDTH|SIMPLE;
830247a4 4565 nextchar(pRExC_state);
fac92740 4566 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4567 break;
4568 case 'W':
eb160463 4569 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4570 *flagp |= HASWIDTH|SIMPLE;
830247a4 4571 nextchar(pRExC_state);
fac92740 4572 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4573 break;
4574 case 'b':
830247a4
IZ
4575 RExC_seen_zerolen++;
4576 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4577 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4578 *flagp |= SIMPLE;
830247a4 4579 nextchar(pRExC_state);
fac92740 4580 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4581 break;
4582 case 'B':
830247a4
IZ
4583 RExC_seen_zerolen++;
4584 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4585 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4586 *flagp |= SIMPLE;
830247a4 4587 nextchar(pRExC_state);
fac92740 4588 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4589 break;
4590 case 's':
eb160463 4591 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4592 *flagp |= HASWIDTH|SIMPLE;
830247a4 4593 nextchar(pRExC_state);
fac92740 4594 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4595 break;
4596 case 'S':
eb160463 4597 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4598 *flagp |= HASWIDTH|SIMPLE;
830247a4 4599 nextchar(pRExC_state);
fac92740 4600 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4601 break;
4602 case 'd':
ffc61ed2 4603 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4604 *flagp |= HASWIDTH|SIMPLE;
830247a4 4605 nextchar(pRExC_state);
fac92740 4606 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4607 break;
4608 case 'D':
ffc61ed2 4609 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4610 *flagp |= HASWIDTH|SIMPLE;
830247a4 4611 nextchar(pRExC_state);
fac92740 4612 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4613 break;
a14b48bc
LW
4614 case 'p':
4615 case 'P':
3568d838 4616 {
3dab1dad 4617 char* const oldregxend = RExC_end;
ccb2c380 4618 char* parse_start = RExC_parse - 2;
a14b48bc 4619
830247a4 4620 if (RExC_parse[1] == '{') {
3568d838 4621 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4622 RExC_end = strchr(RExC_parse, '}');
4623 if (!RExC_end) {
3dab1dad 4624 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4625 RExC_parse += 2;
4626 RExC_end = oldregxend;
0da60cf5 4627 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4628 }
830247a4 4629 RExC_end++;
a14b48bc 4630 }
af6f566e 4631 else {
830247a4 4632 RExC_end = RExC_parse + 2;
af6f566e
HS
4633 if (RExC_end > oldregxend)
4634 RExC_end = oldregxend;
4635 }
830247a4 4636 RExC_parse--;
a14b48bc 4637
3dab1dad 4638 ret = regclass(pRExC_state,depth+1);
a14b48bc 4639
830247a4
IZ
4640 RExC_end = oldregxend;
4641 RExC_parse--;
ccb2c380
MP
4642
4643 Set_Node_Offset(ret, parse_start + 2);
4644 Set_Node_Cur_Length(ret);
830247a4 4645 nextchar(pRExC_state);
a14b48bc
LW
4646 *flagp |= HASWIDTH|SIMPLE;
4647 }
4648 break;
a0d0e21e
LW
4649 case 'n':
4650 case 'r':
4651 case 't':
4652 case 'f':
4653 case 'e':
4654 case 'a':
4655 case 'x':
4656 case 'c':
4657 case '0':
4658 goto defchar;
4659 case '1': case '2': case '3': case '4':
4660 case '5': case '6': case '7': case '8': case '9':
4661 {
1df70142 4662 const I32 num = atoi(RExC_parse);
a0d0e21e 4663
830247a4 4664 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4665 goto defchar;
4666 else {
3dab1dad 4667 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4668 while (isDIGIT(*RExC_parse))
4669 RExC_parse++;
b45f050a 4670
eb160463 4671 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4672 vFAIL("Reference to nonexistent group");
830247a4 4673 RExC_sawback = 1;
eb160463
GS
4674 ret = reganode(pRExC_state,
4675 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4676 num);
a0d0e21e 4677 *flagp |= HASWIDTH;
2af232bd 4678
fac92740 4679 /* override incorrect value set in reganode MJD */
2af232bd 4680 Set_Node_Offset(ret, parse_start+1);
fac92740 4681 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4682 RExC_parse--;
4683 nextchar(pRExC_state);
a0d0e21e
LW
4684 }
4685 }
4686 break;
4687 case '\0':
830247a4 4688 if (RExC_parse >= RExC_end)
b45f050a 4689 FAIL("Trailing \\");
a0d0e21e
LW
4690 /* FALL THROUGH */
4691 default:
a0288114 4692 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4693 back into the quick-grab loop below */
45948336 4694 parse_start--;
a0d0e21e
LW
4695 goto defchar;
4696 }
4697 break;
4633a7c4
LW
4698
4699 case '#':
e2509266 4700 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4701 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4702 RExC_parse++;
830247a4 4703 if (RExC_parse < RExC_end)
4633a7c4
LW
4704 goto tryagain;
4705 }
4706 /* FALL THROUGH */
4707
a0d0e21e 4708 default: {
ba210ebe 4709 register STRLEN len;
58ae7d3f 4710 register UV ender;
a0d0e21e 4711 register char *p;
3dab1dad 4712 char *s;
80aecb99 4713 STRLEN foldlen;
89ebb4a3 4714 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4715
4716 parse_start = RExC_parse - 1;
a0d0e21e 4717
830247a4 4718 RExC_parse++;
a0d0e21e
LW
4719
4720 defchar:
58ae7d3f 4721 ender = 0;
eb160463
GS
4722 ret = reg_node(pRExC_state,
4723 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4724 s = STRING(ret);
830247a4
IZ
4725 for (len = 0, p = RExC_parse - 1;
4726 len < 127 && p < RExC_end;
a0d0e21e
LW
4727 len++)
4728 {
3dab1dad 4729 char * const oldp = p;
5b5a24f7 4730
e2509266 4731 if (RExC_flags & PMf_EXTENDED)
830247a4 4732 p = regwhite(p, RExC_end);
a0d0e21e
LW
4733 switch (*p) {
4734 case '^':
4735 case '$':
4736 case '.':
4737 case '[':
4738 case '(':
4739 case ')':
4740 case '|':
4741 goto loopdone;
4742 case '\\':
4743 switch (*++p) {
4744 case 'A':
1ed8eac0
JF
4745 case 'C':
4746 case 'X':
a0d0e21e
LW
4747 case 'G':
4748 case 'Z':
b85d18e9 4749 case 'z':
a0d0e21e
LW
4750 case 'w':
4751 case 'W':
4752 case 'b':
4753 case 'B':
4754 case 's':
4755 case 'S':
4756 case 'd':
4757 case 'D':
a14b48bc
LW
4758 case 'p':
4759 case 'P':
a0d0e21e
LW
4760 --p;
4761 goto loopdone;
4762 case 'n':
4763 ender = '\n';
4764 p++;
a687059c 4765 break;
a0d0e21e
LW
4766 case 'r':
4767 ender = '\r';
4768 p++;
a687059c 4769 break;
a0d0e21e
LW
4770 case 't':
4771 ender = '\t';
4772 p++;
a687059c 4773 break;
a0d0e21e
LW
4774 case 'f':
4775 ender = '\f';
4776 p++;
a687059c 4777 break;
a0d0e21e 4778 case 'e':
c7f1f016 4779 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4780 p++;
a687059c 4781 break;
a0d0e21e 4782 case 'a':
c7f1f016 4783 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4784 p++;
a687059c 4785 break;
a0d0e21e 4786 case 'x':
a0ed51b3 4787 if (*++p == '{') {
1df70142 4788 char* const e = strchr(p, '}');
b81d288d 4789
b45f050a 4790 if (!e) {
830247a4 4791 RExC_parse = p + 1;
b45f050a
JF
4792 vFAIL("Missing right brace on \\x{}");
4793 }
de5f0749 4794 else {
a4c04bdc
NC
4795 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4796 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4797 STRLEN numlen = e - p - 1;
53305cf1 4798 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4799 if (ender > 0xff)
4800 RExC_utf8 = 1;
a0ed51b3
LW
4801 p = e + 1;
4802 }
a0ed51b3
LW
4803 }
4804 else {
a4c04bdc 4805 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4806 STRLEN numlen = 2;
53305cf1 4807 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4808 p += numlen;
4809 }
a687059c 4810 break;
a0d0e21e
LW
4811 case 'c':
4812 p++;
bbce6d69 4813 ender = UCHARAT(p++);
4814 ender = toCTRL(ender);
a687059c 4815 break;
a0d0e21e
LW
4816 case '0': case '1': case '2': case '3':case '4':
4817 case '5': case '6': case '7': case '8':case '9':
4818 if (*p == '0' ||
830247a4 4819 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4820 I32 flags = 0;
1df70142 4821 STRLEN numlen = 3;
53305cf1 4822 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4823 p += numlen;
4824 }
4825 else {
4826 --p;
4827 goto loopdone;
a687059c
LW
4828 }
4829 break;
a0d0e21e 4830 case '\0':
830247a4 4831 if (p >= RExC_end)
b45f050a 4832 FAIL("Trailing \\");
a687059c 4833 /* FALL THROUGH */
a0d0e21e 4834 default:
041457d9 4835 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4836 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4837 goto normal_default;
a0d0e21e
LW
4838 }
4839 break;
a687059c 4840 default:
a0ed51b3 4841 normal_default:
fd400ab9 4842 if (UTF8_IS_START(*p) && UTF) {
1df70142 4843 STRLEN numlen;
5e12f4fb 4844 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4845 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4846 p += numlen;
4847 }
4848 else
4849 ender = *p++;
a0d0e21e 4850 break;
a687059c 4851 }
e2509266 4852 if (RExC_flags & PMf_EXTENDED)
830247a4 4853 p = regwhite(p, RExC_end);
60a8b682
JH
4854 if (UTF && FOLD) {
4855 /* Prime the casefolded buffer. */
ac7e0132 4856 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4857 }
a0d0e21e
LW
4858 if (ISMULT2(p)) { /* Back off on ?+*. */
4859 if (len)
4860 p = oldp;
16ea2a2e 4861 else if (UTF) {
80aecb99 4862 if (FOLD) {
60a8b682 4863 /* Emit all the Unicode characters. */
1df70142 4864 STRLEN numlen;
80aecb99
JH
4865 for (foldbuf = tmpbuf;
4866 foldlen;
4867 foldlen -= numlen) {
4868 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4869 if (numlen > 0) {
71207a34 4870 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4871 s += unilen;
4872 len += unilen;
4873 /* In EBCDIC the numlen
4874 * and unilen can differ. */
9dc45d57 4875 foldbuf += numlen;
47654450
JH
4876 if (numlen >= foldlen)
4877 break;
9dc45d57
JH
4878 }
4879 else
4880 break; /* "Can't happen." */
80aecb99
JH
4881 }
4882 }
4883 else {
71207a34 4884 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4885 if (unilen > 0) {
0ebc6274
JH
4886 s += unilen;
4887 len += unilen;
9dc45d57 4888 }
80aecb99 4889 }
a0ed51b3 4890 }
a0d0e21e
LW
4891 else {
4892 len++;
eb160463 4893 REGC((char)ender, s++);
a0d0e21e
LW
4894 }
4895 break;
a687059c 4896 }
16ea2a2e 4897 if (UTF) {
80aecb99 4898 if (FOLD) {
60a8b682 4899 /* Emit all the Unicode characters. */
1df70142 4900 STRLEN numlen;
80aecb99
JH
4901 for (foldbuf = tmpbuf;
4902 foldlen;
4903 foldlen -= numlen) {
4904 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4905 if (numlen > 0) {
71207a34 4906 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4907 len += unilen;
4908 s += unilen;
4909 /* In EBCDIC the numlen
4910 * and unilen can differ. */
9dc45d57 4911 foldbuf += numlen;
47654450
JH
4912 if (numlen >= foldlen)
4913 break;
9dc45d57
JH
4914 }
4915 else
4916 break;
80aecb99
JH
4917 }
4918 }
4919 else {
71207a34 4920 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4921 if (unilen > 0) {
0ebc6274
JH
4922 s += unilen;
4923 len += unilen;
9dc45d57 4924 }
80aecb99
JH
4925 }
4926 len--;
a0ed51b3
LW
4927 }
4928 else
eb160463 4929 REGC((char)ender, s++);
a0d0e21e
LW
4930 }
4931 loopdone:
830247a4 4932 RExC_parse = p - 1;
fac92740 4933 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4934 nextchar(pRExC_state);
793db0cb
JH
4935 {
4936 /* len is STRLEN which is unsigned, need to copy to signed */
4937 IV iv = len;
4938 if (iv < 0)
4939 vFAIL("Internal disaster");
4940 }
a0d0e21e
LW
4941 if (len > 0)
4942 *flagp |= HASWIDTH;
090f7165 4943 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4944 *flagp |= SIMPLE;
3dab1dad 4945
cd439c50 4946 if (SIZE_ONLY)
830247a4 4947 RExC_size += STR_SZ(len);
3dab1dad
YO
4948 else {
4949 STR_LEN(ret) = len;
830247a4 4950 RExC_emit += STR_SZ(len);
07be1b83 4951 }
3dab1dad 4952 }
a0d0e21e
LW
4953 break;
4954 }
a687059c 4955
60a8b682
JH
4956 /* If the encoding pragma is in effect recode the text of
4957 * any EXACT-kind nodes. */
3dab1dad
YO
4958 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4959 const STRLEN oldlen = STR_LEN(ret);
4960 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4961
4962 if (RExC_utf8)
4963 SvUTF8_on(sv);
4964 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4965 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4966 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4967
4968 if (SvUTF8(sv))
4969 RExC_utf8 = 1;
4970 if (!SIZE_ONLY) {
a3621e74
YO
4971 GET_RE_DEBUG_FLAGS_DECL;
4972 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4973 (int)oldlen, STRING(ret),
4974 (int)newlen, s));
4975 Copy(s, STRING(ret), newlen, char);
4976 STR_LEN(ret) += newlen - oldlen;
4977 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4978 } else
4979 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4980 }
a72c7584
JH
4981 }
4982
a0d0e21e 4983 return(ret);
a687059c
LW
4984}
4985
873ef191 4986STATIC char *
5f66b61c 4987S_regwhite(char *p, const char *e)
5b5a24f7
CS
4988{
4989 while (p < e) {
4990 if (isSPACE(*p))
4991 ++p;
4992 else if (*p == '#') {
4993 do {
4994 p++;
4995 } while (p < e && *p != '\n');
4996 }
4997 else
4998 break;
4999 }
5000 return p;
5001}
5002
b8c5462f
JH
5003/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5004 Character classes ([:foo:]) can also be negated ([:^foo:]).
5005 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5006 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5007 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5008
5009#define POSIXCC_DONE(c) ((c) == ':')
5010#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5011#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5012
b8c5462f 5013STATIC I32
830247a4 5014S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5015{
97aff369 5016 dVAR;
936ed897 5017 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5018
830247a4 5019 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5020 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5021 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5022 const char c = UCHARAT(RExC_parse);
097eb12c 5023 char* const s = RExC_parse++;
b81d288d 5024
9a86a77b 5025 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5026 RExC_parse++;
5027 if (RExC_parse == RExC_end)
620e46c5 5028 /* Grandfather lone [:, [=, [. */
830247a4 5029 RExC_parse = s;
620e46c5 5030 else {
3dab1dad 5031 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5032 assert(*t == c);
5033
9a86a77b 5034 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5035 const char *posixcc = s + 1;
830247a4 5036 RExC_parse++; /* skip over the ending ] */
3dab1dad 5037
b8c5462f 5038 if (*s == ':') {
1df70142
AL
5039 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5040 const I32 skip = t - posixcc;
80916619
NC
5041
5042 /* Initially switch on the length of the name. */
5043 switch (skip) {
5044 case 4:
3dab1dad
YO
5045 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5046 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5047 break;
80916619
NC
5048 case 5:
5049 /* Names all of length 5. */
5050 /* alnum alpha ascii blank cntrl digit graph lower
5051 print punct space upper */
5052 /* Offset 4 gives the best switch position. */
5053 switch (posixcc[4]) {
5054 case 'a':
3dab1dad
YO
5055 if (memEQ(posixcc, "alph", 4)) /* alpha */
5056 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5057 break;
5058 case 'e':
3dab1dad
YO
5059 if (memEQ(posixcc, "spac", 4)) /* space */
5060 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5061 break;
5062 case 'h':
3dab1dad
YO
5063 if (memEQ(posixcc, "grap", 4)) /* graph */
5064 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5065 break;
5066 case 'i':
3dab1dad
YO
5067 if (memEQ(posixcc, "asci", 4)) /* ascii */
5068 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5069 break;
5070 case 'k':
3dab1dad
YO
5071 if (memEQ(posixcc, "blan", 4)) /* blank */
5072 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5073 break;
5074 case 'l':
3dab1dad
YO
5075 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5076 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5077 break;
5078 case 'm':
3dab1dad
YO
5079 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5080 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5081 break;
5082 case 'r':
3dab1dad
YO
5083 if (memEQ(posixcc, "lowe", 4)) /* lower */
5084 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5085 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5086 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5087 break;
5088 case 't':
3dab1dad
YO
5089 if (memEQ(posixcc, "digi", 4)) /* digit */
5090 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5091 else if (memEQ(posixcc, "prin", 4)) /* print */
5092 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5093 else if (memEQ(posixcc, "punc", 4)) /* punct */
5094 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5095 break;
b8c5462f
JH
5096 }
5097 break;
80916619 5098 case 6:
3dab1dad
YO
5099 if (memEQ(posixcc, "xdigit", 6))
5100 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5101 break;
5102 }
80916619
NC
5103
5104 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5105 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5106 t - s - 1, s + 1);
80916619
NC
5107 assert (posixcc[skip] == ':');
5108 assert (posixcc[skip+1] == ']');
b45f050a 5109 } else if (!SIZE_ONLY) {
b8c5462f 5110 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5111
830247a4 5112 /* adjust RExC_parse so the warning shows after
b45f050a 5113 the class closes */
9a86a77b 5114 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5115 RExC_parse++;
b45f050a
JF
5116 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5117 }
b8c5462f
JH
5118 } else {
5119 /* Maternal grandfather:
5120 * "[:" ending in ":" but not in ":]" */
830247a4 5121 RExC_parse = s;
767d463e 5122 }
620e46c5
JH
5123 }
5124 }
5125
b8c5462f
JH
5126 return namedclass;
5127}
5128
5129STATIC void
830247a4 5130S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5131{
97aff369 5132 dVAR;
3dab1dad 5133 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5134 const char *s = RExC_parse;
5135 const char c = *s++;
b8c5462f 5136
3dab1dad 5137 while (isALNUM(*s))
b8c5462f
JH
5138 s++;
5139 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5140 if (ckWARN(WARN_REGEXP))
5141 vWARN3(s+2,
5142 "POSIX syntax [%c %c] belongs inside character classes",
5143 c, c);
b45f050a
JF
5144
5145 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5146 if (POSIXCC_NOTYET(c)) {
830247a4 5147 /* adjust RExC_parse so the error shows after
b45f050a 5148 the class closes */
9a86a77b 5149 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5150 NOOP;
b45f050a
JF
5151 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5152 }
b8c5462f
JH
5153 }
5154 }
620e46c5
JH
5155}
5156
7f6f358c
YO
5157
5158/*
5159 parse a class specification and produce either an ANYOF node that
5160 matches the pattern. If the pattern matches a single char only and
5161 that char is < 256 then we produce an EXACT node instead.
5162*/
76e3520e 5163STATIC regnode *
3dab1dad 5164S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5165{
97aff369 5166 dVAR;
07be1b83 5167 register UV value;
9a86a77b 5168 register UV nextvalue;
3568d838 5169 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5170 register IV range = 0;
c277df42 5171 register regnode *ret;
ba210ebe 5172 STRLEN numlen;
ffc61ed2 5173 IV namedclass;
cbbf8932 5174 char *rangebegin = NULL;
936ed897 5175 bool need_class = 0;
c445ea15 5176 SV *listsv = NULL;
ffc61ed2 5177 UV n;
9e55ce06 5178 bool optimize_invert = TRUE;
cbbf8932 5179 AV* unicode_alternate = NULL;
1b2d223b
JH
5180#ifdef EBCDIC
5181 UV literal_endpoint = 0;
5182#endif
7f6f358c 5183 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5184
3dab1dad 5185 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5186 case we need to change the emitted regop to an EXACT. */
07be1b83 5187 const char * orig_parse = RExC_parse;
72f13be8 5188 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5189#ifndef DEBUGGING
5190 PERL_UNUSED_ARG(depth);
5191#endif
72f13be8 5192
3dab1dad 5193 DEBUG_PARSE("clas");
7f6f358c
YO
5194
5195 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5196 ret = reganode(pRExC_state, ANYOF, 0);
5197
5198 if (!SIZE_ONLY)
5199 ANYOF_FLAGS(ret) = 0;
5200
9a86a77b 5201 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5202 RExC_naughty++;
5203 RExC_parse++;
5204 if (!SIZE_ONLY)
5205 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5206 }
a0d0e21e 5207
73060fc4 5208 if (SIZE_ONLY) {
830247a4 5209 RExC_size += ANYOF_SKIP;
73060fc4
JH
5210 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5211 }
936ed897 5212 else {
830247a4 5213 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5214 if (FOLD)
5215 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5216 if (LOC)
5217 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5218 ANYOF_BITMAP_ZERO(ret);
396482e1 5219 listsv = newSVpvs("# comment\n");
a0d0e21e 5220 }
b8c5462f 5221
9a86a77b
JH
5222 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5223
b938889d 5224 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5225 checkposixcc(pRExC_state);
b8c5462f 5226
f064b6ad
HS
5227 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5228 if (UCHARAT(RExC_parse) == ']')
5229 goto charclassloop;
ffc61ed2 5230
9a86a77b 5231 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5232
5233 charclassloop:
5234
5235 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5236
73b437c8 5237 if (!range)
830247a4 5238 rangebegin = RExC_parse;
ffc61ed2 5239 if (UTF) {
5e12f4fb 5240 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5241 RExC_end - RExC_parse,
9f7f3913 5242 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5243 RExC_parse += numlen;
5244 }
5245 else
5246 value = UCHARAT(RExC_parse++);
7f6f358c 5247
9a86a77b
JH
5248 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5249 if (value == '[' && POSIXCC(nextvalue))
830247a4 5250 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5251 else if (value == '\\') {
ffc61ed2 5252 if (UTF) {
5e12f4fb 5253 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5254 RExC_end - RExC_parse,
9f7f3913 5255 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5256 RExC_parse += numlen;
5257 }
5258 else
5259 value = UCHARAT(RExC_parse++);
470c3474 5260 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5261 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5262 * be a problem later if we want switch on Unicode.
5263 * A similar issue a little bit later when switching on
5264 * namedclass. --jhi */
ffc61ed2 5265 switch ((I32)value) {
b8c5462f
JH
5266 case 'w': namedclass = ANYOF_ALNUM; break;
5267 case 'W': namedclass = ANYOF_NALNUM; break;
5268 case 's': namedclass = ANYOF_SPACE; break;
5269 case 'S': namedclass = ANYOF_NSPACE; break;
5270 case 'd': namedclass = ANYOF_DIGIT; break;
5271 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5272 case 'p':
5273 case 'P':
3dab1dad
YO
5274 {
5275 char *e;
af6f566e 5276 if (RExC_parse >= RExC_end)
2a4859cd 5277 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5278 if (*RExC_parse == '{') {
1df70142 5279 const U8 c = (U8)value;
ffc61ed2
JH
5280 e = strchr(RExC_parse++, '}');
5281 if (!e)
0da60cf5 5282 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5283 while (isSPACE(UCHARAT(RExC_parse)))
5284 RExC_parse++;
5285 if (e == RExC_parse)
0da60cf5 5286 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5287 n = e - RExC_parse;
ab13f0c7
JH
5288 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5289 n--;
ffc61ed2
JH
5290 }
5291 else {
5292 e = RExC_parse;
5293 n = 1;
5294 }
5295 if (!SIZE_ONLY) {
ab13f0c7
JH
5296 if (UCHARAT(RExC_parse) == '^') {
5297 RExC_parse++;
5298 n--;
5299 value = value == 'p' ? 'P' : 'p'; /* toggle */
5300 while (isSPACE(UCHARAT(RExC_parse))) {
5301 RExC_parse++;
5302 n--;
5303 }
5304 }
097eb12c
AL
5305 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5306 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5307 }
5308 RExC_parse = e + 1;
5309 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5310 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5311 }
f81125e2 5312 break;
b8c5462f
JH
5313 case 'n': value = '\n'; break;
5314 case 'r': value = '\r'; break;
5315 case 't': value = '\t'; break;
5316 case 'f': value = '\f'; break;
5317 case 'b': value = '\b'; break;
c7f1f016
NIS
5318 case 'e': value = ASCII_TO_NATIVE('\033');break;
5319 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5320 case 'x':
ffc61ed2 5321 if (*RExC_parse == '{') {
a4c04bdc
NC
5322 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5323 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5324 char * const e = strchr(RExC_parse++, '}');
b81d288d 5325 if (!e)
ffc61ed2 5326 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5327
5328 numlen = e - RExC_parse;
5329 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5330 RExC_parse = e + 1;
5331 }
5332 else {
a4c04bdc 5333 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5334 numlen = 2;
5335 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5336 RExC_parse += numlen;
5337 }
b8c5462f
JH
5338 break;
5339 case 'c':
830247a4 5340 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5341 value = toCTRL(value);
5342 break;
5343 case '0': case '1': case '2': case '3': case '4':
5344 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5345 {
5346 I32 flags = 0;
5347 numlen = 3;
5348 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5349 RExC_parse += numlen;
b8c5462f 5350 break;
53305cf1 5351 }
1028017a 5352 default:
041457d9 5353 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5354 vWARN2(RExC_parse,
5355 "Unrecognized escape \\%c in character class passed through",
5356 (int)value);
1028017a 5357 break;
b8c5462f 5358 }
ffc61ed2 5359 } /* end of \blah */
1b2d223b
JH
5360#ifdef EBCDIC
5361 else
5362 literal_endpoint++;
5363#endif
ffc61ed2
JH
5364
5365 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5366
5367 if (!SIZE_ONLY && !need_class)
936ed897 5368 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5369
936ed897 5370 need_class = 1;
ffc61ed2
JH
5371
5372 /* a bad range like a-\d, a-[:digit:] ? */
5373 if (range) {
73b437c8 5374 if (!SIZE_ONLY) {
afd78fd5 5375 if (ckWARN(WARN_REGEXP)) {
097eb12c 5376 const int w =
afd78fd5
JH
5377 RExC_parse >= rangebegin ?
5378 RExC_parse - rangebegin : 0;
830247a4 5379 vWARN4(RExC_parse,
b45f050a 5380 "False [] range \"%*.*s\"",
097eb12c 5381 w, w, rangebegin);
afd78fd5 5382 }
3568d838
JH
5383 if (prevvalue < 256) {
5384 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5385 ANYOF_BITMAP_SET(ret, '-');
5386 }
5387 else {
5388 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5389 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5390 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5391 }
b8c5462f 5392 }
ffc61ed2
JH
5393
5394 range = 0; /* this was not a true range */
73b437c8 5395 }
ffc61ed2 5396
73b437c8 5397 if (!SIZE_ONLY) {
c49a72a9
NC
5398 const char *what = NULL;
5399 char yesno = 0;
5400
3568d838
JH
5401 if (namedclass > OOB_NAMEDCLASS)
5402 optimize_invert = FALSE;
e2962f66
JH
5403 /* Possible truncation here but in some 64-bit environments
5404 * the compiler gets heartburn about switch on 64-bit values.
5405 * A similar issue a little earlier when switching on value.
98f323fa 5406 * --jhi */
e2962f66 5407 switch ((I32)namedclass) {
73b437c8
JH
5408 case ANYOF_ALNUM:
5409 if (LOC)
936ed897 5410 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5411 else {
5412 for (value = 0; value < 256; value++)
5413 if (isALNUM(value))
936ed897 5414 ANYOF_BITMAP_SET(ret, value);
73b437c8 5415 }
c49a72a9
NC
5416 yesno = '+';
5417 what = "Word";
73b437c8
JH
5418 break;
5419 case ANYOF_NALNUM:
5420 if (LOC)
936ed897 5421 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5422 else {
5423 for (value = 0; value < 256; value++)
5424 if (!isALNUM(value))
936ed897 5425 ANYOF_BITMAP_SET(ret, value);
73b437c8 5426 }
c49a72a9
NC
5427 yesno = '!';
5428 what = "Word";
73b437c8 5429 break;
ffc61ed2 5430 case ANYOF_ALNUMC:
73b437c8 5431 if (LOC)
ffc61ed2 5432 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5433 else {
5434 for (value = 0; value < 256; value++)
ffc61ed2 5435 if (isALNUMC(value))
936ed897 5436 ANYOF_BITMAP_SET(ret, value);
73b437c8 5437 }
c49a72a9
NC
5438 yesno = '+';
5439 what = "Alnum";
73b437c8
JH
5440 break;
5441 case ANYOF_NALNUMC:
5442 if (LOC)
936ed897 5443 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5444 else {
5445 for (value = 0; value < 256; value++)
5446 if (!isALNUMC(value))
936ed897 5447 ANYOF_BITMAP_SET(ret, value);
73b437c8 5448 }
c49a72a9
NC
5449 yesno = '!';
5450 what = "Alnum";
73b437c8
JH
5451 break;
5452 case ANYOF_ALPHA:
5453 if (LOC)
936ed897 5454 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5455 else {
5456 for (value = 0; value < 256; value++)
5457 if (isALPHA(value))
936ed897 5458 ANYOF_BITMAP_SET(ret, value);
73b437c8 5459 }
c49a72a9
NC
5460 yesno = '+';
5461 what = "Alpha";
73b437c8
JH
5462 break;
5463 case ANYOF_NALPHA:
5464 if (LOC)
936ed897 5465 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5466 else {
5467 for (value = 0; value < 256; value++)
5468 if (!isALPHA(value))
936ed897 5469 ANYOF_BITMAP_SET(ret, value);
73b437c8 5470 }
c49a72a9
NC
5471 yesno = '!';
5472 what = "Alpha";
73b437c8
JH
5473 break;
5474 case ANYOF_ASCII:
5475 if (LOC)
936ed897 5476 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5477 else {
c7f1f016 5478#ifndef EBCDIC
1ba5c669
JH
5479 for (value = 0; value < 128; value++)
5480 ANYOF_BITMAP_SET(ret, value);
5481#else /* EBCDIC */
ffbc6a93 5482 for (value = 0; value < 256; value++) {
3a3c4447
JH
5483 if (isASCII(value))
5484 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5485 }
1ba5c669 5486#endif /* EBCDIC */
73b437c8 5487 }
c49a72a9
NC
5488 yesno = '+';
5489 what = "ASCII";
73b437c8
JH
5490 break;
5491 case ANYOF_NASCII:
5492 if (LOC)
936ed897 5493 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5494 else {
c7f1f016 5495#ifndef EBCDIC
1ba5c669
JH
5496 for (value = 128; value < 256; value++)
5497 ANYOF_BITMAP_SET(ret, value);
5498#else /* EBCDIC */
ffbc6a93 5499 for (value = 0; value < 256; value++) {
3a3c4447
JH
5500 if (!isASCII(value))
5501 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5502 }
1ba5c669 5503#endif /* EBCDIC */
73b437c8 5504 }
c49a72a9
NC
5505 yesno = '!';
5506 what = "ASCII";
73b437c8 5507 break;
aaa51d5e
JF
5508 case ANYOF_BLANK:
5509 if (LOC)
5510 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5511 else {
5512 for (value = 0; value < 256; value++)
5513 if (isBLANK(value))
5514 ANYOF_BITMAP_SET(ret, value);
5515 }
c49a72a9
NC
5516 yesno = '+';
5517 what = "Blank";
aaa51d5e
JF
5518 break;
5519 case ANYOF_NBLANK:
5520 if (LOC)
5521 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5522 else {
5523 for (value = 0; value < 256; value++)
5524 if (!isBLANK(value))
5525 ANYOF_BITMAP_SET(ret, value);
5526 }
c49a72a9
NC
5527 yesno = '!';
5528 what = "Blank";
aaa51d5e 5529 break;
73b437c8
JH
5530 case ANYOF_CNTRL:
5531 if (LOC)
936ed897 5532 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5533 else {
5534 for (value = 0; value < 256; value++)
5535 if (isCNTRL(value))
936ed897 5536 ANYOF_BITMAP_SET(ret, value);
73b437c8 5537 }
c49a72a9
NC
5538 yesno = '+';
5539 what = "Cntrl";
73b437c8
JH
5540 break;
5541 case ANYOF_NCNTRL:
5542 if (LOC)
936ed897 5543 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5544 else {
5545 for (value = 0; value < 256; value++)
5546 if (!isCNTRL(value))
936ed897 5547 ANYOF_BITMAP_SET(ret, value);
73b437c8 5548 }
c49a72a9
NC
5549 yesno = '!';
5550 what = "Cntrl";
ffc61ed2
JH
5551 break;
5552 case ANYOF_DIGIT:
5553 if (LOC)
5554 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5555 else {
5556 /* consecutive digits assumed */
5557 for (value = '0'; value <= '9'; value++)
5558 ANYOF_BITMAP_SET(ret, value);
5559 }
c49a72a9
NC
5560 yesno = '+';
5561 what = "Digit";
ffc61ed2
JH
5562 break;
5563 case ANYOF_NDIGIT:
5564 if (LOC)
5565 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5566 else {
5567 /* consecutive digits assumed */
5568 for (value = 0; value < '0'; value++)
5569 ANYOF_BITMAP_SET(ret, value);
5570 for (value = '9' + 1; value < 256; value++)
5571 ANYOF_BITMAP_SET(ret, value);
5572 }
c49a72a9
NC
5573 yesno = '!';
5574 what = "Digit";
73b437c8
JH
5575 break;
5576 case ANYOF_GRAPH:
5577 if (LOC)
936ed897 5578 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5579 else {
5580 for (value = 0; value < 256; value++)
5581 if (isGRAPH(value))
936ed897 5582 ANYOF_BITMAP_SET(ret, value);
73b437c8 5583 }
c49a72a9
NC
5584 yesno = '+';
5585 what = "Graph";
73b437c8
JH
5586 break;
5587 case ANYOF_NGRAPH:
5588 if (LOC)
936ed897 5589 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5590 else {
5591 for (value = 0; value < 256; value++)
5592 if (!isGRAPH(value))
936ed897 5593 ANYOF_BITMAP_SET(ret, value);
73b437c8 5594 }
c49a72a9
NC
5595 yesno = '!';
5596 what = "Graph";
73b437c8
JH
5597 break;
5598 case ANYOF_LOWER:
5599 if (LOC)
936ed897 5600 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5601 else {
5602 for (value = 0; value < 256; value++)
5603 if (isLOWER(value))
936ed897 5604 ANYOF_BITMAP_SET(ret, value);
73b437c8 5605 }
c49a72a9
NC
5606 yesno = '+';
5607 what = "Lower";
73b437c8
JH
5608 break;
5609 case ANYOF_NLOWER:
5610 if (LOC)
936ed897 5611 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5612 else {
5613 for (value = 0; value < 256; value++)
5614 if (!isLOWER(value))
936ed897 5615 ANYOF_BITMAP_SET(ret, value);
73b437c8 5616 }
c49a72a9
NC
5617 yesno = '!';
5618 what = "Lower";
73b437c8
JH
5619 break;
5620 case ANYOF_PRINT:
5621 if (LOC)
936ed897 5622 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5623 else {
5624 for (value = 0; value < 256; value++)
5625 if (isPRINT(value))
936ed897 5626 ANYOF_BITMAP_SET(ret, value);
73b437c8 5627 }
c49a72a9
NC
5628 yesno = '+';
5629 what = "Print";
73b437c8
JH
5630 break;
5631 case ANYOF_NPRINT:
5632 if (LOC)
936ed897 5633 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5634 else {
5635 for (value = 0; value < 256; value++)
5636 if (!isPRINT(value))
936ed897 5637 ANYOF_BITMAP_SET(ret, value);
73b437c8 5638 }
c49a72a9
NC
5639 yesno = '!';
5640 what = "Print";
73b437c8 5641 break;
aaa51d5e
JF
5642 case ANYOF_PSXSPC:
5643 if (LOC)
5644 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5645 else {
5646 for (value = 0; value < 256; value++)
5647 if (isPSXSPC(value))
5648 ANYOF_BITMAP_SET(ret, value);
5649 }
c49a72a9
NC
5650 yesno = '+';
5651 what = "Space";
aaa51d5e
JF
5652 break;
5653 case ANYOF_NPSXSPC:
5654 if (LOC)
5655 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5656 else {
5657 for (value = 0; value < 256; value++)
5658 if (!isPSXSPC(value))
5659 ANYOF_BITMAP_SET(ret, value);
5660 }
c49a72a9
NC
5661 yesno = '!';
5662 what = "Space";
aaa51d5e 5663 break;
73b437c8
JH
5664 case ANYOF_PUNCT:
5665 if (LOC)
936ed897 5666 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5667 else {
5668 for (value = 0; value < 256; value++)
5669 if (isPUNCT(value))
936ed897 5670 ANYOF_BITMAP_SET(ret, value);
73b437c8 5671 }
c49a72a9
NC
5672 yesno = '+';
5673 what = "Punct";
73b437c8
JH
5674 break;
5675 case ANYOF_NPUNCT:
5676 if (LOC)
936ed897 5677 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5678 else {
5679 for (value = 0; value < 256; value++)
5680 if (!isPUNCT(value))
936ed897 5681 ANYOF_BITMAP_SET(ret, value);
73b437c8 5682 }
c49a72a9
NC
5683 yesno = '!';
5684 what = "Punct";
ffc61ed2
JH
5685 break;
5686 case ANYOF_SPACE:
5687 if (LOC)
5688 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5689 else {
5690 for (value = 0; value < 256; value++)
5691 if (isSPACE(value))
5692 ANYOF_BITMAP_SET(ret, value);
5693 }
c49a72a9
NC
5694 yesno = '+';
5695 what = "SpacePerl";
ffc61ed2
JH
5696 break;
5697 case ANYOF_NSPACE:
5698 if (LOC)
5699 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5700 else {
5701 for (value = 0; value < 256; value++)
5702 if (!isSPACE(value))
5703 ANYOF_BITMAP_SET(ret, value);
5704 }
c49a72a9
NC
5705 yesno = '!';
5706 what = "SpacePerl";
73b437c8
JH
5707 break;
5708 case ANYOF_UPPER:
5709 if (LOC)
936ed897 5710 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5711 else {
5712 for (value = 0; value < 256; value++)
5713 if (isUPPER(value))
936ed897 5714 ANYOF_BITMAP_SET(ret, value);
73b437c8 5715 }
c49a72a9
NC
5716 yesno = '+';
5717 what = "Upper";
73b437c8
JH
5718 break;
5719 case ANYOF_NUPPER:
5720 if (LOC)
936ed897 5721 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5722 else {
5723 for (value = 0; value < 256; value++)
5724 if (!isUPPER(value))
936ed897 5725 ANYOF_BITMAP_SET(ret, value);
73b437c8 5726 }
c49a72a9
NC
5727 yesno = '!';
5728 what = "Upper";
73b437c8
JH
5729 break;
5730 case ANYOF_XDIGIT:
5731 if (LOC)
936ed897 5732 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5733 else {
5734 for (value = 0; value < 256; value++)
5735 if (isXDIGIT(value))
936ed897 5736 ANYOF_BITMAP_SET(ret, value);
73b437c8 5737 }
c49a72a9
NC
5738 yesno = '+';
5739 what = "XDigit";
73b437c8
JH
5740 break;
5741 case ANYOF_NXDIGIT:
5742 if (LOC)
936ed897 5743 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5744 else {
5745 for (value = 0; value < 256; value++)
5746 if (!isXDIGIT(value))
936ed897 5747 ANYOF_BITMAP_SET(ret, value);
73b437c8 5748 }
c49a72a9
NC
5749 yesno = '!';
5750 what = "XDigit";
73b437c8 5751 break;
f81125e2
JP
5752 case ANYOF_MAX:
5753 /* this is to handle \p and \P */
5754 break;
73b437c8 5755 default:
b45f050a 5756 vFAIL("Invalid [::] class");
73b437c8 5757 break;
b8c5462f 5758 }
c49a72a9
NC
5759 if (what) {
5760 /* Strings such as "+utf8::isWord\n" */
5761 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5762 }
b8c5462f 5763 if (LOC)
936ed897 5764 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5765 continue;
a0d0e21e 5766 }
ffc61ed2
JH
5767 } /* end of namedclass \blah */
5768
a0d0e21e 5769 if (range) {
eb160463 5770 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5771 const int w = RExC_parse - rangebegin;
5772 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5773 range = 0; /* not a valid range */
73b437c8 5774 }
a0d0e21e
LW
5775 }
5776 else {
3568d838 5777 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5778 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5779 RExC_parse[1] != ']') {
5780 RExC_parse++;
ffc61ed2
JH
5781
5782 /* a bad range like \w-, [:word:]- ? */
5783 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5784 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5785 const int w =
afd78fd5
JH
5786 RExC_parse >= rangebegin ?
5787 RExC_parse - rangebegin : 0;
830247a4 5788 vWARN4(RExC_parse,
b45f050a 5789 "False [] range \"%*.*s\"",
097eb12c 5790 w, w, rangebegin);
afd78fd5 5791 }
73b437c8 5792 if (!SIZE_ONLY)
936ed897 5793 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5794 } else
ffc61ed2
JH
5795 range = 1; /* yeah, it's a range! */
5796 continue; /* but do it the next time */
a0d0e21e 5797 }
a687059c 5798 }
ffc61ed2 5799
93733859 5800 /* now is the next time */
07be1b83 5801 /*stored += (value - prevvalue + 1);*/
ae5c130c 5802 if (!SIZE_ONLY) {
3568d838 5803 if (prevvalue < 256) {
1df70142 5804 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5805 IV i;
3568d838 5806#ifdef EBCDIC
1b2d223b
JH
5807 /* In EBCDIC [\x89-\x91] should include
5808 * the \x8e but [i-j] should not. */
5809 if (literal_endpoint == 2 &&
5810 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5811 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5812 {
3568d838
JH
5813 if (isLOWER(prevvalue)) {
5814 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5815 if (isLOWER(i))
5816 ANYOF_BITMAP_SET(ret, i);
5817 } else {
3568d838 5818 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5819 if (isUPPER(i))
5820 ANYOF_BITMAP_SET(ret, i);
5821 }
8ada0baa 5822 }
ffc61ed2 5823 else
8ada0baa 5824#endif
07be1b83
YO
5825 for (i = prevvalue; i <= ceilvalue; i++) {
5826 if (!ANYOF_BITMAP_TEST(ret,i)) {
5827 stored++;
5828 ANYOF_BITMAP_SET(ret, i);
5829 }
5830 }
3568d838 5831 }
a5961de5 5832 if (value > 255 || UTF) {
1df70142
AL
5833 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5834 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5835 stored+=2; /* can't optimize this class */
ffc61ed2 5836 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5837 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5838 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5839 prevnatvalue, natvalue);
5840 }
5841 else if (prevnatvalue == natvalue) {
5842 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5843 if (FOLD) {
89ebb4a3 5844 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5845 STRLEN foldlen;
1df70142 5846 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5847
c840d2a2
JH
5848 /* If folding and foldable and a single
5849 * character, insert also the folded version
5850 * to the charclass. */
9e55ce06 5851 if (f != value) {
eb160463 5852 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5853 Perl_sv_catpvf(aTHX_ listsv,
5854 "%04"UVxf"\n", f);
5855 else {
5856 /* Any multicharacter foldings
5857 * require the following transform:
5858 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5859 * where E folds into "pq" and F folds
5860 * into "rst", all other characters
5861 * fold to single characters. We save
5862 * away these multicharacter foldings,
5863 * to be later saved as part of the
5864 * additional "s" data. */
5865 SV *sv;
5866
5867 if (!unicode_alternate)
5868 unicode_alternate = newAV();
5869 sv = newSVpvn((char*)foldbuf, foldlen);
5870 SvUTF8_on(sv);
5871 av_push(unicode_alternate, sv);
5872 }
5873 }
254ba52a 5874
60a8b682
JH
5875 /* If folding and the value is one of the Greek
5876 * sigmas insert a few more sigmas to make the
5877 * folding rules of the sigmas to work right.
5878 * Note that not all the possible combinations
5879 * are handled here: some of them are handled
9e55ce06
JH
5880 * by the standard folding rules, and some of
5881 * them (literal or EXACTF cases) are handled
5882 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5883 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5884 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5885 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5887 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5888 }
5889 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5890 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5891 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5892 }
5893 }
ffc61ed2 5894 }
1b2d223b
JH
5895#ifdef EBCDIC
5896 literal_endpoint = 0;
5897#endif
8ada0baa 5898 }
ffc61ed2
JH
5899
5900 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5901 }
ffc61ed2 5902
936ed897 5903 if (need_class) {
4f66b38d 5904 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5905 if (SIZE_ONLY)
830247a4 5906 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5907 else
830247a4 5908 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5909 }
ffc61ed2 5910
7f6f358c
YO
5911
5912 if (SIZE_ONLY)
5913 return ret;
5914 /****** !SIZE_ONLY AFTER HERE *********/
5915
5916 if( stored == 1 && value < 256
5917 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5918 ) {
5919 /* optimize single char class to an EXACT node
5920 but *only* when its not a UTF/high char */
07be1b83
YO
5921 const char * cur_parse= RExC_parse;
5922 RExC_emit = (regnode *)orig_emit;
5923 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5924 ret = reg_node(pRExC_state,
5925 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5926 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5927 *STRING(ret)= (char)value;
5928 STR_LEN(ret)= 1;
5929 RExC_emit += STR_SZ(1);
5930 return ret;
5931 }
ae5c130c 5932 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5933 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5934 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5935 ) {
a0ed51b3 5936 for (value = 0; value < 256; ++value) {
936ed897 5937 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5938 UV fold = PL_fold[value];
ffc61ed2
JH
5939
5940 if (fold != value)
5941 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5942 }
5943 }
936ed897 5944 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5945 }
ffc61ed2 5946
ae5c130c 5947 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5948 if (optimize_invert &&
ffc61ed2
JH
5949 /* If the only flag is inversion. */
5950 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5951 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5952 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5953 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5954 }
7f6f358c 5955 {
097eb12c 5956 AV * const av = newAV();
ffc61ed2 5957 SV *rv;
9e55ce06 5958 /* The 0th element stores the character class description
6a0407ee 5959 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5960 * to initialize the appropriate swash (which gets stored in
5961 * the 1st element), and also useful for dumping the regnode.
5962 * The 2nd element stores the multicharacter foldings,
6a0407ee 5963 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5964 av_store(av, 0, listsv);
5965 av_store(av, 1, NULL);
9e55ce06 5966 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5967 rv = newRV_noinc((SV*)av);
19860706 5968 n = add_data(pRExC_state, 1, "s");
830247a4 5969 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5970 ARG_SET(ret, n);
a0ed51b3 5971 }
a0ed51b3
LW
5972 return ret;
5973}
5974
76e3520e 5975STATIC char*
830247a4 5976S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5977{
097eb12c 5978 char* const retval = RExC_parse++;
a0d0e21e 5979
4633a7c4 5980 for (;;) {
830247a4
IZ
5981 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5982 RExC_parse[2] == '#') {
e994fd66
AE
5983 while (*RExC_parse != ')') {
5984 if (RExC_parse == RExC_end)
5985 FAIL("Sequence (?#... not terminated");
830247a4 5986 RExC_parse++;
e994fd66 5987 }
830247a4 5988 RExC_parse++;
4633a7c4
LW
5989 continue;
5990 }
e2509266 5991 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5992 if (isSPACE(*RExC_parse)) {
5993 RExC_parse++;
748a9306
LW
5994 continue;
5995 }
830247a4 5996 else if (*RExC_parse == '#') {
e994fd66
AE
5997 while (RExC_parse < RExC_end)
5998 if (*RExC_parse++ == '\n') break;
748a9306
LW
5999 continue;
6000 }
748a9306 6001 }
4633a7c4 6002 return retval;
a0d0e21e 6003 }
a687059c
LW
6004}
6005
6006/*
c277df42 6007- reg_node - emit a node
a0d0e21e 6008*/
76e3520e 6009STATIC regnode * /* Location. */
830247a4 6010S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6011{
97aff369 6012 dVAR;
c277df42 6013 register regnode *ptr;
504618e9 6014 regnode * const ret = RExC_emit;
07be1b83 6015 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6016
c277df42 6017 if (SIZE_ONLY) {
830247a4
IZ
6018 SIZE_ALIGN(RExC_size);
6019 RExC_size += 1;
a0d0e21e
LW
6020 return(ret);
6021 }
c277df42 6022 NODE_ALIGN_FILL(ret);
a0d0e21e 6023 ptr = ret;
c277df42 6024 FILL_ADVANCE_NODE(ptr, op);
fac92740 6025 if (RExC_offsets) { /* MJD */
07be1b83 6026 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6027 "reg_node", __LINE__,
6028 reg_name[op],
07be1b83
YO
6029 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6030 ? "Overwriting end of array!\n" : "OK",
6031 (UV)(RExC_emit - RExC_emit_start),
6032 (UV)(RExC_parse - RExC_start),
6033 (UV)RExC_offsets[0]));
ccb2c380 6034 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6035 }
07be1b83 6036
830247a4 6037 RExC_emit = ptr;
a687059c 6038
a0d0e21e 6039 return(ret);
a687059c
LW
6040}
6041
6042/*
a0d0e21e
LW
6043- reganode - emit a node with an argument
6044*/
76e3520e 6045STATIC regnode * /* Location. */
830247a4 6046S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6047{
97aff369 6048 dVAR;
c277df42 6049 register regnode *ptr;
504618e9 6050 regnode * const ret = RExC_emit;
07be1b83 6051 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6052
c277df42 6053 if (SIZE_ONLY) {
830247a4
IZ
6054 SIZE_ALIGN(RExC_size);
6055 RExC_size += 2;
a0d0e21e
LW
6056 return(ret);
6057 }
fe14fcc3 6058
c277df42 6059 NODE_ALIGN_FILL(ret);
a0d0e21e 6060 ptr = ret;
c277df42 6061 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6062 if (RExC_offsets) { /* MJD */
07be1b83 6063 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6064 "reganode",
ccb2c380
MP
6065 __LINE__,
6066 reg_name[op],
07be1b83 6067 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6068 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6069 (UV)(RExC_emit - RExC_emit_start),
6070 (UV)(RExC_parse - RExC_start),
6071 (UV)RExC_offsets[0]));
ccb2c380 6072 Set_Cur_Node_Offset;
fac92740
MJD
6073 }
6074
830247a4 6075 RExC_emit = ptr;
fe14fcc3 6076
a0d0e21e 6077 return(ret);
fe14fcc3
LW
6078}
6079
6080/*
cd439c50 6081- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6082*/
71207a34
AL
6083STATIC STRLEN
6084S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6085{
97aff369 6086 dVAR;
71207a34 6087 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6088}
6089
6090/*
a0d0e21e
LW
6091- reginsert - insert an operator in front of already-emitted operand
6092*
6093* Means relocating the operand.
6094*/
76e3520e 6095STATIC void
830247a4 6096S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6097{
97aff369 6098 dVAR;
c277df42
IZ
6099 register regnode *src;
6100 register regnode *dst;
6101 register regnode *place;
504618e9 6102 const int offset = regarglen[(U8)op];
07be1b83 6103 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6104/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6105
6106 if (SIZE_ONLY) {
830247a4 6107 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6108 return;
6109 }
a687059c 6110
830247a4
IZ
6111 src = RExC_emit;
6112 RExC_emit += NODE_STEP_REGNODE + offset;
6113 dst = RExC_emit;
fac92740 6114 while (src > opnd) {
c277df42 6115 StructCopy(--src, --dst, regnode);
fac92740 6116 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6117 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6118 "reg_insert",
ccb2c380
MP
6119 __LINE__,
6120 reg_name[op],
07be1b83
YO
6121 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6122 ? "Overwriting end of array!\n" : "OK",
6123 (UV)(src - RExC_emit_start),
6124 (UV)(dst - RExC_emit_start),
6125 (UV)RExC_offsets[0]));
ccb2c380
MP
6126 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6127 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6128 }
6129 }
6130
a0d0e21e
LW
6131
6132 place = opnd; /* Op node, where operand used to be. */
fac92740 6133 if (RExC_offsets) { /* MJD */
07be1b83 6134 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6135 "reginsert",
ccb2c380
MP
6136 __LINE__,
6137 reg_name[op],
07be1b83 6138 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6139 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6140 (UV)(place - RExC_emit_start),
6141 (UV)(RExC_parse - RExC_start),
fac92740 6142 RExC_offsets[0]));
ccb2c380 6143 Set_Node_Offset(place, RExC_parse);
45948336 6144 Set_Node_Length(place, 1);
fac92740 6145 }
c277df42
IZ
6146 src = NEXTOPER(place);
6147 FILL_ADVANCE_NODE(place, op);
6148 Zero(src, offset, regnode);
a687059c
LW
6149}
6150
6151/*
c277df42 6152- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6153- SEE ALSO: regtail_study
a0d0e21e 6154*/
097eb12c 6155/* TODO: All three parms should be const */
76e3520e 6156STATIC void
3dab1dad 6157S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6158{
97aff369 6159 dVAR;
c277df42 6160 register regnode *scan;
72f13be8 6161 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6162#ifndef DEBUGGING
6163 PERL_UNUSED_ARG(depth);
6164#endif
a0d0e21e 6165
c277df42 6166 if (SIZE_ONLY)
a0d0e21e
LW
6167 return;
6168
6169 /* Find last node. */
6170 scan = p;
6171 for (;;) {
504618e9 6172 regnode * const temp = regnext(scan);
3dab1dad
YO
6173 DEBUG_PARSE_r({
6174 SV * const mysv=sv_newmortal();
6175 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6176 regprop(RExC_rx, mysv, scan);
6177 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6178 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6179 });
6180 if (temp == NULL)
6181 break;
6182 scan = temp;
6183 }
6184
6185 if (reg_off_by_arg[OP(scan)]) {
6186 ARG_SET(scan, val - scan);
6187 }
6188 else {
6189 NEXT_OFF(scan) = val - scan;
6190 }
6191}
6192
07be1b83 6193#ifdef DEBUGGING
3dab1dad
YO
6194/*
6195- regtail_study - set the next-pointer at the end of a node chain of p to val.
6196- Look for optimizable sequences at the same time.
6197- currently only looks for EXACT chains.
07be1b83
YO
6198
6199This is expermental code. The idea is to use this routine to perform
6200in place optimizations on branches and groups as they are constructed,
6201with the long term intention of removing optimization from study_chunk so
6202that it is purely analytical.
6203
6204Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6205to control which is which.
6206
3dab1dad
YO
6207*/
6208/* TODO: All four parms should be const */
07be1b83 6209
3dab1dad
YO
6210STATIC U8
6211S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6212{
6213 dVAR;
6214 register regnode *scan;
07be1b83
YO
6215 U8 exact = PSEUDO;
6216#ifdef EXPERIMENTAL_INPLACESCAN
6217 I32 min = 0;
6218#endif
6219
3dab1dad
YO
6220 GET_RE_DEBUG_FLAGS_DECL;
6221
07be1b83 6222
3dab1dad
YO
6223 if (SIZE_ONLY)
6224 return exact;
6225
6226 /* Find last node. */
6227
6228 scan = p;
6229 for (;;) {
6230 regnode * const temp = regnext(scan);
07be1b83
YO
6231#ifdef EXPERIMENTAL_INPLACESCAN
6232 if (PL_regkind[OP(scan)] == EXACT)
6233 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6234 return EXACT;
6235#endif
3dab1dad
YO
6236 if ( exact ) {
6237 switch (OP(scan)) {
6238 case EXACT:
6239 case EXACTF:
6240 case EXACTFL:
6241 if( exact == PSEUDO )
6242 exact= OP(scan);
07be1b83
YO
6243 else if ( exact != OP(scan) )
6244 exact= 0;
3dab1dad
YO
6245 case NOTHING:
6246 break;
6247 default:
6248 exact= 0;
6249 }
6250 }
6251 DEBUG_PARSE_r({
6252 SV * const mysv=sv_newmortal();
6253 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6254 regprop(RExC_rx, mysv, scan);
6255 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6256 SvPV_nolen_const(mysv),
6257 reg_name[exact],
6258 REG_NODE_NUM(scan));
6259 });
a0d0e21e
LW
6260 if (temp == NULL)
6261 break;
6262 scan = temp;
6263 }
07be1b83
YO
6264 DEBUG_PARSE_r({
6265 SV * const mysv_val=sv_newmortal();
6266 DEBUG_PARSE_MSG("");
6267 regprop(RExC_rx, mysv_val, val);
6268 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6269 SvPV_nolen_const(mysv_val),
6270 REG_NODE_NUM(val),
6271 val - scan
6272 );
6273 });
c277df42
IZ
6274 if (reg_off_by_arg[OP(scan)]) {
6275 ARG_SET(scan, val - scan);
a0ed51b3
LW
6276 }
6277 else {
c277df42
IZ
6278 NEXT_OFF(scan) = val - scan;
6279 }
3dab1dad
YO
6280
6281 return exact;
a687059c 6282}
07be1b83 6283#endif
a687059c
LW
6284
6285/*
a687059c
LW
6286 - regcurly - a little FSA that accepts {\d+,?\d*}
6287 */
79072805 6288STATIC I32
5f66b61c 6289S_regcurly(register const char *s)
a687059c
LW
6290{
6291 if (*s++ != '{')
6292 return FALSE;
f0fcb552 6293 if (!isDIGIT(*s))
a687059c 6294 return FALSE;
f0fcb552 6295 while (isDIGIT(*s))
a687059c
LW
6296 s++;
6297 if (*s == ',')
6298 s++;
f0fcb552 6299 while (isDIGIT(*s))
a687059c
LW
6300 s++;
6301 if (*s != '}')
6302 return FALSE;
6303 return TRUE;
6304}
6305
a687059c
LW
6306
6307/*
fd181c75 6308 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6309 */
6310void
097eb12c 6311Perl_regdump(pTHX_ const regexp *r)
a687059c 6312{
35ff7856 6313#ifdef DEBUGGING
97aff369 6314 dVAR;
c445ea15 6315 SV * const sv = sv_newmortal();
a687059c 6316
4f639d21 6317 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6318
6319 /* Header fields of interest. */
c277df42 6320 if (r->anchored_substr)
7b0972df 6321 PerlIO_printf(Perl_debug_log,
a0288114 6322 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 6323 PL_colors[0],
7b0972df 6324 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 6325 SvPVX_const(r->anchored_substr),
3280af22 6326 PL_colors[1],
c277df42 6327 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 6328 (IV)r->anchored_offset);
33b8afdf
JH
6329 else if (r->anchored_utf8)
6330 PerlIO_printf(Perl_debug_log,
a0288114 6331 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
6332 PL_colors[0],
6333 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 6334 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
6335 PL_colors[1],
6336 SvTAIL(r->anchored_utf8) ? "$" : "",
6337 (IV)r->anchored_offset);
c277df42 6338 if (r->float_substr)
7b0972df 6339 PerlIO_printf(Perl_debug_log,
a0288114 6340 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 6341 PL_colors[0],
b81d288d 6342 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6343 SvPVX_const(r->float_substr),
3280af22 6344 PL_colors[1],
c277df42 6345 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6346 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6347 else if (r->float_utf8)
6348 PerlIO_printf(Perl_debug_log,
a0288114 6349 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6350 PL_colors[0],
6351 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6352 SvPVX_const(r->float_utf8),
33b8afdf
JH
6353 PL_colors[1],
6354 SvTAIL(r->float_utf8) ? "$" : "",
6355 (IV)r->float_min_offset, (UV)r->float_max_offset);
6356 if (r->check_substr || r->check_utf8)
b81d288d
AB
6357 PerlIO_printf(Perl_debug_log,
6358 r->check_substr == r->float_substr
33b8afdf 6359 && r->check_utf8 == r->float_utf8
c277df42
IZ
6360 ? "(checking floating" : "(checking anchored");
6361 if (r->reganch & ROPT_NOSCAN)
6362 PerlIO_printf(Perl_debug_log, " noscan");
6363 if (r->reganch & ROPT_CHECK_ALL)
6364 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6365 if (r->check_substr || r->check_utf8)
c277df42
IZ
6366 PerlIO_printf(Perl_debug_log, ") ");
6367
46fc3d4c 6368 if (r->regstclass) {
32fc9b6a 6369 regprop(r, sv, r->regstclass);
3f7c398e 6370 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6371 }
774d564b 6372 if (r->reganch & ROPT_ANCH) {
6373 PerlIO_printf(Perl_debug_log, "anchored");
6374 if (r->reganch & ROPT_ANCH_BOL)
6375 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6376 if (r->reganch & ROPT_ANCH_MBOL)
6377 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6378 if (r->reganch & ROPT_ANCH_SBOL)
6379 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6380 if (r->reganch & ROPT_ANCH_GPOS)
6381 PerlIO_printf(Perl_debug_log, "(GPOS)");
6382 PerlIO_putc(Perl_debug_log, ' ');
6383 }
c277df42
IZ
6384 if (r->reganch & ROPT_GPOS_SEEN)
6385 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6386 if (r->reganch & ROPT_SKIP)
760ac839 6387 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6388 if (r->reganch & ROPT_IMPLICIT)
760ac839 6389 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6390 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6391 if (r->reganch & ROPT_EVAL_SEEN)
6392 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6393 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6394#else
96a5add6 6395 PERL_UNUSED_CONTEXT;
65e66c80 6396 PERL_UNUSED_ARG(r);
17c3b450 6397#endif /* DEBUGGING */
a687059c
LW
6398}
6399
6400/*
a0d0e21e
LW
6401- regprop - printable representation of opcode
6402*/
46fc3d4c 6403void
32fc9b6a 6404Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6405{
35ff7856 6406#ifdef DEBUGGING
97aff369 6407 dVAR;
9b155405 6408 register int k;
a0d0e21e 6409
54dc92de 6410 sv_setpvn(sv, "", 0);
9b155405 6411 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6412 /* It would be nice to FAIL() here, but this may be called from
6413 regexec.c, and it would be hard to supply pRExC_state. */
6414 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6415 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6416
3dab1dad 6417 k = PL_regkind[OP(o)];
9b155405 6418
2a782b5b 6419 if (k == EXACT) {
396482e1 6420 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6421 /* Using is_utf8_string() is a crude hack but it may
6422 * be the best for now since we have no flag "this EXACTish
6423 * node was UTF-8" --jhi */
1df70142 6424 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 6425 const char * const s = do_utf8 ?
c728cb41
JH
6426 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6427 UNI_DISPLAY_REGEX) :
2a782b5b 6428 STRING(o);
e1ec3a88 6429 const int len = do_utf8 ?
2a782b5b
JH
6430 strlen(s) :
6431 STR_LEN(o);
6432 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6433 PL_colors[0],
6434 len, s,
6435 PL_colors[1]);
bb263b4e 6436 } else if (k == TRIE) {
3dab1dad
YO
6437 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6438 /* print the details of the trie in dumpuntil instead, as
4f639d21 6439 * prog->data isn't available here */
a3621e74 6440 } else if (k == CURLY) {
cb434fcc 6441 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6442 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6443 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6444 }
2c2d71f5
JH
6445 else if (k == WHILEM && o->flags) /* Ordinal/of */
6446 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6447 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6448 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6449 else if (k == LOGICAL)
04ebc1ab 6450 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6451 else if (k == ANYOF) {
6452 int i, rangestart = -1;
2d03de9c 6453 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6454
6455 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6456 static const char * const anyofs[] = {
653099ff
GS
6457 "\\w",
6458 "\\W",
6459 "\\s",
6460 "\\S",
6461 "\\d",
6462 "\\D",
6463 "[:alnum:]",
6464 "[:^alnum:]",
6465 "[:alpha:]",
6466 "[:^alpha:]",
6467 "[:ascii:]",
6468 "[:^ascii:]",
6469 "[:ctrl:]",
6470 "[:^ctrl:]",
6471 "[:graph:]",
6472 "[:^graph:]",
6473 "[:lower:]",
6474 "[:^lower:]",
6475 "[:print:]",
6476 "[:^print:]",
6477 "[:punct:]",
6478 "[:^punct:]",
6479 "[:upper:]",
aaa51d5e 6480 "[:^upper:]",
653099ff 6481 "[:xdigit:]",
aaa51d5e
JF
6482 "[:^xdigit:]",
6483 "[:space:]",
6484 "[:^space:]",
6485 "[:blank:]",
6486 "[:^blank:]"
653099ff
GS
6487 };
6488
19860706 6489 if (flags & ANYOF_LOCALE)
396482e1 6490 sv_catpvs(sv, "{loc}");
19860706 6491 if (flags & ANYOF_FOLD)
396482e1 6492 sv_catpvs(sv, "{i}");
653099ff 6493 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6494 if (flags & ANYOF_INVERT)
396482e1 6495 sv_catpvs(sv, "^");
ffc61ed2
JH
6496 for (i = 0; i <= 256; i++) {
6497 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6498 if (rangestart == -1)
6499 rangestart = i;
6500 } else if (rangestart != -1) {
6501 if (i <= rangestart + 3)
6502 for (; rangestart < i; rangestart++)
653099ff 6503 put_byte(sv, rangestart);
ffc61ed2
JH
6504 else {
6505 put_byte(sv, rangestart);
396482e1 6506 sv_catpvs(sv, "-");
ffc61ed2 6507 put_byte(sv, i - 1);
653099ff 6508 }
ffc61ed2 6509 rangestart = -1;
653099ff 6510 }
847a199f 6511 }
ffc61ed2
JH
6512
6513 if (o->flags & ANYOF_CLASS)
bb7a0f54 6514 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6515 if (ANYOF_CLASS_TEST(o,i))
6516 sv_catpv(sv, anyofs[i]);
6517
6518 if (flags & ANYOF_UNICODE)
396482e1 6519 sv_catpvs(sv, "{unicode}");
1aa99e6b 6520 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6521 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6522
6523 {
6524 SV *lv;
32fc9b6a 6525 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6526
ffc61ed2
JH
6527 if (lv) {
6528 if (sw) {
89ebb4a3 6529 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6530
ffc61ed2 6531 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6532 uvchr_to_utf8(s, i);
ffc61ed2 6533
3568d838 6534 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6535 if (rangestart == -1)
6536 rangestart = i;
6537 } else if (rangestart != -1) {
ffc61ed2
JH
6538 if (i <= rangestart + 3)
6539 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6540 const U8 * const e = uvchr_to_utf8(s,rangestart);
6541 U8 *p;
6542 for(p = s; p < e; p++)
ffc61ed2
JH
6543 put_byte(sv, *p);
6544 }
6545 else {
2d03de9c
AL
6546 const U8 *e = uvchr_to_utf8(s,rangestart);
6547 U8 *p;
6548 for (p = s; p < e; p++)
ffc61ed2 6549 put_byte(sv, *p);
396482e1 6550 sv_catpvs(sv, "-");
2d03de9c
AL
6551 e = uvchr_to_utf8(s, i-1);
6552 for (p = s; p < e; p++)
1df70142 6553 put_byte(sv, *p);
ffc61ed2
JH
6554 }
6555 rangestart = -1;
6556 }
19860706 6557 }
ffc61ed2 6558
396482e1 6559 sv_catpvs(sv, "..."); /* et cetera */
19860706 6560 }
fde631ed 6561
ffc61ed2 6562 {
2e0de35c 6563 char *s = savesvpv(lv);
c445ea15 6564 char * const origs = s;
b81d288d 6565
3dab1dad
YO
6566 while (*s && *s != '\n')
6567 s++;
b81d288d 6568
ffc61ed2 6569 if (*s == '\n') {
2d03de9c 6570 const char * const t = ++s;
ffc61ed2
JH
6571
6572 while (*s) {
6573 if (*s == '\n')
6574 *s = ' ';
6575 s++;
6576 }
6577 if (s[-1] == ' ')
6578 s[-1] = 0;
6579
6580 sv_catpv(sv, t);
fde631ed 6581 }
b81d288d 6582
ffc61ed2 6583 Safefree(origs);
fde631ed
JH
6584 }
6585 }
653099ff 6586 }
ffc61ed2 6587
653099ff
GS
6588 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6589 }
9b155405 6590 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6591 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6592#else
96a5add6 6593 PERL_UNUSED_CONTEXT;
65e66c80
SP
6594 PERL_UNUSED_ARG(sv);
6595 PERL_UNUSED_ARG(o);
f9049ba1 6596 PERL_UNUSED_ARG(prog);
17c3b450 6597#endif /* DEBUGGING */
35ff7856 6598}
a687059c 6599
cad2e5aa
JH
6600SV *
6601Perl_re_intuit_string(pTHX_ regexp *prog)
6602{ /* Assume that RE_INTUIT is set */
97aff369 6603 dVAR;
a3621e74 6604 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6605 PERL_UNUSED_CONTEXT;
6606
a3621e74 6607 DEBUG_COMPILE_r(
cfd0369c 6608 {
2d03de9c 6609 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6610 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6611
6612 if (!PL_colorset) reginitcolors();
6613 PerlIO_printf(Perl_debug_log,
a0288114 6614 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6615 PL_colors[4],
6616 prog->check_substr ? "" : "utf8 ",
6617 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6618 s,
6619 PL_colors[1],
6620 (strlen(s) > 60 ? "..." : ""));
6621 } );
6622
33b8afdf 6623 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6624}
6625
2b69d0c2 6626void
864dbfa3 6627Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6628{
27da23d5 6629 dVAR;
9e55ce06 6630#ifdef DEBUGGING
c445ea15 6631 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 6632#endif
fc32ee4a 6633 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6634
7821416a
IZ
6635 if (!r || (--r->refcnt > 0))
6636 return;
be8e71aa 6637 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
c445ea15 6638 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6639 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6640 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6641 const int len = SvCUR(dsv);
9e55ce06
JH
6642 if (!PL_colorset)
6643 reginitcolors();
6644 PerlIO_printf(Perl_debug_log,
a3621e74 6645 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6646 PL_colors[4],PL_colors[5],PL_colors[0],
6647 len, len, s,
6648 PL_colors[1],
6649 len > 60 ? "..." : "");
6650 });
cad2e5aa 6651
43c5f42d
NC
6652 /* gcov results gave these as non-null 100% of the time, so there's no
6653 optimisation in checking them before calling Safefree */
6654 Safefree(r->precomp);
6655 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6656 RX_MATCH_COPY_FREE(r);
f8c7b90f 6657#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6658 if (r->saved_copy)
6659 SvREFCNT_dec(r->saved_copy);
6660#endif
a193d654
GS
6661 if (r->substrs) {
6662 if (r->anchored_substr)
6663 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6664 if (r->anchored_utf8)
6665 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6666 if (r->float_substr)
6667 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6668 if (r->float_utf8)
6669 SvREFCNT_dec(r->float_utf8);
2779dcf1 6670 Safefree(r->substrs);
a193d654 6671 }
c277df42
IZ
6672 if (r->data) {
6673 int n = r->data->count;
f3548bdc
DM
6674 PAD* new_comppad = NULL;
6675 PAD* old_comppad;
4026c95a 6676 PADOFFSET refcnt;
dfad63ad 6677
c277df42 6678 while (--n >= 0) {
261faec3 6679 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6680 switch (r->data->what[n]) {
6681 case 's':
6682 SvREFCNT_dec((SV*)r->data->data[n]);
6683 break;
653099ff
GS
6684 case 'f':
6685 Safefree(r->data->data[n]);
6686 break;
dfad63ad
HS
6687 case 'p':
6688 new_comppad = (AV*)r->data->data[n];
6689 break;
c277df42 6690 case 'o':
dfad63ad 6691 if (new_comppad == NULL)
cea2e8a9 6692 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6693 PAD_SAVE_LOCAL(old_comppad,
6694 /* Watch out for global destruction's random ordering. */
c445ea15 6695 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6696 );
b34c0dd4 6697 OP_REFCNT_LOCK;
4026c95a
SH
6698 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6699 OP_REFCNT_UNLOCK;
6700 if (!refcnt)
9b978d73 6701 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6702
f3548bdc 6703 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6704 SvREFCNT_dec((SV*)new_comppad);
6705 new_comppad = NULL;
c277df42
IZ
6706 break;
6707 case 'n':
9e55ce06 6708 break;
07be1b83 6709 case 'T':
be8e71aa
YO
6710 { /* Aho Corasick add-on structure for a trie node.
6711 Used in stclass optimization only */
07be1b83
YO
6712 U32 refcount;
6713 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6714 OP_REFCNT_LOCK;
6715 refcount = --aho->refcount;
6716 OP_REFCNT_UNLOCK;
6717 if ( !refcount ) {
6718 Safefree(aho->states);
6719 Safefree(aho->fail);
6720 aho->trie=NULL; /* not necessary to free this as it is
6721 handled by the 't' case */
6722 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6723 Safefree(r->regstclass);
07be1b83
YO
6724 }
6725 }
6726 break;
a3621e74 6727 case 't':
07be1b83 6728 {
be8e71aa 6729 /* trie structure. */
07be1b83
YO
6730 U32 refcount;
6731 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6732 OP_REFCNT_LOCK;
6733 refcount = --trie->refcount;
6734 OP_REFCNT_UNLOCK;
6735 if ( !refcount ) {
6736 Safefree(trie->charmap);
6737 if (trie->widecharmap)
6738 SvREFCNT_dec((SV*)trie->widecharmap);
6739 Safefree(trie->states);
6740 Safefree(trie->trans);
6741 if (trie->bitmap)
6742 Safefree(trie->bitmap);
6743 if (trie->wordlen)
6744 Safefree(trie->wordlen);
a3621e74 6745#ifdef DEBUGGING
be8e71aa
YO
6746 if (RX_DEBUG(r)) {
6747 if (trie->words)
6748 SvREFCNT_dec((SV*)trie->words);
6749 if (trie->revcharmap)
6750 SvREFCNT_dec((SV*)trie->revcharmap);
6751 }
a3621e74 6752#endif
07be1b83 6753 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6754 }
07be1b83
YO
6755 }
6756 break;
c277df42 6757 default:
830247a4 6758 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6759 }
6760 }
6761 Safefree(r->data->what);
6762 Safefree(r->data);
a0d0e21e
LW
6763 }
6764 Safefree(r->startp);
6765 Safefree(r->endp);
6766 Safefree(r);
a687059c 6767}
c277df42 6768
76234dfb 6769#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6770/*
6771 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6772 */
6773regnode *
864dbfa3 6774Perl_regnext(pTHX_ register regnode *p)
c277df42 6775{
97aff369 6776 dVAR;
c277df42
IZ
6777 register I32 offset;
6778
3280af22 6779 if (p == &PL_regdummy)
c277df42
IZ
6780 return(NULL);
6781
6782 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6783 if (offset == 0)
6784 return(NULL);
6785
c277df42 6786 return(p+offset);
c277df42 6787}
76234dfb 6788#endif
c277df42 6789
01f988be 6790STATIC void
cea2e8a9 6791S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6792{
6793 va_list args;
6794 STRLEN l1 = strlen(pat1);
6795 STRLEN l2 = strlen(pat2);
6796 char buf[512];
06bf62c7 6797 SV *msv;
73d840c0 6798 const char *message;
c277df42
IZ
6799
6800 if (l1 > 510)
6801 l1 = 510;
6802 if (l1 + l2 > 510)
6803 l2 = 510 - l1;
6804 Copy(pat1, buf, l1 , char);
6805 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6806 buf[l1 + l2] = '\n';
6807 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6808#ifdef I_STDARG
6809 /* ANSI variant takes additional second argument */
c277df42 6810 va_start(args, pat2);
8736538c
AS
6811#else
6812 va_start(args);
6813#endif
5a844595 6814 msv = vmess(buf, &args);
c277df42 6815 va_end(args);
cfd0369c 6816 message = SvPV_const(msv,l1);
c277df42
IZ
6817 if (l1 > 512)
6818 l1 = 512;
6819 Copy(message, buf, l1 , char);
197cf9b9 6820 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6821 Perl_croak(aTHX_ "%s", buf);
c277df42 6822}
a0ed51b3
LW
6823
6824/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6825
76234dfb 6826#ifndef PERL_IN_XSUB_RE
a0ed51b3 6827void
864dbfa3 6828Perl_save_re_context(pTHX)
b81d288d 6829{
97aff369 6830 dVAR;
1ade1aa1
NC
6831
6832 struct re_save_state *state;
6833
6834 SAVEVPTR(PL_curcop);
6835 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6836
6837 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6838 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6839 SSPUSHINT(SAVEt_RE_STATE);
6840
46ab3289 6841 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6842
a0ed51b3 6843 PL_reg_start_tmp = 0;
a0ed51b3 6844 PL_reg_start_tmpl = 0;
c445ea15 6845 PL_reg_oldsaved = NULL;
a5db57d6 6846 PL_reg_oldsavedlen = 0;
a5db57d6 6847 PL_reg_maxiter = 0;
a5db57d6 6848 PL_reg_leftiter = 0;
c445ea15 6849 PL_reg_poscache = NULL;
a5db57d6 6850 PL_reg_poscache_size = 0;
1ade1aa1
NC
6851#ifdef PERL_OLD_COPY_ON_WRITE
6852 PL_nrs = NULL;
6853#endif
ada6e8a9 6854
c445ea15
AL
6855 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6856 if (PL_curpm) {
6857 const REGEXP * const rx = PM_GETRE(PL_curpm);
6858 if (rx) {
1df70142 6859 U32 i;
ada6e8a9 6860 for (i = 1; i <= rx->nparens; i++) {
1df70142 6861 char digits[TYPE_CHARS(long)];
d9fad198 6862 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6863 GV *const *const gvp
6864 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6865
b37c2d43
AL
6866 if (gvp) {
6867 GV * const gv = *gvp;
6868 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6869 save_scalar(gv);
49f27e4b 6870 }
ada6e8a9
AMS
6871 }
6872 }
6873 }
a0ed51b3 6874}
76234dfb 6875#endif
51371543 6876
51371543 6877static void
acfe0abc 6878clear_re(pTHX_ void *r)
51371543 6879{
97aff369 6880 dVAR;
51371543
GS
6881 ReREFCNT_dec((regexp *)r);
6882}
ffbc6a93 6883
a28509cc
AL
6884#ifdef DEBUGGING
6885
6886STATIC void
6887S_put_byte(pTHX_ SV *sv, int c)
6888{
6889 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6890 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6891 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6892 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6893 else
6894 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6895}
6896
3dab1dad
YO
6897#define CLEAR_OPTSTART \
6898 if (optstart) STMT_START { \
07be1b83 6899 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6900 optstart=NULL; \
6901 } STMT_END
6902
6903#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6904
b5a2f8d8
NC
6905STATIC const regnode *
6906S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6907 const regnode *last, SV* sv, I32 l)
a28509cc 6908{
97aff369 6909 dVAR;
a28509cc 6910 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6911 register const regnode *next;
3dab1dad
YO
6912 const regnode *optstart= NULL;
6913 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6914
6915 while (op != END && (!last || node < last)) {
6916 /* While that wasn't END last time... */
6917
6918 NODE_ALIGN(node);
6919 op = OP(node);
6920 if (op == CLOSE)
6921 l--;
b5a2f8d8 6922 next = regnext((regnode *)node);
07be1b83 6923
a28509cc 6924 /* Where, what. */
8e11feef 6925 if (OP(node) == OPTIMIZED) {
e68ec53f 6926 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6927 optstart = node;
3dab1dad 6928 else
8e11feef 6929 goto after_print;
3dab1dad
YO
6930 } else
6931 CLEAR_OPTSTART;
07be1b83 6932
32fc9b6a 6933 regprop(r, sv, node);
a28509cc
AL
6934 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6935 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6936
6937 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6938 if (next == NULL) /* Next ptr. */
6939 PerlIO_printf(Perl_debug_log, "(0)");
6940 else
6941 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6942 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6943 }
6944
a28509cc
AL
6945 after_print:
6946 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6947 assert(next);
6948 {
6949 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
6950 ? regnext((regnode *)next)
6951 : next);
be8e71aa
YO
6952 if (last && nnode > last)
6953 nnode = last;
6954 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6955 }
a28509cc
AL
6956 }
6957 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 6958 assert(next);
3dab1dad 6959 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6960 }
6961 else if ( PL_regkind[(U8)op] == TRIE ) {
6962 const I32 n = ARG(node);
4f639d21 6963 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6964 const I32 arry_len = av_len(trie->words)+1;
6965 I32 word_idx;
6966 PerlIO_printf(Perl_debug_log,
07be1b83 6967 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
6968 (int)(2*(l+3)),
6969 "",
6970 trie->startstate,
6971 TRIE_WORDCOUNT(trie),
6972 (int)TRIE_CHARCOUNT(trie),
6973 trie->uniquecharcount,
6974 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
6975 (int)trie->minlen,
6976 (int)trie->maxlen
8e11feef 6977 );
3dab1dad
YO
6978 if (trie->bitmap) {
6979 int i;
6980 int rangestart= -1;
6981 sv_setpvn(sv, "", 0);
6982 for (i = 0; i <= 256; i++) {
6983 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6984 if (rangestart == -1)
6985 rangestart = i;
6986 } else if (rangestart != -1) {
6987 if (i <= rangestart + 3)
6988 for (; rangestart < i; rangestart++)
6989 put_byte(sv, rangestart);
6990 else {
6991 put_byte(sv, rangestart);
6992 sv_catpvs(sv, "-");
6993 put_byte(sv, i - 1);
6994 }
6995 rangestart = -1;
6996 }
6997 }
07be1b83 6998 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 6999 } else
07be1b83 7000 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
7001
7002 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 7003 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
7004 if (elem_ptr) {
7005 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7006 (int)(2*(l+4)), "",
7007 PL_colors[0],
cfd0369c 7008 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
7009 PL_colors[1]
7010 );
a28509cc 7011 }
a28509cc
AL
7012 }
7013
7014 node = NEXTOPER(node);
7015 node += regarglen[(U8)op];
7016
7017 }
7018 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 7019 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7020 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7021 }
7022 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7023 assert(next);
3dab1dad 7024 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7025 next, sv, l + 1);
7026 }
7027 else if ( op == PLUS || op == STAR) {
3dab1dad 7028 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7029 }
7030 else if (op == ANYOF) {
7031 /* arglen 1 + class block */
7032 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7033 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7034 node = NEXTOPER(node);
7035 }
7036 else if (PL_regkind[(U8)op] == EXACT) {
7037 /* Literal string, where present. */
7038 node += NODE_SZ_STR(node) - 1;
7039 node = NEXTOPER(node);
7040 }
7041 else {
7042 node = NEXTOPER(node);
7043 node += regarglen[(U8)op];
7044 }
7045 if (op == CURLYX || op == OPEN)
7046 l++;
7047 else if (op == WHILEM)
7048 l--;
7049 }
3dab1dad 7050 CLEAR_OPTSTART;
a28509cc
AL
7051 return node;
7052}
7053
7054#endif /* DEBUGGING */
7055
241d1a3b
NC
7056/*
7057 * Local variables:
7058 * c-indentation-style: bsd
7059 * c-basic-offset: 4
7060 * indent-tabs-mode: t
7061 * End:
7062 *
37442d52
RGS
7063 * ex: set ts=8 sts=4 sw=4 noet:
7064 */